r - 选择后 data.frame 内的修改日期变为 <NA>

标签 r date dataframe

我有一个数据框d包含一些 POSIX 日期,我想将其年份修改为 d$date$year <- 100 。起初这似乎工作正常,但是在选择此 data.frame 中的一些行后,除第一个修改日期之外的所有行都将转换为 <NA> 。我在这里做错了什么?请参阅下面的代码。 (R-Fiddle)

date <- c("2014-01-01","2015-01-02","2016-01-03")
val <- c("a","b","c")
d <- data.frame(date,val)
d$date <- strptime(d$date,format="%Y-%m-%d")
d 
#        date val
#1 2014-01-01   a
#2 2015-01-02   b
#3 2016-01-03   c
# correct date as expected

d[c(TRUE,TRUE,TRUE),] 
#        date val
#1 2014-01-01   a
#2 2015-01-02   b
#3 2016-01-03   c
# correct dates as expected

d$date2000 <- d$date
d$date2000$year <- 100 # set year to 2000

d 
#        date val   date2000
#1 2014-01-01   a 2000-01-01
#2 2015-01-02   b 2000-01-02
#3 2016-01-03   c 2000-01-03
# correct dates as expected

d[c(TRUE,TRUE,TRUE),] 
#        date val   date2000
#1 2014-01-01   a 2000-01-01
#2 2015-01-02   b       <NA>
#3 2016-01-03   c       <NA>
# first entry correct, second and third entry <NA>

最佳答案

这个问题什么时候出现?

它似乎发生在调用函数[.data.frame期间(参见d[c(TRUE,TRUE,TRUE),],还有 d[1:3,] 甚至 d[3,])。这是该函数的定义:

> `[.data.frame`
function (x, i, j, drop = if (missing(i)) TRUE else length(cols) == 
    1) 
{
    mdrop <- missing(drop)
    Narg <- nargs() - (!mdrop)
    has.j <- !missing(j)
    if (!all(names(sys.call()) %in% c("", "drop")) && !isS4(x)) 
        warning("named arguments other than 'drop' are discouraged")
    if (Narg < 3L) {
        if (!mdrop) 
            warning("'drop' argument will be ignored")
        if (missing(i)) 
            return(x)
        if (is.matrix(i)) 
            return(as.matrix(x)[i])
        nm <- names(x)
        if (is.null(nm)) 
            nm <- character()
        if (!is.character(i) && anyNA(nm)) {
            names(nm) <- names(x) <- seq_along(x)
            y <- NextMethod("[")
            cols <- names(y)
            if (anyNA(cols)) 
                stop("undefined columns selected")
            cols <- names(y) <- nm[cols]
        }
        else {
            y <- NextMethod("[")
            cols <- names(y)
            if (!is.null(cols) && anyNA(cols)) 
                stop("undefined columns selected")
        }
        if (anyDuplicated(cols)) 
            names(y) <- make.unique(cols)
        attr(y, "row.names") <- .row_names_info(x, 0L)
        attr(y, "class") <- oldClass(x)
        return(y)
    }
    if (missing(i)) {
        if (drop && !has.j && length(x) == 1L) 
            return(.subset2(x, 1L))
        nm <- names(x)
        if (is.null(nm)) 
            nm <- character()
        if (has.j && !is.character(j) && anyNA(nm)) {
            names(nm) <- names(x) <- seq_along(x)
            y <- .subset(x, j)
            cols <- names(y)
            if (anyNA(cols)) 
                stop("undefined columns selected")
            cols <- names(y) <- nm[cols]
        }
        else {
            y <- if (has.j) 
                .subset(x, j)
            else x
            cols <- names(y)
            if (anyNA(cols)) 
                stop("undefined columns selected")
        }
        if (drop && length(y) == 1L) 
            return(.subset2(y, 1L))
        if (anyDuplicated(cols)) 
            names(y) <- make.unique(cols)
        nrow <- .row_names_info(x, 2L)
        if (drop && !mdrop && nrow == 1L) 
            return(structure(y, class = NULL, row.names = NULL))
        else {
            attr(y, "class") <- oldClass(x)
            attr(y, "row.names") <- .row_names_info(x, 0L)
            return(y)
        }
    }
    xx <- x
    cols <- names(xx)
    x <- vector("list", length(x))
    x <- .Internal(copyDFattr(xx, x))
    oldClass(x) <- attr(x, "row.names") <- NULL
    if (has.j) {
        nm <- names(x)
        if (is.null(nm)) 
            nm <- character()
        if (!is.character(j) && anyNA(nm)) 
            names(nm) <- names(x) <- seq_along(x)
        x <- x[j]
        cols <- names(x)
        if (drop && length(x) == 1L) {
            if (is.character(i)) {
                rows <- attr(xx, "row.names")
                i <- pmatch(i, rows, duplicates.ok = TRUE)
            }
            xj <- .subset2(.subset(xx, j), 1L)
            return(if (length(dim(xj)) != 2L) xj[i] else xj[i, 
                , drop = FALSE])
        }
        if (anyNA(cols)) 
            stop("undefined columns selected")
        if (!is.null(names(nm))) 
            cols <- names(x) <- nm[cols]
        nxx <- structure(seq_along(xx), names = names(xx))
        sxx <- match(nxx[j], seq_along(xx))
    }
    else sxx <- seq_along(x)
    rows <- NULL
    if (is.character(i)) {
        rows <- attr(xx, "row.names")
        i <- pmatch(i, rows, duplicates.ok = TRUE)
    }
    for (j in seq_along(x)) {
        xj <- xx[[sxx[j]]]
        x[[j]] <- if (length(dim(xj)) != 2L) 
            xj[i]
        else xj[i, , drop = FALSE]
    }
    if (drop) {
        n <- length(x)
        if (n == 1L) 
            return(x[[1L]])
        if (n > 1L) {
            xj <- x[[1L]]
            nrow <- if (length(dim(xj)) == 2L) 
                dim(xj)[1L]
            else length(xj)
            drop <- !mdrop && nrow == 1L
        }
        else drop <- FALSE
    }
    if (!drop) {
        if (is.null(rows)) 
            rows <- attr(xx, "row.names")
        rows <- rows[i]
        if ((ina <- anyNA(rows)) | (dup <- anyDuplicated(rows))) {
            if (!dup && is.character(rows)) 
                dup <- "NA" %in% rows
            if (ina) 
                rows[is.na(rows)] <- "NA"
            if (dup) 
                rows <- make.unique(as.character(rows))
        }
        if (has.j && anyDuplicated(nm <- names(x))) 
            names(x) <- make.unique(nm)
        if (is.null(rows)) 
            rows <- attr(xx, "row.names")[i]
        attr(x, "row.names") <- rows
        oldClass(x) <- oldClass(xx)
    }
    x
}
<bytecode: 0x7fe8cc3a5548>
<environment: namespace:base>

相关部分发生在这里:

for (j in seq_along(x)) {
            xj <- xx[[sxx[j]]]
            x[[j]] <- if (length(dim(xj)) != 2L) 
                xj[i]
            else xj[i, , drop = FALSE]
        }

此时(例如在 d[3,] 示例中),我们有:

> str(xx)
'data.frame':   3 obs. of  3 variables:
 $ date    : POSIXlt, format: "2014-01-01" "2015-01-02" "2016-01-03"
 $ val     : Factor w/ 3 levels "a","b","c": 1 2 3
 $ date2000: POSIXlt, format: "2000-01-01" "2000-01-02" "2000-01-03"
> str(x)
List of 3
 $ date    : NULL
 $ val     : NULL
 $ date2000: NULL
> i
[1] 3
> str(sxx)
 int [1:3] 1 2 3

对于 j=3,我们有:

> str(xj)
 POSIXlt[1:3], format: "2000-01-01" "2000-01-02" "2000-01-03"
> dim(xj)
NULL
> xj[3]
[1] NA

所以这就是它失败的地方。 我认为问题来自于你用 1 个值而不是 3 替换了 d$date2000$year 的事实:

> xj$wday
[1] 3 5 0
> xj$year
[1] 100
> xj[3]
[1] NA
> xj$year <- c(100,100,100)
> xj[3]
[1] "2000-01-03 CET"

看起来,当显示xj(或d)时,xj$year的值被回收,但是当仅显示时>xj[3] 它尝试构建 POSIXlt,但由于缺少 year 元素而失败。事实上,如果我们尝试使用两个元素,而不是一三个元素,我们可以看到向量被回收:

> xj$year <- c(100,101)
> xj
[1] "2000-01-01 CET" "2001-01-02 CET" "2000-01-03 CET"
> xj[2]
[1] "2001-01-02 CET"
> xj[3]
[1] NA

关于r - 选择后 data.frame 内的修改日期变为 <NA>,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36648502/

相关文章:

R:如果行中的元素满足某些特征,如何删除行?

c++ - 数学将1970年以来的秒转换为日期,反之亦然

python - python pandas 根据日期将数据导出到多个excel文件

pandas - Spark DataFrame是否等效于Pandas Dataframe `.iloc()`方法?

python - 当另一列值更改时创建带有行号的列python

r - plotly 中的 add_trace,用于我的特定图形的跟踪

r - 加速矩阵 rowMeans 操作

r - ggplot geom_bar 与 stat = "sum"

php - MYSQL 日期字段始终输出 0000-00-00

r - 在 r 中连接两个字符串变量