r - 创建一个函数以从数据中添加因子列

标签 r function dplyr tidyr data-cleaning

我计划读取来自跟踪数据的多个数据集。 他们有几行,我将在阅读之前/之后直接跳过。 要保留的重要结构是 3 列中的 xyt 数据。 我还需要将“标签:Caja i”添加到新列以使其成为 ID 列。每个i是每个实验单元。我在完整数据集中一直到 24,但并不总是按顺序排列。

接下来的问题是如何整理数据集,删除 x、y 和 t header 并添加“Caja”标签。

这是我的数据的子集,请注意实验单元记录之间的 NA 线。

dput(cucu)
structure(list(X = c("Key Images", "Title", "0:00:00:00", "", 
"Track", "Label :", "Coords (x,y:px; t:time)", "x", "0", "-2", 
"-11", "-5", "2", "5", "4", "2", "6", "6", "6", "6", "6", "6", 
"7", "-7", "-29", "-27", "-10", "-2", "5", "7", "7", "8", "13", 
"", "Track", "Label :", "Coords (x,y:px; t:time)", "x", "0", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "26", 
"23", "29", "29", "29", "19", "13", "19", "25", "19", "6", "-1", 
"", "Track", "Label :", "Coords (x,y:px; t:time)", "x", "0", 
"3", "6", "17"), X.1 = c("", "Time", "0:00:00:00", "", "", "Caja 4", 
"", "y", "0", "10", "-11", "2", "17", "29", "25", "12", "-9", 
"0", "12", "28", "54", "84", "96", "105", "114", "114", "111", 
"112", "111", "116", "120", "120", "127", "", "", "Caja 5", "", 
"y", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "44", "27", "12", "1", "0", "2", "3", "2", "-2", "1", "2", 
"7", "", "", "Caja 6", "", "y", "0", "14", "32", "38"), X.2 = c("", 
"", "", "", "", "", "", "t", "0:00:00:00", "0:00:00:14", "0:00:38:71", 
"0:00:38:85", "0:00:39:00", "0:00:39:14", "0:00:39:28", "0:00:39:42", 
"0:00:39:57", "0:00:39:71", "0:00:39:85", "0:00:40:00", "0:00:40:14", 
"0:00:40:28", "0:00:40:42", "0:00:40:57", "0:00:40:71", "0:00:40:85", 
"0:00:41:00", "0:00:41:14", "0:00:41:28", "0:00:41:42", "0:00:41:57", 
"0:00:41:71", "0:00:41:85", "", "", "", "", "t", "0:00:00:00", 
"0:00:00:14", "0:00:00:28", "0:00:00:42", "0:00:00:57", "0:00:00:71", 
"0:00:00:85", "0:00:01:00", "0:00:01:14", "0:00:01:28", "0:00:01:42", 
"0:00:01:57", "0:00:01:71", "0:00:40:28", "0:00:40:42", "0:00:40:57", 
"0:00:40:71", "0:00:40:85", "0:00:41:00", "0:00:41:14", "0:00:41:28", 
"0:00:41:42", "0:00:41:57", "0:00:41:71", "0:00:41:85", "", "", 
"", "", "t", "0:00:00:00", "0:00:00:14", "0:00:00:28", "0:00:00:42"
)), .Names = c("X", "X.1", "X.2"), row.names = c(1L, 2L, 3L, 
4L, 5L, 6L, 7L, 8L, 9L, 10L, 280L, 281L, 282L, 283L, 284L, 285L, 
286L, 287L, 288L, 289L, 290L, 291L, 292L, 293L, 294L, 295L, 296L, 
297L, 298L, 299L, 300L, 301L, 302L, 303L, 304L, 305L, 306L, 307L, 
308L, 309L, 310L, 311L, 312L, 313L, 314L, 315L, 316L, 317L, 318L, 
319L, 320L, 590L, 591L, 592L, 593L, 594L, 595L, 596L, 597L, 598L, 
599L, 600L, 601L, 602L, 603L, 604L, 605L, 606L, 607L, 608L, 609L, 
610L), class = "data.frame")

这是一个假的预期输出,其中“Caja”作为所提到的每个级别的 ID 列。

dput(df)
structure(list(Caja = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 
5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6), x = c(-3.07268932779439, 
0.484266873196818, 0.917705630758503, 1.03584730496379, 0.838625757843941, 
0.647284457113794, 1.85357454343089, 1.33693715439301, 0.293179064796777, 
0.00633063048373611, 0.612938289390049, -0.50539082918022, 0.251496480385252, 
-1.75717878846605, 1.00013191487416, -1.52759310599809, 0.372782154041601, 
-1.43694367534352, 1.02558031999024, 1.35942769142916, -0.442425921092658, 
1.08025883023299, 0.715069454457284, -0.479919179648368, -1.02735585729109, 
-0.242179335814154, 1.12564595565253, 1.98471585124264, 0.804050335171037, 
-0.190691648766175), y = c(-0.897147705248384, 0.226793982297109, 
0.10644613224225, 0.99551047056361, -0.526239601986565, 0.75860661193569, 
-0.481560713881324, 0.280105671338355, 0.997811189730975, 0.491707670881505, 
-0.279446168649783, 0.440627202618172, 0.95354565418694, -1.70769016339517, 
-1.53381426766088, -0.0544834794352239, -1.20106285132598, -0.333373160001734, 
-1.20248240636362, -1.04874198341736, 0.181810095042002, -0.423574476932895, 
-1.62483188394934, 1.21775369607968, 1.14458843271056, 0.373070235147697, 
0.409431464558393, -0.213415686981125, -0.347800469559547, 0.185367109684458
), t = c(2.32451959770308, 0.692272863593035, -1.89883997859401, 
-0.223727358926319, 0.995802018891691, 0.732823865155816, 0.961381338943591, 
0.793294686992325, -1.54254907957044, 0.378897583645236, -0.818413418720775, 
-0.502999381649839, 0.103448616784216, -0.455956879543311, -1.33974211593966, 
1.33950932984407, 0.281470433388303, -0.670832974337081, -0.654599666494088, 
-0.486434674492362, -0.0600488930758214, 0.302971586427514, 1.91331234891505, 
-0.638758602719345, 0.975063194257583, -0.544269357921009, -0.129239918745744, 
0.167815176992696, 1.64640395321812, -1.26864285133868)), .Names = c("Caja", 
"x", "y", "t"), row.names = c(NA, -30L), class = "data.frame")

由于我计划针对一堆数据集进行此操作,因此我认为我需要编写一个函数。我认为每个 Caja 的录音之间的空行对于定义每个记录的长度可能很有用

    tidy<- function (data.frame) {
    Caja<-data.frame[grep("Caja",data.frame$X.1),2] # ("Caja 4" "Caja 5" "Caja 6")
    #This is the character vector I need to create a factor: "Caja 4" "Caja 5" "Caja 6"
    # Remove empty and extra Rows

names(data.frame)<-c("Caja","x","y","t")
return(data.frame)
}  

最佳答案

tidy_df <- function(df) { 
  #Create index of column headers and values
  val.indx <- grepl("\\w", df[,3])

  #Index of Caja label positions
  label.indx <- grepl("Caja", df[,2])

  #subset original data with indices
  newdf <- df[val.indx | label.indx,]

  #Individual label names
  labels <- unique(grep("Caja", newdf[,2], value=T))

  #Create label column
  newdf$Caja <- labels[cumsum(grepl("Caja", newdf[,2]))]

  #Remove non-value elements (headers, labels, stray text)
  newdf <- newdf[-grep("Label|x",newdf[,1]),]

  #Not necessary but makes the output look better
  row.names(newdf) <- NULL
  names(newdf) <- c("x", "y", "t", "Caja")

  newdf
}


head(tidy_df(cucu))
# x   y          t   Caja
# 1   0   0 0:00:00:00 Caja 4
# 2  -2  10 0:00:00:14 Caja 4
# 3 -11 -11 0:00:38:71 Caja 4
# 4  -5   2 0:00:38:85 Caja 4
# 5   2  17 0:00:39:00 Caja 4
# 6   5  29 0:00:39:14 Caja 4

关于r - 创建一个函数以从数据中添加因子列,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/34361898/

相关文章:

使用c编程中的函数计算2个坐标之间的距离(x,y,z)

php - PHP中数组的串行逗号

c++ - 当第一行完成时井字游戏不会结束

r - 使用 dplyr 向列值添加抖动

r - 仅选择出现在其他因素的每个水平中的那些水平的因素

r - 为什么我得到 "warning longer object length is not a multiple of shorter object length"?

r - 将两个变量的数据从宽数据转换为长数据

r - 如何避免密度曲线在绘图中被切断

替换 dplyr 链所有列中的 NA

r - 在 R 中获取绘图边界