r - 在 R 中按条件创建附加变量

标签 r data.table

说我的数据

mydata=structure(list(dt_l = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "06.10.2018 1:57", class = "factor"), 
    id = c(39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 
    39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 
    39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 39474239L
    ), date_appr = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "22.06.2019 20:05", class = "factor"), 
    date_call = structure(c(3L, 4L, 2L, 5L, 6L, 7L, 8L, 8L, 9L, 
    10L, 11L, 1L, 12L, 13L, 13L, 14L, 15L), .Label = c("03.11.2018 0:04", 
    "06.10.2018 19:41", "06.10.2018 2:01", "06.10.2018 2:02", 
    "06.10.2018 20:04", "06.10.2018 21:30", "06.10.2018 21:31", 
    "06.10.2018 22:48", "07.10.2018 22:08", "07.10.2018 22:09", 
    "09.10.2018 23:26", "11.03.2019 22:44", "22.06.2019 19:09", 
    "22.06.2019 19:40", "22.06.2019 19:45"), class = "factor"), 
    X = c(7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 
    7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L
    ), goods = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Flekosteel", class = "factor"), 
    login = structure(c(6L, 6L, 2L, 4L, 9L, 9L, 5L, 5L, 3L, 3L, 
    7L, 1L, 8L, 11L, 11L, 10L, 11L), .Label = c("COLOP1AM-1135", 
    "COLOP1AM-722", "COLOP1AM-723", "COLOP1PM-1158", "COLOP1PM-1160", 
    "COLOP1PM-1200", "COLOP1PM-1210", "COLOP1PM-1212", "COLOP1PM-1287", 
    "COLOP1PM-1844", "COLOP1PM-1867"), class = "factor"), sex = structure(c(2L, 
    2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    1L), .Label = c("female", "male"), class = "factor"), age = c(25L, 
    25L, 27L, 22L, 29L, 29L, 37L, 37L, 30L, 30L, 21L, 45L, 33L, 
    21L, 21L, 21L, 21L), country = structure(c(1L, 1L, 1L, 2L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Colombia", 
    "Colombia"), class = "factor")), .Names = c("dt_l", "id", "date_appr", 
"date_call", "X", "goods", "login", "sex", "age", "country"), class = "data.frame", row.names = c(NA, -17L))

如何为组id + goods + sex + country设置标记变量

创建 answer 变量,其中到最后的所有行都应标记为 0,只有最后一行标记为 1

              dt_l       id        date_appr        date_call    X      goods         login
1  06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:01 7954 Flekosteel COLOP1PM-1200
2  06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:02 7954 Flekosteel COLOP1PM-1200

      sex age  country answer
1    male  25 Colombia      0
2    male  25 Colombia      1

如果groups id + goods + sex country只有0行,应该标记为“incoorect data”

例如

24.08.2017 10:26    21409266    24.08.2017 16:39    \N  59de92f6d28f32d15fd2201911d27a2e    Hammer of Thor  \N  \N  \N  \N  incorrect data

怎么做?

最佳答案

因此,您希望每个 (id, goods, sex, country) 组的最后一行被标记?如果是这样,您可以检查行号向量 .I 是否等于组 last(.I) 中的最后一个行号。

library(data.table)
setDT(mydata)

mydata[, answer := as.integer(.I == last(.I)), .(id, goods, sex, country)][]

#                dt_l       id        date_appr        date_call    X      goods         login    sex age  country answer
#  1: 06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:01 7954 Flekosteel COLOP1PM-1200   male  25 Colombia      0
#  2: 06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:02 7954 Flekosteel COLOP1PM-1200   male  25 Colombia      0
#  3: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 19:41 7954 Flekosteel  COLOP1AM-722 female  27 Colombia      0
#  4: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 20:04 7954 Flekosteel COLOP1PM-1158 female  22 Colombia      1
#  5: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:30 7954 Flekosteel COLOP1PM-1287 female  29 Colombia      0
#  6: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:31 7954 Flekosteel COLOP1PM-1287 female  29 Colombia      0
#  7: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160   male  37 Colombia      0
#  8: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160   male  37 Colombia      0
#  9: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:08 7954 Flekosteel  COLOP1AM-723 female  30 Colombia      0
# 10: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:09 7954 Flekosteel  COLOP1AM-723 female  30 Colombia      0
# 11: 06.10.2018 1:57 39474239 22.06.2019 20:05 09.10.2018 23:26 7954 Flekosteel COLOP1PM-1210 female  21 Colombia      0
# 12: 06.10.2018 1:57 39474239 22.06.2019 20:05  03.11.2018 0:04 7954 Flekosteel COLOP1AM-1135 female  45 Colombia      0
# 13: 06.10.2018 1:57 39474239 22.06.2019 20:05 11.03.2019 22:44 7954 Flekosteel COLOP1PM-1212 female  33 Colombia      0
# 14: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female  21 Colombia      0
# 15: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female  21 Colombia      0
# 16: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:40 7954 Flekosteel COLOP1PM-1844   male  21 Colombia      1
# 17: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:45 7954 Flekosteel COLOP1PM-1867 female  21 Colombia      1

关于r - 在 R 中按条件创建附加变量,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/57594609/

相关文章:

R - 如何提取字符串和空行之间的文本?

r - 如何使用 dplyr 中的 mutate 创建一系列由指定突变值的向量定义和调用的列?

r - 为什么 unlist() 将日期类型转换为数字?

r - 按组查找最接近的值

r - 基于分组时间间隔的活跃月数

r - 为什么data.table,设置0x​​104567910 = F,j为单列时输出一个data.table?

r - 如何为每一行返回一系列列中的第一个非 NULL 值?而第二个非空值?

r - ggplot2 与 rworldmap 结合

r - 如何按组加速子集

r - 为什么 data.table 不知道 "J"?