使用 df
,我正在创建一个新的数据框 (final.df
),它在 startdate
和enddate
来自 df
datadframe。
df <- data.frame(claimid = c("123A",
"125B",
"151C",
"124A",
"325C"),
startdate = as.Date(c("2018-01-01",
"2017-05-20",
"2017-12-15",
"2017-11-05",
"2018-02-06")),
enddate = as.Date(c("2018-01-06",
"2017-06-21",
"2018-01-02",
"2017-11-15",
"2018-02-18")))
下面的嵌套函数是我目前用来创建 final.df
的函数,但是当遍历数十万个声明时,这种创建 final.df
的方法> 运行需要数小时。我正在寻找能够更有效地创建 final.df
的替代方案。
claim_level <- function(a) {
specific_row <- df[a, ]
dates <- seq(specific_row$startdate, specific_row$enddate, by="days")
day_level <- function(b) {
day <- dates[b]
data.frame(claimid = specific_row$claimid, date = day)
}
do.call("rbind", lapply(c(1:length(dates)), function(b) day_level(b)))
}
final.df <- do.call("rbind", lapply(c(1:nrow(df)), function(a) claim_level(a)))
print(subset(final.df, claimid == "123A"))
#claimid date
#123A 2018-01-01
#123A 2018-01-02
#123A 2018-01-03
#123A 2018-01-04
#123A 2018-01-05
#123A 2018-01-06
最佳答案
您可以使用 tidyr
中的 gather
将宽格式转换为长格式,然后使用 padr
中的 pad
将在开始日期和结束日期之间创建新的日期行。 group = "claimid"
参数允许您指定分组变量:
library(dplyr)
library(tidyr)
library(padr)
df %>%
gather(var, date, -claimid) %>%
pad(group = "claimid") %>%
select(-var)
或者使用 data.table
来提高效率:
library(data.table)
setDT(df)[,.(date = seq(startdate, enddate, "days")), claimid]
结果:
claimid date
1 123A 2018-01-01
2 123A 2018-01-02
3 123A 2018-01-03
4 123A 2018-01-04
5 123A 2018-01-05
6 123A 2018-01-06
7 124A 2017-11-05
8 124A 2017-11-06
9 124A 2017-11-07
10 124A 2017-11-08
11 124A 2017-11-09
12 124A 2017-11-10
13 124A 2017-11-11
14 124A 2017-11-12
15 124A 2017-11-13
16 124A 2017-11-14
17 124A 2017-11-15
18 125B 2017-05-20
19 125B 2017-05-21
20 125B 2017-05-22
...
基准:
初始化函数:
library(tidyverse)
library(padr)
library(data.table)
# OP's function
claim_level <- function(a) {
specific_row <- df[a, ]
dates <- seq(specific_row$startdate, specific_row$enddate, by="days")
day_level <- function(b) {
day <- dates[b]
data.frame(claimid = specific_row$claimid, date = day)
}
do.call("rbind", lapply(c(1:length(dates)), function(b) day_level(b)))
}
OP_f = function(){
do.call("rbind", lapply(c(1:nrow(df)), function(a) claim_level(a)))
}
# useR's tidyverse + padr
f1 = function(){
df %>%
gather(var, date, -claimid) %>%
pad(interval = "day", group = "claimid") %>%
select(-var)
}
# useR's data.table
DT = df
setDT(DT)
f2 = function(){
DT[,.(date = seq(startdate, enddate, "days")), claimid]
}
# Moody_Mudskipper's Base R
f3 = function(){
do.call(rbind,
Map(function(claimid, startdate, enddate)
data.frame(claimid, date=as.Date(startdate:enddate, origin = "1970-01-01")),
df$claimid, df$startdate, df$enddate))
}
# Moody_Mudskipper's tidyverse
f4 = function(){
df %>%
group_by(claimid) %>%
mutate(date = list(as.Date(startdate:enddate, origin = "1970-01-01"))) %>%
select(1, 4) %>%
unnest %>%
ungroup
}
# MKR's tidyr expand
f5 = function(){
df %>%
group_by(claimid) %>%
expand(date = seq(startdate, enddate, by="day"))
}
检查是否相同:
> identical(OP_f() %>% arrange(claimid), data.frame(f1()))
[1] TRUE
> identical(OP_f(), data.frame(f2()))
[1] TRUE
> identical(OP_f(), data.frame(f3()))
[1] TRUE
> identical(OP_f(), data.frame(f4()))
[1] TRUE
> identical(OP_f() %>% arrange(claimid), data.frame(f5()))
[1] TRUE
基准测试结果:
library(microbenchmark)
microbenchmark(OP_f(), f1(), f2(), f3(), f4(), f5())
Unit: milliseconds
expr min lq mean median uq max neval
OP_f() 26.421534 27.697194 30.342682 28.981143 31.537396 58.071238 100
f1() 36.133364 38.179196 40.749812 39.870931 41.367655 58.428888 100
f2() 1.005843 1.261449 1.450633 1.383232 1.559689 4.058900 100
f3() 2.373679 2.534148 2.786888 2.633035 2.797452 6.941421 100
f4() 22.659097 23.341435 25.275457 24.111411 26.499893 40.840061 100
f5() 46.445622 48.148606 52.565480 51.185478 52.845829 176.912276 100
data.table
在速度方面是赢家,@Moody_Mudskipper 的 Base R 解决方案位居第二。 padr::pad
和tidyr::expand
虽然看起来最方便,但也是最慢的(甚至比OP的原程序还慢)。
关于r - 使用 R 循环创建新数据框的更快方法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50648084/