我有两张 table 。 table1
看起来像这样
date hour data
2010-05-01 3 5
2010-05-02 7 7
2010-05-02 10 8
2010-07-03 18 3
2011-12-09 22 1
2012-05-01 3 0
这存储为 data.table
key 设置为 date
和hour
。
我还有另一张 table ,看起来像这样。这是我的outages
表。
resource date_out date_back
joey 2010-04-30 4:00:00 2010-05-02 8:30:00
billy 2009-04-20 7:00:00 2009-02-02 5:30:00
bob 2011-11-15 12:20:00 2010-12-09 23:00:00
joey 2012-04-28 1:00:00 2012-05-02 17:00:00
我想将列添加到 table1
其中这些列是来自 outages
的资源 table 。我希望这些列中的值在没有中断时为 0,在中断时为 1。
此示例的结果应该是。
date hour data joey billy bob
2010-05-01 3 5 1 0 0
2010-05-02 7 7 1 0 0
2010-05-02 10 8 0 0 0
2010-07-03 18 3 0 0 0
2011-12-09 22 1 0 0 1
2012-05-01 3 0 1 0 0
实际上我的 table1
大约有 2500 行,我的 outages
表有 19000。我能想到的唯一方法是循环遍历 outages
的每一行表,然后将 1 插入 table1
在正确的地方。我的代码依赖 table1
是有序的,所以至少不必扫描该表的 100% 来查找 outages
的每一行。然而,下面的数据需要 4 个多小时。
for (out in 1:length(outages$resource)) {
a<-as.character(outages[out]$resource)
#if column doesn't exist then create it
if (a %in% colnames(table1)==FALSE) {
table1$new<-0
setnames(table1, "new", a)
}
midpoint<-round(length(table1$date)/2,0)
if (table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages [out]$due_back)
{
while(table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
table1[midpoint,a:=1,with=FALSE]
midpoint<-midpoint-1
}
midpoint<-round(length(table1$date)/2,0)
while(table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
table1[midpoint,a:=1,with=FALSE]
midpoint<-midpoint+1
}
} else {
if (table1$date[midpoint]+table1$hour[midpoint]*60*60>outages[out]$due_back) {
while(table1$date[midpoint]+table1$hour[midpoint]*60*60>outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
midpoint<-midpoint-1
}
while(table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
table1[midpoint,a:=1,with=FALSE]
midpoint<-midpoint-1
}
}
midpoint<-round(length(table1$date)/2,0)
if (table1$date[midpoint]+table1$hour[midpoint]*60*60<outages[out]$due_out) {
while(table1$date[midpoint]+table1$hour[midpoint]*60*60<outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
midpoint<-midpoint+1
}
while(table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
table1[midpoint,a:=1,with=FALSE]
midpoint<-midpoint+1
}
}
}
if (sum(table1[,a,with=FALSE])==0) {
table1[,a:=NULL,with=FALSE]
}
}
引用大家最喜欢的电视广告台词“一定有更好的方法”。
最佳答案
这是实现您想要的目标的一种方法。这假设您的 table1
的时间精度为 1 小时。虽然它可以修改为任意精度,但对于较大的时间间隔,它的性能会更好,因为它构建了 date_out
-date_back
范围内可能时间的完整序列。请注意,我使用了与 OP 略有不同的表格来说明重叠间隔并纠正 OP 中的一些错误。
table1 = data.table(date = c("2010-05-01", "2010-05-02", "2010-05-02", "2010-07-03", "2011-12-09", "2012-05-01"), hour = c(3,7,10,18,22,3), data = c(5,7,8,3,1,0))
outages = data.table(resource = c("joey", "bob", "billy", "bob", "joey"), date_out = c("2010-04-30 4:00:00", "2010-04-30 4:00:00", "2009-04-20 7:00:00", "2011-11-15 12:20:00", "2012-04-28 1:00:00"), date_back=c("2010-05-02 8:30:00", "2010-05-02 8:30:00", "2009-06-02 5:30:00", "2011-12-09 23:00:00", "2012-05-02 17:00:00"))
# round up date_out and round down date_back
# and create a sequence in-between spaced by 1 hour
outages[, list(datetime = seq(as.POSIXct(round(as.POSIXct(date_out) + 30*60-1, "hours")),
as.POSIXct(round(as.POSIXct(date_back) - 30*60, "hours")),
60*60)),
by = list(resource, date_out)] -> outages.expanded
setkey(outages.expanded, datetime)
# merge with the original table, then run "table" to get the frequencies/occurences
# and cbind back with the original table
cbind(table1, unclass(table(
outages.expanded[table1[, list(datetime=as.POSIXct(paste0(date, " ", hour, ":00:00")))],
resource])))
# date hour data bob joey
#1: 2010-05-01 3 5 1 1
#2: 2010-05-02 7 7 1 1
#3: 2010-05-02 10 8 0 0
#4: 2010-07-03 18 3 0 0
#5: 2011-12-09 22 1 1 0
#6: 2012-05-01 3 0 0 1
关于r - 根据另一个表中的日期范围在一个表中创建虚拟变量,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/16423817/