非常感谢您对这个问题的帮助,我找不到一个足够接近 SO 的例子。
我有两个 data.table,第一个称为 customer.table
,包含特定时间戳 (AsOfDate
) 的成员快照,第二个表称为 activity.table
描述在 ActivityDate
发送给该客户的营销事件。
我想查找在客户数据表中每条记录的 AsOfDate 之前或之时发送给成员的最新 ActivityDate(即最长日期)。
我看过几个问题(一个很接近的问题是:Handle a table with ID repetition),但我不确定如何将条件 (ActivityDate < AsOfDate) 与事件日期的最大值结合起来——而且我还想保留连接中两个表的所有列,因为我需要计算 ActivityDate 和 AsOfDate 之间的时间。我仍然不知道什么时候使用滚动......
#libraries
library(lubridate)
library(data.table)
#data
customer.table = structure(list(CustomerID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
4), AsOfDate = structure(c(1435622400, 1435622400, 1435622400,
1435622400, 1435622400, 1435622400, 1435622400, 1435622400, 1435622400,
1435622400, 1394150400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), distance = c(2.17380476584343, 29.4024827688224,
3.01353310956009, 18.4923143452557, 294.878606580665, 11.8870209430565,
9.54438580030996, 24.2192034858273, 15.0069335290262, 10.4513664447137,
18.4923143452557)), .Names = c("CustomerID", "AsOfDate", "distance"
), row.names = c("1", "5", "8", "10", "18", "28", "33", "37",
"45", "47", "101"), class = "data.frame")
activity.table = structure(list(CustomerID = c(3, 5, 8, 10, 4, 10, 2, 2, 5, 7,
5, 8, 4, 6, 10, 6, 5, 4, 2, 5, 5, 6, 5, 5, 10, 8, 6, 4, 5, 8,
7, 1, 8, 10, 7, 8, 4, 1, 1, 10, 9, 7, 4, 6, 9, 10, 8, 3, 5, 8,
1, 4, 4), ActivityDate = structure(c(1330560000, 1368144000,
1332855900, 1337817600, 1370822400, 1365984000, 1337817600, 1368144000,
1331164800, 1331164800, 1394150400, 1394150400, 1396224000, 1393891200,
1393891200, 1398643200, 1396310400, 1399334400, 1399939200, 1403222400,
1402358400, 1404086400, 1425254400, 1426464000, 1426464000, 1426464000,
1427155200, 1429056000, 1429056000, 1429056000, 1363737600, 1332201600,
1330560000, 1433116800, 1433289600, 1433289600, 1338462000, 1366628400,
1335885300, 1427241600, 1427241600, 1427241600, 1430265600, 1430265600,
1430265600, 1430265600, 1365503400, 1338394200, 1430265600, 1430265600,
1432598400, 1433894400, 1426723200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), row.index = 1:53), .Names = c("CustomerID", "ActivityDate",
"row.index"), row.names = c(NA, -53L), class = "data.frame")
# what does the data look like
> head(activity.table)
CustomerID ActivityDate row.index
1 3 2012-03-01 00:00:00 1
2 5 2013-05-10 00:00:00 2
3 8 2012-03-27 13:45:00 3
4 10 2012-05-24 00:00:00 4
5 4 2013-06-10 00:00:00 5
6 10 2013-04-15 00:00:00 6
> head(customer.table)
CustomerID AsOfDate distance
1 1 2015-06-30 2.173805
5 2 2015-06-30 29.402483
8 3 2015-06-30 3.013533
10 4 2015-06-30 18.492314
18 5 2015-06-30 294.878607
28 6 2015-06-30 11.887021
感谢您的帮助。
最佳答案
您似乎正在这里寻找一个简单的滚动连接。首先,我们将转换为 data.table
对象(请注意,我在 CRAN 上使用最新版本的解决方案(V 1.9.6+)
library(data.table) # V 1.9.6+
setDT(customer.table)
setDT(activity.table)
然后,对于 customer.table
中的每一行,我们将尝试加入最接近 activity.table
的值,同时滚动到无穷大
indx <- activity.table[customer.table,
on = c(CustomerID = "CustomerID",
ActivityDate = "AsOfDate"),
roll = Inf,
which = TRUE]
indx
# [1] 51 19 48 52 49 44 35 36 45 34 5
indx
是 activity.table
中距离 customer.table
中每一行最近的日期的位置向量。
现在,剩下的就是加入回 customer.table
customer.table[, MostRecentDate := activity.table[indx,ActivityDate]]
customer.table
# CustomerID AsOfDate distance MostRecentDate
# 1: 1 2015-06-30 2.173805 2015-05-26 00:00:00
# 2: 2 2015-06-30 29.402483 2014-05-13 00:00:00
# 3: 3 2015-06-30 3.013533 2012-05-30 16:10:00
# 4: 4 2015-06-30 18.492314 2015-06-10 00:00:00
# 5: 5 2015-06-30 294.878607 2015-04-29 00:00:00
# 6: 6 2015-06-30 11.887021 2015-04-29 00:00:00
# 7: 7 2015-06-30 9.544386 2015-06-03 00:00:00
# 8: 8 2015-06-30 24.219203 2015-06-03 00:00:00
# 9: 9 2015-06-30 15.006934 2015-04-29 00:00:00
# 10: 10 2015-06-30 10.451366 2015-06-01 00:00:00
# 11: 4 2014-03-07 18.492314 2013-06-10 00:00:00
关于r - Data.table:加入 ID 和日期键,但希望在第一个表中的日期键之前(或等于)日期键最接近的日期,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/33357341/