由于重复过滤大数据帧,R for 循环非常慢

标签 r vectorization

提前对数据墙表示歉意。只是想显示数据,并包括 dputs()。总而言之,这篇文章不太长。

我正在努力寻找一种更快的方法来计算数据框中列的值。我目前有以下大约 40000 行的数据框,我必须(我目前)逐行循环:

# ignore row numbers being multiples of 5
head(mydata, 50)
     iduser node_id    insert_timestamp last_10_timestamps
5    175094  171078 2018-02-11 18:33:38                  0
10   175094  171078 2018-02-22 18:33:48                  0
15   175094  171078 2018-02-26 18:33:47                  0
20   175094  171078 2018-03-02 18:33:51                  0
25   175094  171078 2018-03-07 18:33:56                  0
30   175094  171080 2018-03-15 00:47:28                  0
35   175094  171080 2018-04-07 00:46:23                  0
40   175094  171080 2018-04-15 00:46:03                  0
45   175094  171080 2018-04-21 00:46:00                  0
50   175094  171080 2018-04-29 00:46:01                  0
55   563240  171080 2017-12-08 04:44:57                  0
60   563240  171078 2017-12-10 21:32:44                  0
65   563240  171078 2017-12-13 21:32:58                  0
70   563240  171080 2017-12-16 04:57:35                  0
75   563240  171078 2017-12-18 21:33:04                  0
80   563240  171080 2017-12-21 04:47:52                  0
85   563240  171078 2017-12-23 21:32:49                  0
90   563240  171080 2017-12-26 04:47:24                  0
95   563240  171078 2017-12-28 21:33:11                  0
100  563240  171080 2017-12-31 04:44:59                  0
105  563240  171078 2018-01-02 21:32:58                  0
110  563240  171080 2018-01-05 04:59:26                  0
115  563240  171078 2018-01-07 21:33:12                  0
120  563240  171080 2018-01-10 04:45:18                  0
125  563240  171080 2018-01-13 04:45:21                  0
130  563240  171078 2018-01-15 21:33:23                  0
135  563240  171078 2018-01-18 21:33:03                  0
140  563240  171080 2018-01-21 05:06:57                  0
145  563240  171080 2018-01-24 04:45:26                  0
150  563240  171078 2018-01-26 21:33:19                  0
155  563240  171080 2018-01-29 05:20:39                  0
160  563240  171078 2018-01-31 21:34:33                  0
165  563240  171080 2018-02-03 05:06:58                  0
170  563240  171078 2018-02-05 21:34:40                  0
175  563240  171080 2018-02-08 05:38:11                  0
180  563240  171078 2018-02-10 21:34:28                  0
185  563240  171080 2018-02-13 04:45:54                  0
190  563240  171080 2018-04-03 03:47:03                  0
195  563240  171080 2018-04-08 03:46:41                  0
200  563240  171080 2018-04-13 03:47:31                  0
205  563240  171080 2018-04-18 03:47:28                  0
210  563240  171080 2018-04-23 03:47:02                  0
215  563240  171080 2018-04-29 03:46:55                  0
220 1018597  171080 2017-10-11 03:45:35                  0
225 1018597  171078 2017-10-13 20:33:16                  0
230 1018597  171080 2017-10-16 03:45:30                  0
235 1018597  171078 2017-10-18 20:33:15                  0
240 1020046  171078 2017-10-12 18:32:56                  0
245 1020046  171078 2017-10-15 18:32:55                  0
250 1020046  171078 2017-10-18 18:32:53                  0

# and the dput to reconstruct
dput(mydata)
structure(list(iduser = c(175094L, 175094L, 175094L, 175094L, 
175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 563240L, 
563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 
563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 
563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 
563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 
563240L, 563240L, 563240L, 563240L, 1018597L, 1018597L, 1018597L, 
1018597L, 1020046L, 1020046L, 1020046L), node_id = c(171078L, 
171078L, 171078L, 171078L, 171078L, 171080L, 171080L, 171080L, 
171080L, 171080L, 171080L, 171078L, 171078L, 171080L, 171078L, 
171080L, 171078L, 171080L, 171078L, 171080L, 171078L, 171080L, 
171078L, 171080L, 171080L, 171078L, 171078L, 171080L, 171080L, 
171078L, 171080L, 171078L, 171080L, 171078L, 171080L, 171078L, 
171080L, 171080L, 171080L, 171080L, 171080L, 171080L, 171080L, 
171080L, 171078L, 171080L, 171078L, 171078L, 171078L, 171078L
), insert_timestamp = structure(c(1518374018, 1519324428, 1519670027, 
1520015631, 1520447636, 1521074848, 1523061983, 1523753163, 1524271560, 
1524962761, 1512708297, 1512941564, 1513200778, 1513400255, 1513632784, 
1513831672, 1514064769, 1514263644, 1514496791, 1514695499, 1514928778, 
1515128366, 1515360792, 1515559518, 1515818721, 1516052003, 1516311183, 
1516511217, 1516769126, 1517002399, 1517203239, 1517434473, 1517634418, 
1517866480, 1518068291, 1518298468, 1518497154, 1522727223, 1523159201, 
1523591251, 1524023248, 1524455222, 1524973615, 1507693535, 1507926796, 
1508125530, 1508358795, 1507833176, 1508092375, 1508351573), class = c("POSIXct", 
"POSIXt"), tzone = "GMT"), last_10_timestamps = c(0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0)), .Names = c("iduser", "node_id", "insert_timestamp", 
"last_10_timestamps"), row.names = c(5L, 10L, 15L, 20L, 25L, 
30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 
95L, 100L, 105L, 110L, 115L, 120L, 125L, 130L, 135L, 140L, 145L, 
150L, 155L, 160L, 165L, 170L, 175L, 180L, 185L, 190L, 195L, 200L, 
205L, 210L, 215L, 220L, 225L, 230L, 235L, 240L, 245L, 250L), class = "data.frame")

last_10_timestamps 是根据第二个数据帧计算的指标。 要计算任何给定行的last_10_timestamps,我必须通过第二个数据帧所具有的 iduser、node_id 和时间戳这三个参数来过滤第二个数据帧(将在下面显示)。首先,我将显示第二个数据帧,然后是我的缓慢 for 循环,它也清除了我想要做的事情。

head(mydata2, 50)
   iduser   meal_type        log_meal_GMT log_meal_hr
1  175094     "snack" 2018-02-06 00:12:33          19
2  175094     "snack" 2018-02-06 00:57:41          19
3  175094 "breakfast" 2018-02-06 12:19:19           7
4  175094     "lunch" 2018-02-06 20:25:18          15
5  175094    "dinner" 2018-02-06 23:48:35          18
6  175094 "breakfast" 2018-02-08 23:09:42          18
7  175094     "lunch" 2018-02-07 18:57:46          13
8  175094    "dinner" 2018-02-07 23:47:09          18
9  175094 "breakfast" 2018-02-03 16:41:20          11
10 175094     "lunch" 2018-02-03 18:56:16          13
11 175094    "dinner" 2018-02-03 23:33:15          18
12 175094 "breakfast" 2018-02-08 14:02:34           9
13 175094     "lunch" 2018-02-08 23:08:04          18
14 175094    "dinner" 2018-02-08 23:09:13          18
15 175094     "snack" 2018-02-09 03:16:02          22
16 175094 "breakfast" 2018-02-09 13:53:57           8
17 175094     "lunch" 2018-02-10 02:29:40          21
18 175094     "snack" 2018-02-05 01:37:46          20
19 175094 "breakfast" 2018-02-05 13:19:42           8
20 175094     "snack" 2018-02-05 13:55:06           8
21 175094     "lunch" 2018-02-05 16:24:44          11
22 175094    "dinner" 2018-02-05 20:58:21          15
23 175094    "dinner" 2018-02-05 20:58:41          15
24 175094     "snack" 2018-02-04 04:12:42          23
25 175094 "breakfast" 2018-02-04 13:17:59           8

dput(mydata2)
structure(list(iduser = c(175094L, 175094L, 175094L, 175094L, 
175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 
175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 
175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 175094L
), meal_type = c("\"snack\"", "\"snack\"", "\"breakfast\"", "\"lunch\"", 
"\"dinner\"", "\"breakfast\"", "\"lunch\"", "\"dinner\"", "\"breakfast\"", 
"\"lunch\"", "\"dinner\"", "\"breakfast\"", "\"lunch\"", "\"dinner\"", 
"\"snack\"", "\"breakfast\"", "\"lunch\"", "\"snack\"", "\"breakfast\"", 
"\"snack\"", "\"lunch\"", "\"dinner\"", "\"dinner\"", "\"snack\"", 
"\"breakfast\""), log_meal_GMT = c("2018-02-06 00:12:33", "2018-02-06 00:57:41", 
"2018-02-06 12:19:19", "2018-02-06 20:25:18", "2018-02-06 23:48:35", 
"2018-02-08 23:09:42", "2018-02-07 18:57:46", "2018-02-07 23:47:09", 
"2018-02-03 16:41:20", "2018-02-03 18:56:16", "2018-02-03 23:33:15", 
"2018-02-08 14:02:34", "2018-02-08 23:08:04", "2018-02-08 23:09:13", 
"2018-02-09 03:16:02", "2018-02-09 13:53:57", "2018-02-10 02:29:40", 
"2018-02-05 01:37:46", "2018-02-05 13:19:42", "2018-02-05 13:55:06", 
"2018-02-05 16:24:44", "2018-02-05 20:58:21", "2018-02-05 20:58:41", 
"2018-02-04 04:12:42", "2018-02-04 13:17:59"), log_meal_hr = c(19, 
19, 7, 15, 18, 18, 13, 18, 11, 13, 18, 9, 18, 18, 22, 8, 21, 
20, 8, 8, 11, 15, 15, 23, 8)), .Names = c("iduser", "meal_type", 
"log_meal_GMT", "log_meal_hr"), row.names = c(NA, 25L), class = "data.frame")

最后,我有以下 for 循环,该循环需要很长时间。我在一个包含 40K 行的数据帧上进行 for 循环(在我的代码中),并且在每个 for 循环中,过滤一个本身为 100K 行的数据帧:

# nrow(mydata) is nearly 40,000
mydata$last_10_timestamps = 0
for(i in 1:nrow(mydata)) {

  this_user = mydata$iduser[i]
  this_time = mydata$insert_timestamp[i]
  this_node = mydata$node_id[i]

  if(this_node == 171078) {
    # this filtering 2+ seconds each time, is a problem to do 40K times.
    starttime = Sys.time()
    lastTenTimes = tail(mydata2$log_meal_hr[mydata2$iduser == this_user & mydata2$log_meal_GMT < this_time & mydata2$meal_type == "\"lunch\"" ], 10)
    end_time = Sys.time()
    end_time - starttime
  }

  # fill value for timestamps at correct (ith) row
  mydata$last_10_timestamps[i] = mean(lastTenTimes)  
}

任何有关如何加快速度的想法将不胜感激,谢谢!

编辑:我想提前指出,仅共享 100K 数据帧的 25-50 行可能不够,也不能完全突出显示数据。如果需要,很乐意分享更多内容。

最佳答案

查看 for 循环并查看 mydata运行 for 循环后,我怀疑输出不是您想要的。查看 mydata 的第 11 行可以最好地说明这一点。 :iduser mydata2 中不存在该行,但仍然会计算一个值。

for 循环中的另一个可能的问题是为 node_id 的行计算值。不等于171078 .

一个可能的解决方案是使用data.table包的连接功能。在以下解决方案中,我仅计算满足以下四个条件的行的值:

  • mydata$iduser == mydata2$iduser
  • mydata$node_id == 171078
  • mydata2$log_meal_GMT < mydata$insert_timestamp
  • mydata2$meal_type == '"lunch"'

代码:

# convert 'log_meal_GMT' to a date-format
mydata2$log_meal_GMT <- as.POSIXct(mydata2$log_meal_GMT)

# load the 'data.table'-package and convert the data.frames to data.tables
library(data.table)
setDT(mydata)
setDT(mydata2)

# use several nested joins to get the result
mydata[mydata2[meal_type == '"lunch"'
               ][mydata[node_id == 171078]
                 , on = .(iduser, log_meal_GMT < insert_timestamp)
                 , nomatch = 0
                 , allow.cartesian = TRUE
                 ][, .(last_10_ts = mean(tail(log_meal_hr, 10)))
                   , by = .(iduser, log_meal_GMT)]
       , on = .(iduser, insert_timestamp = log_meal_GMT)
       , last_10_ts := i.last_10_ts][]

给出(我还包括了 for 循环的输出):

     iduser node_id    insert_timestamp last_10_timestamps last_10_ts
 1:  175094  171078 2018-02-11 18:33:38           15.16667   15.16667
 2:  175094  171078 2018-02-22 18:33:48           15.16667   15.16667
 3:  175094  171078 2018-02-26 18:33:47           15.16667   15.16667
 4:  175094  171078 2018-03-02 18:33:51           15.16667   15.16667
 5:  175094  171078 2018-03-07 18:33:56           15.16667   15.16667
 6:  175094  171080 2018-03-15 00:47:28           15.16667         NA
 7:  175094  171080 2018-04-07 00:46:23           15.16667         NA
 8:  175094  171080 2018-04-15 00:46:03           15.16667         NA
 9:  175094  171080 2018-04-21 00:46:00           15.16667         NA
10:  175094  171080 2018-04-29 00:46:01           15.16667         NA
11:  563240  171080 2017-12-08 04:44:57           15.16667         NA
12:  563240  171078 2017-12-10 21:32:44                NaN         NA
13:  563240  171078 2017-12-13 21:32:58                NaN         NA
14:  563240  171080 2017-12-16 04:57:35                NaN         NA
15:  563240  171078 2017-12-18 21:33:04                NaN         NA
16:  563240  171080 2017-12-21 04:47:52                NaN         NA
17:  563240  171078 2017-12-23 21:32:49                NaN         NA
18:  563240  171080 2017-12-26 04:47:24                NaN         NA
19:  563240  171078 2017-12-28 21:33:11                NaN         NA
20:  563240  171080 2017-12-31 04:44:59                NaN         NA
21:  563240  171078 2018-01-02 21:32:58                NaN         NA
22:  563240  171080 2018-01-05 04:59:26                NaN         NA
23:  563240  171078 2018-01-07 21:33:12                NaN         NA
24:  563240  171080 2018-01-10 04:45:18                NaN         NA
25:  563240  171080 2018-01-13 04:45:21                NaN         NA
26:  563240  171078 2018-01-15 21:33:23                NaN         NA
27:  563240  171078 2018-01-18 21:33:03                NaN         NA
28:  563240  171080 2018-01-21 05:06:57                NaN         NA
29:  563240  171080 2018-01-24 04:45:26                NaN         NA
30:  563240  171078 2018-01-26 21:33:19                NaN         NA
31:  563240  171080 2018-01-29 05:20:39                NaN         NA
32:  563240  171078 2018-01-31 21:34:33                NaN         NA
33:  563240  171080 2018-02-03 05:06:58                NaN         NA
34:  563240  171078 2018-02-05 21:34:40                NaN         NA
35:  563240  171080 2018-02-08 05:38:11                NaN         NA
36:  563240  171078 2018-02-10 21:34:28                NaN         NA
37:  563240  171080 2018-02-13 04:45:54                NaN         NA
38:  563240  171080 2018-04-03 03:47:03                NaN         NA
39:  563240  171080 2018-04-08 03:46:41                NaN         NA
40:  563240  171080 2018-04-13 03:47:31                NaN         NA
41:  563240  171080 2018-04-18 03:47:28                NaN         NA
42:  563240  171080 2018-04-23 03:47:02                NaN         NA
43:  563240  171080 2018-04-29 03:46:55                NaN         NA
44: 1018597  171080 2017-10-11 03:45:35                NaN         NA
45: 1018597  171078 2017-10-13 20:33:16                NaN         NA
46: 1018597  171080 2017-10-16 03:45:30                NaN         NA
47: 1018597  171078 2017-10-18 20:33:15                NaN         NA
48: 1020046  171078 2017-10-12 18:32:56                NaN         NA
49: 1020046  171078 2017-10-15 18:32:55                NaN         NA
50: 1020046  171078 2017-10-18 18:32:53                NaN         NA

如果您不希望它在 node_id 上过滤,您可以使用:

mydata[mydata2[meal_type == '"lunch"'
               ][mydata
                 , on = .(iduser, log_meal_GMT < insert_timestamp)
                 , nomatch = 0
                 , allow.cartesian = TRUE
                 ][, .(last_10_ts = mean(tail(log_meal_hr, 10)))
                   , by = .(iduser, log_meal_GMT)]
       , on = .(iduser, insert_timestamp = log_meal_GMT)
       , last_10_ts := i.last_10_ts][]

给出:

     iduser node_id    insert_timestamp last_10_timestamps last_10_ts
 1:  175094  171078 2018-02-11 18:33:38           15.16667   15.16667
 2:  175094  171078 2018-02-22 18:33:48           15.16667   15.16667
 3:  175094  171078 2018-02-26 18:33:47           15.16667   15.16667
 4:  175094  171078 2018-03-02 18:33:51           15.16667   15.16667
 5:  175094  171078 2018-03-07 18:33:56           15.16667   15.16667
 6:  175094  171080 2018-03-15 00:47:28           15.16667   15.16667
 7:  175094  171080 2018-04-07 00:46:23           15.16667   15.16667
 8:  175094  171080 2018-04-15 00:46:03           15.16667   15.16667
 9:  175094  171080 2018-04-21 00:46:00           15.16667   15.16667
10:  175094  171080 2018-04-29 00:46:01           15.16667   15.16667
11:  563240  171080 2017-12-08 04:44:57           15.16667         NA
12:  563240  171078 2017-12-10 21:32:44                NaN         NA
13:  563240  171078 2017-12-13 21:32:58                NaN         NA
14:  563240  171080 2017-12-16 04:57:35                NaN         NA
15:  563240  171078 2017-12-18 21:33:04                NaN         NA
.....

关于由于重复过滤大数据帧,R for 循环非常慢,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50186087/

相关文章:

r - 在 sqldf 中将引用声明为 "quote"

r - 有没有办法在绘图热图的右侧移动行标签?

c - C 中的矢量化 Trig 函数?

matlab - 多个起点和终点的高效冒号运算符

python - 在 Pandas 数据框中矢量化条件赋值

python - 在 cython 中向量化具有两个数组作为输入的类的函数

R 无法在包中找到特定功能

使用 list.files() 查找不从特定字符串开始的文件的正则表达式

r - 转换数据框中的混合日期格式

vectorization - "vectorization"是什么?