我有一个数据框,它包含 key_var
和 Amount
字段,如下所示:
删除特定记录/元素后,特定 key_var 的总金额应在 0 到 1 (0-0.99) 之间,现在我需要识别这些可移动记录并针对它们创建一个“FLAG”(通过创建一个新的变量作为FLAG
)。
可以生成多种组合,但我只需要使用 R 生成一组组合。
仅供引用,如果我们从下面的数据集中删除最后 12 条记录/元素,则总和匹配为 0.25。 现在这是手动完成的,我需要生成 R 代码来自动执行相同的操作。
df<-structure(list(key_var = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "XYZ_1234", class = "factor"), Amount = c(8200304.5, 8160830.25, -8035850.35, -7843855.06, -7638726.82, 7635197.95, 6947059.96, -6779376.16, -6659630.59, -6538178.03, 3890858.28, 3727088.57, 3440399.02, 2612664.47, 2147241.37, -1381553.09, -1307455.22, 1253244.05, -1077622.65, 1035065.78, 1020172.5, -1018263.84, 893138.6, -892595.1, -676137.21, 565106.18, -451752.19, -444984.92, -333922.62, -333922.39, 281748.19, -235644.35, -159120.68, 145970.31, 124236.96, 124160.23, -98276.99, -88602.23, -66468.98, 61162.81, 36316.05, 24832.04, 21011.73, 13469.54, -13143.08, -11365.96, 5528.03,3822.78, -3788.55, -1809.79, 995.66, -543.5, 511.52, -18.22, -1.81, 0.96, 0.87, 0.86, 1.66)), .Names = c("key_var", "Amount"), row.names = c(NA, -59L), class = "data.frame")
谢谢, 维纳亚克
最佳答案
您可以尝试“自上而下”的方法。这将从头开始cumsum
,并在总和在范围内时停止。
library(tidyverse)
df %>%
mutate(Sum=cumsum(Amount),
Flag=between(Sum,0,0.99)) %>%
filter(c(rep(T, which(Flag)), rep(F,n()-which(Flag))))
key_var Amount Sum Flag
1 XYZ_1234 8200304.50 8200304.50 FALSE
2 XYZ_1234 8160830.25 16361134.75 FALSE
3 XYZ_1234 -8035850.35 8325284.40 FALSE
4 XYZ_1234 -7843855.06 481429.34 FALSE
5 XYZ_1234 -7638726.82 -7157297.48 FALSE
6 XYZ_1234 7635197.95 477900.47 FALSE
7 XYZ_1234 6947059.96 7424960.43 FALSE
8 XYZ_1234 -6779376.16 645584.27 FALSE
9 XYZ_1234 -6659630.59 -6014046.32 FALSE
10 XYZ_1234 -6538178.03 -12552224.35 FALSE
11 XYZ_1234 3890858.28 -8661366.07 FALSE
12 XYZ_1234 3727088.57 -4934277.50 FALSE
13 XYZ_1234 3440399.02 -1493878.48 FALSE
14 XYZ_1234 2612664.47 1118785.99 FALSE
15 XYZ_1234 2147241.37 3266027.36 FALSE
16 XYZ_1234 -1381553.09 1884474.27 FALSE
17 XYZ_1234 -1307455.22 577019.05 FALSE
18 XYZ_1234 1253244.05 1830263.10 FALSE
19 XYZ_1234 1020172.50 2850435.60 FALSE
20 XYZ_1234 -1018263.84 1832171.76 FALSE
21 XYZ_1234 893138.60 2725310.36 FALSE
22 XYZ_1234 -892595.10 1832715.26 FALSE
23 XYZ_1234 -676137.21 1156578.05 FALSE
24 XYZ_1234 565106.18 1721684.23 FALSE
25 XYZ_1234 -451752.19 1269932.04 FALSE
26 XYZ_1234 -444984.92 824947.12 FALSE
27 XYZ_1234 -333922.62 491024.50 FALSE
28 XYZ_1234 -333922.39 157102.11 FALSE
29 XYZ_1234 -235644.35 -78542.24 FALSE
30 XYZ_1234 -159120.68 -237662.92 FALSE
31 XYZ_1234 145970.31 -91692.61 FALSE
32 XYZ_1234 124236.96 32544.35 FALSE
33 XYZ_1234 124160.23 156704.58 FALSE
34 XYZ_1234 -98276.99 58427.59 FALSE
35 XYZ_1234 -88602.23 -30174.64 FALSE
36 XYZ_1234 -66468.98 -96643.62 FALSE
37 XYZ_1234 61162.81 -35480.81 FALSE
38 XYZ_1234 24832.04 -10648.77 FALSE
39 XYZ_1234 13469.54 2820.77 FALSE
40 XYZ_1234 -3788.55 -967.78 FALSE
41 XYZ_1234 995.66 27.88 FALSE
42 XYZ_1234 -543.50 -515.62 FALSE
43 XYZ_1234 511.52 -4.10 FALSE
44 XYZ_1234 0.96 -3.14 FALSE
45 XYZ_1234 0.87 -2.27 FALSE
46 XYZ_1234 0.86 -1.41 FALSE
47 XYZ_1234 1.66 0.25 TRUE
更通用的解决方案是从上到下获取所有组合。所以第一行是 1:nrow(df)
,然后是 2:nrow(df)
、3:nrow(df)
等等。 . 输出是一个 data.frame,指定 data.frame 子集的开始值和结束值。因此 sum(df$Amount[1:47])
以及三个单个值,例如sum(df$Amount[44:44])
为您提供预期结果。然后,您可以添加带有 TRUE
/FALSE
的列。
res <- data.frame(A=1:nrow(df), B=nrow(df)) %>%
split(.$A) %>%
map(~df[.$A:.$B,]) %>%
map(~mutate(.,Sum=cumsum(.$Amount),
Flag=between(Sum,0,0.99))) %>%
keep(~any(.$Flag)) %>%
map_dbl(~which(.$Flag)[1]) %>%
tibble(Start=as.numeric(names(.)), Stop=.) %>%
mutate(Stop= Start + Stop - 1)
res
# A tibble: 4 x 2
Start Stop
<dbl> <dbl>
1 1 47
2 44 44
3 45 45
4 46 46
# add Flag column of the first match
df %>%
rownames_to_column() %>%
mutate(Flag=FALSE) %>%
mutate(Flag=ifelse(between(as.numeric(rowname), res$Start[1], res$Stop[1]), TRUE, Flag)) %>%
head
key_var Amount Flag
1 XYZ_1234 8200305 TRUE
2 XYZ_1234 8160830 TRUE
3 XYZ_1234 -8035850 TRUE
4 XYZ_1234 -7843855 TRUE
5 XYZ_1234 -7638727 TRUE
6 XYZ_1234 7635198 TRUE
数据
df <- structure(list(key_var = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "XYZ_1234", class = "factor"),
Amount = c(8200304.5, 8160830.25, -8035850.35, -7843855.06,
-7638726.82, 7635197.95, 6947059.96, -6779376.16, -6659630.59,
-6538178.03, 3890858.28, 3727088.57, 3440399.02, 2612664.47,
2147241.37, -1381553.09, -1307455.22, 1253244.05, 1020172.5,
-1018263.84, 893138.6, -892595.1, -676137.21, 565106.18,
-451752.19, -444984.92, -333922.62, -333922.39, -235644.35,
-159120.68, 145970.31, 124236.96, 124160.23, -98276.99, -88602.23,
-66468.98, 61162.81, 24832.04, 13469.54, -3788.55, 995.66,
-543.5, 511.52, 0.96, 0.87, 0.86, 1.66, -1077622.65, 1035065.78,
281748.19, 36316.05, 21011.73, -13143.08, -11365.96, 5528.03,
3822.78, -1809.79, -18.22, -1.81)), class = "data.frame", row.names = c(NA,
-59L), .Names = c("key_var", "Amount"))
关于r - 使用 R 查找总和为给定值的元素,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50507514/