我试图计算分组数据的年增长率;按 1) 组、2) 产品类别和 3) 年分组。
我尝试按三个参数进行分组,然后将增长率计算为:((x/dplyr::lag(x,1))-1)*100。但是,此操作会导致新列完全不适用。
group_exports_g.rate <- baci_exports %>% ungroup() %>%
group_by(group,sna,t) %>% summarise(exports = sum(exports),
n= sum(n)) %>% ungroup() %>%
group_by(group,sna,t) %>% arrange(group,sna,t) %>%
mutate(gr.exports = 100*((exports/lag(exports,1))-1))
dput(baci_exports)
structure(list(t = c(1995, 1995, 1995, 1995, 1995, 1995, 1995,
1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995,
1995, 1995), i = c(4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
8, 12, 12, 12, 12, 12), sna = c("Capital goods", "Consumer goods",
"Fuels and lubricants", "Intermediate goods", "Parts and accessories of capital goods",
"Passenger motor cars", "Primary goods", "Capital goods", "Consumer goods",
"Fuels and lubricants", "Goods not elsewhere specified", "Intermediate goods",
"Parts and accessories of capital goods", "Passenger motor cars",
"Primary goods", "Capital goods", "Consumer goods", "Fuels and lubricants",
"Intermediate goods", "Parts and accessories of capital goods"
), exports = c(1195.2624224154, 22997.0533036558, 5.3693833059,
9720.564817782, 183.9809856813, 111.2556313572, 47976.2051093033,
5711.1685900189, 120659.569187797, 2264.5410811663, 75.806, 60184.2530566294,
2249.821, 280.439, 36458.4499193458, 30316.017597217, 150361.471507771,
8686043.01945958, 448718.59940394, 6633.9115459232), n = c(36L,
207L, 1L, 137L, 30L, 5L, 58L, 143L, 654L, 5L, 3L, 322L, 57L,
12L, 143L, 270L, 364L, 106L, 563L, 150L), name_baci = c("Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Albania", "Albania", "Albania", "Albania", "Albania",
"Albania", "Albania", "Albania", "Algeria", "Algeria", "Algeria",
"Algeria", "Algeria"), cntry = c("Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Albania", "Albania", "Albania", "Albania", "Albania", "Albania",
"Albania", "Albania", "Algeria", "Algeria", "Algeria", "Algeria",
"Algeria"), group = c("Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "SE",
"SE", "SE", "SE", "SE", "SE", "SE", "SE", "Algeria", "Algeria",
"Algeria", "Algeria", "Algeria")), row.names = c(NA, -20L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(t = c(1995,
1995, 1995), i = c(4, 8, 12), .rows = list(1:7, 8:15, 16:20)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
>
structure(list(t = c(1995, 1996, 1997, 1998, 1999, 2000, 2001,
2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
2013, 2014, 2015, 2016, 2017), i = c(4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4), sna = c("Capital goods",
"Capital goods", "Capital goods", "Capital goods", "Capital goods",
"Capital goods", "Capital goods", "Capital goods", "Capital goods",
"Capital goods", "Capital goods", "Capital goods", "Capital goods",
"Capital goods", "Capital goods", "Capital goods", "Capital goods",
"Capital goods", "Capital goods", "Capital goods", "Capital goods",
"Capital goods", "Capital goods"), exports = c(1195.2624224154,
1487.4614064276, 1525.1489543903, 1598.3509917338, 2687.9284436967,
1754.2923408387, 5913.8963941332, 2619.5146133123, 2915.904116471,
6412.63136988, 8158.0444400432, 4312.0519543819, 31170.4665315818,
39146.7973036179, 184970.163402516, 20611.8847778549, 28046.1267778067,
12125.1990587805, 15435.0095479273, 15942.6566817083, 10549.8178035657,
9092.5422325593, 7967.3875079918), n = c(36L, 49L, 71L, 50L,
64L, 88L, 107L, 76L, 105L, 138L, 191L, 178L, 175L, 264L, 448L,
306L, 210L, 232L, 306L, 280L, 207L, 180L, 198L), name_baci = c("Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan"), cntry = c("Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan"), group = c("Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan"
)), row.names = c(NA, -23L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), groups = structure(list(t = c(1995, 1996,
1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017),
i = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4), .rows = list(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L)), row.names = c(NA, -23L), class = c("tbl_df",
"tbl", "data.frame")))
我想要了解每个组、产品类别和年份的年增长率(百分比变化)。
最佳答案
正如 @Croote 指出的,您的样本数据只有一年,因此计算滞后将(正确地)产生 NA。我认为这就是您想要的,因为 1995 年的增长是不确定的。
为了测试两年数据的数据,这里有一些假数据,其中每个值约为。明年上涨 50%:
baci_exports2 <- bind_rows(
baci_exports %>% ungroup(),
baci_exports %>%
ungroup() %>%
mutate(t = 1996,
exports = exports * rnorm(n(), mean = 1.5, sd = 0.01))
)
此代码似乎产生了正确的输出:(请注意每个 gr.exports
如何接近 50,对应于 50% 的增长,并且 ~1.5x
应用于虚假数据的第二年。)
cgroup_exports_g.rate <- baci_exports2 %>%
group_by(group,sna,t) %>%
summarise(exports = sum(exports),
n= sum(n)) %>%
mutate(gr.exports = 100*((exports/lag(exports,1))-1)) %>%
ungroup()
> cgroup_exports_g.rate
# A tibble: 40 x 6
group sna t exports n gr.exports
<chr> <chr> <dbl> <dbl> <int> <dbl>
1 Afghanistan Capital goods 1995 1195. 36 NA
2 Afghanistan Capital goods 1996 1784. 36 49.3
3 Afghanistan Consumer goods 1995 22997. 207 NA
4 Afghanistan Consumer goods 1996 34932. 207 51.9
5 Afghanistan Fuels and lubricants 1995 5.37 1 NA
6 Afghanistan Fuels and lubricants 1996 8.00 1 49.0
7 Afghanistan Intermediate goods 1995 9721. 137 NA
8 Afghanistan Intermediate goods 1996 14647. 137 50.7
9 Afghanistan Parts and accessories of capital goods 1995 184. 30 NA
10 Afghanistan Parts and accessories of capital goods 1996 272. 30 47.9
# ... with 30 more rows
编辑,使用OP中的新阿富汗数据:
afghanistan %>%
ungroup() %>%
group_by(group,sna,t) %>%
summarise(exports = sum(exports),
n= sum(n)) %>%
mutate(gr.exports = 100*((exports/lag(exports,1))-1)) %>%
ungroup()
# A tibble: 23 x 6
group sna t exports n gr.exports
<chr> <chr> <dbl> <dbl> <int> <dbl>
1 Afghanistan Capital goods 1995 1195. 36 NA
2 Afghanistan Capital goods 1996 1487. 49 24.4
3 Afghanistan Capital goods 1997 1525. 71 2.53
4 Afghanistan Capital goods 1998 1598. 50 4.80
5 Afghanistan Capital goods 1999 2688. 64 68.2
6 Afghanistan Capital goods 2000 1754. 88 -34.7
7 Afghanistan Capital goods 2001 5914. 107 237.
8 Afghanistan Capital goods 2002 2620. 76 -55.7
9 Afghanistan Capital goods 2003 2916. 105 11.3
10 Afghanistan Capital goods 2004 6413. 138 120.
关于r - 分组数据增长率,3组,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56763030/