r - 分组数据增长率,3组

标签 r dplyr tidyverse

我试图计算分组数据的年增长率;按 1) 组、2) 产品类别和 3) 年分组。

我尝试按三个参数进行分组,然后将增长率计算为:((x/dplyr::lag(x,1))-1)*100。但是,此操作会导致新列完全不适用。

group_exports_g.rate <- baci_exports %>% ungroup() %>% 
  group_by(group,sna,t) %>% summarise(exports = sum(exports),
                                      n= sum(n)) %>% ungroup() %>% 
  group_by(group,sna,t) %>% arrange(group,sna,t) %>% 
  mutate(gr.exports = 100*((exports/lag(exports,1))-1))
dput(baci_exports)

structure(list(t = c(1995, 1995, 1995, 1995, 1995, 1995, 1995, 
1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 
1995, 1995), i = c(4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 
8, 12, 12, 12, 12, 12), sna = c("Capital goods", "Consumer goods", 
"Fuels and lubricants", "Intermediate goods", "Parts and accessories of capital goods", 
"Passenger motor cars", "Primary goods", "Capital goods", "Consumer goods", 
"Fuels and lubricants", "Goods not elsewhere specified", "Intermediate goods", 
"Parts and accessories of capital goods", "Passenger motor cars", 
"Primary goods", "Capital goods", "Consumer goods", "Fuels and lubricants", 
"Intermediate goods", "Parts and accessories of capital goods"
), exports = c(1195.2624224154, 22997.0533036558, 5.3693833059, 
9720.564817782, 183.9809856813, 111.2556313572, 47976.2051093033, 
5711.1685900189, 120659.569187797, 2264.5410811663, 75.806, 60184.2530566294, 
2249.821, 280.439, 36458.4499193458, 30316.017597217, 150361.471507771, 
8686043.01945958, 448718.59940394, 6633.9115459232), n = c(36L, 
207L, 1L, 137L, 30L, 5L, 58L, 143L, 654L, 5L, 3L, 322L, 57L, 
12L, 143L, 270L, 364L, 106L, 563L, 150L), name_baci = c("Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Albania", "Albania", "Albania", "Albania", "Albania", 
"Albania", "Albania", "Albania", "Algeria", "Algeria", "Algeria", 
"Algeria", "Algeria"), cntry = c("Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Albania", "Albania", "Albania", "Albania", "Albania", "Albania", 
"Albania", "Albania", "Algeria", "Algeria", "Algeria", "Algeria", 
"Algeria"), group = c("Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "SE", 
"SE", "SE", "SE", "SE", "SE", "SE", "SE", "Algeria", "Algeria", 
"Algeria", "Algeria", "Algeria")), row.names = c(NA, -20L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), groups = structure(list(t = c(1995, 
1995, 1995), i = c(4, 8, 12), .rows = list(1:7, 8:15, 16:20)), row.names = c(NA, 
-3L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
> 
structure(list(t = c(1995, 1996, 1997, 1998, 1999, 2000, 2001, 
2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 
2013, 2014, 2015, 2016, 2017), i = c(4, 4, 4, 4, 4, 4, 4, 4, 
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4), sna = c("Capital goods", 
"Capital goods", "Capital goods", "Capital goods", "Capital goods", 
"Capital goods", "Capital goods", "Capital goods", "Capital goods", 
"Capital goods", "Capital goods", "Capital goods", "Capital goods", 
"Capital goods", "Capital goods", "Capital goods", "Capital goods", 
"Capital goods", "Capital goods", "Capital goods", "Capital goods", 
"Capital goods", "Capital goods"), exports = c(1195.2624224154, 
1487.4614064276, 1525.1489543903, 1598.3509917338, 2687.9284436967, 
1754.2923408387, 5913.8963941332, 2619.5146133123, 2915.904116471, 
6412.63136988, 8158.0444400432, 4312.0519543819, 31170.4665315818, 
39146.7973036179, 184970.163402516, 20611.8847778549, 28046.1267778067, 
12125.1990587805, 15435.0095479273, 15942.6566817083, 10549.8178035657, 
9092.5422325593, 7967.3875079918), n = c(36L, 49L, 71L, 50L, 
64L, 88L, 107L, 76L, 105L, 138L, 191L, 178L, 175L, 264L, 448L, 
306L, 210L, 232L, 306L, 280L, 207L, 180L, 198L), name_baci = c("Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan"), cntry = c("Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan"), group = c("Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan"
)), row.names = c(NA, -23L), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"), groups = structure(list(t = c(1995, 1996, 
1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 
2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017), 
    i = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
    4, 4, 4, 4, 4, 4), .rows = list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
        8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 
        19L, 20L, 21L, 22L, 23L)), row.names = c(NA, -23L), class = c("tbl_df", 
"tbl", "data.frame")))

我想要了解每个组、产品类别和年份的年增长率(百分比变化)。

最佳答案

正如 @Croote 指出的,您的样本数据只有一年,因此计算滞后将(正确地)产生 NA。我认为这就是您想要的,因为 1995 年的增长是不确定的。

为了测试两年数据的数据,这里有一些假数据,其中每个值约为。明年上涨 50%:

baci_exports2 <- bind_rows(
  baci_exports %>% ungroup(),
  baci_exports %>% 
    ungroup() %>%
    mutate(t = 1996, 
           exports = exports * rnorm(n(), mean = 1.5, sd = 0.01))
)

此代码似乎产生了正确的输出:(请注意每个 gr.exports 如何接近 50,对应于 50% 的增长,并且 ~1.5x 应用于虚假数据的第二年。)

cgroup_exports_g.rate <- baci_exports2 %>% 
  group_by(group,sna,t) %>% 
  summarise(exports = sum(exports),
            n= sum(n)) %>% 
  mutate(gr.exports = 100*((exports/lag(exports,1))-1)) %>%
  ungroup()


> cgroup_exports_g.rate
# A tibble: 40 x 6
   group       sna                                        t  exports     n gr.exports
   <chr>       <chr>                                  <dbl>    <dbl> <int>      <dbl>
 1 Afghanistan Capital goods                           1995  1195.      36       NA  
 2 Afghanistan Capital goods                           1996  1784.      36       49.3
 3 Afghanistan Consumer goods                          1995 22997.     207       NA  
 4 Afghanistan Consumer goods                          1996 34932.     207       51.9
 5 Afghanistan Fuels and lubricants                    1995     5.37     1       NA  
 6 Afghanistan Fuels and lubricants                    1996     8.00     1       49.0
 7 Afghanistan Intermediate goods                      1995  9721.     137       NA  
 8 Afghanistan Intermediate goods                      1996 14647.     137       50.7
 9 Afghanistan Parts and accessories of capital goods  1995   184.      30       NA  
10 Afghanistan Parts and accessories of capital goods  1996   272.      30       47.9
# ... with 30 more rows

编辑,使用OP中的新阿富汗数据:

afghanistan %>% 
  ungroup() %>%
  group_by(group,sna,t) %>% 
  summarise(exports = sum(exports),
            n= sum(n)) %>% 
  mutate(gr.exports = 100*((exports/lag(exports,1))-1)) %>%
  ungroup()

# A tibble: 23 x 6
   group       sna               t exports     n gr.exports
   <chr>       <chr>         <dbl>   <dbl> <int>      <dbl>
 1 Afghanistan Capital goods  1995   1195.    36      NA   
 2 Afghanistan Capital goods  1996   1487.    49      24.4 
 3 Afghanistan Capital goods  1997   1525.    71       2.53
 4 Afghanistan Capital goods  1998   1598.    50       4.80
 5 Afghanistan Capital goods  1999   2688.    64      68.2 
 6 Afghanistan Capital goods  2000   1754.    88     -34.7 
 7 Afghanistan Capital goods  2001   5914.   107     237.  
 8 Afghanistan Capital goods  2002   2620.    76     -55.7 
 9 Afghanistan Capital goods  2003   2916.   105      11.3 
10 Afghanistan Capital goods  2004   6413.   138     120.

关于r - 分组数据增长率,3组,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56763030/

相关文章:

R {dplyr} : `rename` or `mutate` data. 帧在 LHS 上具有不同列名的 `rowwise` 列表列

r - 根据经纬度获取 K 个最近邻

r - 生存曲线中多组的不同颜色类型和线型

c++ - 如何使用 R 和 Rcpp 删除 NumericVector 中的元素以进行递归

读取单列的txt文件,记录多行每行一个单元格,到一个整洁的数据框中

r - 尝试加载 nfl_elo 的 CSV 文件;缺少参数 "dat",没有默认值

r - 将所有列转换为 data.frame 中的字符

r - 通过 "multiplying"通过其他列的名称转换列的元素

r - 从flextable中的字符串动态设置colnames

r - 从管道分隔数字列表中获取模式