我有一个非常广泛的数据集,我试图在一定程度上但不完全地逆透视。本质上,我试图根据下划线之前的字符串将某些列分组在一起,并分别以这些组为中心。我当前的方法使用两个相反的枢轴、一个 for 循环和一个中间列表来实现我的目标。我能够得到我的最终产品,但据我所知,我想知道是否有更优雅的解决方案。我意识到我可能没有很好地解释事情,因此我使用虚拟数据集重新创建了该场景。
#Required packages
library(tidyverse)
#Dummy data
file <- as_tibble(data.frame(id = c("QQQ", "WWW", "EEE", "RRR", "TTT"),
state = c("aa", "bb", "cc", "dd", "ee"),
city = c("ff", "gg", "hh", "ii", "jj"),
a_1 = runif(5),
a_2 = runif(5),
a_3 = runif(5),
a_4 = runif(5),
a_5 = runif(5),
a_6 = runif(5),
a_7 = runif(5),
a_8 = runif(5),
a_9 = runif(5),
a_10 = runif(5),
b_1 = runif(5),
b_2 = runif(5),
b_3 = runif(5),
b_4 = runif(5),
b_5 = runif(5),
b_6 = runif(5),
b_7 = runif(5),
b_8 = runif(5),
b_9 = runif(5),
b_10 = runif(5),
c_1 = runif(5),
c_2 = runif(5),
c_3 = runif(5),
c_4 = runif(5),
c_5 = runif(5),
c_6 = runif(5),
c_7 = runif(5),
c_8 = runif(5),
c_9 = runif(5),
c_10 = runif(5)))
#My solution
longer <- file %>%
pivot_longer(cols = c(-id:-city),
names_to = c(".value", "section"),
names_pattern = "(.+)_([0-9]+$)"
)
num_letterGroup <- ncol(longer) - 4 #4 is the number of columns i want to retain
wide_list <- vector(mode = "list", length = num_letterGroup)
name_list <- vector(mode = "character", length = num_letterGroup)
for (i in 1:num_letterGroup) {
col_num <- 4 + i
col_name <- colnames(longer)[col_num]
wide <- longer %>%
select(1:4, all_of(col_name)) %>%
pivot_wider(names_from = section, values_from = col_name) %>%
mutate(letterGroup = col_name)
wide_list[[i]] <- wide
name_list[i] <- col_name
}
names(wide_list) <- name_list
wide_df <- bind_rows(wide_list)
我意识到给出的数据量可能看起来过多,但我需要列号是连续的并且达到两位数。预先感谢您提供的任何帮助。
编辑澄清:wide_df 是我想要的最终产品
最佳答案
编辑
这实际上比原来的答案简单得多。 (感谢@thelatemail)
library(tidyr)
pivot_longer(file,
cols = -c(id:city),
names_to = c('letterGroup', '.value'),
names_sep = '_')
# A tibble: 15 x 14
# id state city letterGroup `1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
# <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 QQQ aa ff a 0.894 0.534 0.583 0.327 0.497 0.254 0.877 0.236 0.585 0.436
# 2 QQQ aa ff b 0.861 0.897 0.244 0.292 0.818 0.428 0.732 0.322 0.702 0.158
# 3 QQQ aa ff c 0.371 0.842 0.918 0.615 0.346 0.675 0.821 0.718 0.461 0.374
# 4 WWW bb gg a 0.573 0.00886 0.555 0.810 0.480 0.763 0.624 0.0667 0.705 0.872
# 5 WWW bb gg b 0.994 0.652 0.961 0.825 0.398 0.0138 0.560 0.695 0.0171 0.704
# 6 WWW bb gg c 0.113 0.988 0.663 0.0461 0.335 0.478 0.291 0.338 0.386 0.183
# 7 EEE cc hh a 0.482 0.197 0.630 0.442 0.633 0.932 0.317 0.119 0.872 0.678
# 8 EEE cc hh b 0.834 0.378 0.504 0.911 0.644 0.976 0.777 0.485 0.470 0.560
# 9 EEE cc hh c 0.819 0.240 0.683 0.570 0.969 0.956 0.745 0.790 0.0548 0.314
#10 RRR dd ii a 0.887 0.818 0.0266 0.444 0.554 0.817 0.332 0.0801 0.966 0.252
#11 RRR dd ii b 0.416 0.211 0.931 0.105 0.948 0.555 0.201 0.656 0.794 0.526
#12 RRR dd ii c 0.652 0.897 0.741 0.254 0.815 0.154 0.422 0.361 0.925 0.696
#13 TTT ee jj a 0.391 0.626 0.358 0.296 0.804 0.743 0.655 0.000308 0.257 0.415
#14 TTT ee jj b 0.764 0.686 0.0174 0.460 0.0164 0.0718 0.700 0.558 0.341 0.411
#15 TTT ee jj c 0.812 0.995 0.845 0.513 0.987 0.249 0.429 0.749 0.557 0.369
原始答案
您可以完全获取长格式的数据(不需要中间列),将列名分隔在两个不同的列中并获取宽格式的数据。
file %>%
pivot_longer(cols = -c(id:city)) %>%
separate(name, into = c('letterGroup', 'col'), sep = "_") %>%
pivot_wider(names_from = col, values_from = value)
关于r - 控制 R 中枢轴的细节级别 (tidyverse),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/62925545/