我正在使用 R 编程语言。
我有以下数据集:
library(dplyr)
df = structure(list(ethnicity = c("c", "c", "c", "b", "c", "b", "b",
"b", "c", "a", "b", "b", "a", "b", "c", "a", "c", "c", "a", "a",
"a", "a", "c", "b", "c", "b", "a", "b", "c", "b", "a", "c", "c",
"a", "c", "b", "a", "c", "a", "a", "b", "c", "c", "a", "c", "a",
"c", "b", "a", "b", "a", "a", "c", "a", "b", "a", "a", "c", "a",
"b", "a", "c", "a", "c", "b", "c", "b", "b", "c", "b", "b", "c",
"c", "a", "b", "b", "a", "b", "a", "a", "b", "c", "c", "a", "b",
"a", "b", "a", "c", "c", "b", "c", "a", "b", "b", "c", "b", "a",
"c", "c"), number_of_degrees = c(3L, 2L, 2L, 3L, 1L, 1L, 3L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 3L, 2L,
3L, 1L, 3L, 3L, 3L, 1L, 3L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 2L, 1L,
2L, 1L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 2L, 2L, 1L, 3L, 2L, 1L, 3L,
3L, 3L, 1L, 2L, 2L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 2L, 3L, 3L, 1L,
3L, 2L, 1L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 3L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 3L, 3L, 2L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-100L))
df %>%
# Group the data by number_of_degrees
group_by(number_of_degrees) %>%
# Calculate the percentage of each ethnicity within each group
summarize(
percent_a = mean(ethnicity == "a") * 100,
percent_b = mean(ethnicity == "b") * 100,
percent_c = mean(ethnicity == "c") * 100
)
这会产生以下输出:
# A tibble: 3 x 4
number_of_degrees percent_a percent_b percent_c
<int> <dbl> <dbl> <dbl>
1 1 33.3 36.7 30
2 2 31.6 21.1 47.4
3 3 34.4 40.6 25
我的问题:是否有更“紧凑”的方式来编写此代码,这样我就不必手动编写“percent_a”、“percent_b”等?这样,它会更快并自动针对所有种族值(value)观执行此操作。
最佳答案
也许您可以尝试这个基本 R 选项(列名称可能与所需的输出有点不同)
> aggregate(. ~ number_of_degrees, df, \(x) proportions(table(x)))
number_of_degrees ethnicity.a ethnicity.b ethnicity.c
1 1 0.3333333 0.3666667 0.3000000
2 2 0.3157895 0.2105263 0.4736842
3 3 0.3437500 0.4062500 0.2500000
或
reshape(
as.data.frame(proportions(table(df), 2)),
direction = "wide",
idvar = "number_of_degrees",
timevar = "ethnicity"
)
这给出了
number_of_degrees Freq.a Freq.b Freq.c
1 1 0.3333333 0.3666667 0.3000000
4 2 0.3157895 0.2105263 0.4736842
7 3 0.3437500 0.4062500 0.2500000
或者,使用 dplyr
提供一个不太紧凑的选项(抱歉我的 tidyverse
知识有限)
table(rev(df)) %>%
proportions(1) %>%
as.data.frame.matrix() %>%
rownames_to_column(var = "number_of_degrees") %>%
mutate(number_of_degrees = as.integer(number_of_degrees))
这给出了
number_of_degrees a b c
1 1 0.3333333 0.3666667 0.3000000
2 2 0.3157895 0.2105263 0.4736842
3 3 0.3437500 0.4062500 0.2500000
关于R:对所有值使用 group_by,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/76392631/