我尝试编写一段代码来计算我的数据的平均值和标准误差,并将其放入一个新的小标题中。
但是感觉很笨拙。有谁知道可以使我的代码更优雅的包或其他技巧?
我需要计算多个子组 (days_incubated) 的均值和 se。
library(dplyr)
library(tibble)
library(tidyr)
library(data.table)
library(plotrix)
df2 <- df1%>%
group_by(days_incubated)%>%
summarise_each(funs(mean, se= std.error))%>% # Calculating mean and standard error
mutate_if(is.numeric, round, digits = 2) # Round off the data
df2_trans <- transpose(df2) # Transposing data table
colnames(df2_trans) <- rownames(df2) # Get row and colnames in order
rownames(df2_trans) <- colnames(df2) # Get row and colnames in order
df2_trans <- rownames_to_column(df2_trans, "mass") # Making row names into a column
df3_trans <- df2_trans%>% # Converting one column into two
separate(mass, c("mass","type"), sep = "([_])")
mean_target <- c("mean", "incubated")
mean <- df3_trans%>% # Mean table
filter(type %in% mean_target)%>%
rename("mean day 0"="1")%>%
rename("mean day 4"="2")%>%
rename("mean day 10"="3")%>%
rename("mean day 17"="4")%>%
rename("mean day 24"="5")%>%
rename("mean day 66"="6")%>%
rename("mean day 81"="7")%>%
rename("mean day 94"="8")%>%
rename("mean day 116"="9")%>%
select("mass", "mean day 0", "mean day 4", "mean day 10", "mean day 17", "mean day 24", "mean day 66", "mean day 81", "mean day 94", "mean day 116")%>%
slice(-c(1))
se_target <- c("se", "incubated")
se <- df3_trans%>% # SE table
filter(type %in% se_target)%>%
rename("se day 0"="1")%>%
rename("se day 4"="2")%>%
rename("se day 10"="3")%>%
rename("se day 17"="4")%>%
rename("se day 24"="5")%>%
rename("se day 66"="6")%>%
rename("se day 81"="7")%>%
rename("se day 94"="8")%>%
rename("se day 116"="9")%>%
select("mass", "se day 0", "se day 4", "se day 10", "se day 17", "se day 24", "se day 66", "se day 81", "se day 94", "se day 116")%>%
slice(-c(1))
# join mean and se tables
mean_se <- mean %>% #merging mean and se dataset
full_join(se, by=("mass"))%>%
select("mass","mean day 0","se day 0", "mean day 4", "se day 4", "mean day 10", "se day 10", "mean day 17", "se day 17", "mean day 24", "se day 24", "mean day 66", "se day 66", "mean day 81", "se day 81", "mean day 94", "se day 94", "mean day 116", "se day 116") # Putting columns in correct order
这是数据:
df1 <- structure(list(days_incubated = c("0", "0", "0", "0", "0", "4",
"4", "4", "4", "4", "10", "10", "10", "10", "10", "17", "17",
"17", "17", "17", "24", "24", "24", "24", "24", "66", "66", "66",
"66", "66", "81", "81", "81", "81", "81", "94", "94", "94", "94",
"94", "116", "116", "116", "116", "116"), i.x33.031 = c(7.45,
0, 78.2, 16.49, 18.77, 104.5, 28.95, 26.05, 4.11, 62.09, 1.95,
6.75, 1.41, 3.34, 3.02, 0, 100.28, 0.2, 32.66, 0, 0, 370.57,
7.24, 133.63, 55.26, 0.16, 5.5, 25.17, 16.59, 3.3, 23.95, 30.61,
4.04, 0, 6.58, 0.08, 0.01, 0, 0.38, 0, 0, 0, 0, 0.18, 0), i.x35.034 = c(0,
0, 0.15, 0.02, 0.01, 0.04, 0.04, 0.05, 0.02, 0.09, 0.02, 0, 0.04,
0.01, 0, 0, 0.22, 0, 0.08, 0, 0, 0.66, 0.01, 0.2, 0.12, 0.01,
0.01, 0.04, 0.01, 0.01, 0.01, 0.04, 0, 0, 0, 0, 0, 0, 0.01, 0,
0, 0.02, 0, 0, 0.02), i.x36.017 = c(0.47, 0.09, 0.28, 0.02, 0.03,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0.06, 0.32, 0, 0, 0, 0, 0.12,
0, 0.02), i.x39.959 = c(0.02, 0, 0.08, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.04, 0, 0, 0, 0, 0, 0.01, 0, 0,
0, 0, 0, 0.01, 0.02, 0.06, 0.03, 0.03, 0, 0, 0.02, 0.01, 0, 0,
0), i.x40.023 = c(0.35, 0.02, 0.48, 0.06, 0, 1.25, 0.09, 0.1,
0.03, 0, 0.09, 0.07, 0.55, 0.09, 0.07, 0, 0.63, 0, 0.09, 0.07,
0.02, 1.11, 0.04, 0.59, 0.13, 0, 0.01, 0.02, 0, 0, 0, 0, 0.01,
0.02, 0.06, 0.01, 0.01, 0.01, 0.01, 0.04, 0, 0.08, 0, 0, 0.01
)), row.names = c(NA, -45L), class = "data.frame")
最佳答案
这能满足您的需求吗?它似乎重现了输出。请注意,我认为您的结果实际上标有错误的列。您的其中一个步骤对列进行了字符排序,因此它们与您稍后重命名时所期望的顺序不同。
我们要做的是首先使用 pivot_longer()
以便我们有整洁的数据。从那里,我们可以分组和总结以计算平均误差和标准误差。然后我们 pivot_wider()
将其移回结果的宽格式。
library(dplyr)
library(tidyr)
library(plotrix)
col_order <- paste0(rep(paste0(c("mean", "se"), "_day_"), length(unique(df1$days_incubated))),
rep(unique(df1$days_incubated), each = 2))
df1 %>%
pivot_longer(-days_incubated, names_to = "mass") %>%
group_by(days_incubated, mass) %>%
summarize(mean = mean(value),
se = std.error(value), .groups = "drop") %>%
pivot_wider(names_from = days_incubated, values_from = c("mean", "se"),
names_glue = "{.value}_day_{days_incubated}") %>%
relocate(mass, all_of(col_order))
# A tibble: 5 x 19
mass mean_day_0 se_day_0 mean_day_4 se_day_4 mean_day_10 se_day_10 mean_day_17 se_day_17 mean_day_24 se_day_24 mean_day_66 se_day_66 mean_day_81
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 i.x3~ 24.2 13.9 45.1 17.5 3.29 0.932 26.6 19.5 113. 68.6 10.1 4.67 13.0
2 i.x3~ 0.036 0.0287 0.048 0.0116 0.014 0.00748 0.06 0.0429 0.198 0.121 0.016 0.006 0.01
3 i.x3~ 0.178 0.0867 0 0 0 0 0 0 0.01 0.01 0 0 0
4 i.x3~ 0.02 0.0155 0 0 0 0 0 0 0.008 0.008 0.002 0.002 0.006
5 i.x4~ 0.182 0.0978 0.294 0.240 0.174 0.0941 0.158 0.119 0.378 0.210 0.006 0.004 0.018
# ... with 5 more variables: se_day_81 <dbl>, mean_day_94 <dbl>, se_day_94 <dbl>, mean_day_116 <dbl>, se_day_116 <dbl>
逐行
首先,我添加了一个简短的步骤来帮助列排序。
col_order
是我们根据我们正在寻找的正确最终顺序创建的字符向量。基本上我们只是连接字符串以匹配列名。您可以跳过此步骤,只需像在select()
语句中那样单独键入每个名称,但这会节省键入时间。
现在进入管道。
pivot_longer()
将从列中获取数据并将它们堆叠起来。列名现在将位于名为mass
的变量中。默认情况下,这些值将位于一个名为value
的新变量中。group_by()
对指定变量进行分组,这些变量是您希望在最终表格中出现的行和列组合。summarize()
将每个mass
和days_incubated
组合的多行折叠成一个新行,包含两列:均值和 se ..groups = "drop"
参数表示删除组(即 tibble 不再分组,请查看帮助文件了解更多详细信息)。pivot_wider()
采用长格式摘要 tibble 并将数据旋转到列中,这与您的格式非常相似。它说从days_incubated
中获取列名,并从mean
和se
列中获取值。names_glue
参数指定如何命名新列。它使用glue
包中的语法,那里的{
括号基本上意味着“在这里替换一个值”。因此,每个名称都是 '[name of value column]_day_[days_incubated]'。relocate()
只是对列重新排序。这对于dplyr
来说相对较新,并且是现在重新排列列而不是select()
的首选方式。它说首先获取mass
,然后是all_of()
我们之前创建的col_order
向量中的列。
关于r - 用于计算平均值和标准误差的笨拙代码,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/65201684/