我有以下数据:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
set.seed(1)
df <- data_frame(
genes = paste("Gene_",letters[0:10],sep=""),
X = rnorm(10, 0, 1),
Y = rnorm(10, 0, 2),
Z = rnorm(10, 0, 4))
df
#> # A tibble: 10 × 4
#> genes X Y Z
#> <chr> <dbl> <dbl> <dbl>
#> 1 Gene_a -0.6264538 3.02356234 3.6759095
#> 2 Gene_b 0.1836433 0.77968647 3.1285452
#> 3 Gene_c -0.8356286 -1.24248116 0.2982599
#> 4 Gene_d 1.5952808 -4.42939977 -7.9574068
#> 5 Gene_e 0.3295078 2.24986184 2.4793030
#> 6 Gene_f -0.8204684 -0.08986722 -0.2245150
#> 7 Gene_g 0.4874291 -0.03238053 -0.6231820
#> 8 Gene_h 0.7383247 1.88767242 -5.8830095
#> 9 Gene_i 0.5757814 1.64244239 -1.9126002
#> 10 Gene_j -0.3053884 1.18780264 1.6717662
我可以这样计算 X 列:
fit_X <- MASS::fitdistr(df$X,"normal")
broom::tidy(fit_X)
#> term estimate std.error
#> 1 mean 0.1322028 0.2341758
#> 2 sd 0.7405289 0.1655873
broom::glance(fit_X)
#> n logLik AIC BIC
#> 1 10 -11.18548 26.37096 26.97613
如何对所有列(除了第一列 - genes
)执行此操作,以便最终我得到:
mean.estimate sd.estimate mean.stderror sd.stderror n loglik AIC BIC
X 0.1322028 0.7405289 0.2341758 0.1655873 10 -11.18548 26.37096 26.97613
Y 0.4976899 2.0292617 0.6417089 0.4537 10 -21.26611 46.53221 47.13738
Z -0.534693 3.626276 1.1467291 0.8108599 10 -27.07145 58.14289 58.74806
最佳答案
您可以使用 tidyr::gather
、purrr::map
和 broom::glance
,如下所示:
df_new <- gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list) %>%
mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>%
mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>%
mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>%
mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>%
mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd")) %>%
mutate(mod_glance = map(mod, glance)) %>%
select(-vec, -mod) %>%
unnest()
该代码块的分割如下:
获取列中每个组的向量
gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list)
# A tibble: 3 × 2
var vec
<chr> <list>
1 X <dbl [10]>
2 Y <dbl [10]>
3 Z <dbl [10]>
在新列中添加模型
df_new <- gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list) %>%
mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") ))
# A tibble: 3 × 3
var vec mod
<chr> <list> <list>
1 X <dbl [10]> <S3: fitdistr>
2 Y <dbl [10]> <S3: fitdistr>
3 Z <dbl [10]> <S3: fitdistr>
添加估算列
df_new <- gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list) %>%
mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>%
mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>%
mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>%
mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>%
mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd"))
# A tibble: 3 × 7
var vec mod mod_est_mean mod_est_sd mod_std_mean mod_std_error
<chr> <list> <list> <dbl> <dbl> <dbl> <dbl>
1 X <dbl [10]> <S3: fitdistr> 0.1322028 0.7405289 0.2341758 0.1655873
2 Y <dbl [10]> <S3: fitdistr> 0.4976899 2.0292617 0.6417089 0.4537567
3 Z <dbl [10]> <S3: fitdistr> -0.5346930 3.6262759 1.1467291 0.8108599
浏览模型并解除嵌套
gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list) %>%
mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>%
mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>%
mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>%
mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>%
mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd")) %>%
mutate(mod_glance = map(mod, glance)) %>%
select(-vec, -mod) %>%
unnest()
# A tibble: 3 × 9
var mod_est_mean mod_est_sd mod_std_mean mod_std_error n logLik AIC BIC
<chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 X 0.1322028 0.7405289 0.2341758 0.1655873 10 -11.18548 26.37096 26.97613
2 Y 0.4976899 2.0292617 0.6417089 0.4537567 10 -21.26611 46.53221 47.13738
3 Z -0.5346930 3.6262759 1.1467291 0.8108599 10 -27.07145 58.14289 58.74806
关于r - 如何使用 broom 和 dplyr 适应多列分布,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43849952/