r - Purrr 映射到拆分的数据帧上以获得每个组的 AUROC

标签 r dplyr tidyverse tibble

我已经在这里问过这个问题 ( Purrr map over splitted training-dataframe to get auroc for each model ),但数据真的很糟糕,这个问题可能有点令人困惑。另外我找到了解决这个问题的方法,但感觉不是一个好方法。

所以问题如下。我根据 geology 列拆分数据框。然后,我想使用训练数据框中的其他两列来预测列 trigger 。我想为每个地质学(列表的每个元素 - 数据框)执行此操作。 然后我想在新的测试数据帧上预测这个模型并计算每个地质的 AUROC。

这是训练数据:

# the training data
structure(list(p3 = c(5, 0, 0, 2, 0, 0, 0, 2.8999999165535, 0, 
0, 0, 0, 17.5999999046326, 0, 0, 0.899999976158142, 10.0000000149012, 
1.79999995231628, 0, 8.80000019073486, 0, 0, 11.8999999761581, 
0, 9.60000026226044, 5.19999980926514, 11.7000002861023, 20.0999999046326, 
34.6000008583069, 0, 8.70000028610229, 33.8000000119209, 2.40000009536743, 
17, 27, 36.7999992370605, 0, 15.8999997973442, 0.300000011920929, 
7.69999980926514, 0, 0.899999976158142, 1.5, 0.300000011920929, 
3.50000002980232, 51.3999991416931, 6.09999990463257, 0.400000005960464, 
22, 65.3000020980835), p15 = c(34.2999999374151, 10.4999997392297, 
8.30000010877848, 69.4999992623925, 1.30000001192093, 0, 62.3999992161989, 
71.8999995738268, 32.6999994888902, 4, 0, 0.400000005960464, 
35.699999935925, 24.1000001206994, 0, 53.8999998271465, 41.8999992236495, 
37.8999994322658, 24.0999998524785, 65.5999999046326, 0, 0, 20.5000002905726, 
75.4000002145767, 68.1999989748001, 45.9000007808208, 180.999998480082, 
70.5000009462237, 112.099999666214, 11.3000001907349, 88.0999987274408, 
103.499998867512, 100.399998664856, 59.9999995827675, 200.699998855591, 
21.2999993562698, 47.1999997496605, 42.1999989748001, 58.6000000834465, 
161.299998879433, 43.3999999314547, 110.899999141693, 73.9000004529953, 
46.7999998703599, 25.7999995350838, 21.1000004559755, 86.100000500679, 
15.8999998569489, 5.3999999538064, 143.399998903275), trigger = c(FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), 
    geology = c("Sedimentary rocks", "Crystalline basement", 
    "Sedimentary rocks", "Sedimentary rocks", "Porphyry", "Porphyry", 
    "Porphyry", "Sedimentary rocks", "Sedimentary rocks", "Crystalline basement", 
    "Calcschists with ophiolites", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Porphyry", "Crystalline basement", 
    "Crystalline basement", "Sedimentary rocks", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Sedimentary rocks", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Porphyry", 
    "Calcschists with ophiolites", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Crystalline basement", 
    "Porphyry", "Crystalline basement", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Calcschists with ophiolites", 
    "Plutonite", "Crystalline basement", "Crystalline basement", 
    "Porphyry", "Sedimentary rocks")), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))

这里是测试数据


structure(list(p3 = c(6.40000009536743, 0, 0, 16.3000003397465, 
0, 0, 0, 0, 1, 1.5, 29.1000003814697, 7.49999982118607, 3.5, 
18.3999996185303, 2.69999990612268, 11.3000001907349, 0, 2, 9.5, 
0, 10.1999998092651, 0, 3.60000005364418, 0, 5.29999995231628, 
112.599998474121, 118.099997758865, 54.8999996185303, 72.8000011444092, 
79.9000015258789, 88.7000015377998, 0, 54.6000022888184, 144.599998474121, 
111.200000762939, 7.10000009834766, 32.0999999046326, 0.5, 5.3999999165535, 
0.300000011920929, 0, 36.7999982833862, 101.599998474121, 121.699998855591, 
31.0999994277954, 66.8000020980835, 139.200000762939, 9.50000011920929, 
135.300003051758, 110.900001525879), p15 = c(12.3999996185303, 
63.8000009655952, 20.7000007629395, 121.299998179078, 10.4000001549721, 
27.1999999880791, 49.5000003874302, 13.3000001907349, 31.3999998569489, 
15.4000002890825, 64.3999997377396, 25.1000001430511, 43.6999994516373, 
50.799999833107, 35.1999998092651, 35.1999998837709, 67.1000003442168, 
19.400000333786, 49.300000667572, 21.3999996706843, 75.600000411272, 
38.700000859797, 30.2999994754791, 14.9000003933907, 53.2000011727214, 
137.900000333786, 0.100000001490116, 119.300001859665, 139.700000107288, 
147.799997329712, 45.3000004068017, 56.5000000670552, 47.7999995946884, 
2.90000009536743, 139.499999403954, 26.6999999284744, 6.5, 149.700001835823, 
210.299998342991, 114.499999642372, 3.60000002384186, 60.099999524653, 
97.5999984890223, 153.100000120699, 245.299996376038, 123.49999922514, 
3.70000004768372, 90.5999985486269, 49.1000001132488, 138.599999785423
), trigger = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE)), row.names = c(NA, -50L), groups = structure(list(
    trigger = c(FALSE, TRUE), .rows = structure(list(1:25, 26:50), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

我现在为每个地质学拟合逻辑回归模型 trigger ~ p3 + 15 并获取每个类的 auroc 所做的工作如下:


res = train %>% split(., .$geology) %>%
  map( ~ glm(trigger ~ p3 + p15, data = .x, family = "binomial")) %>%
  map( ~ predict(.x, newdata = test, type = "response")) %>%
  map(function(x) {
    df = data.frame(ref = test$trigger)
    df[["pred"]] = x
    df
  }) %>% map_dfr(function(x) {
   auc = as.numeric(roc(ref ~ pred, data = x)$auc)
  }) %>% pivot_longer(cols = everything(), names_to = "geology", values_to="auc")

但有些部分(function(x){...})我想用更简洁的 purrr 风格的语法替换。我在思考 .x. 以及何时使用 {} 来防止结果传入时遇到了一些麻烦小标题(在某些时候可能是必要的)。 那么我怎样才能获得相同的结果,但是省略了 function(x) 语法呢?

最佳答案

您可以使用 group_bysummarise 而不是使用 split()。起初列表列使用起来有点棘手,但一旦你习惯了它们,它们在很多情况下都非常有用。我会尝试这样的事情:

library(tidyverse)

train %>% 
  group_by(geology) %>% 
  summarise(
    model = list(glm(trigger ~ p3 + p15, data = cur_data(), family = "binomial")),
    yhat = map(model, ~predict(.x, newdata = test, type = "response")),
    auc = map_dbl(yhat, ~pROC::roc(test$trigger, .x)$auc)
  ) %>% 
  select(geology, auc)

## A tibble: 5 x 2
#  geology                       auc
#  <chr>                       <dbl>
# 1 Calcschists with ophiolites 0.794
# 2 Crystalline basement        0.84 
# 3 Plutonite                   0.5  
# 4 Porphyry                    0.864
# 5 Sedimentary rocks           0.912

或者不创建临时列

train %>% 
  group_by(geology) %>% 
  summarise(
    auc = glm(trigger ~ p3 + p15, data = cur_data(), family = "binomial") %>% 
      predict(newdata = test, type = "response") %>% 
      pROC::roc(test$trigger, .) %>% `$`("auc") %>% as.numeric()
  )

reprex package 创建于 2021-06-28 (v1.0.0)

关于r - Purrr 映射到拆分的数据帧上以获得每个组的 AUROC,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/68159781/

相关文章:

r - 将 ggplot 保存为 EPS 格式以便在 Adob​​e Illustrator 中进行编辑——文本问题

graphics - 如何确定函数是否生成图形

r - 不要在 dplyr 中自动调用 db_disconnector

r - 将字符串匹配转换为二进制元素向量

c++ - 在 Rcpp(和 RcppArmadillo)中,如何检查 vec 是否包含复数?

r - dplyr 根据多个条件替换列中的 na 值

r - 错误 : The dbplyr package is required to communicate with database backends

r - 如果 R 中的所有值都是 NA,则删除特定列

r - 如何将具有相同协变量的个体的二项式响应数据分解为伯努利,反之亦然?

返回连续值满足条件的行