r - 根据条件在数据集中创建新行

标签 r

所以我有三列数据。您会注意到辛辛那提有 10 行,而克莱姆森有 8 行。

数据

structure(list(player_id = c(473L, 653L, 816L, 885L, 906L, 969L, 
998L, 102L, 106L, 107L, 23L, 33L, 44L, 67L, 74L, 80L, 87L, 91L
), rating = c(0.8756, 0.8646, 0.8572, 0.8547, 0.8539, 0.8519, 
0.8506, 0.8498, 0.8477, 0.8477, 0.9867, 0.9822, 0.9764, 0.9673, 
0.9654, 0.9644, 0.9614, 0.9566), school = c("Cincinnati", "Cincinnati", 
"Cincinnati", "Cincinnati", "Cincinnati", "Cincinnati", "Cincinnati", 
"Cincinnati", "Cincinnati", "Cincinnati", "Clemson", "Clemson", 
"Clemson", "Clemson", "Clemson", "Clemson", "Clemson", "Clemson"
)), class = "data.frame", row.names = c(NA, -18L))

player_id   rating  school
473 0.8756  Cincinnati
653 0.8646  Cincinnati
816 0.8572  Cincinnati
885 0.8547  Cincinnati
906 0.8539  Cincinnati
969 0.8519  Cincinnati
998 0.8506  Cincinnati
102 0.8498  Cincinnati
106 0.8477  Cincinnati
107 0.8477  Cincinnati
23  0.9867  Clemson
33  0.9822  Clemson
44  0.9764  Clemson
67  0.9673  Clemson
74  0.9654  Clemson
80  0.9644  Clemson
87  0.9614  Clemson
91  0.9566  Clemson

我的目标是,我试图通过创建一个基于评分字段当前平均值的“占位符”排序行,让所有没有 10 行的学校增加 10 行。因此,对于克莱姆森,这将包括两个评级为 0.9701 的“占位符”行。所以最终的输出看起来像这样:

player_id   rating  school
473 0.8756  Cincinnati
653 0.8646  Cincinnati
816 0.8572  Cincinnati
885 0.8547  Cincinnati
906 0.8539  Cincinnati
969 0.8519  Cincinnati
998 0.8506  Cincinnati
102 0.8498  Cincinnati
106 0.8477  Cincinnati
107 0.8477  Cincinnati
23  0.9867  Clemson
33  0.9822  Clemson
44  0.9764  Clemson
67  0.9673  Clemson
74  0.9654  Clemson
80  0.9644  Clemson
87  0.9614  Clemson
91  0.9566  Clemson
0   0.9701  Clemson
0   0.9701  Clemson

实现这一目标的最有效方法是什么?

最佳答案

我们可以在按“学校”分组后,在row_number列的基础上用complete扩展数据集,然后替换NA 元素与“评级”的 mean

library(dplyr)
library(tidyr)
n <- 10
df1 %>%
     group_by(school) %>% 
     mutate(rn = row_number()) %>%
     ungroup %>% 
     complete(school, rn = unique(rn), fill = list(player_id = 0)) %>%     
     # // if all groups should be having fixed number of rows
     # complete(school, rn = seq_len(n), fill = list(player_id = 0))
     group_by(school) %>% 
     mutate(rating = replace_na(rating, mean(rating, na.rm = TRUE)))    
# A tibble: 20 x 4
# Groups:   school [2]
#   school        rn player_id rating
#   <chr>      <int>     <dbl>  <dbl>
# 1 Cincinnati     1       473  0.876
# 2 Cincinnati     2       653  0.865
#3 Cincinnati      3       816  0.857
# 4 Cincinnati     4       885  0.855
# 5 Cincinnati     5       906  0.854
# 6 Cincinnati     6       969  0.852
# 7 Cincinnati     7       998  0.851
# 8 Cincinnati     8       102  0.850
# 9 Cincinnati     9       106  0.848
#10 Cincinnati    10       107  0.848
#11 Clemson        1        23  0.987
#12 Clemson        2        33  0.982
#13 Clemson        3        44  0.976
#14 Clemson        4        67  0.967
#15 Clemson        5        74  0.965
#16 Clemson        6        80  0.964
#17 Clemson        7        87  0.961
#18 Clemson        8        91  0.957
#19 Clemson        9         0  0.970
#20 Clemson       10         0  0.970

使用 OP 的新数据集

n <- 20
out <- df1 %>%
         group_by(school) %>% 
         filter(n() < n) %>% 
         mutate(rn = row_number()) %>%
         ungroup %>% 
         complete(school, rn = seq_len(n), fill = list(rank= 0)) %>%
         group_by(school) %>% 
         mutate(rating = replace_na(rating, mean(rating, na.rm = TRUE))) %>%
         bind_rows(df1 %>% 
                      group_by(school) %>% 
                      filter(n() >=n)) 

range(table(out$school))
#[1] 20 57

或者使用data.table

library(data.table)
library(zoo)
setDT(df1)[,  .SD[seq_len(max(c(n, .N)))], school
     ][, rating := fifelse(is.na(rating), mean(rating, na.rm = TRUE), rating),
       school][, lapply(.SD, na.locf0)]

数据

df1 <- structure(list(player_id = c(473L, 653L, 816L, 885L, 906L, 969L, 
998L, 102L, 106L, 107L, 23L, 33L, 44L, 67L, 74L, 80L, 87L, 91L
), rating = c(0.8756, 0.8646, 0.8572, 0.8547, 0.8539, 0.8519, 
0.8506, 0.8498, 0.8477, 0.8477, 0.9867, 0.9822, 0.9764, 0.9673, 
0.9654, 0.9644, 0.9614, 0.9566), school = c("Cincinnati", "Cincinnati", 
"Cincinnati", "Cincinnati", "Cincinnati", "Cincinnati", "Cincinnati", 
"Cincinnati", "Cincinnati", "Cincinnati", "Clemson", "Clemson", 
"Clemson", "Clemson", "Clemson", "Clemson", "Clemson", "Clemson"
)), class = "data.frame", row.names = c(NA, -18L))

关于r - 根据条件在数据集中创建新行,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/63365858/

相关文章:

R 包小插图

r - 生存/回归分析结果的最佳/有效绘图

r - 没有开罗的ggplot unicode字符?

r - 在 R 中使用 layout() 划分图形的线

r - 我可以像使用 ffmpeg 一样使用 R 来修剪视频吗?

r - 在 R 中使用按比例长度绘制的页面绘制 3 次

r - 如何让用户在 R Shiny 中的 ggplot2 和 gVis 图之间切换?

regex - 如何替换r中某列中字符串的第n个字符

r - setkey 和 := operator, 数据表,R

r - knitr 报告的良好本地化策略是什么?