我有以下代码:
library(tidyverse)
astronauts %>%
group_by(name, nationality, total_hrs_sum) %>%
summarise()
输出:
name nationality total_hrs_sum
<chr> <chr> <dbl>
Acaba, Joseph M. U.S. 7272.23
Acton, Loren Wilbur U.S. 190.94
Adamson, James C. U.S. 334.00
Afanasyev, Viktor Mikhaylovich U.S.S.R/Russia 13338.55
Aidyn (Aydyn) Akanovich Aimbetov Kazakhstan 236.23
Akers, Thomas D. U.S. 814.00
Akiyama, Toyohiro Japan 189.90
Aksyonov, Vladimir U.S.S.R/Russia 284.18
Al Mansoori, Hazzaa UAE 189.00
Al-saud, Sultan bin Salman Saudi Arabia 170.00
我的问题:
我想进一步过滤此数据框,以便每个国籍仅提供 1 个姓名。每个国籍的一个名字应该在total_hrs_sum 列中具有最高值。我对 dplyr 解决方案最满意,但也对其他可能的解决方案持开放态度。
示例数据:
structure(list(name = c("Acaba, Joseph M.", "Acton, Loren Wilbur",
"Adamson, James C.", "Afanasyev, Viktor Mikhaylovich", "Aidyn (Aydyn) Akanovich Aimbetov",
"Akers, Thomas D.", "Akiyama, Toyohiro", "Aksyonov, Vladimir",
"Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman", "Aldrin, Edwin Eugene, Jr.",
"Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr", "Allen, Andrew M.",
"Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison",
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)",
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.",
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II",
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri",
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena",
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.",
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.",
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick",
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel",
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.",
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan",
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria",
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.",
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.",
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.",
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia",
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.",
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia",
"U.S.S.R/Russia", "U.S.S.R/Russia"), total_hrs_sum = c(7272.23,
190.94, 334, 13338.55, 236.23, 814, 189.9, 284.18, 189, 170,
289, 47, 7434.03, 904, 314, 1224, 147, 4046, 593, 614.37, 261.525,
579, 847, 639.5, 206, 188.71, 307, 8784, 3471.35, 377.5, 664,
5686.82, 190.2, 4722, 17942.23, 338, 686, 965, 4297.28, 5085,
734, 190.94, 473.75, 169.63, 1671.75, 708, 190, 26.03, 94.83,
5073.07)), row.names = c(NA, -50L), groups = structure(list(name = c("Acaba, Joseph M.",
"Acton, Loren Wilbur", "Adamson, James C.", "Afanasyev, Viktor Mikhaylovich",
"Aidyn (Aydyn) Akanovich Aimbetov", "Akers, Thomas D.", "Akiyama, Toyohiro",
"Aksyonov, Vladimir", "Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman",
"Aldrin, Edwin Eugene, Jr.", "Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr",
"Allen, Andrew M.", "Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison",
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)",
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.",
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II",
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri",
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena",
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.",
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.",
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick",
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel",
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.",
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan",
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria",
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.",
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.",
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.",
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia",
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.",
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia",
"U.S.S.R/Russia", "U.S.S.R/Russia"), .rows = structure(list(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 50L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
最佳答案
我们可以在按“国籍”分组后使用切片
第一行,并按降序排列
“total_hrs_sum”
library(dplyr)
df %>%
arrange(nationality, desc(total_hrs_sum)) %>%
group_by(nationality) %>%
slice(1)
或者使用top_n
df %>%
group_by(nationality) %>%
top_n(n=1, total_hrs_sum)
# A tibble: 10 x 3
# Groups: nationality [10]
# name nationality total_hrs_sum
# <chr> <chr> <dbl>
# 1 Acaba, Joseph M. U.S. 7272.
# 2 Aidyn (Aydyn) Akanovich Aimbetov Kazakhstan 236.
# 3 Akiyama, Toyohiro Japan 190.
# 4 Al Mansoori, Hazzaa UAE 189
# 5 Al-saud, Sultan bin Salman Saudi Arabia 170
# 6 Aleksandrov, Aleksandr Bulgaria 47
# 7 André-Deshays, Claudie (Haigneré) France 614.
# 8 Arnaldo Tamayo Mendez Cuba 189.
# 9 Avdeyev, Sergei U.S.S.R/Russia 17942.
#10 Bella, Ivan Slovakia 190
关于r - 如何仅保留数据框中的最高重复值?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/62883293/