r - 如何用另一个数据框中的信息替换数据框的行名称?

标签 r dplyr

我有两个数据帧 exp.kirp.log2g_list,其中 exp.kirp.log2 的行名与 匹配g_list$hgnc_symbol,但它们的顺序不同。 我想根据匹配的 g_list$hgnc_symbolexp.kirp.log2 的行名称替换为 gene_list$ensembl_gene_id

# Obtain g_list

genes <- rownames(exp.kirp.log2)
ensembl = useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl", GRCh = 37, verbose = T)
ensembl <- useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
g_list <- getBM(attributes = c('ensembl_gene_id','hgnc_symbol'), 
                filters='hgnc_symbol',values=genes,
                mart=ensembl)
g_list <- g_list[order(g_list$hgnc_symbol),]

rownames(exp.kirp.log2) <- g_list$ensembl_gene_id %in% g_list[match(rownames(exp.kirp.log2), g_list$hgnc_symbol),2]

回溯:

Error in `.rowNamesDF<-`(x, value = value) : invalid 'row.names' length

exp.kirp

dput(exp.kirp.log2[1:20,1:20])
structure(list(TCGA.2K.A9WE.01A = c(7.65342121905285, 2.03892776611756, 
-0.96100202120249, 6.35598354101006, 14.3511850042327, -Inf, 
10.3737643425674, 3.79382306985866, -Inf, 10.0819596419255, 9.44832324553207, 
4.20886056913751, -0.96100202120249, -Inf, -Inf, -Inf, 5.36085937172008, 
9.78880184184623, 10.3776687573505, 11.16757118884), TCGA.2Z.A9J1.01A = c(5.09389393824392, 
5.2160706644244, -Inf, 6.93597002271109, 12.4136523086721, -Inf, 
11.1918237390263, 2.98724809259115, -Inf, 10.1912122382252, 9.9623840324273, 
4.71517403960983, -Inf, -Inf, -Inf, -Inf, 6.22565668754941, 10.3398477765017, 
10.3103072842012, 11.1287210937383), TCGA.2Z.A9J2.01A = c(5.51854458067276, 
4.11644793551166, -Inf, 7.5307754013178, 12.2744679621487, -Inf, 
9.93114303849412, -Inf, -Inf, 10.3189198720956, 10.2574585613045, 
4.11644793551166, -Inf, -Inf, -Inf, 0.309059742501585, 6.16707286132018, 
10.2991951943744, 10.5852157015366, 11.5823040757623), TCGA.2Z.A9J3.01A = c(4.70168212029528, 
3.34111759260469, 3.57815377007565, 7.54694769203808, 10.1689338100564, 
-Inf, 9.96839262629172, 5.28865017271056, -Inf, 9.87305770150294, 
9.75535162798677, 3.57815377007565, -Inf, -Inf, -Inf, -Inf, 6.170389794965, 
10.238532641469, 9.94050095178643, 11.0690397931313), TCGA.2Z.A9J5.01A = c(7.99645936536463, 
5.20408983959317, 1.64952349150802, 6.89258167250936, 13.6832285748428, 
-Inf, 10.3714563849361, 2.6495004987031, -Inf, 10.4176870383992, 
10.0652444551968, 6.86867071663319, -Inf, -Inf, -Inf, -0.350522494468264, 
5.98935248863472, 10.1079093507719, 11.2050505161752, 11.6645692817891
), TCGA.2Z.A9J6.01A = c(5.13719199914349, 6.63590381796106, -1.00057719346275, 
6.92859654071157, 12.0367193976262, -Inf, 10.8202555636581, 5.50707469845262, 
-Inf, 10.3262700402849, 9.91216810777653, 5.94179415093086, -Inf, 
-Inf, -Inf, -Inf, 5.52284042813955, 10.0653680664815, 10.5954686012028, 
11.2355920880251), TCGA.2Z.A9J7.01A = c(6.95117512427229, 2.24944534108584, 
-Inf, 7.25014205824679, 10.9656928148969, 1.07949758402178, 10.5991523452113, 
4.32744306245973, -Inf, 10.4556415168452, 9.46537845450025, 3.66448284036468, 
-Inf, -Inf, -0.920570684997085, -Inf, 6.72337288854289, 10.0139441477751, 
9.28408724134641, 11.4833270722276), TCGA.2Z.A9J8.01A = c(3.61712213221935, 
5.39472334226273, -Inf, 7.92111077839189, 11.9975977242282, -1.58251200148633, 
9.91379851626213, 5.10394657615628, -Inf, 9.95999222715916, 9.90779350021794, 
5.82683572660569, -Inf, -Inf, -Inf, -Inf, 7.85831302551685, 10.3997246047534, 
11.7402171909708, 11.7246448152361), TCGA.2Z.A9J9.01A = c(6.05389548011115, 
-1.41888982477445, -Inf, 5.73237232300075, 14.8647326244225, 
-1.41888982477445, 10.4697437586612, 4.51174000542664, -0.419082711336792, 
9.28576885726015, 9.21399581402162, -1.41888982477445, -Inf, 
-Inf, -Inf, -Inf, 6.0887945976893, 9.39798410008432, 9.51616568261168, 
11.1846268780251), TCGA.2Z.A9JD.01A = c(3.15639661659767, 4.2623504664045, 
-Inf, 8.66937017282105, 10.9506421354115, -Inf, 10.8015070949819, 
1.37484474128973, -Inf, 9.51027580369917, 10.0114476969219, 4.30081927275683, 
-Inf, -Inf, -Inf, -Inf, 5.9232585434528, 10.1410206692489, 10.9093204661635, 
11.1601119970792), TCGA.2Z.A9JE.01A = c(4.55671142771396, 0.976583597876997, 
-Inf, 7.39711669725343, 11.338916568945, 0.976583597876997, 11.4922670603536, 
2.97656526897918, -Inf, 10.098748119083, 9.84075338026455, 5.28034764124523, 
-Inf, -Inf, -Inf, -Inf, 5.7709822912126, 10.6735346943631, 10.5435725361632, 
11.1417882886223), TCGA.2Z.A9JG.01A = c(7.27924748225939, 3.74051746156171, 
2.8145094080944, 5.79197812282256, 12.0865056775331, -Inf, 10.327056990394, 
3.53698537579194, -Inf, 10.5780583590704, 10.8649068477431, 6.57473428966518, 
-Inf, -Inf, -Inf, -Inf, 7.25413670394464, 9.83577157545057, 10.6502486545903, 
11.1906393649786), TCGA.2Z.A9JI.01A = c(8.20162111992077, 5.71548707501235, 
0.393526032228356, 5.73826794012289, 12.7954861179578, 2.61595767716595, 
10.0094938620897, 5.20091233325219, -Inf, 9.93400880935778, 9.75330735360058, 
5.21798676951157, -Inf, -Inf, -Inf, -Inf, 7.0082672553098, 9.74081003982032, 
10.7382235152475, 11.6720072357516), TCGA.2Z.A9JJ.01A = c(6.26475409489153, 
-0.415229871442725, 4.46742296017359, 6.39123499553712, 12.8198023802381, 
-Inf, 11.7916439373724, 2.04421923663312, -Inf, 9.53606689182685, 
10.3591574288036, -1.41503749927884, -Inf, -Inf, -Inf, -Inf, 
5.71406413888836, 10.0630462305102, 10.2195932783632, 11.455724780085
), TCGA.2Z.A9JK.01A = c(5.9386386584728, 6.15383572961508, -Inf, 
6.96731849612975, 14.0309071832818, -1.46076672913396, 10.1870344226096, 
1.99855658313033, -Inf, 9.35930526481309, 9.09945932891836, 4.68887411799513, 
1.99855658313033, -Inf, -Inf, -1.46076672913396, 6.2670469517503, 
9.44376099865969, 10.3192560726034, 11.2260646035167), TCGA.2Z.A9JL.01A = c(7.02541755196655, 
3.6235742674856, 0.623585972399712, 6.53709878587009, 12.9644083075158, 
-0.961282892427146, 11.008714051357, 7.39616034553605, -Inf, 
10.2035151452418, 9.55233575995403, 4.03861176676445, -Inf, -Inf, 
-Inf, -Inf, 6.02729341208669, 10.2444014184424, 10.8539915160092, 
10.9360755463691), TCGA.2Z.A9JM.01A = c(5.91786850792426, 5.24907362097786, 
1.28557977040279, 6.72907414917957, 13.7490268867095, -Inf, 10.9548399985167, 
4.00806552147407, -Inf, 10.1222822552784, 9.55238706244896, 5.47146712618562, 
-Inf, -Inf, -Inf, -1.03622996892924, 6.23045858328003, 10.2600151849016, 
10.4635183450003, 10.7143794759862), TCGA.2Z.A9JN.01A = c(4.09355172548716, 
-Inf, -Inf, 7.28292161137752, 12.2911433534863, -Inf, 10.8165919471646, 
-1.22398025935253, -Inf, 8.92443634456794, 9.76180168782627, 
-1.22398025935253, -1.22398025935253, -Inf, -Inf, -Inf, 7.12910679128811, 
10.6459390192132, 10.9762453349304, 11.3740122319006), TCGA.2Z.A9JO.01A = c(5.31084177712261, 
-0.707218250772154, 0.292663973396858, 7.23068817327933, 13.1978705809921, 
3.38016145559557, 9.83862580682156, -Inf, -Inf, 9.54417855835282, 
10.3479785104067, 5.93655314137344, -Inf, -Inf, -Inf, -Inf, 6.95803180123955, 
10.5275135616911, 10.7557094705532, 11.5723066760841), TCGA.2Z.A9JP.01A = c(6.28494714327597, 
0.226878404196269, 1.22687840419627, 7.45773299323875, 13.34336295947, 
-Inf, 11.208058065601, 1.22687840419627, -Inf, 10.0414301364209, 
9.66875435596374, 4.51225217454896, -Inf, -Inf, -Inf, 0.226878404196269, 
6.145710058105, 9.94538047931203, 10.6250568350002, 11.0719455710567
)), row.names = c("A1BG", "A1CF", "A2BP1", "A2LD1", "A2M", "A2ML1", 
"A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AACSL", "AADAC", 
"AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB", "AAK1", "AAMP"
), class = "data.frame")

g_list

> dput(g_list[1:20,1:2])
structure(list(ensembl_gene_id = c("ENSG00000121410", "ENSG00000148584", 
"ENSG00000175899", "ENSG00000166535", "ENSG00000128274", "ENSG00000118017", 
"ENSG00000094914", "ENSG00000081760", "ENSG00000114771", "ENSG00000261846", 
"ENSG00000197953", "ENSG00000188984", "ENSG00000204518", "ENSG00000109576", 
"ENSG00000103591", "ENSG00000115977", "ENSG00000127837", "ENSG00000129673", 
"ENSG00000090861", "ENSG00000124608"), hgnc_symbol = c("A1BG", 
"A1CF", "A2M", "A2ML1", "A4GALT", "A4GNT", "AAAS", "AACS", "AADAC", 
"AADACL2", "AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB", 
"AAK1", "AAMP", "AANAT", "AARS", "AARS2")), row.names = c(NA, 
20L), class = "data.frame")

最佳答案

  1. 如果 dplyr 和 tidyverse 的其余部分中的许多函数没有主动删除行名称,则不鼓励这样做。您最好将它们迁移到数据中的“真实列”或停止尝试使用它们(我怀疑您需要这些信息,因此前者是您的最佳路径)。在这方面,你可以这样做

    exp.kirp.log2 %>%
      tibble::rownames_to_column("hgnc_symbol") %>%
      right_join(g_list, ., by = "hgnc_symbol") %>%
      tibble()  # this is only "needed" to facilitate visualization here on Stack
    # # A tibble: 21 x 22
    #    ensembl_gene_id hgnc_symbol TCGA.2K.A9WE.01A TCGA.2Z.A9J1.01A TCGA.2Z.A9J2.01A TCGA.2Z.A9J3.01A TCGA.2Z.A9J5.01A
    #    <chr>           <chr>                  <dbl>            <dbl>            <dbl>            <dbl>            <dbl>
    #  1 ENSG00000121410 A1BG                   7.65              5.09             5.52             4.70             8.00
    #  2 ENSG00000148584 A1CF                   2.04              5.22             4.12             3.34             5.20
    #  3 ENSG00000175899 A2M                   14.4              12.4             12.3             10.2             13.7 
    #  4 ENSG00000166535 A2ML1               -Inf              -Inf             -Inf             -Inf             -Inf   
    #  5 ENSG00000128274 A4GALT                10.4              11.2              9.93             9.97            10.4 
    #  6 ENSG00000118017 A4GNT                  3.79              2.99          -Inf                5.29             2.65
    #  7 ENSG00000094914 AAAS                  10.1              10.2             10.3              9.87            10.4 
    #  8 ENSG00000081760 AACS                   9.45              9.96            10.3              9.76            10.1 
    #  9 ENSG00000114771 AADAC                 -0.961          -Inf             -Inf             -Inf             -Inf   
    # 10 ENSG00000261846 AADACL2             -Inf              -Inf             -Inf             -Inf             -Inf   
    # # ... with 11 more rows, and 15 more variables: TCGA.2Z.A9J6.01A <dbl>, TCGA.2Z.A9J7.01A <dbl>, TCGA.2Z.A9J8.01A <dbl>,
    # #   TCGA.2Z.A9J9.01A <dbl>, TCGA.2Z.A9JD.01A <dbl>, TCGA.2Z.A9JE.01A <dbl>, TCGA.2Z.A9JG.01A <dbl>, TCGA.2Z.A9JI.01A <dbl>,
    # #   TCGA.2Z.A9JJ.01A <dbl>, TCGA.2Z.A9JK.01A <dbl>, TCGA.2Z.A9JL.01A <dbl>, TCGA.2Z.A9JM.01A <dbl>, TCGA.2Z.A9JN.01A <dbl>,
    # #   TCGA.2Z.A9JO.01A <dbl>, TCGA.2Z.A9JP.01A <dbl>
    

    仅供引用,我选择使用 right_join 主要是为了在 Stack 上这里进行演示,更喜欢将两列保留在框架的左侧,以便它们出现在此处被截断的渲染。通常,许多人更喜欢 %>% left_join(g_list, by = "hgnc_symbol") 的“思考”,但这两个调用除了结果的列顺序之外在功能上是相同的。

  2. 如果您确实需要它们作为行名称,那么您始终可以使用上面的内容,取消tibble框架,然后应用来自ensembl_gene_id的行名称>。不幸的是,使用您当前的数据无法证明这一点,因为您有重复的符号 AADACL2,因此行名称将失败或更改行名称以强制唯一性,在这种情况下,您的某些行名称不是原始的.

关于r - 如何用另一个数据框中的信息替换数据框的行名称?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/74245603/

相关文章:

r - 使用 dplyr 向列值添加抖动

R 汇总 dplyr 分组数据,并根据另一列排除某些行

r - 使用索引来引用 dplyr 中 summarise() 中的列 - R

Python 相当于 R 的 mclapply()

r - 如何向每个面板添加一条平均线是facet_grid()对象?

r - 查找列之间的一对一、一对多和多对一关系

r - 如果用 R 所有值都为 0,则用 0 填充缺失

r - : 'x' must be a numeric matrix 的热图错误

r - 计算 Rao 二次熵

r - 使用 R 编程将用文字书写的数字转换为数字