我有两个数据帧 exp.kirp.log2
和 g_list
,其中 exp.kirp.log2
的行名与 匹配g_list$hgnc_symbol
,但它们的顺序不同。
我想根据匹配的 g_list$hgnc_symbol
将 exp.kirp.log2
的行名称替换为 gene_list$ensembl_gene_id
。
# Obtain g_list
genes <- rownames(exp.kirp.log2)
ensembl = useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl", GRCh = 37, verbose = T)
ensembl <- useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
g_list <- getBM(attributes = c('ensembl_gene_id','hgnc_symbol'),
filters='hgnc_symbol',values=genes,
mart=ensembl)
g_list <- g_list[order(g_list$hgnc_symbol),]
rownames(exp.kirp.log2) <- g_list$ensembl_gene_id %in% g_list[match(rownames(exp.kirp.log2), g_list$hgnc_symbol),2]
回溯:
Error in `.rowNamesDF<-`(x, value = value) : invalid 'row.names' length
exp.kirp
dput(exp.kirp.log2[1:20,1:20])
structure(list(TCGA.2K.A9WE.01A = c(7.65342121905285, 2.03892776611756,
-0.96100202120249, 6.35598354101006, 14.3511850042327, -Inf,
10.3737643425674, 3.79382306985866, -Inf, 10.0819596419255, 9.44832324553207,
4.20886056913751, -0.96100202120249, -Inf, -Inf, -Inf, 5.36085937172008,
9.78880184184623, 10.3776687573505, 11.16757118884), TCGA.2Z.A9J1.01A = c(5.09389393824392,
5.2160706644244, -Inf, 6.93597002271109, 12.4136523086721, -Inf,
11.1918237390263, 2.98724809259115, -Inf, 10.1912122382252, 9.9623840324273,
4.71517403960983, -Inf, -Inf, -Inf, -Inf, 6.22565668754941, 10.3398477765017,
10.3103072842012, 11.1287210937383), TCGA.2Z.A9J2.01A = c(5.51854458067276,
4.11644793551166, -Inf, 7.5307754013178, 12.2744679621487, -Inf,
9.93114303849412, -Inf, -Inf, 10.3189198720956, 10.2574585613045,
4.11644793551166, -Inf, -Inf, -Inf, 0.309059742501585, 6.16707286132018,
10.2991951943744, 10.5852157015366, 11.5823040757623), TCGA.2Z.A9J3.01A = c(4.70168212029528,
3.34111759260469, 3.57815377007565, 7.54694769203808, 10.1689338100564,
-Inf, 9.96839262629172, 5.28865017271056, -Inf, 9.87305770150294,
9.75535162798677, 3.57815377007565, -Inf, -Inf, -Inf, -Inf, 6.170389794965,
10.238532641469, 9.94050095178643, 11.0690397931313), TCGA.2Z.A9J5.01A = c(7.99645936536463,
5.20408983959317, 1.64952349150802, 6.89258167250936, 13.6832285748428,
-Inf, 10.3714563849361, 2.6495004987031, -Inf, 10.4176870383992,
10.0652444551968, 6.86867071663319, -Inf, -Inf, -Inf, -0.350522494468264,
5.98935248863472, 10.1079093507719, 11.2050505161752, 11.6645692817891
), TCGA.2Z.A9J6.01A = c(5.13719199914349, 6.63590381796106, -1.00057719346275,
6.92859654071157, 12.0367193976262, -Inf, 10.8202555636581, 5.50707469845262,
-Inf, 10.3262700402849, 9.91216810777653, 5.94179415093086, -Inf,
-Inf, -Inf, -Inf, 5.52284042813955, 10.0653680664815, 10.5954686012028,
11.2355920880251), TCGA.2Z.A9J7.01A = c(6.95117512427229, 2.24944534108584,
-Inf, 7.25014205824679, 10.9656928148969, 1.07949758402178, 10.5991523452113,
4.32744306245973, -Inf, 10.4556415168452, 9.46537845450025, 3.66448284036468,
-Inf, -Inf, -0.920570684997085, -Inf, 6.72337288854289, 10.0139441477751,
9.28408724134641, 11.4833270722276), TCGA.2Z.A9J8.01A = c(3.61712213221935,
5.39472334226273, -Inf, 7.92111077839189, 11.9975977242282, -1.58251200148633,
9.91379851626213, 5.10394657615628, -Inf, 9.95999222715916, 9.90779350021794,
5.82683572660569, -Inf, -Inf, -Inf, -Inf, 7.85831302551685, 10.3997246047534,
11.7402171909708, 11.7246448152361), TCGA.2Z.A9J9.01A = c(6.05389548011115,
-1.41888982477445, -Inf, 5.73237232300075, 14.8647326244225,
-1.41888982477445, 10.4697437586612, 4.51174000542664, -0.419082711336792,
9.28576885726015, 9.21399581402162, -1.41888982477445, -Inf,
-Inf, -Inf, -Inf, 6.0887945976893, 9.39798410008432, 9.51616568261168,
11.1846268780251), TCGA.2Z.A9JD.01A = c(3.15639661659767, 4.2623504664045,
-Inf, 8.66937017282105, 10.9506421354115, -Inf, 10.8015070949819,
1.37484474128973, -Inf, 9.51027580369917, 10.0114476969219, 4.30081927275683,
-Inf, -Inf, -Inf, -Inf, 5.9232585434528, 10.1410206692489, 10.9093204661635,
11.1601119970792), TCGA.2Z.A9JE.01A = c(4.55671142771396, 0.976583597876997,
-Inf, 7.39711669725343, 11.338916568945, 0.976583597876997, 11.4922670603536,
2.97656526897918, -Inf, 10.098748119083, 9.84075338026455, 5.28034764124523,
-Inf, -Inf, -Inf, -Inf, 5.7709822912126, 10.6735346943631, 10.5435725361632,
11.1417882886223), TCGA.2Z.A9JG.01A = c(7.27924748225939, 3.74051746156171,
2.8145094080944, 5.79197812282256, 12.0865056775331, -Inf, 10.327056990394,
3.53698537579194, -Inf, 10.5780583590704, 10.8649068477431, 6.57473428966518,
-Inf, -Inf, -Inf, -Inf, 7.25413670394464, 9.83577157545057, 10.6502486545903,
11.1906393649786), TCGA.2Z.A9JI.01A = c(8.20162111992077, 5.71548707501235,
0.393526032228356, 5.73826794012289, 12.7954861179578, 2.61595767716595,
10.0094938620897, 5.20091233325219, -Inf, 9.93400880935778, 9.75330735360058,
5.21798676951157, -Inf, -Inf, -Inf, -Inf, 7.0082672553098, 9.74081003982032,
10.7382235152475, 11.6720072357516), TCGA.2Z.A9JJ.01A = c(6.26475409489153,
-0.415229871442725, 4.46742296017359, 6.39123499553712, 12.8198023802381,
-Inf, 11.7916439373724, 2.04421923663312, -Inf, 9.53606689182685,
10.3591574288036, -1.41503749927884, -Inf, -Inf, -Inf, -Inf,
5.71406413888836, 10.0630462305102, 10.2195932783632, 11.455724780085
), TCGA.2Z.A9JK.01A = c(5.9386386584728, 6.15383572961508, -Inf,
6.96731849612975, 14.0309071832818, -1.46076672913396, 10.1870344226096,
1.99855658313033, -Inf, 9.35930526481309, 9.09945932891836, 4.68887411799513,
1.99855658313033, -Inf, -Inf, -1.46076672913396, 6.2670469517503,
9.44376099865969, 10.3192560726034, 11.2260646035167), TCGA.2Z.A9JL.01A = c(7.02541755196655,
3.6235742674856, 0.623585972399712, 6.53709878587009, 12.9644083075158,
-0.961282892427146, 11.008714051357, 7.39616034553605, -Inf,
10.2035151452418, 9.55233575995403, 4.03861176676445, -Inf, -Inf,
-Inf, -Inf, 6.02729341208669, 10.2444014184424, 10.8539915160092,
10.9360755463691), TCGA.2Z.A9JM.01A = c(5.91786850792426, 5.24907362097786,
1.28557977040279, 6.72907414917957, 13.7490268867095, -Inf, 10.9548399985167,
4.00806552147407, -Inf, 10.1222822552784, 9.55238706244896, 5.47146712618562,
-Inf, -Inf, -Inf, -1.03622996892924, 6.23045858328003, 10.2600151849016,
10.4635183450003, 10.7143794759862), TCGA.2Z.A9JN.01A = c(4.09355172548716,
-Inf, -Inf, 7.28292161137752, 12.2911433534863, -Inf, 10.8165919471646,
-1.22398025935253, -Inf, 8.92443634456794, 9.76180168782627,
-1.22398025935253, -1.22398025935253, -Inf, -Inf, -Inf, 7.12910679128811,
10.6459390192132, 10.9762453349304, 11.3740122319006), TCGA.2Z.A9JO.01A = c(5.31084177712261,
-0.707218250772154, 0.292663973396858, 7.23068817327933, 13.1978705809921,
3.38016145559557, 9.83862580682156, -Inf, -Inf, 9.54417855835282,
10.3479785104067, 5.93655314137344, -Inf, -Inf, -Inf, -Inf, 6.95803180123955,
10.5275135616911, 10.7557094705532, 11.5723066760841), TCGA.2Z.A9JP.01A = c(6.28494714327597,
0.226878404196269, 1.22687840419627, 7.45773299323875, 13.34336295947,
-Inf, 11.208058065601, 1.22687840419627, -Inf, 10.0414301364209,
9.66875435596374, 4.51225217454896, -Inf, -Inf, -Inf, 0.226878404196269,
6.145710058105, 9.94538047931203, 10.6250568350002, 11.0719455710567
)), row.names = c("A1BG", "A1CF", "A2BP1", "A2LD1", "A2M", "A2ML1",
"A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AACSL", "AADAC",
"AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB", "AAK1", "AAMP"
), class = "data.frame")
g_list
> dput(g_list[1:20,1:2])
structure(list(ensembl_gene_id = c("ENSG00000121410", "ENSG00000148584",
"ENSG00000175899", "ENSG00000166535", "ENSG00000128274", "ENSG00000118017",
"ENSG00000094914", "ENSG00000081760", "ENSG00000114771", "ENSG00000261846",
"ENSG00000197953", "ENSG00000188984", "ENSG00000204518", "ENSG00000109576",
"ENSG00000103591", "ENSG00000115977", "ENSG00000127837", "ENSG00000129673",
"ENSG00000090861", "ENSG00000124608"), hgnc_symbol = c("A1BG",
"A1CF", "A2M", "A2ML1", "A4GALT", "A4GNT", "AAAS", "AACS", "AADAC",
"AADACL2", "AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB",
"AAK1", "AAMP", "AANAT", "AARS", "AARS2")), row.names = c(NA,
20L), class = "data.frame")
最佳答案
如果
dplyr
和 tidyverse 的其余部分中的许多函数没有主动删除行名称,则不鼓励这样做。您最好将它们迁移到数据中的“真实列”或停止尝试使用它们(我怀疑您需要这些信息,因此前者是您的最佳路径)。在这方面,你可以这样做exp.kirp.log2 %>% tibble::rownames_to_column("hgnc_symbol") %>% right_join(g_list, ., by = "hgnc_symbol") %>% tibble() # this is only "needed" to facilitate visualization here on Stack # # A tibble: 21 x 22 # ensembl_gene_id hgnc_symbol TCGA.2K.A9WE.01A TCGA.2Z.A9J1.01A TCGA.2Z.A9J2.01A TCGA.2Z.A9J3.01A TCGA.2Z.A9J5.01A # <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> # 1 ENSG00000121410 A1BG 7.65 5.09 5.52 4.70 8.00 # 2 ENSG00000148584 A1CF 2.04 5.22 4.12 3.34 5.20 # 3 ENSG00000175899 A2M 14.4 12.4 12.3 10.2 13.7 # 4 ENSG00000166535 A2ML1 -Inf -Inf -Inf -Inf -Inf # 5 ENSG00000128274 A4GALT 10.4 11.2 9.93 9.97 10.4 # 6 ENSG00000118017 A4GNT 3.79 2.99 -Inf 5.29 2.65 # 7 ENSG00000094914 AAAS 10.1 10.2 10.3 9.87 10.4 # 8 ENSG00000081760 AACS 9.45 9.96 10.3 9.76 10.1 # 9 ENSG00000114771 AADAC -0.961 -Inf -Inf -Inf -Inf # 10 ENSG00000261846 AADACL2 -Inf -Inf -Inf -Inf -Inf # # ... with 11 more rows, and 15 more variables: TCGA.2Z.A9J6.01A <dbl>, TCGA.2Z.A9J7.01A <dbl>, TCGA.2Z.A9J8.01A <dbl>, # # TCGA.2Z.A9J9.01A <dbl>, TCGA.2Z.A9JD.01A <dbl>, TCGA.2Z.A9JE.01A <dbl>, TCGA.2Z.A9JG.01A <dbl>, TCGA.2Z.A9JI.01A <dbl>, # # TCGA.2Z.A9JJ.01A <dbl>, TCGA.2Z.A9JK.01A <dbl>, TCGA.2Z.A9JL.01A <dbl>, TCGA.2Z.A9JM.01A <dbl>, TCGA.2Z.A9JN.01A <dbl>, # # TCGA.2Z.A9JO.01A <dbl>, TCGA.2Z.A9JP.01A <dbl>
仅供引用,我选择使用
right_join
主要是为了在 Stack 上这里进行演示,更喜欢将两列保留在框架的左侧,以便它们出现在此处被截断的渲染。通常,许多人更喜欢%>% left_join(g_list, by = "hgnc_symbol")
的“思考”,但这两个调用除了结果的列顺序之外在功能上是相同的。如果您确实需要它们作为行名称,那么您始终可以使用上面的内容,取消
tibble
框架,然后应用来自ensembl_gene_id
的行名称>。不幸的是,使用您当前的数据无法证明这一点,因为您有重复的符号AADACL2
,因此行名称将失败或更改行名称以强制唯一性,在这种情况下,您的某些行名称不是原始的.
关于r - 如何用另一个数据框中的信息替换数据框的行名称?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/74245603/