r - 使用来自不同变量的数据在条形图中绘制百分比

标签 r plot ggplot2 bar-chart

我是 R 的新手,我有以下问题。

我有一个像这样的数据框:

mydata.df <- data.frame(ID.name = c("1000", "1000", "1000", "1001","1001",1001,"1002","1002"),
                        project_name = c("project1", "project1", "project1", "project1","project1","project1","project2", "project2"), 
                        Canonical_Hugo_Symbol = c("gene1", "gene3", "gene1", "gene1","gene3","gene4","gene1", "gene2"))

我正在尝试绘制 x 上的不同基因和 y 上的百分比,以说明某些基因仅在一个项目集中而其他基因在另一个项目集中的事实。

我试过:

q <- qplot(Canonical_Hugo_Symbol, data= mydata.df, geom="bar", fill= project_name, y = (..count..)/sum(..count..))
q + theme(axis.text.x=element_text(angle=-90, hjust = 1)) + stat_bin(geom = "text", aes(label = paste(round((..count..)/sum(..count..)*100), "%")), vjust = 5)

但作为示例,基因 2 仅存在于项目 2 中,并且应该具有 100% 的百分比,因为它存在于所有(一个)患者中。

非常感谢!

我只想编辑 data.frame 使其更“真实”

dput(subset_mydata[c(1:10,1065:1077),c(3,9,20)])
structure(list(ID.name = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 140L, 140L, 140L, 140L, 141L, 141L, 141L, 141L, 
141L, 141L, 142L, 142L, 142L), .Label = c("", "1075", "1104", 
"1108", "1120", "1121", "1137", "1258", "1264", "1280", "1286", 
"1310", "1317", "1338", "1392", "1401", "1435", "1477", "1480", 
"1494", "1519", "1574", "1588", "1595", "1607", "1611", "1644", 
"1645", "1651", "1653", "1654", "1673", "1687", "1702", "1714", 
"1740", "1776", "1781", "1812", "1835", "1838", "1857", "1874", 
"1890", "1899", "1911", "1933", "1936", "1999", "2006", "2046", 
"2063", "2079", "2081", "2088", "2116", "2135", "2144", "2147", 
"2155", "2166", "2167", "2176", "2183", "2200", "2209", "2223", 
"2253", "2256", "2442", "2444", "2453", "2456", "2462", "2467", 
"2472", "2482", "2497", "2504", "2507", "2513", "2518", "2523", 
"2567", "2568", "2576", "2578", "2598", "2600", "2619", "2623", 
"2625", "2632", "2636", "2646", "2652", "2659", "2660", "2676", 
"2680", "2682", "2705", "2711", "2756", "2765", "2772", "2793", 
"2803", "2854", "2856", "2882", "2912", "2916", "2919", "3058", 
"3063", "3114", "3116", "3117", "3125", "3132", "3140", "3145", 
"3175", "3181", "3248", "3383", "3431", "3436", "3442", "3472", 
"3576", "3639", "4093", "FL1_01215", "FL10_01501", "FL12_01593", 
"FL13_01598", "FL16_01738", "FL17_01752", "FL18_01763", "FL19_01881", 
"FL2_01222", "FL22_02025", "FL23_02032", "FL24_02085", "FL25_02175", 
"FL26_02242", "FL28_02459", "FL3_01235", "FL30_02558", "FL35_02726", 
"FL37_02808", "FL41_02865", "FL43_02926", "FL44_02994", "FL45_03018", 
"FL47_03119", "FL48_03128", "FL55_03303", "FL62_03406", "FL64_03418", 
"FL65_03421", "FL69_03484", "FL7_01306", "FL70_03517", "FL71_03534", 
"FL76_03644", "FL77_03651", "FL8_01425"), class = "factor"), 
project_name = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("", "FL_Ontogeny", "Hemoseq1"), class = "factor"), 
Canonical_Hugo_Symbol = structure(c(174L, 158L, 113L, 113L, 
97L, 175L, 14L, 9L, 35L, 65L, 14L, 14L, 14L, 111L, 123L, 
113L, 113L, 160L, 50L, 50L, 25L, 25L, 131L), .Label = c("", 
"ABCA7", "ACTA2", "ACTB", "ALMS1", "ARID1A", "ARID1B", "ARID2", 
"ARID3A", "ASXL1", "ATM", "B2M", "BCAS3", "BCL2", "BCL7A", 
"BCL9", "BCORL1", "BCR", "BIRC3", "BLNK", "BRAF", "BRD2", 
"BRWD3", "BTG1", "BTG2", "BTG3", "CA10", "CARD11", "CCDC80", 
"CCND1", "CCND2", "CCND3", "CD40", "CD58", "CD79A", "CD79B", 
"CDH23", "CDK4", "CDK6", "CDKN2A", "CDKN2B", "CELSR2", "CHD1", 
"CHD2", "CHRM3", "CIITA", "CNOT1", "COL4A2", "COL6A2", "CREBBP", 
"CRLF2", "CTSS", "CUL9", "CXCR4", "DDX3X", "DGKG", "DIRAS3", 
"DLEU2", "DMD", "DNAH9", "DNM2", "DNMT3A", "DST", "DTX1", 
"EP300", "EPHA6", "EPHA7", "EPRS", "ETS1", "ETV6", "EZH2", 
"FAM135B", "FAM155A", "FAM18A", "FAS", "FAT2", "FBXO11", 
"FCGR2A", "FCGR3A", "FOXO1", "GNA13", "GNAS", "GNB1", "GNB2", 
"GOPC", "HCK", "HEATR1", "HUWE1", "ID3", "IDH1", "IDH2", 
"IKBKE", "IKZF1", "IKZF2", "IKZF3", "IRF4", "IRF8", "JAK1", 
"KAT2A", "KAT2B", "KAT5", "KDM6A", "KIF20B", "KLHL6", "LRRC16A", 
"MALT1", "MAP3K14", "MAP4K1", "MCL1", "MDM2", "MEF2B", "MLL", 
"MLL2", "MPDZ", "MSH6", "MTAP", "MUC16", "MUM1", "MYC", "MYD88", 
"NBAS", "NF1", "NFKB2", "NOTCH1", "NOTCH2", "NPM1", "NR3C1", 
"OSBP2", "P2RY8", "PASD1", "PAX5", "PCLO", "PDGFC", "PDGFRA", 
"PDGFRB", "PHF6", "PIK3CA", "PIK3CD", "PIK3R1", "PIM1", "POSTN", 
"PRKDC", "PTEN", "RASSF8", "RB1", "RELN", "RET", "RFTN1", 
"RFX7", "RHOA", "RIPK3", "RNF213", "ROS1", "SAMD9", "SBF1", 
"SETD2", "SF3B1", "SGK1", "SIN3A", "SLITRK6", "SMARCA2", 
"SMARCA4", "SMARCB1", "SOCS1", "SOX5", "STAT3", "STAT6", 
"SWAP70", "SYK", "TBL1XR1", "TET2", "TMEM30A", "TNFAIP3", 
"TNFRSF14", "TP53", "TRAF2", "TRAF3", "TRAF6", "TTC27", "TYK2", 
"U2AF1", "UBR5", "UGGT1", "ULK4", "UNC13B", "UNC5C", "UNC5D", 
"USP6", "VPS13A", "XPO1", "ZMAT4", "ZNF608", "ZNF708", "ZRSR2"
), class = "factor")), .Names = c("ID.name", "project_name", 
"Canonical_Hugo_Symbol"), row.names = c(1L, 49L, 94L, 96L, 111L, 
 115L, 125L, 127L, 134L, 138L, 18794L, 18796L, 18797L, 18798L, 
 18800L, 18801L, 18802L, 18803L, 18805L, 18806L, 18809L, 18810L, 
 18814L), class = "data.frame")

最佳答案

您没有得到基因 2 的“100%”,因为您的百分比计算是将每个子组(按基因类型的项目)的观察值除以所有 mydata.df 观察值的总和。

如果我没理解错的话,您想要的是每种基因类型的每个项目的百分比。

#added two additional values to gene4.
mydata.df <- data.frame(ID.name = c("1000", "1000", "1000", "1001","1001",1001,"1002","1002","1000","1003"),
                    project_name = c("project1", "project1", "project1", "project1","project1","project1","project2", "project2","project2","project2"), 
                    Canonical_Hugo_Symbol = c("gene1", "gene3", "gene1", "gene1","gene3","gene4","gene1", "gene2","gene4","gene4"))

#use aggregate function to summarize data per ID.name:

#at gene level - and 
table1 = aggregate(mydata.df$ID.name,by=list(Canonical_Hugo_Symbol=mydata.df$Canonical_Hugo_Symbol), length) #number of patients per type of gene
#at gene/project level
table2 = aggregate(mydata.df$ID.name,by=list(Canonical_Hugo_Symbol=mydata.df$Canonical_Hugo_Symbol,project_name=mydata.df$project_name), length) #number of patients per type of gene/project

temp=merge(table1,table2,by=1) #merge table 1 and table 2 - this will be useful to calculate percentages of patients inside each gene and project
names(temp)[c(2,4)] = c("Total_gene","Total_gene_project") #change columns name on the df

percentage = temp$Total_gene_project/temp$Total_gene*100 #calculate percentage
percentage = round(percentage,0)

temp=data.frame(temp,pctg=percentage) #create new df as input to ggplot

pos = ifelse(temp$project_name=="project1",temp$Total_gene_project,temp$Total_gene) #these values will guide the position of labels "percentages"

df = data.frame(temp,pos=pos)

require(ggplot2)

ggplot(data=df,aes(x=Canonical_Hugo_Symbol,y=Total_gene_project,fill=project_name)) + 
  geom_bar(stat="identity") +
  xlab("Type of Gene") + ylab("Number of Patients") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=-90, hjust = 1)) +
  geom_text(aes(y=pos,label=paste(pctg, "%")),vjust=2)

enter image description here

更新 - 该代码也适用于您的真实数据。

enter image description here

在上面的代码中,假设数据框名称是mydata.df,只需将对象pos设置为“project 1”的正确名称(例如:“FL_Ontogeny").

关于r - 使用来自不同变量的数据在条形图中绘制百分比,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/21724007/

相关文章:

r - 如何向 ggridges 中的脊线图添加垂直颜色渐变?

r - 在 geom_tile 中按面拆分重叠的图 block

r - 如何将函数应用于一系列数据表列而无需指定新列名?

r - 使混合排序区分大小写? [r]

r - 如何在 do.call 中添加函数的更多参数?

r - 在 R 中使用 `$` 和 `K` 作为数千美元的 y 轴标签

r - 我的分组横杆不躲避,而我的箱形图却躲避

r - builtins(internal = TRUE) 返回什么?

python - 如何增加子图的大小并减少水平间距?

python - 如何创建分类进度条