我正在使用 ggplot2 为两个不同的参数创建直方图。我当前的方法附在我的问题末尾(包括一个数据集,可以直接从pasetbin.com使用和加载),它创建
- 基于“位置”属性(“WITHIN”或“NOT_WITHIN”)直观显示记录用户数据空间分布频率的直方图。
- 基于“上下文”属性(“点击 A”或“点击 B”)直观显示记录用户数据分布频率的直方图。
# Load my example dataset from pastebin
RawDataSet <- read.csv("http://pastebin.com/raw/uKybDy03", sep=";")
# Load packages
library(plyr)
library(dplyr)
library(reshape2)
library(ggplot2)
###### Create Frequency Table for Location-Information
LocationFrequency <- ddply(RawDataSet, .(UserEmail), summarize,
All = length(UserEmail),
Within_area = sum(location=="WITHIN"),
Not_within_area = sum(location=="NOT_WITHIN"))
# Create a column for unique identifiers
LocationFrequency <- mutate(LocationFrequency, id = rownames(LocationFrequency))
# Reorder columns
LocationFrequency <- LocationFrequency[,c(5,1:4)]
# Format id-column as numbers (not as string)
LocationFrequency[,c(1)] <- sapply(LocationFrequency[, c(1)], as.numeric)
# Melt data
LocationFrequency.m = melt(LocationFrequency, id.var=c("UserEmail","All","id"))
# Plot data
p <- ggplot(LocationFrequency.m, aes(x=id, y=value, fill=variable)) +
geom_bar(stat="identity") +
theme_grey(base_size = 16)+
labs(title="Histogram showing the distribution of all spatial information per user.") +
labs(x="User", y="Number of notifications interaction within/not within the area") +
# using IDs instead of UserEmail
scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30), labels=c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30"))
# Change legend Title
p + labs(fill = "Type of location")
##### Create Frequency Table for Interaction-Information
InterationFrequency <- ddply(RawDataSet, .(UserEmail), summarize,
All = length(UserEmail),
Clicked_A = sum(context=="Clicked A"),
Clicked_B = sum(context=="Clicked B"))
# Create a column for unique identifiers
InterationFrequency <- mutate(InterationFrequency, id = rownames(InterationFrequency))
# Reorder columns
InterationFrequency <- InterationFrequency[,c(5,1:4)]
# Format id-column as numbers (not as string)
InterationFrequency[,c(1)] <- sapply(InterationFrequency[, c(1)], as.numeric)
# Melt data
InterationFrequency.m = melt(InterationFrequency, id.var=c("UserEmail","All","id"))
# Plot data
p <- ggplot(InterationFrequency.m, aes(x=id, y=value, fill=variable)) +
geom_bar(stat="identity") +
theme_grey(base_size = 16)+
labs(title="Histogram showing the distribution of all interaction types per user.") +
labs(x="User", y="Number of interaction") +
# using IDs instead of UserEmail
scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30), labels=c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30"))
# Change legend Title
p + labs(fill = "Type of interaction")
但是我想要实现的是:如何将两个直方图组合在一个图中?是否可以以某种方式为每个部分放置相应的百分比?有些像下面的草图,它表示每个用户的观察总数(条形的完整高度)并使用不同的分段来可视化相应的数据。每个栏将分为几个部分(within 和 not_within),其中每个部分将分为两个子部分,显示交互类型的百分比(*单击 A' 或点击了 B)。
最佳答案
根据更新描述,我将制作一个包含两部分的组合条形图:负部分和正部分。为了实现这一点,您必须将数据转换为正确的格式:
# load needed libraries
library(dplyr)
library(tidyr)
library(ggplot2)
# summarise your data
new.df <- RawDataSet %>%
group_by(UserEmail,location,context) %>%
tally() %>%
mutate(n2 = n * c(1,-1)[(location=="NOT_WITHIN")+1L]) %>%
group_by(UserEmail,location) %>%
mutate(p = c(1,-1)[(location=="NOT_WITHIN")+1L] * n/sum(n))
new.df
数据框如下所示:
> new.df
Source: local data frame [90 x 6]
Groups: UserEmail, location [54]
UserEmail location context n n2 p
(fctr) (fctr) (fctr) (int) (dbl) (dbl)
1 andre NOT_WITHIN Clicked A 3 -3 -1.0000000
2 bibi NOT_WITHIN Clicked A 4 -4 -0.5000000
3 bibi NOT_WITHIN Clicked B 4 -4 -0.5000000
4 bibi WITHIN Clicked A 9 9 0.6000000
5 bibi WITHIN Clicked B 6 6 0.4000000
6 corinn NOT_WITHIN Clicked A 10 -10 -0.5882353
7 corinn NOT_WITHIN Clicked B 7 -7 -0.4117647
8 corinn WITHIN Clicked A 9 9 0.7500000
9 corinn WITHIN Clicked B 3 3 0.2500000
10 dpfeifer NOT_WITHIN Clicked A 7 -7 -1.0000000
.. ... ... ... ... ... ...
接下来您可以使用以下命令创建绘图:
ggplot() +
geom_bar(data = new.df[new.df$location == "NOT_WITHIN",],
aes(x = UserEmail, y = n2, color = "darkgreen", fill = context),
size = 1, stat = "identity", width = 0.7) +
geom_bar(data = new.df[new.df$location == "WITHIN",],
aes(x = UserEmail, y = n2, color = "darkred", fill = context),
size = 1, stat = "identity", width = 0.7) +
scale_y_continuous(breaks = seq(-20,20,5),
labels = c(20,15,10,5,0,5,10,15,20)) +
scale_color_manual("Location of interaction",
values = c("darkgreen","darkred"),
labels = c("NOT_WITHIN","WITHIN")) +
scale_fill_manual("Type of interaction",
values = c("lightyellow","lightblue"),
labels = c("Clicked A","Clicked B")) +
guides(color = guide_legend(override.aes = list(color = c("darkred","darkgreen"),
fill = NA, size = 2), reverse = TRUE),
fill = guide_legend(override.aes = list(fill = c("lightyellow","lightblue"),
color = "black", size = 0.5))) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
axis.title = element_blank(),
legend.title = element_text(face = "italic", size = 14),
legend.key.size = unit(1, "lines"),
legend.text = element_text(size = 11))
结果是:
如果您想使用百分比值,可以使用p
列来绘制图表:
ggplot() +
geom_bar(data = new.df[new.df$location == "NOT_WITHIN",],
aes(x = UserEmail, y = p, color = "darkgreen", fill = context),
size = 1, stat = "identity", width = 0.7) +
geom_bar(data = new.df[new.df$location == "WITHIN",],
aes(x = UserEmail, y = p, color = "darkred", fill = context),
size = 1, stat = "identity", width = 0.7) +
scale_y_continuous(breaks = c(-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1),
labels = scales::percent(c(1,0.75,0.5,0.25,0,0.25,0.5,0.75,1))) +
scale_color_manual("Location of interaction",
values = c("darkgreen","darkred"),
labels = c("NOT_WITHIN","WITHIN")) +
scale_fill_manual("Type of interaction",
values = c("lightyellow","lightblue"),
labels = c("Clicked A","Clicked B")) +
coord_flip() +
guides(color = guide_legend(override.aes = list(color = c("darkred","darkgreen"),
fill = NA, size = 2), reverse = TRUE),
fill = guide_legend(override.aes = list(fill = c("lightyellow","lightblue"),
color = "black", size = 0.5))) +
theme_minimal(base_size = 14) +
theme(axis.title = element_blank(),
legend.title = element_text(face = "italic", size = 14),
legend.key.size = unit(1, "lines"),
legend.text = element_text(size = 11))
结果是:
回复评论
如果您想将文本标签放置在条形内,您还必须计算位置变量:
new.df <- RawDataSet %>%
group_by(UserEmail,location,context) %>%
tally() %>%
mutate(n2 = n * c(1,-1)[(location=="NOT_WITHIN")+1L]) %>%
group_by(UserEmail,location) %>%
mutate(p = c(1,-1)[(location=="NOT_WITHIN")+1L] * n/sum(n),
pos = (context=="Clicked A")*p/2 + (context=="Clicked B")*(c(1,-1)[(location=="NOT_WITHIN")+1L] * (1 - abs(p)/2)))
然后将以下行添加到 ggplot
代码中的 geom_bar
之后:
geom_text(data = new.df, aes(x = UserEmail, y = pos, label = n))
结果是:
您还可以使用 label = scales::percent(abs(p))
代替 label = n
显示百分比。
关于r - 使用两个属性的组合频率直方图,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/35255234/