我有以下数据框
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# 3.5.3
df=pd.DataFrame({'Type': [ 'Sentence', 'Array', 'String', '-','-', 'Sentence', 'Array', 'String', '-','-', 'Sentence'],
'Length': [42,21,11,6,6,42,21,11,6,6,42],
'label': [1,1,0,0,0,1,1,0,0,0,1],
})
print(df)
# Type Length label
#0 Sentence 42 1
#1 Array 21 1
#2 String 11 0
#3 - 6 0
#4 - 6 0
#5 Sentence 42 1
#6 Array 21 1
#7 String 11 0
#8 - 6 0
#9 - 6 0
#10 Sentence 42 1
我想为数据框中的任意列绘制堆积条形图(数字,例如 Length
列或 categorical 例如 Type
列)并相对于 label
进行堆叠使用计数/百分比注释的列,其中还显示罕见观察值的小值。以下脚本给出了错误的结果:
ax = df.plot.bar(stacked=True)
#ax = df[["Type","label"]].plot.bar(stacked=True)
#ax = df.groupby('Type').size().plot(kind='bar', stacked=True)
ax.legend(["0: normanl", "1: Anomaly"])
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.text(x+width/2,
y+height/2,
'{:.0f} %'.format(height),
horizontalalignment='center',
verticalalignment='center')
我可以想象,我需要以某种方式计算所选列相对于 label
的计数栏目:
## counts will be used for the labels
counts = df.apply(lambda x: x.value_counts())
## percents will be used to determine the height of each bar
percents = counts.div(counts.sum(axis=1), axis=0)
我尝试使用 df.groupby(['selcted column', 'label']
来解决该问题不成功。我在这个Google Colab Notebook 中收集了所有可能的解决方案尽管如此,我找不到一种直接的方法来适应数据框架。
到目前为止,我已经尝试了以下受此启发的解决方案 post通过使用 df.groupby(['selcted column', 'label']
解决问题不成功,我得到了 TypeError: unsupported operand type(s) for +: 'int' and 'str' for total = sum(dff.sum())
不知道问题出在哪里?在索引中或 df
转型。
顺便说一句,我在这个 Google Colab Notebook 中收集了所有可能的解决方案尽管如此,我找不到一种直接的方法来通过 Mathplotlib
适应数据帧。 。所以我正在寻找一种优雅的使用方式 Seaborn
或plotly
.
df = df.groupby(["Type","label"]).count()
#dfp_Type = df.pivot_table(index='Type', columns='label', values= 'Length', aggfunc='mean')
dfp_Type = df.pivot_table(index='Type', columns='label', values= df.Type.size(), aggfunc='mean')
#dfp_Length = df.pivot_table(index='Length', columns='label', values= df.Length.size(), aggfunc='mean')
ax = dfp_Type.plot(kind='bar', stacked=True, rot=0)
# iterate through each bar container
for c in ax.containers: labels = [v.get_height() if v.get_height() > 0 else '' for v in c]
# add the annotations
ax.bar_label(c, fmt='%0.0f%%', label_type='center')
# move the legend
ax.legend(title='Class', bbox_to_anchor=(1, 1.02), loc='upper left')
plt.show()
输出:
预期输出:
最佳答案
预期输出中的值与OP中的
df
不匹配,因此示例DataFrame已更新。用
pandas.DataFrame.plot
绘图,使用kind='bar'
和stacked=True
。pandas
使用并导入matplotlib
作为默认绘图后端,因此无需导入其他绘图库。资源:
- How to aggregate unique count with pandas pivot_table了解有关在
.pivot_table
中使用aggfunc=len
的详细信息。 - How to add value labels on a bar chart有关
.bar_label
的详细信息和示例。 - How to add multiple annotations to a bar plot & How to create and annotate a stacked proportional bar chart用于向条形图添加计数和百分比。
- How to aggregate unique count with pandas pivot_table了解有关在
在
python 3.10
、pandas 1.4.3
、matplotlib 3.5.1
中测试
import pandas as pd
# sample dataframe
df = pd.DataFrame({'Type': [ 'Sentence', 'Array', 'String', '-','-', 'Sentence', 'Array', 'String', '-','-', 'Sentence'],
'Length': [42, 21, 11, 6, 6, 42, 21, 11, 6, 6, 42],
'label': [1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0]})
# pivot the dataframe and get len
dfp = df.pivot_table(index='Type', columns='label', values='Length', aggfunc=len)
# get the total for each row
total = dfp.sum(axis=1)
# calculate the percent for each row
per = dfp.div(total, axis=0).mul(100).round(2)
# plot the pivoted dataframe
ax = dfp.plot(kind='bar', stacked=True, figsize=(10, 8), rot=0)
# set the colors for each Class
segment_colors = {'0': 'white', '1': 'black'}
# iterate through the containers
for c in ax.containers:
# get the current segment label (a string); corresponds to column / legend
label = c.get_label()
# create custom labels with the bar height and the percent from the per column
# the column labels in per and dfp are int, so convert label to int
labels = [f'{v.get_height()}\n({row}%)' if v.get_height() > 0 else '' for v, row in zip(c, per[int(label)])]
# add the annotation
ax.bar_label(c, labels=labels, label_type='center', fontweight='bold', color=segment_colors[label])
# move the legend
_ = ax.legend(title='Class', bbox_to_anchor=(1, 1.01), loc='upper left')
评论更新
- 如果数据中没有
'Array'
,如何始终为其提供位置:- 如果
'Array'
不在dfp.index
中,请将其添加到dfp
中。 df.Type = pd.Categorical(df.Type, ['-', 'Array', 'Sentence', 'String'],ordered=True)
不能确保缺失的类别已绘制。
- 如果
- 如何拥有所有注释,即使它们很小:
- 不要堆叠条形图,并设置
logy=True
。
- 不要堆叠条形图,并设置
- 这使用链接中提供的完整数据。
# pivot the dataframe and get len
dfp = df.pivot_table(index='Type', columns='label', values='Length', aggfunc=len)
# append Array if it's not included
if 'Array' not in dfp.index:
dfp = pd.concat([dfp, pd.DataFrame({0: [np.nan], 1: [np.nan]}, index=['Array'])])
# order the index
dfp = dfp.loc[['-', 'Array', 'Sentence', 'String'], :]
# calculate the percent for each row
per = dfp.div(dfp.sum(axis=1), axis=0).mul(100).round(2)
# plot the pivoted dataframe
ax = dfp.plot(kind='bar', stacked=False, figsize=(10, 8), rot=0, logy=True, width=0.75)
# iterate through the containers
for c in ax.containers:
# get the current segment label (a string); corresponds to column / legend
label = c.get_label()
# create custom labels with the bar height and the percent from the per column
# the column labels in per and dfp are int, so convert label to int
labels = [f'{v.get_height()}\n({row}%)' if v.get_height() > 0 else '' for v, row in zip(c, per[int(label)])]
# add the annotation
ax.bar_label(c, labels=labels, label_type='edge', fontsize=10, fontweight='bold')
# move the legend
ax.legend(title='Class', bbox_to_anchor=(1, 1.01), loc='upper left')
# pad the spacing between the number and the edge of the figure
_ = ax.margins(y=0.1)
数据帧 View
- 基于 OP 中的示例数据
df
Type Length label
0 Sentence 42 1
1 Array 21 1
2 String 11 0
3 - 6 0
4 - 6 0
5 Sentence 42 1
6 Array 21 1
7 String 11 0
8 - 6 0
9 - 6 1
10 Sentence 42 0
dfp
label 0 1
Type
- 3.0 1.0
Array NaN 2.0
Sentence 1.0 2.0
String 2.0 NaN
总计
Type
- 4.0
Array 2.0
Sentence 3.0
String 2.0
dtype: float64
每
label 0 1
Type
- 75.00 25.00
Array NaN 100.00
Sentence 33.33 66.67
String 100.00 NaN
关于python - 创建堆积条形图并用计数和百分比进行注释,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/73568416/