我有一个数据集,其结构如下所示的 example_df
:
example_df = pd.DataFrame({'measurement_id': np.concatenate([[0] * 300, [1] * 300]),
'min': np.concatenate([np.repeat(range(0, 30), 10),
np.repeat(range(0, 30), 10)]),
'grp': list(np.repeat(['A', 'B'], 5)) * 60,
'grp2': list(np.random.choice([0, 1, 2], 10)) * 60,
'obj': np.array(list(range(0, 10)) * 60),
'x': np.random.normal(0.0, 10.0, 600),
'y': np.random.normal(50.0, 40.0, 600)})
我还有一个函数,它将点组列表作为输入并执行一些计算。我想准备数据并在分组数据框中创建点列表。
我目前的解决方案如下:
def df_to_points(df):
points = []
for index, row in df.iterrows():
points.append(tuple(row))
return(points)
res = example_df \
.groupby(['measurement_id', 'min', 'grp']) \
.apply(lambda x: [df_to_points(g[['x', 'y']]) for _, g in x.groupby('grp2')])
res.head(5)
measurement_id min grp
0 0 A [[(7.435996920897324, 63.64844826366264), (-9....
1 B [[(-10.213911323779579, 108.64263032884301), (...
2 A [[(6.004534743892181, 38.11898691750269), (12....
3 B [[(-11.486905682289555, 68.26172126981378), (-...
4 A [[(7.5612638943199295, 28.756743327333556), (-...
其中 res
系列的每一行如下所示:
[[(7.435996920897324, 63.64844826366264),
(-9.722976872232584, 11.831678494223155),
(10.809492206072777, 82.9238481225157),
(-7.918248246978473, 58.46902598333271)],
[(6.270634566510545, 59.10653240815831),
(-5.765185730532471, 22.232739287056663),
(-13.129531349093371, 85.02932179274353)],
[(0.6686875099768917, 60.634711491838786),
(-7.373072676442981, 30.897262347426693),
(-11.489744246260528, 6.834296232736001)]]
问题是我原来的 DataFrame 有几百万行,感觉这个解决方案可以从一些优化中受益。
示例的当前运行时间是:
%timeit res = example_df \
.groupby(['measurement_id', 'min', 'grp']) \
.apply(lambda x: [df_to_points(g[['x', 'y']]) for _, g in x.groupby('grp2')])
289 ms ± 1.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
因此,我的问题是:
- 用
numpy
多维数组替换元组列表会提高性能吗? - 为了提高速度,是否应该避免任何重大瓶颈?
@Edit:由 grp
定义的组内包含不同数量对象的示例
example_df2 = pd.DataFrame({'measurement_id': np.concatenate([[0] * 300, [1] * 300]),
'min': np.concatenate([np.repeat(range(0, 30), 10),
np.repeat(range(0, 30), 10)]),
'grp': list(np.repeat(['A', 'B', 'C'], [4, 4, 2])) * 60,
'grp2': list(np.random.choice([0, 1, 2], 10)) * 60,
'obj': np.array(list(range(0, 10)) * 60),
'x': np.random.normal(0.0, 10.0, 600),
'y': np.random.normal(50.0, 40.0, 600)})
最佳答案
您可以使用.pivot_table()
和aggfunc=
简单的list
:
example_df['combined'] = example_df[['x', 'y']].values.tolist()
example_df = example_df.pivot_table(index=['measurement_id', 'min', 'grp'], columns=['grp2'], values=['combined'], aggfunc=list)
example_df['res'] = example_df.values.tolist()
example_df = example_df.drop(columns=['combined'])
打印:
res
grp2
measurement_id min grp
0 0 A [[[0.9303000896627107, 42.806752849742715], [-...
1 B [[[-18.605643711859955, 117.83261611194004], [...
2 A [[[-7.304055455430749, 18.06452177236371], [-1...
...
<小时/>
使用timeit
进行基准测试:
example_df = pd.DataFrame({'measurement_id': np.concatenate([[0] * 300, [1] * 300]),
'min': np.concatenate([np.repeat(range(0, 30), 10),
np.repeat(range(0, 30), 10)]),
'grp': list(np.repeat(['A', 'B'], 10)) * 30,
'grp2': list(np.random.choice([0, 1, 2], 10)) * 60,
'obj': np.array(list(range(0, 10)) * 60),
'x': np.random.normal(0.0, 10.0, 600),
'y': np.random.normal(50.0, 40.0, 600)})
def get_df():
return example_df.copy()
def solution_1():
def df_to_points(df):
points = []
for index, row in df.iterrows():
points.append(tuple(row))
return(points)
example_df = get_df()
res = example_df \
.groupby(['measurement_id', 'min', 'grp']) \
.apply(lambda x: [df_to_points(g[['x', 'y']]) for _, g in x.groupby('grp2')])
return res
def solution_2():
example_df = get_df()
example_df['combined'] = example_df[['x', 'y']].values.tolist()
example_df = example_df.pivot_table(index=['measurement_id', 'min', 'grp'], columns=['grp2'], values=['combined'], aggfunc=list)
example_df['res'] = example_df.values.tolist()
example_df = example_df.drop(columns=['combined'])
return example_df
t1 = timeit(lambda: solution_1(), number=100)
t2 = timeit(lambda: solution_2(), number=100)
print(t1)
print(t2)
打印:
21.74300919502275
3.124330924008973
<小时/>
编辑:通过更新的问题,您可以执行以下操作:
example_df['combined'] = example_df[['x', 'y']].values.tolist()
example_df = example_df.pivot_table(index=['measurement_id', 'min', 'grp'], columns=['grp2'], values=['combined'], aggfunc=list)
example_df.apply(lambda x: list(x[x.notna()]), axis=1)
基准测试:
from timeit import timeit
example_df = pd.DataFrame({'measurement_id': np.concatenate([[0] * 300, [1] * 300]),
'min': np.concatenate([np.repeat(range(0, 30), 10),
np.repeat(range(0, 30), 10)]),
'grp': list(np.repeat(['A', 'B'], 5)) * 60,
'grp2': list(np.random.choice([0, 1, 2], 10)) * 60,
'obj': np.array(list(range(0, 10)) * 60),
'x': np.random.normal(0.0, 10.0, 600),
'y': np.random.normal(50.0, 40.0, 600)})
example_df = pd.DataFrame({'measurement_id': np.concatenate([[0] * 300, [1] * 300]),
'min': np.concatenate([np.repeat(range(0, 30), 10),
np.repeat(range(0, 30), 10)]),
'grp': list(np.repeat(['A', 'B', 'C'], [4, 4, 2])) * 60,
'grp2': list(np.random.choice([0, 1, 2], 10)) * 60,
'obj': np.array(list(range(0, 10)) * 60),
'x': np.random.normal(0.0, 10.0, 600),
'y': np.random.normal(50.0, 40.0, 600)})
def get_df():
return example_df.copy()
def solution_1():
def df_to_points(df):
points = []
for index, row in df.iterrows():
points.append(tuple(row))
return(points)
example_df = get_df()
res = example_df \
.groupby(['measurement_id', 'min', 'grp']) \
.apply(lambda x: [df_to_points(g[['x', 'y']]) for _, g in x.groupby('grp2')])
return res
def solution_2():
example_df = get_df()
example_df['combined'] = example_df[['x', 'y']].values.tolist()
example_df = example_df.pivot_table(index=['measurement_id', 'min', 'grp'], columns=['grp2'], values=['combined'], aggfunc=list)
return example_df.apply(lambda x: list(x[x.notna()]), axis=1)
t1 = timeit(lambda: solution_1(), number=100)
t2 = timeit(lambda: solution_2(), number=100)
print(t1)
print(t2)
打印:
45.391786905995104
13.506823723029811
关于python - 在每组 Pandas 数据框中创建点列表,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59590971/