假设我有两个 numpy 数组,形状为 (d, f)
的 A
和形状为 (d,) 的
包含 I
0..n
中的索引,例如
I = np.array([0, 0, 1, 0, 2, 1])
A = np.arange(12).reshape(6, 2)
我正在寻找一种快速减少所有切片的方法,特别是 sum
、mean
和 max
, A[我==我,:]
;一个慢版本将是
results = np.zeros((I.max() + 1, A.shape[1]))
for i in np.unique(I):
results[i, :] = np.mean(A[I == i, :], axis=0)
在这种情况下给出
results = [[ 2.66666667, 3.66666667],
[ 7. , 8. ],
[ 8. , 9. ]])
编辑:我根据 Divakar 的回答和之前发帖人的(已删除)基于 pandas
的回答做了一些计时。
时间码:
from __future__ import division, print_function
import numpy as np, pandas as pd
from time import time
np.random.seed(0)
d = 500000
f = 500
n = 500
I = np.hstack((np.arange(n), np.random.randint(n, size=(d - n,))))
np.random.shuffle(I)
A = np.random.rand(d, f)
def reduce_naive(A, I, op="avg"):
target_dtype = (np.float if op=="avg" else A.dtype)
results = np.zeros((I.max() + 1, A.shape[1]), dtype=target_dtype)
npop = {"avg": np.mean, "sum": np.sum, "max": np.max}.get(op)
for i in np.unique(I):
results[i, :] = npop(A[I == i, :], axis=0)
return results
def reduce_reduceat(A, I, op="avg"):
sidx = I.argsort()
sI = I[sidx]
sortedA = A[sidx]
idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ]
if op == "max":
return np.maximum.reduceat(sortedA, idx, axis=0)
sums = np.add.reduceat(sortedA, idx, axis=0)
if op == "sum":
return sums
if op == "avg":
count = np.r_[idx[1:] - idx[:-1], A.shape[0] - idx[-1]]
return sums/count.astype(float)[:,None]
def reduce_bincount(A, I, op="avg"):
ids = (I[:,None] + (I.max()+1)*np.arange(A.shape[1])).ravel()
sums = np.bincount(ids, A.ravel()).reshape(A.shape[1],-1).T
if op == "sum":
return sums
if op == "avg":
return sums/np.bincount(ids).reshape(A.shape[1],-1).T
def reduce_pandas(A, I, op="avg"):
group = pd.concat([pd.DataFrame(A), pd.DataFrame(I, columns=("i",))
], axis=1
).groupby('i')
if op == "sum":
return group.sum().values
if op == "avg":
return group.mean().values
if op == "max":
return group.max().values
def reduce_hybrid(A, I, op="avg"):
sidx = I.argsort()
sI = I[sidx]
sortedA = A[sidx]
idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ]
unq_sI = sI[idx]
m = I.max()+1
N = A.shape[1]
target_dtype = (np.float if op=="avg" else A.dtype)
out = np.zeros((m,N),dtype=target_dtype)
ss_idx = np.r_[idx,A.shape[0]]
npop = {"avg": np.mean, "sum": np.sum, "max": np.max}.get(op)
for i in range(len(idx)):
out[unq_sI[i]] = npop(sortedA[ss_idx[i]:ss_idx[i+1]], axis=0)
return out
for op in ("sum", "avg", "max"):
for name, method in (("naive ", reduce_naive),
("reduceat", reduce_reduceat),
("pandas ", reduce_pandas),
("bincount", reduce_bincount),
("hybrid ", reduce_hybrid)
("numba ", reduce_numba)
):
if op == "max" and name == "bincount":
continue
# if name is not "naive":
# assert np.allclose(method(A, I, op), reduce_naive(A, I, op))
times = []
for tries in range(3):
time0 = time(); method(A, I, op)
times.append(time() - time0);
print(name, op, "{:.2f}".format(np.min(times)))
print()
时间:
naive sum 1.10
reduceat sum 4.62
pandas sum 5.29
bincount sum 1.54
hybrid sum 0.62
numba sum 0.31
naive avg 1.12
reduceat avg 4.45
pandas avg 5.23
bincount avg 2.43
hybrid avg 0.61
numba avg 0.33
naive max 1.19
reduceat max 3.18
pandas max 5.24
hybrid max 0.72
numba max 0.34
(我选择 d
和 n
作为我的用例的典型值 - 我在我的答案中添加了 numba-versions 的代码)。
最佳答案
方法 #1:使用 NumPy ufunc reduceat
我们有ufuncs
对于这三个还原操作,幸运的是我们还有ufunc.reduceat
沿轴以特定间隔执行这些减少。因此,使用这些,我们将像这样计算这三个操作 -
# Gives us sorted array based on input indices I and indices at which the
# sorted array should be interval-limited for reduceat operations to be
# applied later on using those results
def sorted_array_intervals(A, I):
# Compute sort indices for I. To be later used for sorting A based on it.
sidx = I.argsort()
sI = I[sidx]
sortedA = A[sidx]
# Get indices at which intervals change. Also, get count in each interval
idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ]
return sortedA, idx
# Groupby sum reduction using the interval indices
# to perform interval-limited ufunc reductions
def groupby_sum(A, I):
sortedA, idx = sorted_array_intervals(A,I)
return np.add.reduceat(sortedA, idx, axis=0)
# Groupby mean reduction
def groupby_mean(A, I):
sortedA, idx = sorted_array_intervals(A,I)
sums = np.add.reduceat(sortedA, idx, axis=0)
count = np.r_[idx[1:] - idx[:-1], A.shape[0] - idx[-1]]
return sums/count.astype(float)[:,None]
# Groupby max reduction
def groupby_max(A, I):
sortedA, idx = sorted_array_intervals(A,I)
return np.maximum.reduceat(sortedA, idx, axis=0)
因此,如果我们需要所有这些操作,我们可以重用 sorted_array_intervals
的一个实例,就像这样 -
def groupby_sum_mean_max(A, I):
sortedA, idx = sorted_array_intervals(A,I)
sums = np.add.reduceat(sortedA, idx, axis=0)
count = np.r_[idx[1:] - idx[:-1], A.shape[0] - idx[-1]]
avgs = sums/count.astype(float)[:,None]
maxs = np.maximum.reduceat(sortedA, idx, axis=0)
return sums, avgs, maxs
方法 #1-B:混合版本(排序 + 切片 + 归约)
这是一个混合版本,它确实需要 sorted_array_intervals
的帮助来获取排序数组和间隔变为下一组的索引,但在最后阶段使用切片对每个间隔求和,对每个组重复执行此操作。当我们使用 views
时,切片在这里很有用。
实现看起来像这样-
def reduce_hybrid(A, I, op="avg"):
sidx = I.argsort()
sI = I[sidx]
sortedA = A[sidx]
# Get indices at which intervals change. Also, get count in each interval
idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ]
unq_sI = sI[idx]
m = I.max()+1
N = A.shape[1]
target_dtype = (np.float if op=="avg" else A.dtype)
out = np.zeros((m,N),dtype=target_dtype)
ss_idx = np.r_[idx,A.shape[0]]
npop = {"avg": np.mean, "sum": np.sum, "max": np.max}.get(op)
for i in range(len(idx)):
out[unq_sI[i]] = npop(sortedA[ss_idx[i]:ss_idx[i+1]], axis=0)
return out
运行时测试(使用问题中发布的基准测试设置)-
In [432]: d = 500000
...: f = 500
...: n = 500
...: I = np.hstack((np.arange(n), np.random.randint(n, size=(d - n,))))
...: np.random.shuffle(I)
...: A = np.random.rand(d, f)
...:
In [433]: %timeit reduce_naive(A, I, op="sum")
...: %timeit reduce_hybrid(A, I, op="sum")
...:
1 loops, best of 3: 1.03 s per loop
1 loops, best of 3: 549 ms per loop
In [434]: %timeit reduce_naive(A, I, op="avg")
...: %timeit reduce_hybrid(A, I, op="avg")
...:
1 loops, best of 3: 1.04 s per loop
1 loops, best of 3: 550 ms per loop
In [435]: %timeit reduce_naive(A, I, op="max")
...: %timeit reduce_hybrid(A, I, op="max")
...:
1 loops, best of 3: 1.14 s per loop
1 loops, best of 3: 631 ms per loop
方法 #2:使用 NumPy bincount
这是使用 np.bincount
的另一种方法进行基于 bin 的求和。因此,有了它,我们可以计算总和和平均值,还可以避免在过程中进行排序,就像这样 -
ids = (I[:,None] + (I.max()+1)*np.arange(A.shape[1])).ravel()
sums = np.bincount(ids, A.ravel()).reshape(A.shape[1],-1).T
avgs = sums/np.bincount(ids).reshape(A.shape[1],-1).T
关于python - 对连续的非连续切片进行 Numpy 缩减,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/42863146/