给定一个 pandas 时间序列(或 numpy 数组或一个简单的 python 列表,如果更容易的话),我想为该系列中的每个点找到等待时间,直到该系列下一次处于该级别。因此,如果第 T 天为 0 且第 T+1 天为正数,我想找到等待该系列为 0 或以下的天数。如果T+1天为负数,求等待时间直到序列为0或以上
import random
import pandas as pd
import numpy as np
np.random.seed(1234567)
N = 10
ts = pd.util.testing.makeTimeSeries(N).cumsum()
我可以用双循环来做
def min2(x):
return min(x) if len(x) > 0 else np.nan
out = ts*np.nan
for idx, (d,v) in enumerate(ts.iteritems()):
if idx+1 < N:
if ts[idx+1] > ts[idx]:
out[d] = min2([k for k in xrange(idx+2, N) if ts[k] <= ts[idx]]) - idx
elif ts[idx+1] < ts[idx]:
out[d] = min2([k for k in xrange(idx+2, N) if ts[k] >= ts[idx]]) - idx
else:
out[d] = 1
print ts
2000-01-03 -0.514625
2000-01-04 -0.964179
2000-01-05 0.770442
2000-01-06 1.413822
2000-01-07 1.439962
2000-01-10 1.520343
2000-01-11 0.722954
2000-01-12 0.094867
2000-01-13 -0.251360
2000-01-14 0.716725
Freq: B, dtype: float64
print out
2000-01-03 2.0
2000-01-04 NaN
2000-01-05 4.0
2000-01-06 3.0
2000-01-07 2.0
2000-01-10 NaN
2000-01-11 NaN
2000-01-12 2.0
2000-01-13 NaN
2000-01-14 NaN
Freq: B, dtype: float64
但是有没有一种有效的方法(对于大 N)?
最佳答案
这是一种使用两个堆栈的方法。不幸的是很难矢量化。尽管如此,在我的 10,000 个样本测试中,它的运行速度比原始循环快了 100 倍:
import numpy as np
freqs = np.random.randn(20)/10
N = 10**4
data = np.sin(np.arange(N)[:, None] * freqs).sum(axis=-1)
test = False
if test:
data = """
2000-01-03 -0.514625
2000-01-04 -0.964179
2000-01-05 0.770442
2000-01-06 1.413822
2000-01-07 1.439962
2000-01-10 1.520343
2000-01-11 0.722954
2000-01-12 0.094867
2000-01-13 -0.251360
2000-01-14 0.716725
"""
data = np.array([float(d.strip().split()[1])
for d in data.strip().split('\n')])
def min2(x):
return min(x) if len(x) > 0 else np.nan
def OP(data):
out = data*np.nan
for idx, v in enumerate(data):
if idx+1 < N:
if data[idx+1] > data[idx]:
out[idx] = min2([k for k in xrange(idx+2, N) if data[k] <= data[idx]]) - idx
elif data[idx+1] < data[idx]:
out[idx] = min2([k for k in xrange(idx+2, N) if data[k] >= data[idx]]) - idx
else:
out[idx] = 1
return out
def PP(data):
stack = np.empty(data.shape, int)
wait = np.zeros(data.shape) + np.nan
lp = 0
hp = -1
dd = np.lib.stride_tricks.as_strided(data, (data.size-1, 2),
2 * data.strides)
for j, (do, dn) in enumerate(dd):
if dn > do:
stack[lp] = j
lp += 1
while hp < -1 and dn >= data[stack[hp+1]]:
hp += 1
wait[stack[hp]] = j - stack[hp] + 1
elif dn < do:
stack[hp] = j
hp -= 1
while lp > 0 and dn <= data[stack[lp-1]]:
lp -= 1
wait[stack[lp]] = j - stack[lp] + 1
else:
wait[j] = 1
return wait
def check(data, wait):
w = np.where(~np.isnan(wait))[0]
assert np.all((data[w + 1] - data[w])
* (data[w + wait[w].astype(int)] - data[w]) <= 0)
assert np.all((data[w+1] - data[w])
* (data[w + wait[w].astype(int) - 1] - data[w]) >= 0)
print('test passed')
waito = OP(data)
wait = PP(data)
check(data, wait)
print('outputs equal',
np.all((wait==waito) | (np.isnan(wait) & np.isnan(waito))))
from timeit import timeit
print('\nTimings:')
for f in OP, PP:
print('{:16s} {:10.6f} ms'.format(f.__name__, timeit(
lambda: f(data), number=10) * 100))
示例输出:
test passed
('outputs equal', True)
Timings:
OP 12099.189901 ms
PP 26.705098 ms
关于python,找到连续值时间序列直到下一次出现的等待时间,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43397160/