我正在使用 numpy.genfromtxt() 读取一个大文件。其中一行的格式为 string
,其他行的格式为 floats
(示例:
76920 1995-12-31-00:00 - - - - - - - - - - - - - - - - - - 108 - - - - - - 14.3 - 17.4 - - - - 14.3 - - - - 1005.8 - - - - - - - - - - 2.6 - - - - - - - - - 7.4 - - 54 3.2 5.1 - - - - - - - 7.4 - - 5.7 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 3.7 - - - - -
)。在这种情况下,日期 (1995-12-31-00:00) 应读取为 string
,其他所有内容应读取为 float
(或不存在)。根据manual我可以单独为每一列设置数据类型,但我没有找到一种方法只为一列(在本例中为第二列)设置数据类型,而让其余的随意猜测。这可能吗?或者我是否必须在想要指定其中一种数据类型时立即写下所有数据类型?
此外,我想设置一个“后备”数据类型,如果数据类型无法识别,则使用该数据类型。 IE。我当前的输出是:
[('Stno', '<i8'), ('DateHourNMT', 'S16'), ('CD', '<i8'), ('CD147', '?'), ('CD15', '?'), ('CD18', '?'), ('CD30', '?'), ('CD40', '?'), ('CD69', '?'), ('CD75', '?'), ('CD98', '?'), ('CV', '<f8'), ('CV147', '?'), ('CV15', '?'), ('CV18', '?'), ('CV30', '?'), ('CV40', '?'), ('CV69', '?'), ('CV75', '?'), ('CV98', '?'), ('DD', '<i8'), ('DG', '?'), ('DG_010', '?'), ('DM1', '?'), ('DMT', '?'), ('DP1', '?'), ('DX', '?'), ('FF', '<f8'), ('FG', '?'), ('FG_010', '<f8'), ('FG_020', '?'), ('FG_1', '?'), ('FX', '<f8'), ('FX_020', '?'), ('FX_1', '?'), ('H13', '?'), ('HM0LF', '?'), ('HMEAN', '?'), ('HS1MAX', '?'), ('PR', '<f8'), ('S1HMAX', '?'), ('S1MAX', '?'), ('S1MEAN', '?'), ('SDP1', '?'), ('SFP', '?'), ('SIGS1', '?'), ('SPR1', '?'), ('SPRT', '?'), ('SPRTP', '?'), ('T13', '?'), ('TA', '<f8'), ('TC', '?'), ('TD', '<f8'), ('THHF', '?'), ('THLF', '?'), ('THTP', '?'), ('TL', '?'), ('TM24', '?'), ('TMAX', '?'), ('TU', '?'), ('TW', '<f8'), ('TWB', '?'), ('UI', '?'), ('UU', '<i8'), ('WHM0', '<f8'), ('WHMAX', '<f8'), ('WL', '?'), ('WL00', '?'), ('WL10', '?'), ('WL20', '?'), ('WL30', '?'), ('WL40', '?'), ('WL50', '?'), ('WTHMAX', '<f8'), ('WTP', '<f8'), ('WTP2', '<f8'), ('WTZ', '<f8'), ('X1CD147', '?'), ('X1CD30', '?'), ('X1CD75', '?'), ('X1CV147', '?'), ('X1CV30', '?'), ('X1CV75', '?'), ('X1DD', '<i8'), ('X1DG_010', '?'), ('X1FF', '<f8'), ('X1FG_010', '<f8'), ('X1FG_020', '?'), ('X1FX', '<f8'), ('X1FX_020', '?'), ('X1HMEAN', '?'), ('X1PR', '<f8'), ('X1SDP1', '?'), ('X1TA', '<f8'), ('X1TD', '<f8'), ('X1TM24', '?'), ('X1TS', '?'), ('X1TW', '?'), ('X1UU', '<i8'), ('X1WHM0', '<f8'), ('X1WHMAX', '<f8'), ('X1WL', '<f8'), ('X1WTHMAX', '<f8'), ('X1WTP', '<f8'), ('X1WTP2', '<f8'), ('X1WTZ', '<f8'), ('X2DD', '?'), ('X2FF', '?'), ('X2FG_010', '?'), ('X2FG_020', '?'), ('X2FX', '?'), ('X2FX_020', '?'), ('X2SDP1', '?'), ('X2TM24', '?'), ('X2TW', '?'), ('X2WHM0', '<f8'), ('X2WHMAX', '<f8'), ('X2WL', '<f8'), ('X2WTHMAX', '<f8'), ('X2WTP', '<f8'), ('X2WTP2', '<f8'), ('X2WTZ', '<f8')]
但我想要这样的东西:
[('Stno', '<i8'), ('DateHourNMT', 'S16'), ('CD', '<i8'), ('CD147', '<f8'), ('CD15', '<f8'), ('CD18', '<f8'), ('CD30', '<f8'), ('CD40', '<f8'), ('CD69', '<f8'), ('CD75', '<f8'), ('CD98', '<f8'), ('CV', '<f8'), ('CV147', '<f8'), ('CV15', '<f8'), ('CV18', '<f8'), ('CV30', '<f8'), ('CV40', '<f8'), ('CV69', '<f8'), ('CV75', '<f8'), ('CV98', '<f8'), ('DD', '<i8'), ('DG', '<f8'), ('DG_010', '<f8'), ('DM1', '<f8'), ('DMT', '<f8'), ('DP1', '<f8'), ('DX', '<f8'), ('FF', '<f8'), ('FG', '<f8'), ('FG_010', '<f8'), ('FG_020', '<f8'), ('FG_1', '<f8'), ('FX', '<f8'), ('FX_020', '<f8'), ('FX_1', '<f8'), ('H13', '<f8'), ('HM0LF', '<f8'), ('HMEAN', '<f8'), ('HS1MAX', '<f8'), ('PR', '<f8'), ('S1HMAX', '<f8'), ('S1MAX', '<f8'), ('S1MEAN', '<f8'), ('SDP1', '<f8'), ('SFP', '<f8'), ('SIGS1', '<f8'), ('SPR1', '<f8'), ('SPRT', '<f8'), ('SPRTP', '<f8'), ('T13', '<f8'), ('TA', '<f8'), ('TC', '<f8'), ('TD', '<f8'), ('THHF', '<f8'), ('THLF', '<f8'), ('THTP', '<f8'), ('TL', '<f8'), ('TM24', '<f8'), ('TMAX', '<f8'), ('TU', '<f8'), ('TW', '<f8'), ('TWB', '<f8'), ('UI', '<f8'), ('UU', '<i8'), ('WHM0', '<f8'), ('WHMAX', '<f8'), ('WL', '<f8'), ('WL00', '<f8'), ('WL10', '<f8'), ('WL20', '<f8'), ('WL30', '<f8'), ('WL40', '<f8'), ('WL50', '<f8'), ('WTHMAX', '<f8'), ('WTP', '<f8'), ('WTP2', '<f8'), ('WTZ', '<f8'), ('X1CD147', '<f8'), ('X1CD30', '<f8'), ('X1CD75', '<f8'), ('X1CV147', '<f8'), ('X1CV30', '<f8'), ('X1CV75', '<f8'), ('X1DD', '<i8'), ('X1DG_010', '<f8'), ('X1FF', '<f8'), ('X1FG_010', '<f8'), ('X1FG_020', '?'), ('X1FX', '<f8'), ('X1FX_020', '<f8'), ('X1HMEAN', '<f8'), ('X1PR', '<f8'), ('X1SDP1', '<f8'), ('X1TA', '<f8'), ('X1TD', '<f8'), ('X1TM24', '<f8'), ('X1TS', '<f8'), ('X1TW', '<f8'), ('X1UU', '<i8'), ('X1WHM0', '<f8'), ('X1WHMAX', '<f8'), ('X1WL', '<f8'), ('X1WTHMAX', '<f8'), ('X1WTP', '<f8'), ('X1WTP2', '<f8'), ('X1WTZ', '<f8'), ('X2DD', '<f8'), ('X2FF', '<f8'), ('X2FG_010', '<f8'), ('X2FG_020', '<f8'), ('X2FX', '<f8'), ('X2FX_020', '<f8'), ('X2SDP1', '<f8'), ('X2TM24', '<f8'), ('X2TW', '<f8'), ('X2WHM0', '<f8'), ('X2WHMAX', '<f8'), ('X2WL', '<f8'), ('X2WTHMAX', '<f8'), ('X2WTP', '<f8'), ('X2WTP2', '<f8'), ('X2WTZ', '<f8')]
如果我将默认数据类型设置为float
。这可能吗?
编辑2:错误行:
76920 2005-01-01-00:00 52 - - - - - - - - 0.06 - - - - - - - - 230 - - - - - - 10.1 - 12.3 - - 8.5 - - - - - - 1016.7 - - - - - - - - - - 7.0 - 4.7 - - - - - - - 7.8 - - 85 1.5 2.5 - - - - - - - 6.1 12.8 8.5 4.9 - - - - - - 229 - 10.3 12.3 - 8.6 - - 1016.8 - 7.1 4.7 - - - 84 - 2.8 19.88 6.8 - - - - - - - - - - - - 1.5 2.4 17.40 - 12.6 5.8 4.9
数据获取方式
self.data = numpy.genfromtxt(self.file, skip_footer=0, dtype = float, missing_values = '-', filling_values = -1000, names = True, invalid_raise = False, usemask = True, converters={0: int, 1: str})
最佳答案
通常的解决方案是使用dtype=None
,它告诉genfromtxt
创建一个
对每列的数据类型进行有根据的猜测。如果这没有产生
所需的数据类型,那么这里有三个选项:
import numpy as np
def using_explicit_dtype(filename):
with open(filename, 'rb') as f:
header = next(f).split()
n = len(header)
dtype = ['<i8', 'S16']+['<f8']*(n-2)
dtype = zip(header, dtype)
data = np.genfromtxt(f,
dtype=dtype,
usemask=True)
return data
def using_default_dtype(filename):
with open(filename, 'rb') as f:
header = next(f).split()
data = np.genfromtxt(f,
# filling_values=-1000,
# names=True,
# usemask=True,
dtype=float,
converters={0: int, 1: str})
data.dtype.names = header
return data
def using_pandas(filename):
import pandas as pd
df = pd.read_table('data', delim_whitespace=True, na_values='-')
return df
using_explicit_dtype
读取文件的第一行来构建所需的
明确地输入数据。这样做的一个优点是您可以使用
usemask=True
返回结构化掩码数组。然而,有一个缺点,
是您必须提前显式指定字符串列的长度。
using_default_dtype
使用 dtype=float
设置默认 dtype 和转换器
处理第一列和第二列。不幸的是,这与
一些其他参数,例如 filling_values=-1000
、names=True
和
usemask=True
。
using_pandas
使用 pandas将数据加载到
pandas.DataFrame
。 Pandas 对行标记和列标记有更好的支持
表状数组优于 NumPy 结构化数组。您可能会发现这更
比处理 NumPy 的结构化数组更方便的解决方案。
filename = 'data'
data = using_explicit_dtype(filename)
print(data)
# [ (76920, '1995-12-31-00:00', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 108.0, nan, nan, nan, nan, nan, nan, 14.3, nan, 17.4, nan, nan, nan, nan, 14.3, nan, nan, nan, nan, 1005.8, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 2.6, nan, nan, nan, nan, nan, nan, nan, nan, nan, 7.4, nan, nan, 54.0, 3.2, 5.1, nan, nan, nan, nan, nan, nan, nan, 7.4, nan, nan, 5.7, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 3.7, nan, nan, nan, nan, nan)
# (76920, '2005-01-01-00:00', 52.0, nan, nan, nan, nan, nan, nan, nan, nan, 0.06, nan, nan, nan, nan, nan, nan, nan, nan, 230.0, nan, nan, nan, nan, nan, nan, 10.1, nan, 12.3, nan, nan, 8.5, nan, nan, nan, nan, nan, nan, 1016.7, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 7.0, nan, 4.7, nan, nan, nan, nan, nan, nan, nan, 7.8, nan, nan, 85.0, 1.5, 2.5, nan, nan, nan, nan, nan, nan, nan, 6.1, 12.8, 8.5, 4.9, nan, nan, nan, nan, nan, nan, 229.0, nan, 10.3, 12.3, nan, 8.6, nan, nan, 1016.8, nan, 7.1, 4.7, nan, nan, nan, 84.0, nan, 2.8, 19.88, 6.8, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1.5, 2.4, 17.4, nan, 12.6, 5.8, 4.9)]
print(data.dtype.descr[:5])
# [('Stno', '<i8'), ('DateHourNMT', '|S16'), ('CD', '<f8'), ('CD147', '<f8'), ('CD15', '<f8')]
print(type(data))
# <class 'numpy.ma.core.MaskedArray'>
<小时/>
data2 = using_default_dtype(filename)
print(data2)
# [ (76920, '1995-12-31-00:00', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 108.0, nan, nan, nan, nan, nan, nan, 14.3, nan, 17.4, nan, nan, nan, nan, 14.3, nan, nan, nan, nan, 1005.8, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 2.6, nan, nan, nan, nan, nan, nan, nan, nan, nan, 7.4, nan, nan, 54.0, 3.2, 5.1, nan, nan, nan, nan, nan, nan, nan, 7.4, nan, nan, 5.7, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 3.7, nan, nan, nan, nan, nan)
# (76920, '2005-01-01-00:00', 52.0, nan, nan, nan, nan, nan, nan, nan, nan, 0.06, nan, nan, nan, nan, nan, nan, nan, nan, 230.0, nan, nan, nan, nan, nan, nan, 10.1, nan, 12.3, nan, nan, 8.5, nan, nan, nan, nan, nan, nan, 1016.7, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 7.0, nan, 4.7, nan, nan, nan, nan, nan, nan, nan, 7.8, nan, nan, 85.0, 1.5, 2.5, nan, nan, nan, nan, nan, nan, nan, 6.1, 12.8, 8.5, 4.9, nan, nan, nan, nan, nan, nan, 229.0, nan, 10.3, 12.3, nan, 8.6, nan, nan, 1016.8, nan, 7.1, 4.7, nan, nan, nan, 84.0, nan, 2.8, 19.88, 6.8, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1.5, 2.4, 17.4, nan, 12.6, 5.8, 4.9)]
print(data2.dtype.descr[:5])
# [('Stno', '<i8'), ('DateHourNMT', '|S16'), ('CD', '<f8'), ('CD147', '<f8'), ('CD15', '<f8')]
print(type(data2))
# <type 'numpy.ndarray'>
<小时/>
df = using_pandas(filename)
print(df)
# Stno DateHourNMT CD CD147 CD15 CD18 CD30 CD40 CD69 CD75 \
# 0 76920 1995-12-31-00:00 NaN NaN NaN NaN NaN NaN NaN NaN
# 1 76920 2005-01-01-00:00 52 NaN NaN NaN NaN NaN NaN NaN
# ... X2SDP1 X2TM24 X2TW X2WHM0 X2WHMAX X2WL X2WTHMAX X2WTP \
# 0 ... NaN NaN NaN NaN 3.7 NaN NaN NaN
# 1 ... NaN NaN NaN 1.5 2.4 17.4 NaN 12.6
# X2WTP2 X2WTZ
# 0 NaN NaN
# 1 5.8 4.9
print(df.dtypes)
# Stno int64
# DateHourNMT object
# CD float64
# ...
# X2WTP2 float64
# X2WTZ float64
# Length: 122, dtype: object
关于python - 设置一种特定的日期类型以在 genfromtxt 中读取,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28985745/