python - 预期的二进制或 unicode 字符串，得到 nan - tensorflow/pandas

我对 TensorFlow/机器学习还很陌生，因此遇到了一些困难。我有一个 csv 格式的数据集 here并想用像 here 这样的 Pandas 来阅读它.它适用于不同的数据集，但我进行了修改和扩展，但我认为我在这里遗漏了一些重要的东西。基本上我要做的就是从给定的数据集中预测“总体”评级。这是我的代码和我得到的回溯:

import pandas as pd
import tensorflow as tf
import tempfile


COLUMNS = ["reviewerID", "asin", "reviewerName", "helpful_0", "helpful_1", "reviewText",
           "overall", "summary", "unixReviewTime"]

CATEGORICAL_COLUMNS = ["reviewerID", "reviewerName", "reviewText", "summary"]
CONTINUOUS_COLUMNS = ["helpful_0", "helpful_1", "unixReviewTime"]

df_train = pd.read_csv('Digital_Music_5.csv', names=COLUMNS, skipinitialspace=True,
                       low_memory=False, skiprows=1)
df_test = pd.read_csv('Digital_Music_5_test.csv', names=COLUMNS,
                      skipinitialspace=True, skiprows=1)

LABEL_COLUMN = "label"


df_train[LABEL_COLUMN] = df_train["overall"]
df_test[LABEL_COLUMN] = df_train["overall"]

print(df_train)


def input_fn(df):
    # Creates a dictionary mapping from each continuous feature column name (k)
    # to the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values)
                       for k in CONTINUOUS_COLUMNS}
    # Creates a dictionary mapping from each categorical feature column name
    # (k) to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df[k].size)],
        values=df[k].values,
        dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS}
    # Merges the two dictionaries into one.
    feature_cols = dict(continuous_cols)
    feature_cols.update(categorical_cols)
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label


def train_input_fn():
    return input_fn(df_train)


def eval_input_fn():
    return input_fn(df_test)


reviewText = tf.contrib.layers.sparse_column_with_hash_bucket("reviewText", hash_bucket_size=100000)
reviewerID = tf.contrib.layers.sparse_column_with_hash_bucket("reviewerID", hash_bucket_size=100000)
reviewerName = tf.contrib.layers.sparse_column_with_hash_bucket("reviewerName", hash_bucket_size=100000)
summary = tf.contrib.layers.sparse_column_with_hash_bucket("summary", hash_bucket_size=100000)


asin = tf.contrib.layers.real_valued_column("asin")
helpful_0 = tf.contrib.layers.real_valued_column("helpful_0")
helpful_1 = tf.contrib.layers.real_valued_column("helpful_1")
unixReviewTime = tf.contrib.layers.real_valued_column("unixReviewTime")

# reviewText_x_summary = tf.contrib.layers.crossed_column([reviewText, summary], hash_bucket_size=100000)
# reviewerID_x_reviewerName = tf.contrib.layers.crossed_column([reviewerID, reviewerName], hash_bucket_size=100000)
# reviewText_x_reviewerID_x_reviewerName = tf.contrib.layers.crossed_column([reviewText, reviewerID, reviewerName], hash_bucket_size=100000)


model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[reviewText, reviewerName, summary,
                                                       asin, helpful_0, helpful_1, unixReviewTime], optimizer=tf.train.FtrlOptimizer(
                                                                     learning_rate=0.1,
                                                                     l1_regularization_strength=1.0,
                                                                     l2_regularization_strength=1.0),
                                                       model_dir=model_dir)

m.fit(input_fn=train_input_fn, steps=200)
# results = m.evaluate(input_fn=eval_input_fn, steps=1)
# for key in sorted(results):
#     print("{}: {}".format(key, results[key]))

回溯:

Traceback (most recent call last):
  File "amazon_reviews.py", line 78, in <module>
    m.fit(input_fn=train_input_fn, steps=200)
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 280, in new_func
    return func(*args, **kwargs)
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 426, in fit
    loss = self._train_model(input_fn=input_fn, hooks=hooks)
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 932, in _train_model
    features, labels = input_fn()
  File "amazon_reviews.py", line 47, in train_input_fn
    return input_fn(df_train)
  File "amazon_reviews.py", line 36, in input_fn
    dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS}
  File "amazon_reviews.py", line 36, in <dictcomp>
    dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS}
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/sparse_tensor.py", line 125, in __init__
    values, name="values", as_ref=True)
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 702, in internal_convert_to_tensor
    ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 110, in _constant_tensor_conversion_function
    return constant(v, dtype=dtype, name=name)
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 99, in constant
    tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape))
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 451, in make_tensor_proto
    append_fn(tensor_proto, proto_values)
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 109, in SlowAppendObjectArrayToTensorProto
    tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 109, in <listcomp>
    tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
  File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/util/compat.py", line 65, in as_bytes
    (bytes_or_text,))
TypeError: Expected binary or unicode string, got nan

最佳答案

您的输入 DataFrame 包含空的评论者姓名和评论文本，它们通过 pd.read_csv() 映射到 NaN，但是 TensorFlow 需要一个字符串而不是 NaN。

使用此命令检查空单元格:

df_train[df_train.isnull().any(axis=1)]

您可以使用简单地将这些 NaN 转换为空字符串

df_train.fillna('', inplace=True)

或者让 pd.read_csv() 直接使用 na_values=[] 创建空字符串而不是 NAN:

df_train = pd.read_csv('Digital_Music_5.csv', names=COLUMNS, 
                        skipinitialspace=True, low_memory=False, 
                        skiprows=1, na_values=[])

关于python - 预期的二进制或 unicode 字符串，得到 nan - tensorflow/pandas，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/43183661/

python - 预期的二进制或 unicode 字符串，得到 nan - tensorflow/pandas

上一篇：python - 如何按长度对单词列表进行排序

下一篇：python - 如何使用 BeautifulSoup 提取 div 的属性值