tensorflow - 使用 tf.py_func 生成输入数据

标签 tensorflow keras generator pipeline tensorflow-datasets

Python 版本 = 3.6.3 Tensorflow 版本 = 1.3.0

我曾在 Keras 中工作,但现在正尝试直接在 TensorFlow 中工作。 我正在尝试实现 Kerasfit_generator 的等价物,因此我不必在开始时将所有训练数据加载到内存中,但可以提供它根据训练需要进入网络。 下面的代码代表我尝试开始类似的事情,但如果我这样做全错了,我很想知道我应该在文档中的哪个位置查看以及我应该查看哪些关键字用于搜索这个。

我的系统目前基于一个生成器,该生成器读取 sqlite 数据库文件以提取 np.arrays,然后将它们转换成我想要的数据形状(一个时间序列和一个前向预测)。我现在正在尝试迁移该系统以使用 Tensorflow Dataset 并在应用 tf.py_func 时遇到困难。这是我现在尝试的工作方式

import tensorflow as tf
import os
from tensorflow.contrib.data import Dataset, Iterator

import sqlite3
import pandas as pd
import numpy as np

LOOKBACK_ROWS = 600 
DATA_DIR = '/mnt/derived_data/processedData'

files = os.listdir(DATA_DIR)

def data_from_files(f):
    with sqlite3.connect(DATA_DIR + f) as conn:
        results = conn.execute("SELECT col1, col2, FROM tbl")
        col_names = [d[0] for d in results.description]
        arr = np.array(results.fetchall())

    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return tf.convert_to_tensor(X, name = 'X'), tf.convert_to_tensor(Y, name = 'Y')

filenames = tf.constant(files)

dataset = Dataset.from_tensor_slices((filenames))

dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files,
    [filename],
    [tf.float32, tf.float32])))


iterator     = Iterator.from_structure(dataset.output_types, dataset.output_shapes)
next_element = iterator.get_next()
dataset_init_op = iterator.make_initializer(dataset)

with tf.Session() as sess:
    sess.run(dataset_init_op)

    while True:
        try:
            elem = sess.run(next_element)
            print('Success')
        except tf.errors.OutOfRangeError:
            print('End of dataset.')
            break

初始化运行良好,但是当我启动 session 并运行时出现以下错误:

  2017-10-16 16:58:45.227612: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
    2017-10-16 16:58:45.227615: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0:   Y 
    2017-10-16 16:58:45.227620: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0)
    2017-10-16 16:58:45.276138: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: TypeError: must be str, not bytes
    2017-10-16 16:58:45.276306: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
    Traceback (most recent call last):
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call
        return fn(*args)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn
        status, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/contextlib.py", line 88, in __exit__
        next(self.gen)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
        pywrap_tensorflow.TF_GetCode(status))
    tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
         [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]

    During handling of the above exception, another exception occurred:

    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/home/usr/code/nn/data_folder/pipeline.py", line 51, in <module>
        elem = sess.run(next_element)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run
        run_metadata_ptr)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1124, in _run
        feed_dict_tensor, options, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
        options, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
        raise type(e)(node_def, op, message)
    tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
         [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]
    >>> python.el: native completion setup loaded
    >>> 

问题

(1) 这似乎正是 py_func 的用例,但我错了吗?如果没有,谁能指出一些比 Tensorflow 文档更深入的资源? (我确实在 git 上注意到一个潜在的相关问题:https://github.com/tensorflow/tensorflow/issues/12396 但用 tuple 包装所有内容的修复对我没有帮助)。

(2) 我应该遵循的一般流程是什么,特别是我想从一堆文件名开始并为每个文件名输出多个训练 Example 的地方?

谢谢。

下面我重写了我的脚本,以便它可以成为一个独立的可运行示例。我相信问题仍然与上面的代码相同,但我也重新粘贴错误以确认。

自包含的可运行代码示例合并了@mrry 的回答中的更改:

import tensorflow as tf
import os
import numpy as np

LOOKBACK_ROWS = 600 

arr = np.random.random_sample((2000, 2))
np.save("npfile.npy", arr)

def data_from_files(f):

    arr = np.load(f)
    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return X, Y

files = ["npfile.npy"]
filenames = tf.constant(files)


# NOTE: In TensorFlow 1.4, `tf.contrib.data` is now `tf.data`.
dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)

# NOTE: In TensorFlow 1.4, the `tuple` is no longer needed.
dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files,
    [filename],
    [tf.float32, tf.float32])))

# NOTE: If you only have one `Dataset`, you do not need to use
# `Iterator.from_structure()`.
iterator     = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    sess.run(iterator.initializer)

    while True:
        try:
            elem = sess.run(next_element)
            print('Success')
        except tf.errors.OutOfRangeError:
            print('End of dataset.')
            break

错误:

2017-10-16 18:30:44.143668: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
2017-10-16 18:30:44.143672: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0:   Y 
2017-10-16 18:30:44.143679: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0)
2017-10-16 18:30:44.190852: W tensorflow/core/framework/op_kernel.cc:1192] Unknown: AttributeError: 'bytes' object has no attribute 'read'
2017-10-16 18:30:44.190959: W tensorflow/core/framework/op_kernel.cc:1192] Unknown: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
Traceback (most recent call last):
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call
    return fn(*args)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn
    status, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/contextlib.py", line 88, in __exit__
    next(self.gen)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
    pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "demo.py", line 48, in <module>
    elem = sess.run(next_element)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run
    run_metadata_ptr)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1124, in _run
    feed_dict_tensor, options, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
    options, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]

最佳答案

以相反的顺序回答你的问题:

What is the general flow I should be following, particularly where I want to start with something like a bunch of filenames and output more than one training Example per file name?

要将一个元素转换为多个元素,请使用 Dataset.flat_map(f)转型。此转换允许您定义一个函数 f(x),它将单个元素 x 映射到嵌套的 Dataset 对象,然后它负责展平嵌套数据集。

This seems like exactly a use case for py_func but am I wrong about that?

这是 tf.py_func() 的用例但是您的程序有一个小错误:tf.py_func() 操作期望您的函数 (data_from_files()) 返回 NumPy 数组,而 tf.Tensor 对象。只需返回 XY 即可。


回答完这两点后,让我们看看如何重写代码:

import tensorflow as tf
import os

import sqlite3
import pandas as pd
import numpy as np

LOOKBACK_ROWS = 600 
DATA_DIR = '/mnt/derived_data/processedData'

files = os.listdir(DATA_DIR)

def data_from_files(f):
    with sqlite3.connect(DATA_DIR + f) as conn:
        results = conn.execute("SELECT col1, col2, FROM tbl")
        col_names = [d[0] for d in results.description]
        arr = np.array(results.fetchall())

    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return X, Y

filenames = tf.constant(files)

# NOTE: In TensorFlow 1.4, `tf.contrib.data` is now `tf.data`.
dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)

# NOTE: In TensorFlow 1.4, the `tuple` is no longer needed.
dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files,
    [filename],
    [tf.float32, tf.float32])))

# NOTE: If you only have one `Dataset`, you do not need to use
# `Iterator.from_structure()`.
iterator     = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    sess.run(iterator.initializer)

    while True:
        try:
            elem = sess.run(next_element)
            print('Success')
        except tf.errors.OutOfRangeError:
            print('End of dataset.')
            break

关于tensorflow - 使用 tf.py_func 生成输入数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/46779211/

相关文章:

python - 'tensorflow' 没有属性 'to_int32'

tensorflow - Keras:下载 Fashion_MNIST 数据时出错

python - 如何使用tensorflow数据集读取多个.mat文件(太大而无法放入内存)

python - Keras Transfer-Learning 设置 layers.trainable 为 True 无效

python - 如何在 .map 函数中访问张量形状?

tensorflow - keras + scikit-learn 包装器,当 GridSearchCV 与 n_jobs >1 时似乎挂起

python - Keras:在生成器中加载图像与在生成器外部加载图像

python - 从调用生成器的函数返回或产生?

具有递归调用的Python生成器

cuda - 没有root安装Cuda