tensorflow - 使用 tf.py_func 生成输入数据

Python 版本 = 3.6.3 Tensorflow 版本 = 1.3.0

我曾在 Keras 中工作,但现在正尝试直接在 TensorFlow 中工作。 我正在尝试实现 Kerasfit_generator 的等价物,因此我不必在开始时将所有训练数据加载到内存中,但可以提供它根据训练需要进入网络。 下面的代码代表我尝试开始类似的事情,但如果我这样做全错了,我很想知道我应该在文档中的哪个位置查看以及我应该查看哪些关键字用于搜索这个。

我的系统目前基于一个生成器,该生成器读取 sqlite 数据库文件以提取 np.arrays,然后将它们转换成我想要的数据形状(一个时间序列和一个前向预测)。我现在正在尝试迁移该系统以使用 Tensorflow Dataset 并在应用 tf.py_func 时遇到困难。这是我现在尝试的工作方式

import tensorflow as tf
import os
from tensorflow.contrib.data import Dataset, Iterator

import sqlite3
import pandas as pd
import numpy as np

DATA_DIR = '/mnt/derived_data/processedData'

files = os.listdir(DATA_DIR)

def data_from_files(f):
    with sqlite3.connect(DATA_DIR + f) as conn:
        results = conn.execute("SELECT col1, col2, FROM tbl")
        col_names = [d[0] for d in results.description]
        arr = np.array(results.fetchall())

    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return tf.convert_to_tensor(X, name = 'X'), tf.convert_to_tensor(Y, name = 'Y')

filenames = tf.constant(files)

dataset = Dataset.from_tensor_slices((filenames))

dataset = dataset.map(lambda filename: tuple(tf.py_func(
    [tf.float32, tf.float32])))

iterator     = Iterator.from_structure(dataset.output_types, dataset.output_shapes)
next_element = iterator.get_next()
dataset_init_op = iterator.make_initializer(dataset)

with tf.Session() as sess:

    while True:
            elem = sess.run(next_element)
        except tf.errors.OutOfRangeError:
            print('End of dataset.')

初始化运行良好,但是当我启动 session 并运行时出现以下错误:

  2017-10-16 16:58:45.227612: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
    2017-10-16 16:58:45.227615: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0:   Y 
    2017-10-16 16:58:45.227620: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0)
    2017-10-16 16:58:45.276138: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: TypeError: must be str, not bytes
    2017-10-16 16:58:45.276306: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
    Traceback (most recent call last):
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call
        return fn(*args)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn
        status, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/contextlib.py", line 88, in __exit__
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
    tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
         [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]

    During handling of the above exception, another exception occurred:

    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/home/usr/code/nn/data_folder/pipeline.py", line 51, in <module>
        elem = sess.run(next_element)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1124, in _run
        feed_dict_tensor, options, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
        options, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
        raise type(e)(node_def, op, message)
    tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
         [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]
    >>> python.el: native completion setup loaded


(1) 这似乎正是 py_func 的用例,但我错了吗?如果没有,谁能指出一些比 Tensorflow 文档更深入的资源? (我确实在 git 上注意到一个潜在的相关问题:https://github.com/tensorflow/tensorflow/issues/12396 但用 tuple 包装所有内容的修复对我没有帮助)。

(2) 我应该遵循的一般流程是什么,特别是我想从一堆文件名开始并为每个文件名输出多个训练 Example 的地方?



自包含的可运行代码示例合并了@mrry 的回答中的更改:

import tensorflow as tf
import os
import numpy as np


arr = np.random.random_sample((2000, 2))
np.save("npfile.npy", arr)

def data_from_files(f):

    arr = np.load(f)
    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return X, Y

files = ["npfile.npy"]
filenames = tf.constant(files)

# NOTE: In TensorFlow 1.4, `tf.contrib.data` is now `tf.data`.
dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)

# NOTE: In TensorFlow 1.4, the `tuple` is no longer needed.
dataset = dataset.map(lambda filename: tuple(tf.py_func(
    [tf.float32, tf.float32])))

# NOTE: If you only have one `Dataset`, you do not need to use
# `Iterator.from_structure()`.
iterator     = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:

    while True:
            elem = sess.run(next_element)
        except tf.errors.OutOfRangeError:
            print('End of dataset.')


2017-10-16 18:30:44.143668: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
2017-10-16 18:30:44.143672: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0:   Y 
2017-10-16 18:30:44.143679: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0)
2017-10-16 18:30:44.190852: W tensorflow/core/framework/op_kernel.cc:1192] Unknown: AttributeError: 'bytes' object has no attribute 'read'
2017-10-16 18:30:44.190959: W tensorflow/core/framework/op_kernel.cc:1192] Unknown: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
Traceback (most recent call last):
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call
    return fn(*args)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn
    status, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/contextlib.py", line 88, in __exit__
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "demo.py", line 48, in <module>
    elem = sess.run(next_element)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1124, in _run
    feed_dict_tensor, options, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
    options, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]



What is the general flow I should be following, particularly where I want to start with something like a bunch of filenames and output more than one training Example per file name?

要将一个元素转换为多个元素,请使用 Dataset.flat_map(f)转型。此转换允许您定义一个函数 f(x),它将单个元素 x 映射到嵌套的 Dataset 对象,然后它负责展平嵌套数据集。

This seems like exactly a use case for py_func but am I wrong about that?

这是 tf.py_func() 的用例但是您的程序有一个小错误:tf.py_func() 操作期望您的函数 (data_from_files()) 返回 NumPy 数组,而 tf.Tensor 对象。只需返回 XY 即可。


import tensorflow as tf
import os

import sqlite3
import pandas as pd
import numpy as np

DATA_DIR = '/mnt/derived_data/processedData'

files = os.listdir(DATA_DIR)

def data_from_files(f):
    with sqlite3.connect(DATA_DIR + f) as conn:
        results = conn.execute("SELECT col1, col2, FROM tbl")
        col_names = [d[0] for d in results.description]
        arr = np.array(results.fetchall())

    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return X, Y

filenames = tf.constant(files)

# NOTE: In TensorFlow 1.4, `tf.contrib.data` is now `tf.data`.
dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)

# NOTE: In TensorFlow 1.4, the `tuple` is no longer needed.
dataset = dataset.map(lambda filename: tuple(tf.py_func(
    [tf.float32, tf.float32])))

# NOTE: If you only have one `Dataset`, you do not need to use
# `Iterator.from_structure()`.
iterator     = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:

    while True:
            elem = sess.run(next_element)
        except tf.errors.OutOfRangeError:
            print('End of dataset.')

