python-3.x - TPU : double, 不支持的数据类型由输出 IteratorGetNext:0 引起

我正在使用 tf.keras 构建一个简单的深度学习模型，结构如下:


def build_simple_model(timestep, dim1, show_model_structure=False):

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.LSTM(units=64, input_shape=(timestep, dim1), activation="relu", return_sequences=True, dtype="float64"))
    model.add(tf.keras.layers.LSTM(units=64, input_shape=(timestep, dim1), activation="relu", return_sequences=True, dtype="float64"))
    model.add(tf.keras.layers.LSTM(units=64, input_shape=(timestep, dim1), activation="relu", dtype="float64"))
    model.add(tf.keras.layers.Dense(32, activation="relu",dtype="float64")) 
    model.add(tf.keras.layers.Dense(1,dtype="float64"))

    # adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(loss='mse', 
                  optimizer='adam', 
                  metrics=['mape'])

    if show_model_structure:
        model.summary()
        draw_model_graph(model)
    return model

并在 Colab 上使用了 TPU:

resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)

使用新的 TPU API 构建和拟合模型:

with strategy.scope():
    test_model = build_simple_model(23, x_t.shape[2])
    test_model.fit(x_t.astype('float64'), y_train.astype('float64'), epochs=100, batch_size=64)#, callbacks=[callback])

但是，奇怪的错误仍然存在，并提示错误的数据类型。

完整跟踪堆栈:

---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _do_call(self, fn, *args)
   1364     try:
-> 1365       return fn(*args)
   1366     except errors.OpError as e:

11 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
   1347       # Ensure any changes to the graph are reflected in the runtime.
-> 1348       self._extend_graph()
   1349       return self._call_tf_sessionrun(options, feed_dict, fetch_list,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _extend_graph(self)
   1387     with self._graph._session_run_lock():  # pylint: disable=protected-access
-> 1388       tf_session.ExtendSession(self._session)
   1389 

InvalidArgumentError: Unsupported data type for TPU: double, caused by output IteratorGetNext:0

During handling of the above exception, another exception occurred:

InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-150-8f5f7911c182> in <module>()
      1 with strategy.scope():
      2     test_model = build_simple_model(23, x_t.shape[2])
----> 3     test_model.fit(x_t.astype('float64'), y_train.astype('float64'), epochs=100, batch_size=64)#, callbacks=[callback])

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    725         max_queue_size=max_queue_size,
    726         workers=workers,
--> 727         use_multiprocessing=use_multiprocessing)
    728 
    729   def evaluate(self,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_distributed.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
    617         validation_split=validation_split,
    618         shuffle=shuffle,
--> 619         epochs=epochs)
    620     if not dist_utils.is_distributing_by_cloning(model):
    621       with model._distribution_strategy.scope():

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in _distribution_standardize_user_data(self, x, y, sample_weight, class_weight, batch_size, validation_split, shuffle, epochs, allow_partial_batch)
   2270         session = None
   2271       else:
-> 2272         session = K.get_session()
   2273 
   2274       first_x_value = nest.flatten(x)[0]

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py in get_session(op_input_list)
    484   if not _MANUAL_VAR_INIT:
    485     with session.graph.as_default():
--> 486       _initialize_variables(session)
    487   return session
    488 

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py in _initialize_variables(session)
    901     # marked as initialized.
    902     is_initialized = session.run(
--> 903         [variables_module.is_variable_initialized(v) for v in candidate_vars])
    904     uninitialized_vars = []
    905     for flag, v in zip(is_initialized, candidate_vars):

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    954     try:
    955       result = self._run(None, fetches, feed_dict, options_ptr,
--> 956                          run_metadata_ptr)
    957       if run_metadata:
    958         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1178     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1179       results = self._do_run(handle, final_targets, final_fetches,
-> 1180                              feed_dict_tensor, options, run_metadata)
   1181     else:
   1182       results = []

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1357     if handle is None:
   1358       return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1359                            run_metadata)
   1360     else:
   1361       return self._do_call(_prun_fn, handle, feeds, fetches)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in _do_call(self, fn, *args)
   1382                     '\nsession_config.graph_options.rewrite_options.'
   1383                     'disable_meta_optimizer = True')
-> 1384       raise type(e)(node_def, op, message)
   1385 
   1386   def _extend_graph(self):

InvalidArgumentError: Unsupported data type for TPU: double, caused by output IteratorGetNext:0

有人知道这是怎么回事吗？
提前致谢。

最佳答案

我在使用 TPU 运行 keras 模型时遇到了同样的问题。

我发现它与提供给 .fit 的 y 标签(第二个)参数的数据类型有关。它顺利工作的唯一方法是使用 dtype int64。任何 float 类型都会收到相同的异常。

在我的例子中，虽然我计划使用需要浮点标签的二进制交叉熵来运行模型，但我可以使用带有 int 标签的分类交叉熵来解决它。

在你的情况下，我看到你的损失是 mse，所以不确定你是否可以通过使用 int 标签来解决。

另一种成功解决该异常但随后出现其他异常的方法如下:

定义y标签向量为tf.Variable:

sess = K.get_session()

...

y = tf.Variable(my_baseline_y_vector, dtype='bfloat16') # any float dtype EXCEPT float64

然后在调用 .fit 之前:

train_y = y.eval(session=sess)

将其转换为可提供给 .fit 的数组

model.fit(train_x, train_y, ...)

如前所述，这通过了讨论的 dtype 异常，但随后收到了其他异常 :(。我放弃了，正在使用带有 int 标签的分类。如果您没有这样的选择，那么我很乐意听到你是如何处理其他异常(exception)的:)。

关于python-3.x - TPU : double, 不支持的数据类型由输出 IteratorGetNext:0 引起，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/58535392/

python-3.x - TPU : double, 不支持的数据类型由输出 IteratorGetNext:0 引起

上一篇：reactjs - 是否可以将 MaterialUI 与 React 和 css 模块一起使用并访问 css 模块文件中的主题？

下一篇：graphql - 在 graphql(JS) 中清理/拒绝包含不安全 html 的用户输入