tensorflow - 同样的模型在 keras 中收敛,但在 tensorflow 中不收敛,这怎么可能?

标签 tensorflow deep-learning keras lstm

我正在尝试在tensorflow中使用lstm,但我发现我无法制作一个简单的imdb情感模型来收敛。

我采用了 keras 模型,并尝试在 TensorFlow 中复制完全相同的模型,在 keras 中它可以训练并收敛,但是在 TensorFlow 中它只是卡在某个点(0.69 损失)。

我试图使它们尽可能相等,我能看出的唯一区别是,在 keras 中,填充位于序列之前,而在 tensorflow 中,由于 tensorflow 中的约定,我使用“后”填充。

知道我的 tensorflow 模型有什么问题吗?

from __future__ import print_function

import random
import numpy as np

from tensorflow.contrib.keras.python.keras.preprocessing import sequence
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense, Dropout, Activation
from tensorflow.contrib.keras.python.keras.layers import Embedding
from tensorflow.contrib.keras.python.keras.layers import LSTM
from tensorflow.contrib.keras.python.keras.layers import Conv1D, MaxPooling1D
from tensorflow.contrib.keras.python.keras.datasets import imdb

import tensorflow as tf

# Embedding
max_features = 30000
maxlen = 2494
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 2


class TrainData:
    def __init__(self, batch_sz=batch_size):
        (x_train, y_train), (_, _) = imdb.load_data(num_words=max_features)


        y_train = [[int(x == 1), int(x != 1)] for x in y_train]
        self._batch_size = batch_sz

        self._train_data = sequence.pad_sequences(x_train, padding='pre')

        self._train_labels = y_train

    def next_batch(self):
        if len(self._train_data) < self._batch_size:
            self.__init__()

        batch_x, batch_y = self._train_data[:self._batch_size], self._train_labels[:self._batch_size]
        self._train_data = self._train_data[self._batch_size:]
        self._train_labels = self._train_labels[self._batch_size:]

        return batch_x, batch_y

    def batch_generator(self):
        while True:
            if len(self._train_data) < self._batch_size:
                self.__init__()

            batch_x, batch_y = self._train_data[:self._batch_size], self._train_labels[:self._batch_size]
            self._train_data = self._train_data[self._batch_size:]
            self._train_labels = self._train_labels[self._batch_size:]

            yield batch_x, batch_y

    def get_num_batches(self):
        return int(len(self._train_data) / self._batch_size)

def length(sequence):
    used = tf.sign(tf.abs(sequence))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length


def get_model(x, y):
    embedding = tf.get_variable("embedding", [max_features, embedding_size], dtype=tf.float32)
    embedded_x = tf.nn.embedding_lookup(embedding, x)
    print(x)
    print(embedded_x)
    print(length(x))

    cell_1 = tf.contrib.rnn.BasicLSTMCell(lstm_output_size)
    output_1, state_1 = tf.nn.dynamic_rnn(cell_1, embedded_x, dtype=tf.float32, scope="rnn_layer1",
                                          sequence_length=length(x))

    # Select last output.
    last_index = tf.shape(output_1)[1] - 1
    # reshaping to [seq_length, batch_size, num_units]
    output = tf.transpose(output_1, [1, 0, 2])

    last = tf.gather(output, last_index)

    # Softmax layer
    with tf.name_scope('fc_layer'):
        weight = tf.get_variable(name="weights", shape=[lstm_output_size, 2])
        bias = tf.get_variable(shape=[2], name="bias")

    logits = tf.matmul(last, weight) + bias

    loss = tf.losses.softmax_cross_entropy(y, logits=logits)

    optimizer = tf.train.AdamOptimizer()
    optimize_step = optimizer.minimize(loss=loss)

    return loss, optimize_step


def tf_model():
    x_holder = tf.placeholder(tf.int32, shape=[None, maxlen])
    y_holder = tf.placeholder(tf.int32, shape=[None, 2])
    loss, opt_step = get_model(x_holder, y_holder)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        step = 0
        for epoch in range(10):
            cost_epochs = []
            train_data = TrainData()
            cost_batch = 0
            for batch in range(train_data.get_num_batches()):
                x_train, y_train = train_data.next_batch()
                _, cost_batch = sess.run([opt_step, loss],
                                         feed_dict={x_holder: x_train,
                                                    y_holder: y_train})

                cost_epochs.append(cost_batch)


                step += 1
                # if step % 100 == 0:
                print("Epoch: " + str(epoch))
                print("\tcost: " + str(np.mean(cost_epochs)))



def keras_model():
    # print('Loading data...')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

    y_test = [[int(x == 1), int(x != 1)] for x in y_test]

    x_test = sequence.pad_sequences(x_test, maxlen=maxlen, padding='pre')

    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))

    model.add(LSTM(lstm_output_size))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print('Train...')
    data = TrainData()
    model.fit_generator(data.batch_generator(), steps_per_epoch=data.get_num_batches(),
                        epochs=epochs,
                        validation_data=(x_test, y_test))


if __name__ == '__main__':
    # keras_model()
    tf_model()

编辑

当我将序列长度限制为 100 时,两个模型都会收敛,因此我假设 lstm 层中有一些不同的东西。

最佳答案

检查您的操作的初始值。就我而言,keras 中的 adadelta 优化器的初始学习率为 1.0,而 tf.keras 中的初始学习率为 0.001,因此在 mnist 数据集中,它的收敛速度非常慢。

关于tensorflow - 同样的模型在 keras 中收敛,但在 tensorflow 中不收敛,这怎么可能?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44225529/

相关文章:

python - 批量训练但在 Tensorflow 中测试单个数据项?

python - Keras 图像分类损失

tensorflow - 如何获得 TensorFlow 字符串的长度?

machine-learning - 减少(相对于延迟)神经网络中的过度拟合

python - 在 Tensorflow 中使用迭代器时如何正确设置 is_training

Keras:二值图像分割中的像素类不平衡

python - Tensorflow Keras 也使用 tfrecords 进行验证

python - 如何使用经过训练的 BERT 模型检查点进行预测?

machine-learning - keras 模型中的 NaN 损失

python - “RandomForestClassifier”对象没有属性 'layers'