theano - theano 更新时出现奇怪的错误

我设计了一个可变网络，但是在theano上出现了一些问题。总体思路是，不同的输入将得到具有相同参数的不同网络，类似于带有自动编码器的递归神经网络。我的代码中有两种情况，一种情况是如果 c > 1 则运行 combine_feat_gt1_1()，另一种情况是运行 combine_feat_gt1_0()。

奇怪的是，如果我注释 update=updates ，代码可以运行而没有错误，这不是我的预期(代码中的 train_test theano 函数)。但是，如果我取消注释 update=updates，则会发生错误(代码中的 train_test_bug theano 函数)。后一个是我想实现的。

我已经花了几天时间研究这个错误。谁能帮我？我将不胜感激。

import os
import sys
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from theano.ifelse import ifelse

class Test(object):

    def __init__(
        self,
        numpy_rng,
        input=None,
        output=None,
        n_output=6,
        n_input=3,
        n_group=2,
        W_r=None,
        b_r=None
    ):

        self.n_output = n_output
        self.n_input = n_input
        self.n_group = n_group

        if not W_r:
            initial_W_r = numpy.asarray(
                numpy_rng.uniform(
                    low=-4 * numpy.sqrt(6. / (n_input + n_input)),
                    high=4 * numpy.sqrt(6. / (n_input + n_input)),
                    size=(n_input, n_input)
                ),
                dtype=theano.config.floatX
            )
            W_r = theano.shared(value=initial_W_r, name='W_r', borrow=True)

        if not b_r:
            b_r = theano.shared(
                value=numpy.zeros(
                    n_input,
                    dtype=theano.config.floatX
                ),
                borrow=True
            )

        self.W_r = W_r
        self.b_r = b_r

        if input is None:
            self.x = T.tensor4(name='input', dtype=theano.config.floatX)
        else:
            self.x = input

        if output is None:
            self.y = T.matrix(name='output', dtype=theano.config.floatX)
        else:
            self.y = output

        self.params = [self.W_r, self.b_r]

    def get_output_values(self, input):
        a, b, c, d = input.shape

        def recusive(x_t, h_tm1, wr, hr):
            h_t = T.dot(h_tm1, wr) + T.dot(x_t, wr) +  hr
            return h_t

        def combine_recusive(data):
            hidden, _ = theano.scan(fn=recusive,
                               sequences=data[1:],
                               outputs_info=data[0],
                               non_sequences=[self.W_r, self.b_r],
                               n_steps=data[1:].shape[0],
                               strict=True)
            return hidden[-1]

        def combine_feat_gt1_1(input):
            feats, _ = theano.scan(fn=combine_recusive,
                                   sequences=input[0],
                                   outputs_info=None,
                                   n_steps=input[0].shape[0])
            recusive_flag = T.ones(1)
            return T.reshape(feats, (1,-1)) # concatenation

        def combine_feat_gt1_0(input):
            feats = input[0]
            recusive_flag = T.zeros(1)
            return T.reshape(feats, (1,-1)) # concatenation

        feat = ifelse(T.gt(c, 1), combine_feat_gt1_1(input), combine_feat_gt1_0(input))

        # debug code snippet
        self.debug_ifelse = theano.function([input], T.gt(c, 1))
        self.debug_1_0 = theano.function([input], ifelse(T.gt(c, 1), 1, 0))

        return feat

    def get_cost_updates(self):

        learning_rate = 0.1
        self.y_given_x = self.get_output_values(self.x)
        cost = T.sum(( self.y_given_x - self.y) ** 2)

        gparams = T.grad(cost, self.params)
        updates = [
             (param, param - learning_rate * gparam)
             for param, gparam in zip(self.params, gparams)
         ]

        return (cost, updates)


if __name__ == "__main__":

    toy_data = numpy.array([[[[1,1,1],[2,2,2]], [[3, 4,5],[4,5,6]]]],dtype=theano.config.floatX)
    lable = numpy.array([[1,2,3,4,5,6]],dtype=theano.config.floatX)
    toy_data2 = numpy.array([[[[1,1,1]], [[3,4,5]]]],dtype=theano.config.floatX)
    lable2 = numpy.array([[6,5,4,3,2,1]],dtype=theano.config.floatX)

    x = T.tensor4('x', dtype=theano.config.floatX)
    y = T.matrix('y', dtype=theano.config.floatX)
    newX = T.tensor4(dtype=x.dtype)
    newY = T.matrix(dtype=y.dtype)

    rng = numpy.random.RandomState(123)
    test = Test(
        numpy_rng=rng,
        input=x,
        output=y,
        n_group=2,
        n_input=3,
        n_output=6
    )

    cost, updates= test.get_cost_updates()

    train_test = theano.function(
        [newX, newY],
        cost,
        # updates=updates,
        givens={
            x : newX,
            y : newY
        }
    )

    train_test_bug = theano.function(
        [newX, newY],
        cost,
        updates=updates,
        givens={
            x : newX,
            y : newY
        }
    )


    print train_test(toy_data, lable)
    print train_test(toy_data2, lable2)

    # code with bug
    # print train_test_bug(toy_data, lable)
    # print train_test_bug(toy_data2, lable2)

编辑(@danielrenshaw)

我已将代码简化为更简单的问题演示。

原因在于双重嵌套扫描表达式的梯度计算。当使用修改后的最内层递归表达式时，问题就消失了(请参阅下面第一个函数中的注释)。

import numpy
import theano
import theano.tensor as tt
import theano.ifelse


def inner_scan_step(x_t_t, h_tm1, w):
    # Fails when using this recursive expression
    h_t = tt.dot(h_tm1, w) + x_t_t

    # No failure when using this recursive expression
    # h_t = h_tm1 + tt.dot(x_t_t, w)

    return h_t


def outer_scan_step(x_t, w):
    h, _ = theano.scan(inner_scan_step,
                       sequences=[x_t[1:]],
                       outputs_info=[x_t[0]],
                       non_sequences=[w],
                       strict=True)
    return h[-1]


def get_outputs(x, w):
    features, _ = theano.scan(outer_scan_step,
                              sequences=[x],
                              non_sequences=[w],
                              strict=True)
    return tt.grad(features.sum(), w)


def main():
    theano.config.compute_test_value = 'raise'

    x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))

    x = tt.tensor3()
    x.tag.test_value = x_value

    w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)

    f = theano.function(inputs=[x], outputs=get_outputs(x, w))

    print f(x_value)


if __name__ == "__main__":
    main()

最佳答案

我解决了由 danielrenshaw 编辑的这个问题。当我添加 h0 作为outputs_info时，它起作用了。在此之前，我使用序列的第一个元素作为outputs_info，我认为它导致了错误。但我仍然无法解决我原来的问题。

import numpy
import theano
import theano.tensor as tt
import theano.ifelse


def inner_scan_step(x_t_t, h_tm1, w):
    # Fails when using this recursive expression
    h_t = tt.dot(h_tm1, w) + x_t_t

    # No failure when using this recursive expression
    # h_t = h_tm1 + tt.dot(x_t_t, w)

    return h_t


def outer_scan_step(x_t, w, h0):
    h, _ = theano.scan(inner_scan_step,
                       sequences=[x_t],
                       outputs_info=[h0],
                       non_sequences=[w],
                       strict=True)
    return h[-1]


def get_outputs(x, w, h0):
    features, _ = theano.scan(outer_scan_step,
                              sequences=[x],
                              non_sequences=[w, h0],
                              strict=True)
    return tt.grad(features.sum(), w)


def main():
    theano.config.compute_test_value = 'raise'

    x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))

    x = tt.tensor3()
    x.tag.test_value = x_value

    w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
    h0 = theano.shared(value=numpy.zeros(3, dtype=theano.config.floatX), borrow=True)

    f = theano.function(inputs=[x], outputs=get_outputs(x, w, h0))

    print f(x_value)


if __name__ == "__main__":
    main()

关于theano - theano 更新时出现奇怪的错误，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/31804340/

theano - theano 更新时出现奇怪的错误

上一篇：visual-studio-2015 - 如何使用 Visual Studio 2015 即时重命名变量？

下一篇：html - 设置图片标签源的高度