java - 反向传播的神经网络不收敛

基本上我正在尝试在网络中实现反向传播。我知道反向传播算法是硬编码的，但我试图首先使其发挥作用。

它适用于一组输入和输出，但超出一个训练集，网络会收敛于一个解决方案，而另一输出收敛于 0.5。

即一次试验的输出是: [0.9969527919933012，0.003043774988797313]

[0.5000438200377985，0.49995612243030635]

Network.java

private ArrayList<ArrayList<ArrayList<Double>>> weights;
private ArrayList<ArrayList<Double>> nodes;

private final double LEARNING_RATE = -0.25;
private final double DEFAULT_NODE_VALUE = 0.0;

private double momentum = 1.0;

public Network() {
    weights = new ArrayList<ArrayList<ArrayList<Double>>>();
    nodes = new ArrayList<ArrayList<Double>>();
}

/**
 * This method is used to add a layer with {@link n} nodes to the network.
 * @param n number of nodes for the layer
 */
public void addLayer(int n) {
    nodes.add(new ArrayList<Double>());
    for (int i = 0;i < n;i++)
        nodes.get(nodes.size()-1).add(DEFAULT_NODE_VALUE);
}

/**
 * This method generates the weights used to link layers together.
 */
public void createWeights() {
    // there are only weights between layers, so we have one less weight layer than node layer
    for (int i = 0;i < nodes.size()-1;i++) {
        weights.add(new ArrayList<ArrayList<Double>>());

        // for each node above the weight
        for (int j = 0;j < nodes.get(i).size();j++) {
            weights.get(i).add(new ArrayList<Double>());

            // for each node below the weight
            for (int k = 0;k < nodes.get(i+1).size();k++)
                weights.get(i).get(j).add(Math.random()*2-1);
        }
    }
}

/**
 * Utilizes the differentiated sigmoid function to change weights in the network
 * @param out   The desired output pattern for the network
 */
private void propogateBackward(double[] out) {
    /*
     * Error calculation using squared error formula and the sigmoid derivative
     * 
     * Output Node : dk = Ok(1-Ok)(Ok-Tk)
     * Hidden Node : dj = Oj(1-Oj)SummationkEK(dkWjk)
     * 
     * k is an output node
     * j is a hidden node
     * 
     * dw = LEARNING_RATE*d*outputOfpreviousLayer(not weighted)
     * W = dW + W
     */

    // update the last layer of weights first because it is a special case

    double dkW = 0;

    for (int i = 0;i < nodes.get(nodes.size()-1).size();i++) {

        double outputK = nodes.get(nodes.size()-1).get(i);
        double deltaK = outputK*(1-outputK)*(outputK-out[i]);

        for (int j = 0;j < nodes.get(nodes.size()-2).size();j++) {
            weights.get(1).get(j).set(i, weights.get(1).get(j).get(i) + LEARNING_RATE*deltaK*nodes.get(nodes.size()-2).get(j) );
            dkW += deltaK*weights.get(1).get(j).get(i);
        }
    }

    for (int i = 0;i < nodes.get(nodes.size()-2).size();i++) {

        //Hidden Node : dj = Oj(1-Oj)SummationkEK(dkWjk)
        double outputJ = nodes.get(1).get(i);
        double deltaJ = outputJ*(1-outputJ)*dkW*LEARNING_RATE;

        for (int j = 0;j < nodes.get(0).size();j++) {
            weights.get(0).get(j).set(i, weights.get(0).get(j).get(i) + deltaJ*nodes.get(0).get(j) );
        }


    }

}

/**
 * Propogates an array of input values through the network
 * @param in    an array of inputs
 */
private void propogateForward(double[] in) {
    // pass the weights to the input layer
    for (int i = 0;i < in.length;i++)
        nodes.get(0).set(i, in[i]);

    // propagate through the rest of the network
    // for each layer after the first layer
    for (int i = 1;i < nodes.size();i++)

        // for each node in the layer
        for (int j = 0;j < nodes.get(i).size();j++) {

            // for each node in the previous layer
            for (int k = 0;k < nodes.get(i-1).size();k++)

                // add to the node the weighted output from k to j
                nodes.get(i).set(j, nodes.get(i).get(j)+weightedNode(i-1, k, j));

            // once the node has received all of its inputs we can apply the activation function
            nodes.get(i).set(j, activation(nodes.get(i).get(j)));

        }   
}

/**
 * This method returns the activation value of an input
 * @param   in the total input of a node
 * @return  the sigmoid function at the input
 */
private double activation(double in) {
    return 1/(1+Math.pow(Math.E,-in));
}

/**
 * Weighted output for a node.
 * @param layer the layer which the transmitting node is on
 * @param node  the index of the transmitting node
 * @param previousNode  the index of the receiving node
 * @return  the output of the transmitting node times the weight between the two nodes
 */
private double weightedNode(int layer, int node, int nextNode) {
    return nodes.get(layer).get(node)*weights.get(layer).get(node).get(nextNode);
}

/**
 * This method resets all of the nodes to their default value
 */
private void resetNodes() {
    for (int i = 0;i < nodes.size();i++)
        for (int j = 0;j < nodes.get(i).size();j++)
            nodes.get(i).set(j, DEFAULT_NODE_VALUE);
}

/**
 * Teach the network correct responses for certain input values.
 * @param in    an array of input values
 * @param out   an array of desired output values
 * @param n     number of iterations to perform
 */
public void train(double[] in, double[] out, int n) {
    for (int i = 0;i < n;i++) {
        propogateForward(in);
        propogateBackward(out);
        resetNodes();
    }
}

public void getResult(double[] in) {
    propogateForward(in);
    System.out.println(nodes.get(2));
    resetNodes();
}

SnapSolve.java

public SnapSolve() {

    Network net = new Network();
    net.addLayer(2);
    net.addLayer(4);
    net.addLayer(2);
    net.createWeights();

    double[] l = {0, 1};
    double[] p = {1, 0};

    double[] n = {1, 0};
    double[] r = {0, 1};

    for(int i = 0;i < 100000;i++) {
        net.train(l, p, 1);
        net.train(n, r, 1);
    }

    net.getResult(l);
    net.getResult(n);

}

public static void main(String[] args) {
    new SnapSolve();
}

最佳答案

建议

您在网络中使用的初始权重非常大。通常，您希望按照与单元扇入平方根的倒数成比例的方式初始化 sigmoid 激活神经网络中的权重。因此，对于网络第 i 层中的单元，选择正负 n^{-1/2} 之间的初始权重，其中 n 是第 i-1 层中的单元数量。 (有关详细信息，请参阅 http://www.willamette.edu/~gorr/classes/cs449/precond.html。)
您似乎使用的学习率参数也相当大，这可能会导致您的网络在训练期间“反弹”。我会在对数刻度上尝试不同的值:0.2、0.1、0.05、0.02、0.01、0.005，...，直到找到一个效果更好的值。
您实际上只训练了两个示例(尽管您使用的网络应该能够轻松地对这两点进行建模)。您可以通过向现有输入添加噪声并期望网络产生正确的输出来增加训练数据集的多样性。我发现，当使用平方误差损失(就像您正在使用的那样)并尝试学习像 XOR 这样的二元 boolean 运算符时，这有时会有所帮助，因为真正的函数域中很少有输入输出对可以训练.

监控

此外，我想提出一个可能有助于您解决此类问题的一般性建议:添加一些代码，以便您在给定已知输入输出时监视网络的当前错误对(或整个“验证”数据集)。

如果您可以在训练期间监控网络的误差，它将帮助您更清楚地看到网络何时收敛——当您训练网络时，误差应该稳步下降。如果它到处反弹，您就会知道您使用的学习率太大，或者需要以其他方式调整您的训练数据集。如果误差增加，则说明您的梯度计算出现问题。

关于java - 反向传播的神经网络不收敛，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/19229650/

java - 反向传播的神经网络不收敛

建议

监控

上一篇：java - log4j maven 依赖项下载超时

下一篇：java - h2 留下锁定文件，即使只创建一个连接并立即断开连接