java - 为什么输出总是收敛到0.5?

标签 java neural-network processing

我试图解决 XOR 问题,但输出总是收敛到 0.5,所以我尝试了一个更简单的问题,例如 NOT,并且发生了同样的事情。

我真的不知道发生了什么,我检查了代码一百万次,一切似乎都是正确的,当我调试它并保存神经网络信息时,我看到权重值或偏差值正在变得越来越大真的很大。为此,我还关注了有关神经网络的 3 蓝 1 棕色 YouTube 系列以及其他一些视频。

这是我的代码:

PS:我把整个代码放在这里,但我认为主要问题在 bakpropag 函数内部

class NeuralNetwork {
  int inNum, hiddenLayersNum, outNum, netSize;
  int[] hiddenLayerSize;
  Matrix[] weights;
  Matrix[] biases;
  Matrix[] sums;
  Matrix[] activations;
  Matrix[] error;
  Matrix inputs;
  long samples = 0;
  float learningRate;

  //Constructor------------------------------------------------------------------------------------------------------

  NeuralNetwork(int inNum, int hiddenLayersNum, int[] hiddenLayerSize, int outNum, float learningRate) {
    this.inNum = inNum;
    this.hiddenLayersNum = hiddenLayersNum;
    this.hiddenLayerSize = hiddenLayerSize;
    this.outNum = outNum;
    this.netSize = hiddenLayersNum + 1;
    this.learningRate = learningRate;
    //output layer plus the hidden layer size
    //Note: I'm not adding the input layer because it doesn't have weights
    weights = new Matrix[netSize];
    //no biases added to the output layer
    biases = new Matrix[netSize - 1];
    sums = new Matrix[netSize];
    activations = new Matrix[netSize];
    error = new Matrix[netSize];

    initializeHiddenLayer();
    initializeOutputLayer();
  }

  //Initializing Algorithms------------------------------------------------------------------------------------------

  void initializeHiddenLayer() {
    for (int i = 0; i < hiddenLayersNum; i++) {
      if (i == 0) {//only the first hidden layer takes the inputs
        weights[i] = new Matrix(hiddenLayerSize[i], inNum);
      } else {
        weights[i] = new Matrix(hiddenLayerSize[i], hiddenLayerSize[i - 1]);
      }
      biases[i] = new Matrix(hiddenLayerSize[i], 1);
      sums[i] = new Matrix(hiddenLayerSize[i], 1);
      activations[i] = new Matrix(hiddenLayerSize[i], 1);
      error[i] = new Matrix(hiddenLayerSize[i], 1);
    }
  }

  void initializeOutputLayer() {
    //the output layer takes the last hidden layer activation values
    weights[netSize - 1] = new Matrix(outNum, hiddenLayerSize[hiddenLayerSize.length - 1]);
    activations[netSize - 1] = new Matrix(outNum, 1);
    sums[netSize - 1] = new Matrix(outNum, 1);
    error[netSize - 1] = new Matrix(outNum, 1);



    for (Matrix m : weights) {
      for (int i = 0; i < m.i; i++) {
        for (int j = 0; j < m.j; j++) {
          m.values[i][j] = random(-1, 1);
        }
      }
    }
    for (Matrix m : biases) {
      for (int i = 0; i < m.i; i++) {
        for (int j = 0; j < m.j; j++) {
          m.values[i][j] = 1;
        }
      }
    }
    for (Matrix m : sums) {
      for (int i = 0; i < m.i; i++) {
        for (int j = 0; j < m.j; j++) {
          m.values[i][j] = 0;
        }
      }
    }
  }

  //Calculation------------------------------------------------------------------------------------------------------

  void calculate(float[] inputs) {
    this.inputs = new Matrix(0, 0);
    this.inputs = this.inputs.arrayToCollumn(inputs);

    sums[0] = (weights[0].matrixMult(this.inputs)).sum(biases[0]);
    activations[0] = sigM(sums[0]);

    for (int i = 1; i < netSize - 1; i++) {
      sums[i] = weights[i].matrixMult(activations[i - 1]);
      activations[i] = sigM(sums[i]).sum(biases[i]);
    }
    //there's no biases in the output layer
    //And the output layer uses sigmoid function
    sums[netSize - 1] = weights[netSize - 1].matrixMult(activations[netSize - 1 - 1]);
    activations[netSize - 1] = sigM(sums[netSize - 1]);
  }

  //Sending outputs--------------------------------------------------------------------------------------------------

  Matrix getOuts() {
    return activations[netSize - 1];
  }

  //Backpropagation--------------------------------------------------------------------------------------------------
  void calcError(float[] exp) {
    Matrix expected = new Matrix(0, 0);
    expected = expected.arrayToCollumn(exp);
    //E = (output - expected)
    error[netSize - 1] = this.getOuts().diff(expected);
    samples++;
  }

  void backPropag(int layer) {
    if (layer == netSize - 1) {
      error[layer].scalarDiv(samples);
      for (int i = layer - 1; i >= 0; i--) {
        prevLayerCost(i);
      }
      weightError(layer);
      backPropag(layer - 1);
    } else {
      weightError(layer);
      biasError(layer);
      if (layer != 0)
        backPropag(layer - 1);
    }
  }

  void weightError(int layer) {
    if (layer != 0) {
      for (int i = 0; i < weights[layer].i; i++) {
        for (int j = 0; j < weights[layer].j; j++) {
          float changeWeight = 0;
          if (layer != netSize - 1)
            changeWeight = activations[layer - 1].values[j][0] * deriSig(sums[layer].values[i][0]) * error[layer].values[i][0];
          else
            changeWeight = activations[layer - 1].values[j][0] * deriSig(sums[layer].values[i][0]) * error[layer].values[i][0];

          weights[layer].values[i][j] += -learningRate * changeWeight;
        }
      }
    } else {
      for (int i = 0; i < weights[layer].i; i++) {
        for (int j = 0; j < weights[layer].j; j++) {
          float changeWeight = this.inputs.values[j][0] * deriSig(sums[layer].values[i][0]) * error[layer].values[i][0];
          weights[layer].values[i][j] += -learningRate * changeWeight;
        }
      }
    }
  }


  void biasError(int layer) {
    for (int i = 0; i < biases[layer].i; i++) {
      for (int j = 0; j < biases[layer].j; j++) {
        float changeBias = 0;
        if (layer != netSize - 1)
          changeBias = deriSig(sums[layer].values[i][0]) * error[layer].values[i][0];
        biases[layer].values[i][j] += -learningRate * changeBias;
      }
    }
  }

  void prevLayerCost(int layer) {
    for (int i = 0; i < activations[layer].i; i++) {
      for (int j = 0; j < activations[layer + 1].j; j++) {//for all conections of that neuron to the next layer
        if (layer != netSize - 1)
          error[layer].values[i][0] += weights[layer + 1].values[j][i] * deriSig(sums[layer + 1].values[j][0]) * error[layer + 1].values[j][0];
        else
          error[layer].values[i][0] += weights[layer + 1].values[j][i] * deriSig(sums[layer + 1].values[j][0]) * error[layer + 1].values[j][0];
      }
    }
  } 
  //Activation Functions---------------------------------------------------------------------------------------------

  Matrix reLUM(Matrix m) {
    Matrix temp = m.copyM();
    for (int i = 0; i < temp.i; i++) {
      for (int j = 0; j < temp.j; j++) {
        temp.values[i][j] = ReLU(m.values[i][j]);
      }
    }
    return temp;
  }

  float ReLU(float x) {
    return max(0, x);
  }

  float deriReLU(float x) {
    if (x <= 0)
      return 0;
    else
      return 1;
  }

  Matrix sigM(Matrix m) {
    Matrix temp = m.copyM();
    for (int i = 0; i < temp.i; i++) {
      for (int j = 0; j < temp.j; j++) {
        temp.values[i][j] = sig(m.values[i][j]);
      }
    }
    return temp;
  }

  float sig(float x) {
    return 1 / (1 + exp(-x));
  }

  float deriSig(float x) {
    return sig(x) * (1 - sig(x));
  }
  //Saving Files-----------------------------------------------------------------------------------------------------
  void SaveNeuNet() {
    for (int i = 0; i < weights.length; i++) {
      weights[i].saveM("weights\\weightLayer" + i);
    }
    for (int i = 0; i < biases.length; i++) {
      biases[i].saveM("biases\\biasLayer" + i);
    }
    for (int i = 0; i < activations.length; i++) {
      activations[i].saveM("activations\\activationLayer" + i);
    }
    for (int i = 0; i < error.length; i++) {
      error[i].saveM("errors\\errorLayer" + i);
    }
  }
}  

这是矩阵代码:

class Matrix {
  int i, j, size;
  float[][] values;

  Matrix(int i, int j) {
    this.i = i;
    this.j = j;
    this.size = i * j;
    values = new float[i][j];
  }

  Matrix sum (Matrix other) {
    if (other.i == this.i && other.j == this.j) {
      for (int x = 0; x < this.i; x++) {
        for (int z = 0; z < this.j; z++) {
          values[x][z] += other.values[x][z];
        }
      }
      return this;
    }
    return null;
  }

  Matrix diff(Matrix other) {
    if (other.i == this.i && other.j == this.j) {
      for (int x = 0; x < this.i; x++) {
        for (int z = 0; z < this.j; z++) {
          values[x][z] -= other.values[x][z];
        }
      }
      return this;
    }
    return null;
  }

  Matrix scalarMult(float k) {
    for (int i = 0; i < this.i; i++) {
      for (int j = 0; j < this.j; j++) {
        values[i][j] *= k;
      }
    }
    return this;
  }

  Matrix scalarDiv(float k) {
    if (k != 0) {
      for (int i = 0; i < this.i; i++) {
        for (int j = 0; j < this.j; j++) {
          values[i][j] /= k;
        }
      }
      return this;
    } else
      return null;
  }

  Matrix matrixMult(Matrix other) {
    if (this.j != other.i)
      return null;
    else {
      Matrix temp = new Matrix(this.i, other.j);

      for (int i = 0; i < temp.i; i++) {
        for (int j = 0; j < temp.j; j++) {
          for (int k = 0; k < this.j; k++) {
            temp.values[i][j] += this.values[i][k] * other.values[k][j];
          }
        }
      }

      return temp;
    }
  }

  Matrix squaredValues(){
     for (int i = 0; i < this.i; i++){
       for (int j = 0; j < this.j; j++){
         values[i][j] = sq(values[i][j]);
       }
     }
     return this;
  }


  void printM() {
    for (int x = 0; x < this.i; x++) {
      print("| ");
      for (int z = 0; z < this.j; z++) {
        print(values[x][z] + " | ");
      }
      println();
    }
  }

  void saveM(String name) {
    String out = "";
    for (int x = 0; x < this.i; x++) {
      out += "| ";
      for (int z = 0; z < this.j; z++) {
        out += values[x][z] + " | ";
      }
      out += "\n";
    }
    saveStrings("outputs\\" + name + ".txt", new String[] {out});
  }

  Matrix arrayToCollumn(float[] array) {
    Matrix temp = new Matrix(array.length, 1);
    for (int i = 0; i < array.length; i++)
      temp.values[i][0] = array[i];
    return temp;
  }

  Matrix arrayToLine(float[] array) {
    Matrix temp = new Matrix(1, array.length);
    for (int j = 0; j < array.length; j++)
      temp.values[0][j] = array[j];
    return temp;
  }
  Matrix copyM(){
    Matrix temp = new Matrix(i, j);
    for (int i = 0; i < this.i; i++){
      for (int j = 0; j < this.j; j++){
        temp.values[i][j] = this.values[i][j];
      }
    }
    return temp;
  }
}

正如我所说,输出始终收敛于 0.5,而不是实际值 1 或 0

最佳答案

我重写了代码,现在可以运行了!我不知道之前的代码出了什么问题,但是这个有效:

class NeuralNetwork {
    int netSize;
    float learningRate;
    Matrix[] weights;
    Matrix[] biases;
    Matrix[] activations;
    Matrix[] sums;
    Matrix[] errors;

    NeuralNetwork(int inNum, int hiddenNum, int[] hiddenLayerSize, int outNum, float learningRate) {
        netSize = hiddenNum + 1;
        this.learningRate = learningRate;

        weights = new Matrix[netSize];
        biases = new Matrix[netSize - 1];
        activations = new Matrix[netSize];
        sums = new Matrix[netSize];
        errors = new Matrix[netSize];

        initializeMatrices(inNum, hiddenNum, hiddenLayerSize, outNum);
    }

    //INITIALIZING MATRICES
    void initializeMatrices(int inNum, int hiddenNum, int[] layerSize, int outNum) {

        for (int i = 0; i < hiddenNum; i++) {
            if (i == 0)
                weights[i] = new Matrix(layerSize[0], inNum);
            else
                weights[i] = new Matrix(layerSize[i], layerSize[i - 1]);

            biases[i] = new Matrix(layerSize[i], 1);
            activations[i] = new Matrix(layerSize[i], 1);
            errors[i] = new Matrix(layerSize[i], 1);
            sums[i] = new Matrix(layerSize[i], 1);

            weights[i].randomize(-1, 1);
            biases[i].randomize(-1, 1);
            activations[i].randomize(-1, 1);
        }

        weights[netSize - 1] = new Matrix(outNum, layerSize[layerSize.length - 1]);
        activations[netSize - 1] = new Matrix(outNum, 1);
        errors[netSize - 1] = new Matrix(outNum, 1);
        sums[netSize - 1] = new Matrix(outNum, 1);

        weights[netSize - 1].randomize(-1, 1);
        activations[netSize - 1].randomize(-1, 1);
    }

    //---------------------------------------------------------------------------------------------------------------

    void forwardPropag(float[] ins) {
        Matrix inputs = new Matrix(0, 0);
        inputs = inputs.arrayToCollumn(ins);

        sums[0] = (weights[0].matrixMult(inputs)).sum(biases[0]);
        activations[0] = sigM(sums[0]);

        for (int i = 1; i < netSize - 1; i++) {
            sums[i] = (weights[i].matrixMult(activations[i - 1])).sum(biases[i]);
            activations[i] = sigM(sums[i]);
        }

        //output layer does not have biases
        sums[netSize - 1] = weights[netSize - 1].matrixMult(activations[netSize - 2]);
        activations[netSize - 1] = sigM(sums[netSize - 1]);
    }

    Matrix predict(float[] inputs) {
        forwardPropag(inputs);
        return activations[netSize - 1].copyM();
    }

    //SUPERVISED LEARNING - BACKPROPAGATION
    void train(float[] inps, float[] expec) {
        Matrix expected = new Matrix(0, 0);
        expected = expected.arrayToCollumn(expec);

        errors[netSize - 1] = predict(inps).diff(expected);

        calcErorrPrevLayers();

        adjustWeights(inps);
        adjustBiases();

        for (Matrix m : errors){
            m.reset();
        }
    }

    void calcErorrPrevLayers() {
        for (int l = netSize - 2; l >= 0; l--) {
            for (int i = 0; i < activations[l].i; i++) {
                for (int j = 0; j < activations[l + 1].i; j++) {
                    errors[l].values[i][0] += weights[l + 1].values[j][i] * dSig(sums[l + 1].values[j][0]) * errors[l + 1].values[j][0];
                }
            }
        }
    }

    void adjustWeights(float[] inputs) {
        for (int l = 0; l < netSize; l++) {
            if (l == 0) {
                //for ervery neuron n in the first layer
                for (int n = 0; n < activations[l].i; n++) {
                    //for every weight w of the first layer
                    for (int w = 0; w < inputs.length; w++) {
                        float weightChange = inputs[w] * dSig(sums[l].values[n][0]) * errors[l].values[n][0];
                        weights[l].values[n][w] += -learningRate * weightChange;
                    }
                }
            } else {
                //for ervery neuron n in the first layer
                for (int n = 0; n < activations[l].i; n++) {
                    //for every weight w of the first layer
                    for (int w = 0; w < activations[l - 1].i; w++) {
                        float weightChange = activations[l - 1].values[w][0] * dSig(sums[l].values[n][0]) * errors[l].values[n][0];
                        weights[l].values[n][w] += -learningRate * weightChange;
                    }
                }
            }
        }
    }

    void adjustBiases() {
        for (int l = 0; l < netSize - 1; l++) {
            //for ervery neuron n in the first layer
            for (int n = 0; n < activations[l].i; n++) {
                float biasChange = dSig(sums[l].values[n][0]) * errors[l].values[n][0];
                biases[l].values[n][0] += -learningRate * biasChange;
            }
        }
    }

    //ACTIVATION FUNCTION
    float sig(float x) {
        return 1 / (1 + exp(-x));
    }

    float dSig(float x) {
        return sig(x) * (1 - sig(x));
    }

    Matrix sigM(Matrix m) {
        Matrix temp = m.copyM();
        for (int i = 0; i < m.i; i++) {
            for (int j = 0; j < m.j; j++) {
                temp.values[i][j] = sig(m.values[i][j]);
            }
        }

        return temp;
    }
}

关于java - 为什么输出总是收敛到0.5?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54048302/

相关文章:

java - 是否可以编写逻辑形式 "then if"的条件?

java - 如何编写 OSGI 命令行应用程序

python - 来自 tensorflow 的用于图像分割的 sigmoid_cross_entropy 损失函数

python - 实时艺术项目---输入: sound --- output: image (better title?)

python - 在Caffe中训练GoogLeNet时如何向输入图像添加外部特征?

image - 人工神经网络图像变换

java - 处理 - 有面向对象的编程问题

java - 使用 Jena 的 SPARQL 查询没有产生任何结果——但在线工作

java - Jsoup Java HTML 解析器 : Executing Javascript events

java - GSON 将特定字段的整数值动态转换为 boolean 值