我有以下神经网络代码,我只是试图从基本问题(例如 XOR 问题)开始,同时构建代码库。这是一个业余爱好项目。
#include <iostream>
#include <array>
#include <random>
#include <chrono>
#include <iomanip>
#include <fstream>
#include <algorithm>
#include <iomanip>
typedef float DataType;
typedef DataType (*ActivationFuncPtr)(const DataType&);
static DataType learningRate = 0.02;
static std::size_t numberEpochs = 1000000;
DataType sigmoid(const DataType& x)
{
return DataType(1) / (DataType(1) + std::exp(-x));
}
template<typename T>
class Random
{
public:
T operator()()
{
return m_dis(m_mt);
}
protected:
static std::mt19937 m_mt;
static std::uniform_real_distribution<T> m_dis;
};
template<typename T> std::mt19937 Random<T>::m_mt(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count());
template<typename T> std::uniform_real_distribution<T> Random<T>::m_dis(0,1);
template<std::size_t NumInputs>
class Neuron
{
public:
Neuron(ActivationFuncPtr activationFunction)
:
m_activationFunction(activationFunction)
{
Random<DataType> r;
std::generate(m_weights.begin(),m_weights.end(),[&]()
{
return r();
});
m_biasWeight = r();
}
void FeedForward(const std::array<DataType,NumInputs>& inputValues)
{
DataType sum = m_biasWeight;
for(std::size_t i = 0; i < inputValues.size(); ++i)
sum += inputValues[i] * m_weights[i];
m_output = m_activationFunction(sum);
m_netInput = sum;
}
DataType GetOutput() const
{
return m_output;
}
DataType GetNetInput() const
{
return m_netInput;
}
std::array<DataType,NumInputs> Backpropagate(const DataType& error,
const std::array<DataType,NumInputs>& inputValues,
std::array<DataType,NumInputs+1>& weightAdjustments)
{
DataType errorOverOutput = error;
DataType outputOverNetInput = m_output * (DataType(1) - m_output); // sigmoid derivative
std::array<DataType,NumInputs> netInputOverWeight;
for(std::size_t i = 0; i < NumInputs; ++i)
{
netInputOverWeight[i] = inputValues[i];
}
DataType netInputOverBias = DataType(1);
std::array<DataType,NumInputs> errorOverWeight;
for(std::size_t i = 0; i < NumInputs; ++i)
{
errorOverWeight[i] = errorOverOutput * outputOverNetInput * netInputOverWeight[i];
}
DataType errorOverBias = errorOverOutput * outputOverNetInput * netInputOverBias;
for(std::size_t i = 0; i < NumInputs; ++i)
{
weightAdjustments[i] = errorOverWeight[i];
}
weightAdjustments[NumInputs] = errorOverBias;
DataType errorOverNetInput = errorOverOutput * outputOverNetInput;
std::array<DataType,NumInputs> errorWeights;
for(std::size_t i = 0; i < NumInputs; ++i)
{
errorWeights[i] = errorOverNetInput * m_weights[i];
}
return errorWeights;
}
void AdjustWeights(const std::array<DataType,NumInputs+1>& adjustments)
{
for(std::size_t i = 0; i < NumInputs; ++i)
m_weights[i] = m_weights[i] - learningRate * adjustments[i];
m_biasWeight = m_biasWeight - learningRate * adjustments[NumInputs];
}
const std::array<DataType,NumInputs> GetWeights() const {return m_weights;}
const DataType& GetBiasWeight() const { return m_biasWeight; }
protected:
std::array<DataType,NumInputs> m_weights;
DataType m_biasWeight;
ActivationFuncPtr m_activationFunction;
DataType m_output;
DataType m_netInput;
};
main()
{
std::array<std::array<DataType,2>,4> inputData = {{{0,0},{0,1},{1,0},{1,1}}};
std::array<std::array<DataType,1>,4> desiredOutputs = {{{0},{1},{1},{0}}};
std::array<Neuron<2>*,2> hiddenLayer1 = {{ new Neuron<2>(sigmoid), new Neuron<2>(sigmoid) }};
std::array<Neuron<2>*,1> outputLayer = {{ new Neuron<2>(sigmoid) }};
std::cout << std::fixed << std::setprecision(80);
DataType minError = std::numeric_limits<DataType>::max();
bool minErrorFound = false;
std::size_t epochNumber = 0;
while(epochNumber < numberEpochs && !minErrorFound)
{
DataType epochMSE = 0;
for(std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType,2>& dataRow = inputData[row];
const std::array<DataType,1>& outputRow = desiredOutputs[row];
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
outputLayer[0]->FeedForward({output0,output1});
DataType finalOutput0 = outputLayer[0]->GetOutput();
// if there was more than 1 output neuron these errors need to be summed together first to create total error
DataType totalError = 0.5 * std::pow(outputRow[0] - finalOutput0,2.f);
epochMSE += totalError * totalError;
DataType propagateError = -(outputRow[0] - finalOutput0);
std::array<DataType,3> weightAdjustmentsOutput;
std::array<DataType,2> outputError = outputLayer[0]->Backpropagate(propagateError,
{output0,output1},
weightAdjustmentsOutput);
std::array<DataType,3> weightAdjustmentsHidden1;
hiddenLayer1[0]->Backpropagate(outputError[0],dataRow,weightAdjustmentsHidden1);
std::array<DataType,3> weightAdjustmentsHidden2;
hiddenLayer1[1]->Backpropagate(outputError[1],dataRow,weightAdjustmentsHidden2);
outputLayer[0]->AdjustWeights(weightAdjustmentsOutput);
hiddenLayer1[0]->AdjustWeights(weightAdjustmentsHidden1);
hiddenLayer1[1]->AdjustWeights(weightAdjustmentsHidden2);
}
epochMSE *= DataType(1) / inputData.size();
if(epochMSE >= minError + 0.00000001)
{
minErrorFound = true;
}
else
minError = epochMSE;
++epochNumber;
}
std::cout << std::fixed << std::setprecision(80)
<< "\n\n====================================\n"
<< " TRAINING COMPLETE"
<< "\n\n====================================" << std::endl;
std::cout << "Minimum error: " << minError << std::endl;
std::cout << "Number epochs: " << epochNumber << "/" << numberEpochs << std::endl;
// output tests
std::cout << std::fixed << std::setprecision(2)
<< "\n\n====================================\n"
<< " FINAL TESTS"
<< "\n\n====================================" << std::endl;
for(std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType,2>& dataRow = inputData[row];
const std::array<DataType,1>& outputRow = desiredOutputs[row];
std::cout << dataRow[0] << "," << dataRow[1] << " (" << outputRow[0] << ") : ";
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
outputLayer[0]->FeedForward({output0,output1});
DataType finalOutput0 = outputLayer[0]->GetOutput();
std::cout << finalOutput0 << std::endl;
}
return 0;
}
大多数时候,输出看起来像这样,我想“太棒了!成功了!”
====================================
TRAINING COMPLETE
====================================
Minimum error: 0.00000000106923325748908837340422905981540679931640625000000000000000000000000000
Number epochs: 1000000/1000000
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.01
0.00,1.00 (1.00) : 0.99
1.00,0.00 (1.00) : 0.99
1.00,1.00 (0.00) : 0.01
Process returned 0 (0x0) execution time : 0.992 s
Press any key to continue.
但是下面是偶尔的输出,我想了解一下,这是过拟合,还是欠拟合,还是我哪里做错了?我怎样才能避免这种情况?
====================================
TRAINING COMPLETE
====================================
Minimum error: 0.00787912402302026748657226562500000000000000000000000000000000000000000000000000
Number epochs: 1000000/1000000
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.01
0.00,1.00 (1.00) : 0.50
1.00,0.00 (1.00) : 0.99
1.00,1.00 (0.00) : 0.50
Process returned 0 (0x0) execution time : 1.024 s
Press any key to continue.
我尝试过使用更多或更少的 epochs 以及更高或更低的学习率,但我仍然偶尔会得到如上的结果(并不总是与上面完全相同但相似)。例如,学习率为 0.002
和 1000000
个时期,我偶尔会得到以下信息:
====================================
TRAINING COMPLETE
====================================
Minimum error: 0.01417684461921453475952148437500000000000000000000000000000000000000000000000000
Number epochs: 176477/1000000
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.29
0.00,1.00 (1.00) : 0.59
1.00,0.00 (1.00) : 0.59
1.00,1.00 (0.00) : 0.63
Process returned 0 (0x0) execution time : 0.225 s
Press any key to continue.
我看到它是如何提前退出的,因为错误增加而不是缩小,但这是因为我在不该退出的时候提前退出了吗?
最佳答案
你没有做错任何事。请注意,即使在使用相同数量的时期和训练数据训练网络后,您也会得到不同的结果。如果您在网络中使用了更多的训练数据和/或错误的训练数据,那么过度拟合就是原因。欠拟合与此相反。你没有欠拟合,也没有过拟合。您可以尝试将学习率降低一个数量级或至少一半、增加它、改变训练函数或增加动量。重要的是你要知道神经网络是一个非常经验的过程,如果你训练的网络通过验证那么没关系,如果没有那么稍微调整一下并重新训练或只是重新训练。他们的设计没有封闭形式的公式、解决方案或配方。
关于c++ - 这是 XOR 测试中的过度拟合还是欠拟合?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58312806/