diff --git a/mlp.py b/mlp.py index 5a91c0d638f9782f7250c999bfc3817d56bff3d2..aa626d038348f6c33a0f76efee44b95ca5d0470b 100644 --- a/mlp.py +++ b/mlp.py @@ -3,25 +3,29 @@ import pandas as pd def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate): - """Take the arrays w1,b1,w2,b2 of a 2-layers neural network - ,update them with a gradient descent - and calculate the average lost the MSE method """ - - d_in , d_h = w1.shape # extracts the dimensions of the variables to define future np.arrays - N , d_out = targets.shape + + """Take the arrays w1,b1,w2,b2 of a 2-layers neural network + ,update them with a gradient descent + and calculate the average lost the MSE method""" + ( + d_in, + d_h, + ) = w1.shape # extracts the dimensions of the variables to define future np.arrays + N, d_out = targets.shape a0 = data # the data are the input of the first layer z1 = np.matmul(a0, w1) + b1 # input of the hidden layer a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer z2 = np.matmul(a1, w2) + b2 # input of the output layer a2 = 1 / (1 + np.exp(-z2)) # output of the output layer predictions = a2 # the predicted values are the outputs of the output layer - #Create the gradient for the variables w2,b2,w1,b1 + # Create the gradient for the variables w2,b2,w1,b1 + loss = np.mean(np.square(predictions - targets)) dCdw2 = np.zeros((d_h, d_out)) - dCdb2 = np.zeros((1, d_out)) + dCdb2 = np.zeros(d_out) dCdw1 = np.zeros((d_in, d_h)) - dCdb1 = np.zeros((1, d_h)) - - #take each data with its respective labels + dCdb1 = np.zeros(d_h) + + # take each data with its respective labels for dataRow, targetsRow in zip(data, targets): a0 = dataRow # the data are the input of the first layer @@ -31,56 +35,42 @@ def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate): a2 = 1 / (1 + np.exp(-z2)) # output of the output layer predictionsRow = a2 # the predicted values are the outputs of the output layer - # Calculate the partial derivative of the cost in relaltion to each network output - dCda = 2 * (predictionsRow - targetsRow) - # sum the contribution of each data for the w2 updating - for l in range( d_h ): - for m in range( d_out ): - dCdw2[l][m] += ( - dCda[l] - * a2[l] - * (1 - a2[l]) - * a1[m] - ) - # sum the contribution of each data for the b2 updating - for l in range( d_out ): - dCdb2[0][l] += ( - dCda[l] - * a2[l] - * (1 - a2[l]) - ) - # sum the contribution of each data for the w1 updating - for l in range( d_in ): - for m in range( d_h ): - for j in range( d_out ): - dCdw1[l][m] += ( + # Calculate the partial derivative of the cost in relaltion to each network output + dCda = (2 / d_out) * (predictionsRow - targetsRow) + # sum the contribution of each data for the w2 updating + for r in range(d_h): + for c in range(d_out): + dCdw2[r][c] += dCda[c] * a2[c] * (1 - a2[c]) * a1[r] + # sum the contribution of each data for the b2 updating + for r in range(d_out): + dCdb2[r] += dCda[r] * a2[r] * (1 - a2[r]) + # sum the contribution of each data for the w1 updating + for r in range(d_in): + for c in range(d_h): + for j in range(d_out): + dCdw1[r][c] += ( dCda[j] * a2[j] * (1 - a2[j]) - * w2[j][l] - * a1[l] - * (1 - a1[l]) - * a0[m] + * w2[c][j] + * a1[c] + * (1 - a1[c]) + * a0[r] ) - # sum the contribution of each data for the b1 updating - for l in range( d_h ): - for j in range( d_out ): - dCdb1[0][l] += ( - dCda[j] - * a2[j] - * (1 - a2[j]) - * w2[j][l] - * a1[l] - * (1 - a1[l]) + # sum the contribution of each data for the b1 updating + for l in range(d_h): + for j in range(d_out): + dCdb1[l] += ( + dCda[j] * a2[j] * (1 - a2[j]) * w2[r][j] * a1[r] * (1 - a1[r]) ) - - #Average value of each data contribution + + # Average value of each data contribution dCdw1 = dCdw1 / N dCdb1 = dCdb1 / N dCdw2 = dCdw2 / N dCdb2 = dCdb2 / N - - #Arrays update + + # Arrays update w1 -= learning_rate * dCdw1 b1 -= learning_rate * dCdb1 w2 -= learning_rate * dCdw2 @@ -88,9 +78,9 @@ def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate): # realizing a new network interaction with new values a0 = data # the data are the input of the first layer - new_z1 = np.matmul(a0, new_w1) + new_b1 # input of the hidden layer - new_a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer - z2 = np.matmul(new_a1, new_w2) + new_b2 # input of the output layer + z1 = np.matmul(a0, w1) + b1 # input of the hidden layer + a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer + z2 = np.matmul(a1, w2) + b2 # input of the output layer a2 = 1 / (1 + np.exp(-z2)) # output of the output layer predictions = a2 # the predicted values are the outputs of the output layer @@ -98,89 +88,85 @@ def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate): loss = np.mean(np.square(predictions - targets)) return w1, b1, w2, b2, loss - + + def one_hot(labels): """Returns the 2d array with binary vectors with the 1's in the respective position of the sort matrix""" - oneHotMat = np.zeros((labels.size, labels.size), dtype=int) - + oneHotMat = np.zeros((labels.size, 10), dtype=int) + for index, values in enumerate(labels): - oneHotMat[index, values] = 1 + oneHotMat[index][values] = 1 return oneHotMat + def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate): - """Take the arrays w1,b1,w2,b2 of a 2-layers neural network - ,update them with a gradient descent - and calculate the average lost the cross - entropy method """ + """Take the arrays w1,b1,w2,b2 of a 2-layers neural network + ,update them with a gradient descent + and calculate the average lost the cross - entropy method""" + + ( + d_in, + d_h, + ) = w1.shape # extracts the dimensions of the variables to define future np.arrays + N = labels_train.shape[0] + d_out = b2.shape[0] - d_in , d_h = w1.shape # extracts the dimensions of the variables to define future np.arrays - N , d_out = targets.shape a0 = data # the data are the input of the first layer z1 = np.matmul(a0, w1) + b1 # input of the hidden layer a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer z2 = np.matmul(a1, w2) + b2 # input of the output layer a2 = 1 / (1 + np.exp(-z2)) # output of the output layer predictions = a2 # the predicted values are the outputs of the output layer - oneHot = one_hot(labels_train) - - #Create the gradient for the variables w2,b2,w1,b1 + oneHot = one_hot(labels_train) + loss = np.mean( + (-oneHot * np.log(predictions)) - (1 - oneHot) * np.log(1 - predictions) + ) + print(loss) + # Create the gradient for the variables w2,b2,w1,b1 dCdw2 = np.zeros((d_h, d_out)) - dCdb2 = np.zeros((1, d_out)) + dCdb2 = np.zeros(d_out) dCdw1 = np.zeros((d_in, d_h)) - dCdb1 = np.zeros((1, d_h)) - - #take each data with its respective labels - for dataRow, oneHotLabel in zip(data, oneHot): + dCdb1 = np.zeros(d_h) + # take each data with its respective labels + for dataRow, oneHotRow in zip(data, oneHot): a0 = dataRow # the data are the input of the first layer z1 = np.matmul(a0, w1) + b1 # input of the hidden layer a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer z2 = np.matmul(a1, w2) + b2 # input of the output layer a2 = 1 / (1 + np.exp(-z2)) # output of the output layer - predictionsRow = a2 # the predicted values are the outputs of the output layer - dCdz2 = predictionsRow - oneHotLabel - - # sum the contribution of each data for the w2 updating - for l in range( d_h ): - for m in range( d_out ): - dCdw2[l][m] += ( - dCdz2[l] - * a1[m] ) - - # sum the contribution of each data for the b2 updating - for l in range( d_out ): - dCdb2[0][l] += ( - dCdz2[l] - ) - # sum the contribution of each data for the w1 updating - for l in range( d_in ): - for m in range( d_h ) : - for j in range( d_out ): - dCdw1[l][m] += ( - dCdz2[j] - * w2[j][l] - * a1[l] - * (1 - a1[l]) - * a0[m] - ) - # sum the contribution of each data for the b1 updating - for l in range( d_h ): - for j in range( d_out ): - dCdb1[0][l] += ( - dCdz2[j] - * w2[j][l] - * a1[l] - * (1 - a1[l]) - ) - - #Average value of each data contribution + # the predicted values are the outputs of the output layer + dCdz2 = a2 - oneHotRow + + # sum the contribution of each data for the w2 updating + for r in range(d_h): + for c in range(d_out): + dCdw2[r][c] += dCdz2[c] * a1[r] + + # sum the contribution of each data for the b2 updating + for r in range(d_out): + dCdb2[r] += dCdz2[r] + + # sum the contribution of each data for the w1 updating + for r in range(d_in): + for c in range(d_h): + for j in range(d_out): + dCdw1[r][c] += dCdz2[j] * w2[c][j] * a1[c] * (1 - a1[c]) * a0[r] + + # sum the contribution of each data for the b1 updating + for r in range(d_h): + for j in range(d_out): + dCdb1[r] += dCdz2[j] * w2[r][j] * a1[r] * (1 - a1[r]) + + # Average value of each data contribution dCdw1 = dCdw1 / N dCdb1 = dCdb1 / N dCdw2 = dCdw2 / N dCdb2 = dCdb2 / N - - #Arrays update + + # Arrays update w1 -= learning_rate * dCdw1 b1 -= learning_rate * dCdb1 w2 -= learning_rate * dCdw2 @@ -188,14 +174,53 @@ def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate): # realizing a new network interaction with new values a0 = data # the data are the input of the first layer - new_z1 = np.matmul(a0, new_w1) + new_b1 # input of the hidden layer - new_a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer - z2 = np.matmul(new_a1, new_w2) + new_b2 # input of the output layer + z1 = np.matmul(a0, w1) + b1 # input of the hidden layer + a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer + z2 = np.matmul(a1, w2) + b2 # input of the output layer a2 = 1 / (1 + np.exp(-z2)) # output of the output layer predictions = a2 # the predicted values are the outputs of the output layer # Compute loss (Entropy Loss) - - loss = np.mean( ( -1 * oneHot * np.log( predictions ) ) - ( 1 - oneHot ) * np.log( 1 - predictions ) ) + + loss = np.mean( + (-oneHot * np.log(predictions)) - (1 - oneHot) * np.log(1 - predictions) + ) + print(loss) return w1, b1, w2, b2, loss + + +def train_mlp(w1, b1, w2, b2, data, labels_train, learning_rate, num_epoch): + + train_accuracies = [] + for i in range(num_epoch): + w1, b1, w2, b2, loss = learn_once_cross_entropy( + w1, b1, w2, b2, data, labels_train, learning_rate + ) + train_accuracies.append(loss) + return w1, b1, w2, b2, train_accuracies + + +################################################################################ +# RANDOM TEST +###################################################################################### +N = 30 # number of input data +d_in = 3 # input dimension +d_h = 5 # number of neurons in the hidden layer +d_out = 10 # output dimension (number of neurons of the output layer) + +# Random initialization of the network weights and biaises +w1 = 2 * np.random.rand(d_in, d_h) - 1 # first layer weights +b1 = np.zeros(d_h) # first layer biaises +w2 = 2 * np.random.rand(d_h, d_out) - 1 # second layer weights +b2 = np.zeros(d_out) # second layer biaises + +data = np.random.rand(N, d_in) # create a random data +# targets = np.random.rand(N, d_out) # create a random targets +labels_train = np.random.randint(3, size=N) # create a random targets +# print(w1, b1, w2, b2) +# print('NEW VALUES') +w1, b1, w2, b2, loss = learn_once_cross_entropy( + w1, b1, w2, b2, data, labels_train, 0.01 +) +# print(w1, b1, w2, b2, loss)