Convolution filter backpropagation method finished

Sergio0694 · Sergio0694 · commit 8f636c0ad6a0 · 2017-12-31T16:58:08.000+01:00
diff --git a/NeuralNetwork.NET/Networks/Layers/Cpu/ConvolutionalLayer.cs b/NeuralNetwork.NET/Networks/Layers/Cpu/ConvolutionalLayer.cs
@@ -110,7 +110,7 @@ public override unsafe void Backpropagate(in Tensor x, in Tensor dy, in Tensor z
         public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
         {
             Tensor.New(OutputInfo.Channels, KernelInfo.Size, out Tensor dw);
-            CpuDnn.ConvolutionBackwardFilter(a, InputInfo, delta, OutputInfo, dw);
+            CpuDnn.ConvolutionBackwardFilter(a, InputInfo, delta, OutputInfo, dw, KernelInfo);
             dw.Reshape(1, Weights.Length, out dJdw);
             Tensor.New(1, Biases.Length, out dJdb);
             CpuDnn.ConvolutionBackwardBias(delta, OutputInfo, dJdb);
diff --git a/NeuralNetwork.NET/cpuDNN/CpuBlas.cs b/NeuralNetwork.NET/cpuDNN/CpuBlas.cs
@@ -5,6 +5,9 @@
 
 namespace NeuralNetworkNET.cpuDNN
 {
+    /// <summary>
+    /// A class that exposes static BLAS (Basic Linear Algebra Subprograms) methods working on <see cref="Tensor"/> instances
+    /// </summary>
     public static class CpuBlas
     {
         /// <summary>
@@ -66,7 +69,7 @@ void Kernel(int i)
         }
 
         /// <summary>
-        /// Performs the in place multiplication (Hadamard product) product between two <see cref="Tensor"/> instances
+        /// Performs the elementwise multiplication (Hadamard product) product between two <see cref="Tensor"/> instances
         /// </summary>
         /// <param name="x1">The first <see cref="Tensor"/></param>
         /// <param name="x2">The second <see cref="Tensor"/></param>
diff --git a/NeuralNetwork.NET/cpuDNN/CpuDnn{Convolution}.cs b/NeuralNetwork.NET/cpuDNN/CpuDnn{Convolution}.cs
@@ -139,7 +139,7 @@ public static unsafe void ConvolutionBackwardData(
             if (imgSize < kSize) throw new ArgumentException("Each subdivided tensor must at least have the size of the kernels");
             if (dyInfo.Channels != nKernels) throw new ArgumentException("The source depth must be equal to the number of kernels");
 
-            // Traanspose the layer kernels
+            // Rotate the layer kernels
             Rotate180(w, wInfo.Channels, out Tensor w180);
 
             /* ============================
@@ -212,11 +212,12 @@ void BackwardsKernel(int index)
         /// <param name="dy">The output error <see cref="Tensor"/></param>
         /// <param name="dyInfo">The output error volume info (depth and 2D slices size)</param>
         /// <param name="dw">The resulting weights gradient</param>
+        /// <param name="wInfo">The info on the layer kernels</param>
         /// <exception cref="ArgumentException">The size of one of the input <see cref="Tensor"/> instances isn't valid</exception>
         public static unsafe void ConvolutionBackwardFilter(
             in Tensor x, in TensorInfo xInfo,
             in Tensor dy, in TensorInfo dyInfo,
-            in Tensor dw)
+            in Tensor dw, in TensorInfo wInfo)
         {
             // Checks and local parameters
             int
@@ -244,15 +245,19 @@ public static unsafe void ConvolutionBackwardFilter(
              * Kernels:         HK*WK*sourceDepth*kernelsDepth (delta(l + 1) used to calculate the 3D gradient for each kernel)
              * Output:          sourceDepth*kernelsDepth slices, where each stack of sourceDepth slices is the gradient for the i-th kernel */
             int
-                hResult = imgHeight - kHeight + 1,                              // Size of each image edge after the convolution
+                hResult = imgHeight - kHeight + 1,                          // Size of each image edge after the convolution
                 wResult = imgWidth - kWidth + 1,
-                convolutionOutputSize = hResult * wResult,                      // Size of each processed image
-                gradientSize = convolutionOutputSize * xInfo.Channels,     // Size of each calculated gradient (one for each original kernel, so for each input delta)
-                finalWidth = gradientSize * dyInfo.Channels,               // Final size of each sample row
-                iterationsPerSample = xInfo.Channels * kDepth;             // Each sample has its own list of 3D gradients, one for each kernel
+                convolutionOutputSize = hResult * wResult,                  // Size of each processed image
+                gradientSize = convolutionOutputSize * xInfo.Channels,      // Size of each calculated gradient (one for each original kernel, so for each input delta)
+                finalWidth = gradientSize * dyInfo.Channels,                // Final size of each sample row
+                iterationsPerSample = xInfo.Channels * kDepth;              // Each sample has its own list of 3D gradients, one for each kernel
+
+            // Rotate the inputs and prepare the temporary tensor
+            Rotate180(x, xInfo.Channels, out Tensor xt);
+            Tensor.New(x.Entities, finalWidth, out Tensor dwTemp);
 
             // Process the valid convolution
-            float* px = x, pdy = dy, pdw = dw;
+            float* px = xt, pdy = dy, pdw = dwTemp;
             void GradientKernel(int index)
             {
                 // Calculate the current indexes
@@ -291,7 +296,17 @@ void GradientKernel(int index)
                 }
             }
             Parallel.For(0, n * iterationsPerSample, GradientKernel).AssertCompleted();
-            throw new NotImplementedException("The CPU gradient convolution isn't implemented correctly yet");
+            xt.Free();
+
+            /* ==========================
+             * Gradient compression
+             * ==========================
+             * At this point, the temporary tensor has the series of (p,q) gradients for all the layer
+             * kernels, where p is the input depth and q is the kernel index.
+             * The final weights gradient is the sum for all the samples in the current training batch */
+            dw.Reshape(1, dw.Size, out Tensor wPlane);  // The gradient is [q,p]-shaped, flatten to the size of each sample before compressing
+            CpuBlas.CompressVertically(dwTemp, wPlane);
+            dwTemp.Free();
         }
 
         /// <summary>
diff --git a/Samples/DigitsTest/Program.cs b/Samples/DigitsTest/Program.cs
@@ -6,7 +6,6 @@
 using NeuralNetworkNET.APIs.Results;
 using NeuralNetworkNET.APIs.Structs;
 using NeuralNetworkNET.Networks.Activations;
-using NeuralNetworkNET.Networks.Cost;
 using NeuralNetworkNET.SupervisedLearning.Optimization.Parameters;
 using NeuralNetworkNET.SupervisedLearning.Optimization.Progress;
 
@@ -18,10 +17,20 @@ static async Task Main()
         {
             (var training, var test) = DataParser.LoadDatasets();
             INeuralNetwork network = NetworkManager.NewSequential(TensorInfo.CreateForGrayscaleImage(28, 28),
-                NetworkLayers.FullyConnected(100, ActivationFunctionType.Sigmoid),
-                NetworkLayers.FullyConnected(10, ActivationFunctionType.Sigmoid, CostFunctionType.CrossEntropy));
-            TrainingSessionResult result = await NetworkManager.TrainNetworkAsync(network, (training.X, training.Y), 60, 10,
-                TrainingAlgorithmsInfo.StochasticGradientDescent(), 0.5f,
+                NetworkLayers.Convolutional((5, 5), 20, ActivationFunctionType.Identity),
+                NetworkLayers.Pooling(ActivationFunctionType.LeakyReLU),
+                NetworkLayers.FullyConnected(100, ActivationFunctionType.LeCunTanh),
+                NetworkLayers.Softmax(10));
+            TrainingSessionResult result = await NetworkManager.TrainNetworkAsync(network, (training.X, training.Y), 60, 100,
+                TrainingAlgorithmsInfo.Adadelta(), 0.5f,
+                new Progress<BatchProgress>(p =>
+                {
+                    Console.SetCursorPosition(0, Console.CursorTop);
+                    int n = (int)(p.Percentage * 32 / 100);
+                    char[] c = new char[32];
+                    for (int i = 0; i < 32; i++) c[i] = i <= n ? '=' : ' ';
+                    Console.Write($"[{new String(c)}] ");
+                }),
                 testParameters: new TestParameters(test, new Progress<BackpropagationProgressEventArgs>(p =>
                 {
                     Printf($"Epoch {p.Iteration}, cost: {p.Result.Cost}, accuracy: {p.Result.Accuracy}");
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
@@ -58,7 +58,7 @@ private static unsafe void TestGradient(WeightedLayerBase cpu, WeightedLayerBase
                 cpu.ComputeGradient(xt, deltat, out Tensor dJdw_cpu, out Tensor dJdb_cpu);
                 gpu.ComputeGradient(xt, deltat, out Tensor dJdw_gpu, out Tensor dJdb_gpu);
                 Assert.IsTrue(dJdw_cpu.ContentEquals(dJdw_gpu));
-                Assert.IsTrue(dJdb_cpu.ContentEquals(dJdb_gpu));
+                Assert.IsTrue(dJdb_cpu.ContentEquals(dJdb_gpu, 1e-4f, 1e-5f)); // The cuDNN ConvolutionBackwardBias is not always as precise as the CPU version
                 dJdw_cpu.Free();
                 dJdw_gpu.Free();
                 dJdb_cpu.Free();
@@ -202,14 +202,13 @@ public unsafe void ConvolutionBackward()
         }
 
         [TestMethod]
-        public void ConvolutionGradient()
+        public unsafe void ConvolutionGradient()
         {
-            // TODO: CPU gradient not implemented yet
-            /* float[,]
-                x = WeightsProvider.NewFullyConnectedWeights(127, 58 * 58 * 3),
-                delta = WeightsProvider.NewFullyConnectedWeights(127, 54 * 54 * 20);
+            float[,]
+                x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(127), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3),
+                delta = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(127), 54 * 54 * 5, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 54 * 54 * 5);
             ConvolutionalLayer
-                cpu = new ConvolutionalLayer(new TensorInfo(58, 58, 3), ConvolutionInfo.Default, (5, 5), 20, ActivationFunctionType.LeCunTanh, BiasInitializationMode.Gaussian),
+                cpu = new ConvolutionalLayer(new TensorInfo(58, 58, 3), ConvolutionInfo.Default, (5, 5), 5, ActivationFunctionType.LeCunTanh, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnConvolutionalLayer(cpu.InputInfo, ConvolutionInfo.Default, cpu.KernelInfo, cpu.OutputInfo, cpu.Weights, cpu.Biases, ActivationFunctionType.LeCunTanh);
             fixed (float* px = x)
             {
@@ -218,7 +217,7 @@ public void ConvolutionGradient()
                 z_gpu.Free();
                 a_gpu.Free();
             }
-            TestGradient(cpu, gpu, x, delta); */
+            TestGradient(cpu, gpu, x, delta);
         }
 
         #endregion

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ public override unsafe void Backpropagate(in Tensor x, in Tensor dy, in Tensor z`
`110`	`110`	`public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)`
`111`	`111`	`{`
`112`	`112`	`Tensor.New(OutputInfo.Channels, KernelInfo.Size, out Tensor dw);`
`113`		`- CpuDnn.ConvolutionBackwardFilter(a, InputInfo, delta, OutputInfo, dw);`
	`113`	`+ CpuDnn.ConvolutionBackwardFilter(a, InputInfo, delta, OutputInfo, dw, KernelInfo);`
`114`	`114`	`dw.Reshape(1, Weights.Length, out dJdw);`
`115`	`115`	`Tensor.New(1, Biases.Length, out dJdb);`
`116`	`116`	`CpuDnn.ConvolutionBackwardBias(delta, OutputInfo, dJdb);`
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,9 @@`
`5`	`5`
`6`	`6`	`namespace NeuralNetworkNET.cpuDNN`
`7`	`7`	`{`
	`8`	`+ /// <summary>`
	`9`	`+ /// A class that exposes static BLAS (Basic Linear Algebra Subprograms) methods working on <see cref="Tensor"/> instances`
	`10`	`+ /// </summary>`
`8`	`11`	`public static class CpuBlas`
`9`	`12`	`{`
`10`	`13`	`/// <summary>`
`@@ -66,7 +69,7 @@ void Kernel(int i)`
`66`	`69`	`}`
`67`	`70`
`68`	`71`	`/// <summary>`
`69`		`- /// Performs the in place multiplication (Hadamard product) product between two <see cref="Tensor"/> instances`
	`72`	`+ /// Performs the elementwise multiplication (Hadamard product) product between two <see cref="Tensor"/> instances`
`70`	`73`	`/// </summary>`
`71`	`74`	`/// <param name="x1">The first <see cref="Tensor"/></param>`
`72`	`75`	`/// <param name="x2">The second <see cref="Tensor"/></param>`