1 files changed, 170 insertions, 0 deletions
diff --git a/ml/dlib/examples/dnn_introduction_ex.cpp b/ml/dlib/examples/dnn_introduction_ex.cpp
new file mode 100644
index 000000000..6ae3ddf73
--- /dev/null
+++ b/ml/dlib/examples/dnn_introduction_ex.cpp
@@ -0,0 +1,170 @@
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+    This is an example illustrating the use of the deep learning tools from the
+    dlib C++ Library.  In it, we will train the venerable LeNet convolutional
+    neural network to recognize hand written digits.  The network will take as
+    input a small image and classify it as one of the 10 numeric digits between
+    0 and 9.
+
+    The specific network we will run is from the paper
+        LeCun, Yann, et al. "Gradient-based learning applied to document recognition."
+        Proceedings of the IEEE 86.11 (1998): 2278-2324.
+    except that we replace the sigmoid non-linearities with rectified linear units. 
+
+    These tools will use CUDA and cuDNN to drastically accelerate network
+    training and testing.  CMake should automatically find them if they are
+    installed and configure things appropriately.  If not, the program will
+    still run but will be much slower to execute.
+*/
+
+
+#include <dlib/dnn.h>
+#include <iostream>
+#include <dlib/data_io.h>
+
+using namespace std;
+using namespace dlib;
+ 
+int main(int argc, char** argv) try
+{
+    // This example is going to run on the MNIST dataset.  
+    if (argc != 2)
+    {
+        cout << "This example needs the MNIST dataset to run!" << endl;
+        cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl;
+        cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl;
+        cout << "put them in a folder.  Then give that folder as input to this program." << endl;
+        return 1;
+    }
+
+
+    // MNIST is broken into two parts, a training set of 60000 images and a test set of
+    // 10000 images.  Each image is labeled so that we know what hand written digit is
+    // depicted.  These next statements load the dataset into memory.
+    std::vector<matrix<unsigned char>> training_images;
+    std::vector<unsigned long>         training_labels;
+    std::vector<matrix<unsigned char>> testing_images;
+    std::vector<unsigned long>         testing_labels;
+    load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels);
+
+
+    // Now let's define the LeNet.  Broadly speaking, there are 3 parts to a network
+    // definition.  The loss layer, a bunch of computational layers, and then an input
+    // layer.  You can see these components in the network definition below.  
+    // 
+    // The input layer here says the network expects to be given matrix<unsigned char>
+    // objects as input.  In general, you can use any dlib image or matrix type here, or
+    // even define your own types by creating custom input layers.
+    //
+    // Then the middle layers define the computation the network will do to transform the
+    // input into whatever we want.  Here we run the image through multiple convolutions,
+    // ReLU units, max pooling operations, and then finally a fully connected layer that
+    // converts the whole thing into just 10 numbers.  
+    // 
+    // Finally, the loss layer defines the relationship between the network outputs, our 10
+    // numbers, and the labels in our dataset.  Since we selected loss_multiclass_log it
+    // means we want to do multiclass classification with our network.   Moreover, the
+    // number of network outputs (i.e. 10) is the number of possible labels.  Whichever
+    // network output is largest is the predicted label.  So for example, if the first
+    // network output is largest then the predicted digit is 0, if the last network output
+    // is largest then the predicted digit is 9.  
+    using net_type = loss_multiclass_log<
+                                fc<10,        
+                                relu<fc<84,   
+                                relu<fc<120,  
+                                max_pool<2,2,2,2,relu<con<16,5,5,1,1,
+                                max_pool<2,2,2,2,relu<con<6,5,5,1,1,
+                                input<matrix<unsigned char>> 
+                                >>>>>>>>>>>>;
+    // This net_type defines the entire network architecture.  For example, the block
+    // relu<fc<84,SUBNET>> means we take the output from the subnetwork, pass it through a
+    // fully connected layer with 84 outputs, then apply ReLU.  Similarly, a block of
+    // max_pool<2,2,2,2,relu<con<16,5,5,1,1,SUBNET>>> means we apply 16 convolutions with a
+    // 5x5 filter size and 1x1 stride to the output of a subnetwork, then apply ReLU, then
+    // perform max pooling with a 2x2 window and 2x2 stride.  
+
+
+
+    // So with that out of the way, we can make a network instance.
+    net_type net;
+    // And then train it using the MNIST data.  The code below uses mini-batch stochastic
+    // gradient descent with an initial learning rate of 0.01 to accomplish this.
+    dnn_trainer<net_type> trainer(net);
+    trainer.set_learning_rate(0.01);
+    trainer.set_min_learning_rate(0.00001);
+    trainer.set_mini_batch_size(128);
+    trainer.be_verbose();
+    // Since DNN training can take a long time, we can ask the trainer to save its state to
+    // a file named "mnist_sync" every 20 seconds.  This way, if we kill this program and
+    // start it again it will begin where it left off rather than restarting the training
+    // from scratch.  This is because, when the program restarts, this call to
+    // set_synchronization_file() will automatically reload the settings from mnist_sync if
+    // the file exists.
+    trainer.set_synchronization_file("mnist_sync", std::chrono::seconds(20));
+    // Finally, this line begins training.  By default, it runs SGD with our specified
+    // learning rate until the loss stops decreasing.  Then it reduces the learning rate by
+    // a factor of 10 and continues running until the loss stops decreasing again.  It will
+    // keep doing this until the learning rate has dropped below the min learning rate
+    // defined above or the maximum number of epochs as been executed (defaulted to 10000). 
+    trainer.train(training_images, training_labels);
+
+    // At this point our net object should have learned how to classify MNIST images.  But
+    // before we try it out let's save it to disk.  Note that, since the trainer has been
+    // running images through the network, net will have a bunch of state in it related to
+    // the last batch of images it processed (e.g. outputs from each layer).  Since we
+    // don't care about saving that kind of stuff to disk we can tell the network to forget
+    // about that kind of transient data so that our file will be smaller.  We do this by
+    // "cleaning" the network before saving it.
+    net.clean();
+    serialize("mnist_network.dat") << net;
+    // Now if we later wanted to recall the network from disk we can simply say:
+    // deserialize("mnist_network.dat") >> net;
+
+
+    // Now let's run the training images through the network.  This statement runs all the
+    // images through it and asks the loss layer to convert the network's raw output into
+    // labels.  In our case, these labels are the numbers between 0 and 9.
+    std::vector<unsigned long> predicted_labels = net(training_images);
+    int num_right = 0;
+    int num_wrong = 0;
+    // And then let's see if it classified them correctly.
+    for (size_t i = 0; i < training_images.size(); ++i)
+    {
+        if (predicted_labels[i] == training_labels[i])
+            ++num_right;
+        else
+            ++num_wrong;
+        
+    }
+    cout << "training num_right: " << num_right << endl;
+    cout << "training num_wrong: " << num_wrong << endl;
+    cout << "training accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;
+
+    // Let's also see if the network can correctly classify the testing images.  Since
+    // MNIST is an easy dataset, we should see at least 99% accuracy.
+    predicted_labels = net(testing_images);
+    num_right = 0;
+    num_wrong = 0;
+    for (size_t i = 0; i < testing_images.size(); ++i)
+    {
+        if (predicted_labels[i] == testing_labels[i])
+            ++num_right;
+        else
+            ++num_wrong;
+        
+    }
+    cout << "testing num_right: " << num_right << endl;
+    cout << "testing num_wrong: " << num_wrong << endl;
+    cout << "testing accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;
+
+
+    // Finally, you can also save network parameters to XML files if you want to do
+    // something with the network in another tool.  For example, you could use dlib's
+    // tools/convert_dlib_nets_to_caffe to convert the network to a caffe model.
+    net_to_xml(net, "lenet.xml");
+}
+catch(std::exception& e)
+{
+    cout << e.what() << endl;
+}
+