1 files changed, 355 insertions, 0 deletions
diff --git a/ml/dlib/examples/empirical_kernel_map_ex.cpp b/ml/dlib/examples/empirical_kernel_map_ex.cpp
new file mode 100644
index 000000000..9f7b1a579
--- /dev/null
+++ b/ml/dlib/examples/empirical_kernel_map_ex.cpp
@@ -0,0 +1,355 @@
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+
+    This is an example illustrating the use of the empirical_kernel_map 
+    from the dlib C++ Library.
+
+    This example program assumes you are familiar with some general elements of 
+    the library.  In particular, you should have at least read the svm_ex.cpp 
+    and matrix_ex.cpp examples.   
+
+
+    Most of the machine learning algorithms in dlib are some flavor of "kernel machine".
+    This means they are all simple linear algorithms that have been formulated such 
+    that the only way they look at the data given by a user is via dot products between
+    the data samples.  These algorithms are made more useful via the application of the
+    so-called kernel trick.  This trick is to replace the dot product with a user 
+    supplied function which takes two samples and returns a real number.  This function 
+    is the kernel that is required by so many algorithms.  The most basic kernel is the 
+    linear_kernel which is simply a normal dot product.  More interesting, however,
+    are kernels which first apply some nonlinear transformation to the user's data samples 
+    and then compute a dot product.  In this way, a simple algorithm that finds a linear
+    plane to separate data (e.g. the SVM algorithm) can be made to solve complex 
+    nonlinear learning problems.  
+    
+    An important element of the kernel trick is that these kernel functions perform 
+    the nonlinear transformation implicitly.  That is, if you look at the implementations
+    of these kernel functions you won't see code that transforms two input vectors in 
+    some way and then computes their dot products.  Instead you will see a simple function
+    that takes two input vectors and just computes a single real number via some simple
+    process.  You can basically think of this as an optimization.  Imagine that originally 
+    we wrote out the entire procedure to perform the nonlinear transformation and then
+    compute the dot product but then noticed we could cancel a few terms here and there 
+    and simplify the whole thing down into a more compact and easily evaluated form.
+    The result is a nice function that computes what we want but we no longer get to see
+    what those nonlinearly transformed input vectors are.  
+
+    The empirical_kernel_map is a tool that undoes this.  It allows you to obtain these 
+    nonlinearly transformed vectors.  It does this by taking a set of data samples from 
+    the user (referred to as basis samples), applying the nonlinear transformation to all 
+    of them, and then constructing a set of orthonormal basis vectors which spans the space 
+    occupied by those transformed input samples.  Then if we wish to obtain the nonlinear 
+    version of any data sample we can simply project it onto this orthonormal basis and 
+    we obtain a regular vector of real numbers which represents the nonlinearly transformed 
+    version of the data sample.  The empirical_kernel_map has been formulated to use only 
+    dot products between data samples so it is capable of performing this service for any 
+    user supplied kernel function. 
+    
+    The empirical_kernel_map is useful because it is often difficult to formulate an 
+    algorithm in a way that uses only dot products.  So the empirical_kernel_map lets 
+    us easily kernelize any algorithm we like by using this object during a preprocessing 
+    step.  However, it should be noted that the algorithm is only practical when used 
+    with at most a few thousand basis samples.  Fortunately, most datasets live in 
+    subspaces that are relatively low dimensional.  So for these datasets, using the 
+    empirical_kernel_map is practical assuming an appropriate set of basis samples can be 
+    selected by the user.  To help with this dlib supplies the linearly_independent_subset_finder.  
+    I also often find that just picking a random subset of the data as a basis works well.
+
+
+
+    In what follows, we walk through the process of creating an empirical_kernel_map,
+    projecting data to obtain the nonlinearly transformed vectors, and then doing a 
+    few interesting things with the data.
+*/
+
+
+
+
+#include <dlib/svm.h>
+#include <dlib/rand.h>
+#include <iostream>
+#include <vector>
+
+
+using namespace std;
+using namespace dlib;
+
+// ----------------------------------------------------------------------------------------
+
+// First let's make a typedef for the kind of samples we will be using. 
+typedef matrix<double, 0, 1> sample_type;
+
+// We will be using the radial_basis_kernel in this example program.
+typedef radial_basis_kernel<sample_type> kernel_type;
+
+// ----------------------------------------------------------------------------------------
+
+void generate_concentric_circles (
+    std::vector<sample_type>& samples,
+    std::vector<double>& labels,
+    const int num_points
+);
+/*!
+    requires
+        - num_points > 0
+    ensures
+        - generates two circles centered at the point (0,0), one of radius 1 and
+          the other of radius 5.  These points are stored into samples.  labels will
+          tell you if a given samples is from the smaller circle (its label will be 1)
+          or from the larger circle (its label will be 2).
+        - each circle will be made up of num_points
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+void test_empirical_kernel_map (
+    const std::vector<sample_type>& samples,
+    const std::vector<double>& labels,
+    const empirical_kernel_map<kernel_type>& ekm
+);
+/*!
+    This function computes various interesting things with the empirical_kernel_map.  
+    See its implementation below for details.
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+int main()
+{
+    std::vector<sample_type> samples;
+    std::vector<double> labels;
+
+    // Declare an instance of the kernel we will be using.  
+    const kernel_type kern(0.1);
+
+    // create a dataset with two concentric circles.  There will be 100 points on each circle.
+    generate_concentric_circles(samples, labels, 100);
+
+    empirical_kernel_map<kernel_type> ekm;
+
+
+    // Here we create an empirical_kernel_map using all of our data samples as basis samples.  
+    cout << "\n\nBuilding an empirical_kernel_map with " << samples.size() << " basis samples." << endl;
+    ekm.load(kern, samples);
+    cout << "Test the empirical_kernel_map when loaded with every sample." << endl;
+    test_empirical_kernel_map(samples, labels, ekm);
+
+
+
+
+
+
+    // create a new dataset with two concentric circles.  There will be 1000 points on each circle.
+    generate_concentric_circles(samples, labels, 1000);
+    // Rather than use all 2000 samples as basis samples we are going to use the 
+    // linearly_independent_subset_finder to pick out a good basis set.  The idea behind this 
+    // object is to try and find the 40 or so samples that best spans the subspace containing all the 
+    // data.  
+    linearly_independent_subset_finder<kernel_type> lisf(kern, 40);
+    // populate lisf with samples.  We have configured it to allow at most 40 samples but this function 
+    // may determine that fewer samples are necessary to form a good basis.  In this example program
+    // it will select only 26.
+    fill_lisf(lisf, samples);
+
+    // Now reload the empirical_kernel_map but this time using only our small basis  
+    // selected using the linearly_independent_subset_finder.
+    cout << "\n\nBuilding an empirical_kernel_map with " << lisf.size() << " basis samples." << endl;
+    ekm.load(lisf);
+    cout << "Test the empirical_kernel_map when loaded with samples from the lisf object." << endl;
+    test_empirical_kernel_map(samples, labels, ekm);
+
+
+    cout << endl;
+}
+
+// ----------------------------------------------------------------------------------------
+
+void test_empirical_kernel_map (
+    const std::vector<sample_type>& samples,
+    const std::vector<double>& labels,
+    const empirical_kernel_map<kernel_type>& ekm
+)
+{
+
+    std::vector<sample_type> projected_samples;
+
+    // The first thing we do is compute the nonlinearly projected vectors using the 
+    // empirical_kernel_map.  
+    for (unsigned long i = 0; i < samples.size(); ++i)
+    {
+        projected_samples.push_back(ekm.project(samples[i]));
+    }
+
+    // Note that a kernel matrix is just a matrix M such that M(i,j) == kernel(samples[i],samples[j]).
+    // So below we are computing the normal kernel matrix as given by the radial_basis_kernel and the
+    // input samples.  We also compute the kernel matrix for all the projected_samples as given by the 
+    // linear_kernel.  Note that the linear_kernel just computes normal dot products.  So what we want to
+    // see is that the dot products between all the projected_samples samples are the same as the outputs
+    // of the kernel function for their respective untransformed input samples.  If they match then
+    // we know that the empirical_kernel_map is working properly.
+    const matrix<double> normal_kernel_matrix = kernel_matrix(ekm.get_kernel(), samples);
+    const matrix<double> new_kernel_matrix = kernel_matrix(linear_kernel<sample_type>(), projected_samples);
+
+    cout << "Max kernel matrix error: " << max(abs(normal_kernel_matrix - new_kernel_matrix)) << endl;
+    cout << "Mean kernel matrix error: " << mean(abs(normal_kernel_matrix - new_kernel_matrix)) << endl;
+    /*
+        Example outputs from these cout statements.
+        For the case where we use all samples as basis samples:
+            Max kernel matrix error: 7.32747e-15
+            Mean kernel matrix error: 7.47789e-16
+
+        For the case where we use only 26 samples as basis samples:
+            Max kernel matrix error: 0.000953573
+            Mean kernel matrix error: 2.26008e-05
+
+
+        Note that if we use enough basis samples we can perfectly span the space of input samples.
+        In that case we get errors that are essentially just rounding noise (Moreover, using all the 
+        samples is always enough since they are always within their own span).  Once we start 
+        to use fewer basis samples we may begin to get approximation error.  In the second case we 
+        used 26 and we can see that the data doesn't really lay exactly in a 26 dimensional subspace.  
+        But it is pretty close.  
+    */
+
+
+
+    // Now let's do something more interesting.  The following loop finds the centroids
+    // of the two classes of data.
+    sample_type class1_center; 
+    sample_type class2_center; 
+    for (unsigned long i = 0; i < projected_samples.size(); ++i)
+    {
+        if (labels[i] == 1)
+            class1_center += projected_samples[i];
+        else
+            class2_center += projected_samples[i];
+    }
+
+    const int points_per_class = samples.size()/2;
+    class1_center /= points_per_class;
+    class2_center /= points_per_class;
+
+
+    // Now classify points by which center they are nearest.  Recall that the data
+    // is made up of two concentric circles.  Normally you can't separate two concentric
+    // circles by checking which points are nearest to each center since they have the same
+    // centers.  However, the kernel trick makes the data separable and the loop below will 
+    // perfectly classify each data point.
+    for (unsigned long i = 0; i < projected_samples.size(); ++i)
+    {
+        double distance_to_class1 = length(projected_samples[i] - class1_center);
+        double distance_to_class2 = length(projected_samples[i] - class2_center);
+
+        bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2);
+
+        // Now print a message for any misclassified points.
+        if (predicted_as_class_1 == true && labels[i] != 1)
+            cout << "A point was misclassified" << endl;
+
+        if (predicted_as_class_1 == false && labels[i] != 2)
+            cout << "A point was misclassified" << endl;
+    }
+
+
+
+    // Next, note that classifying a point based on its distance between two other 
+    // points is the same thing as using the plane that lies between those two points 
+    // as a decision boundary.  So let's compute that decision plane and use it to classify 
+    // all the points.
+    
+    sample_type plane_normal_vector = class1_center - class2_center;
+    // The point right in the center of our two classes should be on the deciding plane, not
+    // on one side or the other.  This consideration brings us to the formula for the bias.
+    double bias = dot((class1_center+class2_center)/2, plane_normal_vector);
+
+    // Now classify points by which side of the plane they are on.
+    for (unsigned long i = 0; i < projected_samples.size(); ++i)
+    {
+        double side = dot(plane_normal_vector, projected_samples[i]) - bias;
+
+        bool predicted_as_class_1 = (side > 0);
+
+        // Now print a message for any misclassified points.
+        if (predicted_as_class_1 == true && labels[i] != 1)
+            cout << "A point was misclassified" << endl;
+
+        if (predicted_as_class_1 == false && labels[i] != 2)
+            cout << "A point was misclassified" << endl;
+    }
+
+
+    // It would be nice to convert this decision rule into a normal decision_function object and
+    // dispense with the empirical_kernel_map.  Happily, it is possible to do so.  Consider the
+    // following example code:
+    decision_function<kernel_type> dec_funct = ekm.convert_to_decision_function(plane_normal_vector);
+    // The dec_funct now computes dot products between plane_normal_vector and the projection
+    // of any sample point given to it.  All that remains is to account for the bias. 
+    dec_funct.b = bias;
+
+    // now classify points by which side of the plane they are on.
+    for (unsigned long i = 0; i < samples.size(); ++i)
+    {
+        double side = dec_funct(samples[i]);
+
+        // And let's just check that the dec_funct really does compute the same thing as the previous equation.
+        double side_alternate_equation = dot(plane_normal_vector, projected_samples[i]) - bias;
+        if (abs(side-side_alternate_equation) > 1e-14)
+            cout << "dec_funct error: " << abs(side-side_alternate_equation) << endl;
+
+        bool predicted_as_class_1 = (side > 0);
+
+        // Now print a message for any misclassified points.
+        if (predicted_as_class_1 == true && labels[i] != 1)
+            cout << "A point was misclassified" << endl;
+
+        if (predicted_as_class_1 == false && labels[i] != 2)
+            cout << "A point was misclassified" << endl;
+    }
+
+}
+
+// ----------------------------------------------------------------------------------------
+
+void generate_concentric_circles (
+    std::vector<sample_type>& samples,
+    std::vector<double>& labels,
+    const int num
+)
+{
+    sample_type m(2,1);
+    samples.clear();
+    labels.clear();
+
+    dlib::rand rnd;
+
+    // make some samples near the origin
+    double radius = 1.0;
+    for (long i = 0; i < num; ++i)
+    {
+        double sign = 1;
+        if (rnd.get_random_double() < 0.5)
+            sign = -1;
+        m(0) = 2*radius*rnd.get_random_double()-radius;
+        m(1) = sign*sqrt(radius*radius - m(0)*m(0));
+
+        samples.push_back(m);
+        labels.push_back(1);
+    }
+
+    // make some samples in a circle around the origin but far away
+    radius = 5.0;
+    for (long i = 0; i < num; ++i)
+    {
+        double sign = 1;
+        if (rnd.get_random_double() < 0.5)
+            sign = -1;
+        m(0) = 2*radius*rnd.get_random_double()-radius;
+        m(1) = sign*sqrt(radius*radius - m(0)*m(0));
+
+        samples.push_back(m);
+        labels.push_back(2);
+    }
+}
+
+// ----------------------------------------------------------------------------------------
+