summaryrefslogtreecommitdiffstats
path: root/ml/dlib/examples/empirical_kernel_map_ex.cpp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--ml/dlib/examples/empirical_kernel_map_ex.cpp355
1 files changed, 355 insertions, 0 deletions
diff --git a/ml/dlib/examples/empirical_kernel_map_ex.cpp b/ml/dlib/examples/empirical_kernel_map_ex.cpp
new file mode 100644
index 000000000..9f7b1a579
--- /dev/null
+++ b/ml/dlib/examples/empirical_kernel_map_ex.cpp
@@ -0,0 +1,355 @@
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+
+ This is an example illustrating the use of the empirical_kernel_map
+ from the dlib C++ Library.
+
+ This example program assumes you are familiar with some general elements of
+ the library. In particular, you should have at least read the svm_ex.cpp
+ and matrix_ex.cpp examples.
+
+
+ Most of the machine learning algorithms in dlib are some flavor of "kernel machine".
+ This means they are all simple linear algorithms that have been formulated such
+ that the only way they look at the data given by a user is via dot products between
+ the data samples. These algorithms are made more useful via the application of the
+ so-called kernel trick. This trick is to replace the dot product with a user
+ supplied function which takes two samples and returns a real number. This function
+ is the kernel that is required by so many algorithms. The most basic kernel is the
+ linear_kernel which is simply a normal dot product. More interesting, however,
+ are kernels which first apply some nonlinear transformation to the user's data samples
+ and then compute a dot product. In this way, a simple algorithm that finds a linear
+ plane to separate data (e.g. the SVM algorithm) can be made to solve complex
+ nonlinear learning problems.
+
+ An important element of the kernel trick is that these kernel functions perform
+ the nonlinear transformation implicitly. That is, if you look at the implementations
+ of these kernel functions you won't see code that transforms two input vectors in
+ some way and then computes their dot products. Instead you will see a simple function
+ that takes two input vectors and just computes a single real number via some simple
+ process. You can basically think of this as an optimization. Imagine that originally
+ we wrote out the entire procedure to perform the nonlinear transformation and then
+ compute the dot product but then noticed we could cancel a few terms here and there
+ and simplify the whole thing down into a more compact and easily evaluated form.
+ The result is a nice function that computes what we want but we no longer get to see
+ what those nonlinearly transformed input vectors are.
+
+ The empirical_kernel_map is a tool that undoes this. It allows you to obtain these
+ nonlinearly transformed vectors. It does this by taking a set of data samples from
+ the user (referred to as basis samples), applying the nonlinear transformation to all
+ of them, and then constructing a set of orthonormal basis vectors which spans the space
+ occupied by those transformed input samples. Then if we wish to obtain the nonlinear
+ version of any data sample we can simply project it onto this orthonormal basis and
+ we obtain a regular vector of real numbers which represents the nonlinearly transformed
+ version of the data sample. The empirical_kernel_map has been formulated to use only
+ dot products between data samples so it is capable of performing this service for any
+ user supplied kernel function.
+
+ The empirical_kernel_map is useful because it is often difficult to formulate an
+ algorithm in a way that uses only dot products. So the empirical_kernel_map lets
+ us easily kernelize any algorithm we like by using this object during a preprocessing
+ step. However, it should be noted that the algorithm is only practical when used
+ with at most a few thousand basis samples. Fortunately, most datasets live in
+ subspaces that are relatively low dimensional. So for these datasets, using the
+ empirical_kernel_map is practical assuming an appropriate set of basis samples can be
+ selected by the user. To help with this dlib supplies the linearly_independent_subset_finder.
+ I also often find that just picking a random subset of the data as a basis works well.
+
+
+
+ In what follows, we walk through the process of creating an empirical_kernel_map,
+ projecting data to obtain the nonlinearly transformed vectors, and then doing a
+ few interesting things with the data.
+*/
+
+
+
+
+#include <dlib/svm.h>
+#include <dlib/rand.h>
+#include <iostream>
+#include <vector>
+
+
+using namespace std;
+using namespace dlib;
+
+// ----------------------------------------------------------------------------------------
+
+// First let's make a typedef for the kind of samples we will be using.
+typedef matrix<double, 0, 1> sample_type;
+
+// We will be using the radial_basis_kernel in this example program.
+typedef radial_basis_kernel<sample_type> kernel_type;
+
+// ----------------------------------------------------------------------------------------
+
+void generate_concentric_circles (
+ std::vector<sample_type>& samples,
+ std::vector<double>& labels,
+ const int num_points
+);
+/*!
+ requires
+ - num_points > 0
+ ensures
+ - generates two circles centered at the point (0,0), one of radius 1 and
+ the other of radius 5. These points are stored into samples. labels will
+ tell you if a given samples is from the smaller circle (its label will be 1)
+ or from the larger circle (its label will be 2).
+ - each circle will be made up of num_points
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+void test_empirical_kernel_map (
+ const std::vector<sample_type>& samples,
+ const std::vector<double>& labels,
+ const empirical_kernel_map<kernel_type>& ekm
+);
+/*!
+ This function computes various interesting things with the empirical_kernel_map.
+ See its implementation below for details.
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+int main()
+{
+ std::vector<sample_type> samples;
+ std::vector<double> labels;
+
+ // Declare an instance of the kernel we will be using.
+ const kernel_type kern(0.1);
+
+ // create a dataset with two concentric circles. There will be 100 points on each circle.
+ generate_concentric_circles(samples, labels, 100);
+
+ empirical_kernel_map<kernel_type> ekm;
+
+
+ // Here we create an empirical_kernel_map using all of our data samples as basis samples.
+ cout << "\n\nBuilding an empirical_kernel_map with " << samples.size() << " basis samples." << endl;
+ ekm.load(kern, samples);
+ cout << "Test the empirical_kernel_map when loaded with every sample." << endl;
+ test_empirical_kernel_map(samples, labels, ekm);
+
+
+
+
+
+
+ // create a new dataset with two concentric circles. There will be 1000 points on each circle.
+ generate_concentric_circles(samples, labels, 1000);
+ // Rather than use all 2000 samples as basis samples we are going to use the
+ // linearly_independent_subset_finder to pick out a good basis set. The idea behind this
+ // object is to try and find the 40 or so samples that best spans the subspace containing all the
+ // data.
+ linearly_independent_subset_finder<kernel_type> lisf(kern, 40);
+ // populate lisf with samples. We have configured it to allow at most 40 samples but this function
+ // may determine that fewer samples are necessary to form a good basis. In this example program
+ // it will select only 26.
+ fill_lisf(lisf, samples);
+
+ // Now reload the empirical_kernel_map but this time using only our small basis
+ // selected using the linearly_independent_subset_finder.
+ cout << "\n\nBuilding an empirical_kernel_map with " << lisf.size() << " basis samples." << endl;
+ ekm.load(lisf);
+ cout << "Test the empirical_kernel_map when loaded with samples from the lisf object." << endl;
+ test_empirical_kernel_map(samples, labels, ekm);
+
+
+ cout << endl;
+}
+
+// ----------------------------------------------------------------------------------------
+
+void test_empirical_kernel_map (
+ const std::vector<sample_type>& samples,
+ const std::vector<double>& labels,
+ const empirical_kernel_map<kernel_type>& ekm
+)
+{
+
+ std::vector<sample_type> projected_samples;
+
+ // The first thing we do is compute the nonlinearly projected vectors using the
+ // empirical_kernel_map.
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ projected_samples.push_back(ekm.project(samples[i]));
+ }
+
+ // Note that a kernel matrix is just a matrix M such that M(i,j) == kernel(samples[i],samples[j]).
+ // So below we are computing the normal kernel matrix as given by the radial_basis_kernel and the
+ // input samples. We also compute the kernel matrix for all the projected_samples as given by the
+ // linear_kernel. Note that the linear_kernel just computes normal dot products. So what we want to
+ // see is that the dot products between all the projected_samples samples are the same as the outputs
+ // of the kernel function for their respective untransformed input samples. If they match then
+ // we know that the empirical_kernel_map is working properly.
+ const matrix<double> normal_kernel_matrix = kernel_matrix(ekm.get_kernel(), samples);
+ const matrix<double> new_kernel_matrix = kernel_matrix(linear_kernel<sample_type>(), projected_samples);
+
+ cout << "Max kernel matrix error: " << max(abs(normal_kernel_matrix - new_kernel_matrix)) << endl;
+ cout << "Mean kernel matrix error: " << mean(abs(normal_kernel_matrix - new_kernel_matrix)) << endl;
+ /*
+ Example outputs from these cout statements.
+ For the case where we use all samples as basis samples:
+ Max kernel matrix error: 7.32747e-15
+ Mean kernel matrix error: 7.47789e-16
+
+ For the case where we use only 26 samples as basis samples:
+ Max kernel matrix error: 0.000953573
+ Mean kernel matrix error: 2.26008e-05
+
+
+ Note that if we use enough basis samples we can perfectly span the space of input samples.
+ In that case we get errors that are essentially just rounding noise (Moreover, using all the
+ samples is always enough since they are always within their own span). Once we start
+ to use fewer basis samples we may begin to get approximation error. In the second case we
+ used 26 and we can see that the data doesn't really lay exactly in a 26 dimensional subspace.
+ But it is pretty close.
+ */
+
+
+
+ // Now let's do something more interesting. The following loop finds the centroids
+ // of the two classes of data.
+ sample_type class1_center;
+ sample_type class2_center;
+ for (unsigned long i = 0; i < projected_samples.size(); ++i)
+ {
+ if (labels[i] == 1)
+ class1_center += projected_samples[i];
+ else
+ class2_center += projected_samples[i];
+ }
+
+ const int points_per_class = samples.size()/2;
+ class1_center /= points_per_class;
+ class2_center /= points_per_class;
+
+
+ // Now classify points by which center they are nearest. Recall that the data
+ // is made up of two concentric circles. Normally you can't separate two concentric
+ // circles by checking which points are nearest to each center since they have the same
+ // centers. However, the kernel trick makes the data separable and the loop below will
+ // perfectly classify each data point.
+ for (unsigned long i = 0; i < projected_samples.size(); ++i)
+ {
+ double distance_to_class1 = length(projected_samples[i] - class1_center);
+ double distance_to_class2 = length(projected_samples[i] - class2_center);
+
+ bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2);
+
+ // Now print a message for any misclassified points.
+ if (predicted_as_class_1 == true && labels[i] != 1)
+ cout << "A point was misclassified" << endl;
+
+ if (predicted_as_class_1 == false && labels[i] != 2)
+ cout << "A point was misclassified" << endl;
+ }
+
+
+
+ // Next, note that classifying a point based on its distance between two other
+ // points is the same thing as using the plane that lies between those two points
+ // as a decision boundary. So let's compute that decision plane and use it to classify
+ // all the points.
+
+ sample_type plane_normal_vector = class1_center - class2_center;
+ // The point right in the center of our two classes should be on the deciding plane, not
+ // on one side or the other. This consideration brings us to the formula for the bias.
+ double bias = dot((class1_center+class2_center)/2, plane_normal_vector);
+
+ // Now classify points by which side of the plane they are on.
+ for (unsigned long i = 0; i < projected_samples.size(); ++i)
+ {
+ double side = dot(plane_normal_vector, projected_samples[i]) - bias;
+
+ bool predicted_as_class_1 = (side > 0);
+
+ // Now print a message for any misclassified points.
+ if (predicted_as_class_1 == true && labels[i] != 1)
+ cout << "A point was misclassified" << endl;
+
+ if (predicted_as_class_1 == false && labels[i] != 2)
+ cout << "A point was misclassified" << endl;
+ }
+
+
+ // It would be nice to convert this decision rule into a normal decision_function object and
+ // dispense with the empirical_kernel_map. Happily, it is possible to do so. Consider the
+ // following example code:
+ decision_function<kernel_type> dec_funct = ekm.convert_to_decision_function(plane_normal_vector);
+ // The dec_funct now computes dot products between plane_normal_vector and the projection
+ // of any sample point given to it. All that remains is to account for the bias.
+ dec_funct.b = bias;
+
+ // now classify points by which side of the plane they are on.
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ double side = dec_funct(samples[i]);
+
+ // And let's just check that the dec_funct really does compute the same thing as the previous equation.
+ double side_alternate_equation = dot(plane_normal_vector, projected_samples[i]) - bias;
+ if (abs(side-side_alternate_equation) > 1e-14)
+ cout << "dec_funct error: " << abs(side-side_alternate_equation) << endl;
+
+ bool predicted_as_class_1 = (side > 0);
+
+ // Now print a message for any misclassified points.
+ if (predicted_as_class_1 == true && labels[i] != 1)
+ cout << "A point was misclassified" << endl;
+
+ if (predicted_as_class_1 == false && labels[i] != 2)
+ cout << "A point was misclassified" << endl;
+ }
+
+}
+
+// ----------------------------------------------------------------------------------------
+
+void generate_concentric_circles (
+ std::vector<sample_type>& samples,
+ std::vector<double>& labels,
+ const int num
+)
+{
+ sample_type m(2,1);
+ samples.clear();
+ labels.clear();
+
+ dlib::rand rnd;
+
+ // make some samples near the origin
+ double radius = 1.0;
+ for (long i = 0; i < num; ++i)
+ {
+ double sign = 1;
+ if (rnd.get_random_double() < 0.5)
+ sign = -1;
+ m(0) = 2*radius*rnd.get_random_double()-radius;
+ m(1) = sign*sqrt(radius*radius - m(0)*m(0));
+
+ samples.push_back(m);
+ labels.push_back(1);
+ }
+
+ // make some samples in a circle around the origin but far away
+ radius = 5.0;
+ for (long i = 0; i < num; ++i)
+ {
+ double sign = 1;
+ if (rnd.get_random_double() < 0.5)
+ sign = -1;
+ m(0) = 2*radius*rnd.get_random_double()-radius;
+ m(1) = sign*sqrt(radius*radius - m(0)*m(0));
+
+ samples.push_back(m);
+ labels.push_back(2);
+ }
+}
+
+// ----------------------------------------------------------------------------------------
+