1 files changed, 343 insertions, 0 deletions
diff --git a/ml/dlib/python_examples/svm_struct.py b/ml/dlib/python_examples/svm_struct.py
new file mode 100755
index 000000000..7f0004ccd
--- /dev/null
+++ b/ml/dlib/python_examples/svm_struct.py
@@ -0,0 +1,343 @@
+#!/usr/bin/python
+# The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+#
+# This is an example illustrating the use of the structural SVM solver from
+# the dlib C++ Library.  Therefore, this example teaches you the central ideas
+# needed to setup a structural SVM model for your machine learning problems.  To
+# illustrate the process, we use dlib's structural SVM solver to learn the
+# parameters of a simple multi-class classifier.  We first discuss the
+# multi-class classifier model and then walk through using the structural SVM
+# tools to find the parameters of this classification model.     As an aside,
+# dlib's C++ interface to the structural SVM solver is threaded.  So on a
+# multi-core computer it is significantly faster than using the python
+# interface.  So consider using the C++ interface instead if you find that
+# running it in python is slow.
+#
+#
+# COMPILING/INSTALLING THE DLIB PYTHON INTERFACE
+#   You can install dlib using the command:
+#       pip install dlib
+#
+#   Alternatively, if you want to compile dlib yourself then go into the dlib
+#   root folder and run:
+#       python setup.py install
+#   or
+#       python setup.py install --yes USE_AVX_INSTRUCTIONS
+#   if you have a CPU that supports AVX instructions, since this makes some
+#   things run faster.  
+#
+#   Compiling dlib should work on any operating system so long as you have
+#   CMake installed.  On Ubuntu, this can be done easily by running the
+#   command:
+#       sudo apt-get install cmake
+#
+
+import dlib
+
+
+def main():
+    # In this example, we have three types of samples: class 0, 1, or 2.  That
+    # is, each of our sample vectors falls into one of three classes.  To keep
+    # this example very simple, each sample vector is zero everywhere except at
+    # one place.  The non-zero dimension of each vector determines the class of
+    # the vector.  So for example, the first element of samples has a class of 1
+    # because samples[0][1] is the only non-zero element of samples[0].
+    samples = [[0, 2, 0], [1, 0, 0], [0, 4, 0], [0, 0, 3]]
+    # Since we want to use a machine learning method to learn a 3-class
+    # classifier we need to record the labels of our samples.  Here samples[i]
+    # has a class label of labels[i].
+    labels = [1, 0, 1, 2]
+
+    # Now that we have some training data we can tell the structural SVM to
+    # learn the parameters of our 3-class classifier model.  The details of this
+    # will be explained later.  For now, just note that it finds the weights
+    # (i.e. a vector of real valued parameters) such that predict_label(weights,
+    # sample) always returns the correct label for a sample vector.
+    problem = ThreeClassClassifierProblem(samples, labels)
+    weights = dlib.solve_structural_svm_problem(problem)
+
+    # Print the weights and then evaluate predict_label() on each of our
+    # training samples. Note that the correct label is predicted for each
+    # sample.
+    print(weights)
+    for k, s in enumerate(samples):
+        print("Predicted label for sample[{0}]: {1}".format(
+            k, predict_label(weights, s)))
+
+
+def predict_label(weights, sample):
+    """Given the 9-dimensional weight vector which defines a 3 class classifier,
+    predict the class of the given 3-dimensional sample vector.   Therefore, the
+    output of this function is either 0, 1, or 2 (i.e. one of the three possible
+    labels)."""
+
+    # Our 3-class classifier model can be thought of as containing 3 separate
+    # linear classifiers.  So to predict the class of a sample vector we
+    # evaluate each of these three classifiers and then whatever classifier has
+    # the largest output "wins" and predicts the label of the sample.  This is
+    # the popular one-vs-all multi-class classifier model.
+    # Keeping this in mind, the code below simply pulls the three separate
+    # weight vectors out of weights and then evaluates each against sample.  The
+    # individual classifier scores are stored in scores and the highest scoring
+    # index is returned as the label.
+    w0 = weights[0:3]
+    w1 = weights[3:6]
+    w2 = weights[6:9]
+    scores = [dot(w0, sample), dot(w1, sample), dot(w2, sample)]
+    max_scoring_label = scores.index(max(scores))
+    return max_scoring_label
+
+
+def dot(a, b):
+    """Compute the dot product between the two vectors a and b."""
+    return sum(i * j for i, j in zip(a, b))
+
+
+################################################################################
+
+
+class ThreeClassClassifierProblem:
+    # Now we arrive at the meat of this example program.  To use the
+    # dlib.solve_structural_svm_problem() routine you need to define an object
+    # which tells the structural SVM solver what to do for your problem.  In
+    # this example, this is done by defining the ThreeClassClassifierProblem
+    # object.  Before we get into the details, we first discuss some background
+    # information on structural SVMs.
+    #
+    # A structural SVM is a supervised machine learning method for learning to
+    # predict complex outputs.  This is contrasted with a binary classifier
+    # which makes only simple yes/no predictions.  A structural SVM, on the
+    # other hand, can learn to predict complex outputs such as entire parse
+    # trees or DNA sequence alignments.  To do this, it learns a function F(x,y)
+    # which measures how well a particular data sample x matches a label y,
+    # where a label is potentially a complex thing like a parse tree. However,
+    # to keep this example program simple we use only a 3 category label output.
+    #
+    # At test time, the best label for a new x is given by the y which
+    # maximizes F(x,y). To put this into the context of the current example,
+    # F(x,y) computes the score for a given sample and class label.  The
+    # predicted class label is therefore whatever value of y which makes F(x,y)
+    # the biggest.  This is exactly what predict_label() does. That is, it
+    # computes F(x,0), F(x,1), and F(x,2) and then reports which label has the
+    # biggest value.
+    #
+    # At a high level, a structural SVM can be thought of as searching the
+    # parameter space of F(x,y) for the set of parameters that make the
+    # following inequality true as often as possible:
+    #     F(x_i,y_i) > max{over all incorrect labels of x_i} F(x_i, y_incorrect)
+    # That is, it seeks to find the parameter vector such that F(x,y) always
+    # gives the highest score to the correct output.  To define the structural
+    # SVM optimization problem precisely, we first introduce some notation:
+    #    - let PSI(x,y)    == the joint feature vector for input x and a label y
+    #    - let F(x,y|w)    == dot(w,PSI(x,y)).
+    #      (we use the | notation to emphasize that F() has the parameter vector
+    #       of weights called w)
+    #    - let LOSS(idx,y) == the loss incurred for predicting that the
+    #      idx-th training  sample has a label of y.  Note that LOSS()
+    #      should always be >= 0 and should become exactly 0 when y is the
+    #      correct label for the idx-th sample.  Moreover, it should notionally
+    #      indicate how bad it is to predict y for the idx'th sample.
+    #    - let x_i == the i-th training sample.
+    #    - let y_i == the correct label for the i-th training sample.
+    #    - The number of data samples is N.
+    #
+    # Then the optimization problem solved by a structural SVM using
+    # dlib.solve_structural_svm_problem() is the following:
+    #     Minimize: h(w) == 0.5*dot(w,w) + C*R(w)
+    #
+    #     Where R(w) == sum from i=1 to N: 1/N * sample_risk(i,w) and
+    #     sample_risk(i,w) == max over all
+    #         Y: LOSS(i,Y) + F(x_i,Y|w) - F(x_i,y_i|w) and C > 0
+    #
+    # You can think of the sample_risk(i,w) as measuring the degree of error
+    # you would make when predicting the label of the i-th sample using
+    # parameters w.  That is, it is zero only when the correct label would be
+    # predicted and grows larger the more "wrong" the predicted output becomes.
+    # Therefore, the objective function is minimizing a balance between making
+    # the weights small (typically this reduces overfitting) and fitting the
+    # training data.  The degree to which you try to fit the data is controlled
+    # by the C parameter.
+    #
+    # For a more detailed introduction to structured support vector machines
+    # you should consult the following paper:
+    #     Predicting Structured Objects with Support Vector Machines by
+    #     Thorsten Joachims, Thomas Hofmann, Yisong Yue, and Chun-nam Yu
+    #
+
+    # Finally, we come back to the code.  To use
+    # dlib.solve_structural_svm_problem() you need to provide the things
+    # discussed above.  This is the value of C, the number of training samples,
+    # the dimensionality of PSI(), as well as methods for calculating the loss
+    # values and PSI() vectors.  You will also need to write code that can
+    # compute:
+    # max over all Y: LOSS(i,Y) + F(x_i,Y|w).  To summarize, the
+    # ThreeClassClassifierProblem class is required to have the following
+    # fields:
+    #   - C
+    #   - num_samples
+    #   - num_dimensions
+    #   - get_truth_joint_feature_vector()
+    #   - separation_oracle()
+
+    C = 1
+
+    # There are also a number of optional arguments:
+    # epsilon is the stopping tolerance.  The optimizer will run until R(w) is
+    # within epsilon of its optimal value. If you don't set this then it
+    # defaults to 0.001.
+    # epsilon = 1e-13
+
+    # Uncomment this and the optimizer will print its progress to standard
+    # out.  You will be able to see things like the current risk gap.  The
+    # optimizer continues until the
+    # risk gap is below epsilon.
+    # be_verbose = True
+
+    # If you want to require that the learned weights are all non-negative
+    # then set this field to True.
+    # learns_nonnegative_weights = True
+
+    # The optimizer uses an internal cache to avoid unnecessary calls to your
+    # separation_oracle() routine.  This parameter controls the size of that
+    # cache.  Bigger values use more RAM and might make the optimizer run
+    # faster.  You can also disable it by setting it to 0 which is good to do
+    # when your separation_oracle is very fast.  If If you don't call this
+    # function it defaults to a value of 5.
+    # max_cache_size = 20
+
+    def __init__(self, samples, labels):
+        # dlib.solve_structural_svm_problem() expects the class to have
+        # num_samples and num_dimensions fields.  These fields should contain
+        # the number of training samples and the dimensionality of the PSI
+        # feature vector respectively.
+        self.num_samples = len(samples)
+        self.num_dimensions = len(samples[0])*3
+
+        self.samples = samples
+        self.labels = labels
+
+    def make_psi(self, x, label):
+        """Compute PSI(x,label)."""
+        # All we are doing here is taking x, which is a 3 dimensional sample
+        # vector in this example program, and putting it into one of 3 places in
+        # a 9 dimensional PSI vector, which we then return.  So this function
+        # returns PSI(x,label).  To see why we setup PSI like this, recall how
+        # predict_label() works.  It takes in a 9 dimensional weight vector and
+        # breaks the vector into 3 pieces.  Each piece then defines a different
+        # classifier and we use them in a one-vs-all manner to predict the
+        # label.  So now that we are in the structural SVM code we have to
+        # define the PSI vector to correspond to this usage.  That is, we need
+        # to setup PSI so that argmax_y dot(weights,PSI(x,y)) ==
+        # predict_label(weights,x).  This is how we tell the structural SVM
+        # solver what kind of problem we are trying to solve.
+        #
+        # It's worth emphasizing that the single biggest step in using a
+        # structural SVM is deciding how you want to represent PSI(x,label).  It
+        # is always a vector, but deciding what to put into it to solve your
+        # problem is often not a trivial task. Part of the difficulty is that
+        # you need an efficient method for finding the label that makes
+        # dot(w,PSI(x,label)) the biggest.  Sometimes this is easy, but often
+        # finding the max scoring label turns into a difficult combinatorial
+        # optimization problem.  So you need to pick a PSI that doesn't make the
+        # label maximization step intractable but also still well models your
+        # problem.
+        #
+        # Create a dense vector object (note that you can also use unsorted
+        # sparse vectors (i.e.  dlib.sparse_vector objects) to represent your
+        # PSI vector.  This is useful if you have very high dimensional PSI
+        # vectors that are mostly zeros.  In the context of this example, you
+        # would simply return a dlib.sparse_vector at the end of make_psi() and
+        # the rest of the example would still work properly. ).
+        psi = dlib.vector()
+        # Set it to have 9 dimensions.  Note that the elements of the vector
+        # are 0 initialized.
+        psi.resize(self.num_dimensions)
+        dims = len(x)
+        if label == 0:
+            for i in range(0, dims):
+                psi[i] = x[i]
+        elif label == 1:
+            for i in range(dims, 2 * dims):
+                psi[i] = x[i - dims]
+        else:  # the label must be 2
+            for i in range(2 * dims, 3 * dims):
+                psi[i] = x[i - 2 * dims]
+        return psi
+
+    # Now we get to the two member functions that are directly called by
+    # dlib.solve_structural_svm_problem().
+    #
+    # In get_truth_joint_feature_vector(), all you have to do is return the
+    # PSI() vector for the idx-th training sample when it has its true label.
+    # So here it returns
+    # PSI(self.samples[idx], self.labels[idx]).
+    def get_truth_joint_feature_vector(self, idx):
+        return self.make_psi(self.samples[idx], self.labels[idx])
+
+    # separation_oracle() is more interesting.
+    # dlib.solve_structural_svm_problem() will call separation_oracle() many
+    # times during the optimization.  Each time it will give it the current
+    # value of the parameter weights and the separation_oracle() is supposed to
+    # find the label that most violates the structural SVM objective function
+    # for the idx-th sample.  Then the separation oracle reports the
+    # corresponding PSI vector and loss value.  To state this more precisely,
+    # the separation_oracle() member function has the following contract:
+    #   requires
+    #      - 0 <= idx < self.num_samples
+    #      - len(current_solution) == self.num_dimensions
+    #   ensures
+    #      - runs the separation oracle on the idx-th sample.
+    #        We define this as follows:
+    #         - let X           == the idx-th training sample.
+    #         - let PSI(X,y)    == the joint feature vector for input X
+    #                              and an arbitrary label y.
+    #         - let F(X,y)      == dot(current_solution,PSI(X,y)).
+    #         - let LOSS(idx,y) == the loss incurred for predicting that the
+    #           idx-th sample has a label of y.  Note that LOSS()
+    #           should always be >= 0 and should become exactly 0 when y is the
+    #           correct label for the idx-th sample.
+    #  
+    #            Then the separation oracle finds a Y such that:
+    #               Y = argmax over all y: LOSS(idx,y) + F(X,y)
+    #            (i.e. It finds the label which maximizes the above expression.)
+    #  
+    #            Finally, separation_oracle() returns LOSS(idx,Y),PSI(X,Y)
+    def separation_oracle(self, idx, current_solution):
+        samp = self.samples[idx]
+        dims = len(samp)
+        scores = [0, 0, 0]
+        # compute scores for each of the three classifiers
+        scores[0] = dot(current_solution[0:dims], samp)
+        scores[1] = dot(current_solution[dims:2*dims], samp)
+        scores[2] = dot(current_solution[2*dims:3*dims], samp)
+
+        # Add in the loss-augmentation.  Recall that we maximize
+        # LOSS(idx,y) + F(X,y) in the separate oracle, not just F(X,y) as we
+        # normally would in predict_label(). Therefore, we must add in this
+        # extra amount to account for the loss-augmentation. For our simple
+        # multi-class classifier, we incur a loss of 1 if we don't predict the
+        # correct label and a loss of 0 if we get the right label.
+        if self.labels[idx] != 0:
+            scores[0] += 1
+        if self.labels[idx] != 1:
+            scores[1] += 1
+        if self.labels[idx] != 2:
+            scores[2] += 1
+
+        # Now figure out which classifier has the largest loss-augmented score.
+        max_scoring_label = scores.index(max(scores))
+        # And finally record the loss that was associated with that predicted
+        # label. Again, the loss is 1 if the label is incorrect and 0 otherwise.
+        if max_scoring_label == self.labels[idx]:
+            loss = 0
+        else:
+            loss = 1
+
+        # Finally, return the loss and PSI vector corresponding to the label
+        # we just found.
+        psi = self.make_psi(samp, max_scoring_label)
+        return loss, psi
+
+
+if __name__ == "__main__":
+    main()