1 files changed, 649 insertions, 0 deletions
diff --git a/src/ml/dlib/dlib/svm/structural_svm_problem.h b/src/ml/dlib/dlib/svm/structural_svm_problem.h
new file mode 100644
index 000000000..3a73457b9
--- /dev/null
+++ b/src/ml/dlib/dlib/svm/structural_svm_problem.h
@@ -0,0 +1,649 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_STRUCTURAL_SVM_PRObLEM_Hh_
+#define DLIB_STRUCTURAL_SVM_PRObLEM_Hh_
+
+#include "structural_svm_problem_abstract.h"
+#include "../algs.h"
+#include <vector>
+#include "../optimization/optimization_oca.h"
+#include "../matrix.h"
+#include "sparse_vector.h"
+#include <iostream>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        struct nuclear_norm_regularizer
+        {
+            long first_dimension;
+            long nr;
+            long nc;
+            double regularization_strength;
+        };
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename structural_svm_problem
+        >
+    class cache_element_structural_svm 
+    {
+    public:
+
+        cache_element_structural_svm (
+        ) : prob(0), sample_idx(0), last_true_risk_computed(std::numeric_limits<double>::infinity()) {}
+
+        typedef typename structural_svm_problem::scalar_type scalar_type;
+        typedef typename structural_svm_problem::matrix_type matrix_type;
+        typedef typename structural_svm_problem::feature_vector_type feature_vector_type;
+
+        void init (
+            const structural_svm_problem* prob_,
+            const long idx
+        )
+        /*!
+            ensures
+                - This object will be a cache for the idx-th sample in the given
+                  structural_svm_problem.
+        !*/
+        {
+            prob = prob_;
+            sample_idx = idx;
+
+            loss.clear();
+            psi.clear();
+            lru_count.clear();
+
+            if (prob->get_max_cache_size() != 0)
+            {
+                prob->get_truth_joint_feature_vector(idx, true_psi);
+                compact_sparse_vector(true_psi);
+            }
+        }
+
+        void get_truth_joint_feature_vector_cached (
+            feature_vector_type& psi 
+        ) const
+        {
+            if (prob->get_max_cache_size() != 0)
+                psi = true_psi;
+            else
+                prob->get_truth_joint_feature_vector(sample_idx, psi);
+
+            if (is_matrix<feature_vector_type>::value)
+            {
+                DLIB_CASSERT((long)psi.size() == prob->get_num_dimensions(),
+                    "The dimensionality of your PSI vector doesn't match get_num_dimensions()");
+            }
+        }
+
+        void separation_oracle_cached (
+            const bool use_only_cache,
+            const bool skip_cache,
+            const scalar_type& saved_current_risk_gap,
+            const matrix_type& current_solution,
+            scalar_type& out_loss,
+            feature_vector_type& out_psi
+        ) const
+        {
+            const bool cache_enabled = prob->get_max_cache_size() != 0;
+
+            // Don't waste time computing this if the cache isn't going to be used.
+            const scalar_type dot_true_psi = cache_enabled ? dot(true_psi, current_solution) : 0;
+
+            scalar_type best_risk = -std::numeric_limits<scalar_type>::infinity();
+            unsigned long best_idx = 0;
+            long max_lru_count = 0;
+            if (cache_enabled)
+            {
+                // figure out which element in the cache is the best (i.e. has the biggest risk)
+                for (unsigned long i = 0; i < loss.size(); ++i)
+                {
+                    const scalar_type risk = loss[i] + dot(psi[i], current_solution) - dot_true_psi;
+                    if (risk > best_risk)
+                    {
+                        best_risk = risk;
+                        out_loss = loss[i];
+                        best_idx = i;
+                    }
+                    if (lru_count[i] > max_lru_count)
+                        max_lru_count = lru_count[i];
+                }
+
+                if (!skip_cache)
+                {
+                    // Check if the best psi vector in the cache is still good enough to use as
+                    // a proxy for the true separation oracle.  If the risk value has dropped
+                    // by enough to get into the stopping condition then the best psi isn't
+                    // good enough. 
+                    if ((best_risk + saved_current_risk_gap > last_true_risk_computed &&
+                        best_risk >= 0) || use_only_cache)
+                    {
+                        out_psi = psi[best_idx];
+                        lru_count[best_idx] = max_lru_count + 1;
+                        return;
+                    }
+                }
+            }
+
+
+            prob->separation_oracle(sample_idx, current_solution, out_loss, out_psi);
+            if (is_matrix<feature_vector_type>::value)
+            {
+                DLIB_CASSERT((long)out_psi.size() == prob->get_num_dimensions(),
+                    "The dimensionality of your PSI vector doesn't match get_num_dimensions()");
+            }
+
+            if (!cache_enabled)
+                return;
+
+            compact_sparse_vector(out_psi);
+
+            last_true_risk_computed = out_loss + dot(out_psi, current_solution) - dot_true_psi;
+
+            // If the separation oracle is only solved approximately then the result might
+            // not be as good as just selecting true_psi as the output.  So here we check
+            // if that is the case. 
+            if (last_true_risk_computed < 0 && best_risk < 0)
+            {
+                out_psi = true_psi;
+                out_loss = 0;
+            }
+            // Alternatively, an approximate separation oracle might not do as well as just
+            // selecting from the cache.  So if that is the case when just take the best
+            // element from the cache.
+            else if (last_true_risk_computed < best_risk) 
+            {
+                out_psi = psi[best_idx];
+                out_loss = loss[best_idx];
+                lru_count[best_idx] = max_lru_count + 1;
+            }
+            // if the cache is full
+            else if (loss.size() >= prob->get_max_cache_size())
+            {
+                // find least recently used cache entry for idx-th sample
+                const long i       = index_of_min(mat(lru_count));
+
+                // save our new data in the cache
+                loss[i] = out_loss;
+                psi[i]  = out_psi;
+
+                const long max_use = max(mat(lru_count));
+                // Make sure this new cache entry has the best lru count since we have used
+                // it most recently.
+                lru_count[i] = max_use + 1;
+            }
+            else
+            {
+                // In this case we just append the new psi into the cache.
+
+                loss.push_back(out_loss);
+                psi.push_back(out_psi);
+                long max_use = 1;
+                if (lru_count.size() != 0)
+                    max_use = max(mat(lru_count)) + 1;
+                lru_count.push_back(max_use);
+            }
+        }
+
+    private:
+        // Do nothing if T isn't actually a sparse vector
+        template <typename T> void compact_sparse_vector( T& ) const { }
+
+        template <
+            typename T,
+            typename U,
+            typename alloc
+            >
+        void compact_sparse_vector (
+            std::vector<std::pair<T,U>,alloc>& vect
+        ) const
+        {
+            // If the sparse vector has more entires than dimensions then it must have some 
+            // duplicate elements.  So compact them using make_sparse_vector_inplace().
+            if (vect.size() > (unsigned long)prob->get_num_dimensions())
+            {
+                make_sparse_vector_inplace(vect);
+                // make sure the vector doesn't use more RAM than is necessary
+                std::vector<std::pair<T,U>,alloc>(vect).swap(vect);
+            }
+        }
+
+        const structural_svm_problem* prob;
+
+        long sample_idx;
+
+        mutable feature_vector_type true_psi;
+        mutable std::vector<scalar_type> loss;
+        mutable std::vector<feature_vector_type> psi;
+        mutable std::vector<long> lru_count;
+        mutable double last_true_risk_computed;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename matrix_type_,
+        typename feature_vector_type_ = matrix_type_
+        >
+    class structural_svm_problem : public oca_problem<matrix_type_> 
+    {
+    public:
+        /*!
+            CONVENTION
+                - C == get_c()
+                - eps == get_epsilon()
+                - max_iterations == get_max_iterations()
+                - if (skip_cache) then
+                    - we won't use the oracle cache when we need to evaluate the separation
+                      oracle. Instead, we will directly call the user supplied separation_oracle().
+
+                - get_max_cache_size() == max_cache_size
+
+                - if (cache.size() != 0) then
+                    - cache.size() == get_num_samples()
+                    - for all i: cache[i] == the cached results of calls to separation_oracle()
+                      for the i-th sample.
+        !*/
+
+        typedef matrix_type_ matrix_type;
+        typedef typename matrix_type::type scalar_type;
+        typedef feature_vector_type_ feature_vector_type;
+
+        structural_svm_problem (
+        ) :
+            saved_current_risk_gap(0),
+            eps(0.001),
+            max_iterations(10000),
+            verbose(false),
+            skip_cache(true),
+            count_below_eps(0),
+            max_cache_size(5),
+            converged(false),
+            nuclear_norm_part(0),
+            cache_based_eps(std::numeric_limits<scalar_type>::infinity()),
+            C(1)
+        {}
+
+        scalar_type get_cache_based_epsilon (
+        ) const
+        {
+            return cache_based_eps;
+        }
+
+        void set_cache_based_epsilon (
+            scalar_type eps_
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(eps_ > 0,
+                "\t void structural_svm_problem::set_cache_based_epsilon()"
+                << "\n\t eps_ must be greater than 0"
+                << "\n\t eps_: " << eps_ 
+                << "\n\t this: " << this
+                );
+
+            cache_based_eps = eps_;
+        }
+
+        void set_epsilon (
+            scalar_type eps_
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(eps_ > 0,
+                "\t void structural_svm_problem::set_epsilon()"
+                << "\n\t eps_ must be greater than 0"
+                << "\n\t eps_: " << eps_ 
+                << "\n\t this: " << this
+                );
+
+            eps = eps_;
+        }
+
+        const scalar_type get_epsilon (
+        ) const { return eps; }
+
+        unsigned long get_max_iterations (
+        ) const { return max_iterations; }
+
+        void set_max_iterations (
+            unsigned long max_iter
+        ) 
+        {
+            max_iterations = max_iter;
+        }
+
+        void set_max_cache_size (
+            unsigned long max_size
+        )
+        {
+            max_cache_size = max_size;
+        }
+
+        unsigned long get_max_cache_size (
+        ) const { return max_cache_size; }
+
+        void be_verbose (
+        ) 
+        {
+            verbose = true;
+        }
+
+        void be_quiet(
+        )
+        {
+            verbose = false;
+        }
+
+        scalar_type get_c (
+        ) const { return C; }
+
+        void set_c (
+            scalar_type C_
+        ) 
+        { 
+            // make sure requires clause is not broken
+            DLIB_ASSERT(C_ > 0,
+                "\t void structural_svm_problem::set_c()"
+                << "\n\t C_ must be greater than 0"
+                << "\n\t C_:    " << C_ 
+                << "\n\t this: " << this
+                );
+
+            C = C_; 
+        }
+
+        void add_nuclear_norm_regularizer (
+            long first_dimension,
+            long rows,
+            long cols,
+            double regularization_strength
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(0 <= first_dimension && first_dimension < get_num_dimensions() &&
+                0 <= rows && 0 <= cols && rows*cols+first_dimension <= get_num_dimensions() &&
+                0 < regularization_strength,
+                "\t void structural_svm_problem::add_nuclear_norm_regularizer()"
+                << "\n\t Invalid arguments were given to this function."
+                << "\n\t first_dimension:         " << first_dimension 
+                << "\n\t rows:                    " << rows 
+                << "\n\t cols:                    " << cols 
+                << "\n\t get_num_dimensions():    " << get_num_dimensions() 
+                << "\n\t regularization_strength: " << regularization_strength 
+                << "\n\t this: " << this
+                );
+
+            impl::nuclear_norm_regularizer temp;
+            temp.first_dimension = first_dimension;
+            temp.nr = rows;
+            temp.nc = cols;
+            temp.regularization_strength = regularization_strength;
+            nuclear_norm_regularizers.push_back(temp);
+        }
+
+        unsigned long num_nuclear_norm_regularizers (
+        ) const { return nuclear_norm_regularizers.size(); }
+
+        void clear_nuclear_norm_regularizers (
+        ) { nuclear_norm_regularizers.clear(); }
+
+        virtual long get_num_dimensions (
+        ) const = 0;
+
+        virtual long get_num_samples (
+        ) const = 0;
+
+        virtual void get_truth_joint_feature_vector (
+            long idx,
+            feature_vector_type& psi 
+        ) const = 0;
+
+        virtual void separation_oracle (
+            const long idx,
+            const matrix_type& current_solution,
+            scalar_type& loss,
+            feature_vector_type& psi
+        ) const = 0;
+
+    private:
+
+        virtual bool risk_has_lower_bound (
+            scalar_type& lower_bound
+        ) const 
+        { 
+            lower_bound = 0;
+            return true; 
+        }
+
+        virtual bool optimization_status (
+            scalar_type current_objective_value,
+            scalar_type current_error_gap,
+            scalar_type current_risk_value,
+            scalar_type current_risk_gap,
+            unsigned long num_cutting_planes,
+            unsigned long num_iterations
+        ) const 
+        {
+            if (verbose)
+            {
+                using namespace std;
+                if (nuclear_norm_regularizers.size() != 0)
+                {
+                    cout << "objective:             " << current_objective_value << endl;
+                    cout << "objective gap:         " << current_error_gap << endl;
+                    cout << "risk:                  " << current_risk_value-nuclear_norm_part << endl;
+                    cout << "risk+nuclear norm:     " << current_risk_value << endl;
+                    cout << "risk+nuclear norm gap: " << current_risk_gap << endl;
+                    cout << "num planes:            " << num_cutting_planes << endl;
+                    cout << "iter:                  " << num_iterations << endl;
+                }
+                else
+                {
+                    cout << "objective:     " << current_objective_value << endl;
+                    cout << "objective gap: " << current_error_gap << endl;
+                    cout << "risk:          " << current_risk_value << endl;
+                    cout << "risk gap:      " << current_risk_gap << endl;
+                    cout << "num planes:    " << num_cutting_planes << endl;
+                    cout << "iter:          " << num_iterations << endl;
+                }
+                cout << endl;
+            }
+
+            if (num_iterations >= max_iterations)
+                return true;
+
+            saved_current_risk_gap = current_risk_gap;
+
+            if (converged)
+            {
+                return (current_risk_gap < std::max(cache_based_eps,cache_based_eps*current_risk_value)) || 
+                       (current_risk_gap == 0);
+            }
+
+            if (current_risk_gap < eps)
+            {
+                // Only stop when we see that the risk gap is small enough on a non-cached
+                // iteration.  But even then, if we are supposed to do the cache based
+                // refinement then we just mark that we have "converged" to avoid further
+                // calls to the separation oracle and run all subsequent iterations off the
+                // cache.
+                if (skip_cache || max_cache_size == 0)
+                {
+                    converged = true;
+                    skip_cache = false;
+                    return (current_risk_gap < std::max(cache_based_eps,cache_based_eps*current_risk_value)) ||
+                           (current_risk_gap == 0);
+                }
+
+                ++count_below_eps;
+
+                // Only disable the cache if we have seen a few consecutive iterations that
+                // look to have converged.
+                if (count_below_eps > 1)
+                {
+                    // Instead of stopping we shouldn't use the cache on the next iteration.  This way
+                    // we can be sure to have the best solution rather than assuming the cache is up-to-date
+                    // enough.
+                    skip_cache = true;
+                    count_below_eps = 0;
+                }
+            }
+            else
+            {
+                count_below_eps = 0;
+                skip_cache = false;
+            }
+
+            return false;
+        }
+
+        virtual void get_risk (
+            matrix_type& w,
+            scalar_type& risk,
+            matrix_type& subgradient
+        ) const 
+        {
+            feature_vector_type ftemp;
+            const unsigned long num = get_num_samples();
+
+            // initialize the cache and compute psi_true.
+            if (cache.size() == 0)
+            {
+                cache.resize(get_num_samples());
+                for (unsigned long i = 0; i < cache.size(); ++i)
+                    cache[i].init(this,i);
+
+                psi_true.set_size(w.size(),1);
+                psi_true = 0;
+
+                for (unsigned long i = 0; i < num; ++i)
+                {
+                    cache[i].get_truth_joint_feature_vector_cached(ftemp);
+
+                    subtract_from(psi_true, ftemp);
+                }
+            }
+
+            subgradient = psi_true;
+            scalar_type total_loss = 0;
+            call_separation_oracle_on_all_samples(w,subgradient,total_loss);
+
+            subgradient /= num;
+            total_loss /= num;
+            risk = total_loss + dot(subgradient,w);
+
+            if (nuclear_norm_regularizers.size() != 0)
+            {
+                matrix_type grad; 
+                scalar_type obj;
+                compute_nuclear_norm_parts(w, grad, obj);
+                risk += obj;
+                subgradient += grad;
+            }
+        }
+
+        virtual void call_separation_oracle_on_all_samples (
+            const matrix_type& w,
+            matrix_type& subgradient,
+            scalar_type& total_loss
+        ) const
+        {
+            feature_vector_type ftemp;
+            const unsigned long num = get_num_samples();
+            for (unsigned long i = 0; i < num; ++i)
+            {
+                scalar_type loss;
+                separation_oracle_cached(i, w, loss, ftemp);
+                total_loss += loss;
+                add_to(subgradient, ftemp);
+            }
+        }
+
+    protected:
+
+        void compute_nuclear_norm_parts(
+            const matrix_type& m,
+            matrix_type& grad,
+            scalar_type& obj
+        ) const
+        {
+            obj = 0;
+            grad.set_size(m.size(), 1);
+            grad = 0;
+
+            matrix<double> u,v,w,f;
+            nuclear_norm_part = 0;
+            for (unsigned long i = 0; i < nuclear_norm_regularizers.size(); ++i)
+            {
+                const long nr = nuclear_norm_regularizers[i].nr;
+                const long nc = nuclear_norm_regularizers[i].nc;
+                const long size = nr*nc;
+                const long idx = nuclear_norm_regularizers[i].first_dimension;
+                const double strength = nuclear_norm_regularizers[i].regularization_strength;
+
+                f = matrix_cast<double>(reshape(rowm(m, range(idx, idx+size-1)), nr, nc));
+                svd3(f, u,w,v);
+
+
+                const double norm = sum(w);
+                obj += strength*norm;
+                nuclear_norm_part += strength*norm/C;
+
+                f = u*trans(v);
+
+                set_rowm(grad, range(idx, idx+size-1)) = matrix_cast<double>(strength*reshape_to_column_vector(f));
+            }
+
+            obj /= C;
+            grad /= C;
+        }
+
+        void separation_oracle_cached (
+            const long idx,
+            const matrix_type& current_solution,
+            scalar_type& loss,
+            feature_vector_type& psi
+        ) const 
+        {
+            cache[idx].separation_oracle_cached(converged,
+                                                skip_cache, 
+                                                saved_current_risk_gap,
+                                                current_solution,
+                                                loss,
+                                                psi);
+        }
+
+        std::vector<impl::nuclear_norm_regularizer> nuclear_norm_regularizers;
+
+        mutable scalar_type saved_current_risk_gap;
+        mutable matrix_type psi_true;
+        scalar_type eps;
+        unsigned long max_iterations;
+        mutable bool verbose;
+
+
+        mutable std::vector<cache_element_structural_svm<structural_svm_problem> > cache;
+        mutable bool skip_cache;
+        mutable int count_below_eps;
+        unsigned long max_cache_size;
+        mutable bool converged;
+        mutable double nuclear_norm_part;
+        scalar_type cache_based_eps;
+
+        scalar_type C;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_STRUCTURAL_SVM_PRObLEM_Hh_
+