summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/svm/structural_svm_problem_abstract.h
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/svm/structural_svm_problem_abstract.h')
-rw-r--r--ml/dlib/dlib/svm/structural_svm_problem_abstract.h348
1 files changed, 0 insertions, 348 deletions
diff --git a/ml/dlib/dlib/svm/structural_svm_problem_abstract.h b/ml/dlib/dlib/svm/structural_svm_problem_abstract.h
deleted file mode 100644
index 20b3d73a7..000000000
--- a/ml/dlib/dlib/svm/structural_svm_problem_abstract.h
+++ /dev/null
@@ -1,348 +0,0 @@
-// Copyright (C) 2011 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#undef DLIB_STRUCTURAL_SVM_PRObLEM_ABSTRACT_Hh_
-#ifdef DLIB_STRUCTURAL_SVM_PRObLEM_ABSTRACT_Hh_
-
-#include "../optimization/optimization_oca_abstract.h"
-#include "sparse_vector_abstract.h"
-#include "../matrix.h"
-
-namespace dlib
-{
-
-// ----------------------------------------------------------------------------------------
-
- template <
- typename matrix_type_,
- typename feature_vector_type_ = matrix_type_
- >
- class structural_svm_problem : public oca_problem<matrix_type_>
- {
- public:
- /*!
- REQUIREMENTS ON matrix_type_
- - matrix_type_ == a dlib::matrix capable of storing column vectors
-
- REQUIREMENTS ON feature_vector_type_
- - feature_vector_type_ == a dlib::matrix capable of storing column vectors
- or an unsorted sparse vector type as defined in dlib/svm/sparse_vector_abstract.h.
-
- INITIAL VALUE
- - get_epsilon() == 0.001
- - get_max_iterations() == 10000
- - get_max_cache_size() == 5
- - get_c() == 1
- - get_cache_based_epsilon() == std::numeric_limits<scalar_type>::infinity()
- (I.e. the cache based epsilon feature is disabled)
- - num_nuclear_norm_regularizers() == 0
- - This object will not be verbose
-
- WHAT THIS OBJECT REPRESENTS
- This object is a tool for solving the optimization problem associated with
- a structural support vector machine. A structural SVM is a supervised
- machine learning method for learning to predict complex outputs. This is
- contrasted with a binary classifier which makes only simple yes/no
- predictions. A structural SVM, on the other hand, can learn to predict
- complex outputs such as entire parse trees or DNA sequence alignments. To
- do this, it learns a function F(x,y) which measures how well a particular
- data sample x matches a label y. When used for prediction, the best label
- for a new x is given by the y which maximizes F(x,y).
-
- To use this object you inherit from it, provide implementations of its four
- pure virtual functions, and then pass your object to the oca optimizer.
- Also, you should only pass an instance of this object to the oca optimizer
- once. That is, the act of using a structural_svm_problem instance with the
- oca solver "uses" the structural_svm_problem instance. If you want to
- solve the same problem multiple times then you must use a fresh instance of
- your structural_svm_problem.
-
-
- To define the optimization problem precisely, we first introduce some notation:
- - let PSI(x,y) == the joint feature vector for input x and a label y.
- - let F(x,y|w) == dot(w,PSI(x,y)).
- - let LOSS(idx,y) == the loss incurred for predicting that the idx-th training
- sample has a label of y. Note that LOSS() should always be >= 0 and should
- become exactly 0 when y is the correct label for the idx-th sample.
- - let x_i == the i-th training sample.
- - let y_i == the correct label for the i-th training sample.
- - The number of data samples is N.
-
- Then the optimization problem solved using this object is the following:
- Minimize: h(w) == 0.5*dot(w,w) + C*R(w)
-
- Where R(w) == sum from i=1 to N: 1/N * sample_risk(i,w)
- and sample_risk(i,w) == max over all Y: LOSS(i,Y) + F(x_i,Y|w) - F(x_i,y_i|w)
- and C > 0
-
-
-
- For an introduction to structured support vector machines you should consult
- the following paper:
- Predicting Structured Objects with Support Vector Machines by
- Thorsten Joachims, Thomas Hofmann, Yisong Yue, and Chun-nam Yu
-
- For a more detailed discussion of the particular algorithm implemented by this
- object see the following paper:
- T. Joachims, T. Finley, Chun-Nam Yu, Cutting-Plane Training of Structural SVMs,
- Machine Learning, 77(1):27-59, 2009.
-
- Note that this object is essentially a tool for solving the 1-Slack structural
- SVM with margin-rescaling. Specifically, see Algorithm 3 in the above referenced
- paper.
- !*/
-
- typedef matrix_type_ matrix_type;
- typedef typename matrix_type::type scalar_type;
- typedef feature_vector_type_ feature_vector_type;
-
- structural_svm_problem (
- );
- /*!
- ensures
- - this object is properly initialized
- !*/
-
- void set_epsilon (
- scalar_type eps
- );
- /*!
- requires
- - eps > 0
- ensures
- - #get_epsilon() == eps
- !*/
-
- const scalar_type get_epsilon (
- ) const;
- /*!
- ensures
- - returns the error epsilon that determines when training should stop.
- Smaller values may result in a more accurate solution but take longer
- to execute. Specifically, the algorithm stops when the average sample
- risk (i.e. R(w) as defined above) is within epsilon of its optimal value.
-
- Also note that sample risk is an upper bound on a sample's loss. So
- you can think of this epsilon value as saying "solve the optimization
- problem until the average loss per sample is within epsilon of it's
- optimal value".
- !*/
-
- scalar_type get_cache_based_epsilon (
- ) const;
- /*!
- ensures
- - if (get_max_cache_size() != 0) then
- - The solver will not stop when the average sample risk is within
- get_epsilon() of its optimal value. Instead, it will keep running
- but will run the optimizer completely on the cache until the average
- sample risk is within #get_cache_based_epsilon() of its optimal
- value. This means that it will perform this additional refinement in
- the solution accuracy without making any additional calls to the
- separation_oracle(). This is useful when using a nuclear norm
- regularization term because it allows you to quickly solve the
- optimization problem to a high precision, which in the case of a
- nuclear norm regularized problem means that many of the learned
- matrices will be low rank or very close to low rank due to the
- nuclear norm regularizer. This may not happen without solving the
- problem to a high accuracy or their ranks may be difficult to
- determine, so the extra accuracy given by the cache based refinement
- is very useful. Finally, note that we include the nuclear norm term
- as part of the "risk" for the purposes of determining when to stop.
- - else
- - The value of #get_cache_based_epsilon() has no effect.
- !*/
-
- void set_cache_based_epsilon (
- scalar_type eps
- );
- /*!
- requires
- - eps > 0
- ensures
- - #get_cache_based_epsilon() == eps
- !*/
-
- void set_max_iterations (
- unsigned long max_iter
- );
- /*!
- ensures
- - #get_max_iterations() == max_iter
- !*/
-
- unsigned long get_max_iterations (
- );
- /*!
- ensures
- - returns the maximum number of iterations the SVM optimizer is allowed to
- run before it is required to stop and return a result.
- !*/
-
- void set_max_cache_size (
- unsigned long max_size
- );
- /*!
- ensures
- - #get_max_cache_size() == max_size
- !*/
-
- unsigned long get_max_cache_size (
- ) const;
- /*!
- ensures
- - Returns the number of joint feature vectors per training sample kept in
- the separation oracle cache. This cache is used to avoid unnecessary
- calls to the user supplied separation_oracle() function. Note that a
- value of 0 means that caching is not used at all. This is appropriate
- if the separation oracle is cheap to evaluate.
- !*/
-
- void add_nuclear_norm_regularizer (
- long first_dimension,
- long rows,
- long cols,
- double regularization_strength
- );
- /*!
- requires
- - 0 <= first_dimension < get_num_dimensions()
- - 0 <= rows
- - 0 <= cols
- - first_dimension+rows*cols <= get_num_dimensions()
- - 0 < regularization_strength
- ensures
- - Adds a nuclear norm regularization term to the optimization problem
- solved by this object. That is, instead of solving:
- Minimize: h(w) == 0.5*dot(w,w) + C*R(w)
- this object will solve:
- Minimize: h(w) == 0.5*dot(w,w) + C*R(w) + regularization_strength*nuclear_norm_of(part of w)
- where "part of w" is the part of w indicated by the arguments to this
- function. In particular, the part of w included in the nuclear norm is
- exactly the matrix reshape(rowm(w, range(first_dimension, first_dimension+rows*cols-1)), rows, cols).
- Therefore, if you think of the w vector as being the concatenation of a
- bunch of matrices then you can use multiple calls to add_nuclear_norm_regularizer()
- to add nuclear norm regularization terms to any of the matrices packed into w.
- - #num_nuclear_norm_regularizers() == num_nuclear_norm_regularizers() + 1
- !*/
-
- unsigned long num_nuclear_norm_regularizers (
- ) const;
- /*!
- ensures
- - returns the number of nuclear norm regularizers that are currently a part
- of this optimization problem. That is, returns the number of times
- add_nuclear_norm_regularizer() has been called since the last call to
- clear_nuclear_norm_regularizers() or object construction, whichever is
- most recent.
- !*/
-
- void clear_nuclear_norm_regularizers (
- );
- /*!
- ensures
- - #num_nuclear_norm_regularizers() == 0
- !*/
-
- void be_verbose (
- );
- /*!
- ensures
- - This object will print status messages to standard out so that a
- user can observe the progress of the algorithm.
- !*/
-
- void be_quiet(
- );
- /*!
- ensures
- - this object will not print anything to standard out
- !*/
-
- scalar_type get_c (
- ) const;
- /*!
- ensures
- - returns the SVM regularization parameter. It is the parameter that
- determines the trade off between trying to fit the training data
- exactly or allowing more errors but hopefully improving the
- generalization of the resulting classifier. Larger values encourage
- exact fitting while smaller values of C may encourage better
- generalization.
- !*/
-
- void set_c (
- scalar_type C
- );
- /*!
- requires
- - C > 0
- ensures
- - #get_c() == C
- !*/
-
- // --------------------------------
- // User supplied routines
- // --------------------------------
-
- virtual long get_num_dimensions (
- ) const = 0;
- /*!
- ensures
- - returns the dimensionality of a joint feature vector
- !*/
-
- virtual long get_num_samples (
- ) const = 0;
- /*!
- ensures
- - returns the number of training samples in this problem.
- !*/
-
- virtual void get_truth_joint_feature_vector (
- long idx,
- feature_vector_type& psi
- ) const = 0;
- /*!
- requires
- - 0 <= idx < get_num_samples()
- ensures
- - #psi == PSI(x_idx, y_idx)
- (i.e. the joint feature vector for the idx-th training sample its true label.)
- !*/
-
- virtual void separation_oracle (
- const long idx,
- const matrix_type& current_solution,
- scalar_type& loss,
- feature_vector_type& psi
- ) const = 0;
- /*!
- requires
- - 0 <= idx < get_num_samples()
- - current_solution.size() == get_num_dimensions()
- ensures
- - runs the separation oracle on the idx-th sample. We define this as follows:
- - let X == the idx-th training sample.
- - let PSI(X,y) == the joint feature vector for input X and an arbitrary label y.
- - let F(X,y) == dot(current_solution,PSI(X,y)).
- - let LOSS(idx,y) == the loss incurred for predicting that the idx-th sample
- has a label of y. Note that LOSS() should always be >= 0 and should
- become exactly 0 when y is the correct label for the idx-th sample.
-
- Then the separation oracle finds a Y such that:
- Y = argmax over all y: LOSS(idx,y) + F(X,y)
- (i.e. It finds the label which maximizes the above expression.)
-
- Finally, we can define the outputs of this function as:
- - #loss == LOSS(idx,Y)
- - #psi == PSI(X,Y)
- !*/
- };
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_STRUCTURAL_SVM_PRObLEM_ABSTRACT_Hh_
-
-