1 files changed, 136 insertions, 0 deletions
diff --git a/ml/dlib/dlib/svm/feature_ranking_abstract.h b/ml/dlib/dlib/svm/feature_ranking_abstract.h
new file mode 100644
index 000000000..5a6fd3bb9
--- /dev/null
+++ b/ml/dlib/dlib/svm/feature_ranking_abstract.h
@@ -0,0 +1,136 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_KERNEL_FEATURE_RANKINg_ABSTRACT_H_
+#ifdef DLIB_KERNEL_FEATURE_RANKINg_ABSTRACT_H_
+
+#include <vector>
+#include <limits>
+
+#include "svm_abstract.h"
+#include "kcentroid_abstract.h"
+#include "../is_kind.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename kernel_type,
+        typename sample_matrix_type,
+        typename label_matrix_type
+        >
+    matrix<typename kernel_type::scalar_type> rank_features (
+        const kcentroid<kernel_type>& kc,
+        const sample_matrix_type& samples,
+        const label_matrix_type& labels,
+        const long num_features = samples(0).nr()
+    );
+    /*!
+        requires
+            - sample_matrix_type == a matrix or something convertible to a matrix via mat()
+            - label_matrix_type  == a matrix or something convertible to a matrix via mat()
+            - is_binary_classification_problem(samples, labels) == true
+            - kc.train(samples(0)) must be a valid expression.  This means that
+              kc must use a kernel type that is capable of operating on the
+              contents of the samples matrix
+            - 0 < num_features <= samples(0).nr()
+        ensures
+            - Let Class1 denote the centroid of all the samples with labels that are < 0
+            - Let Class2 denote the centroid of all the samples with labels that are > 0
+            - finds a ranking of the features where the best features come first.  This 
+              function does this by computing the distance between the centroid of the Class1 
+              samples and the Class2 samples in kernel defined feature space.
+              Good features are then ones that result in the biggest separation between
+              the two centroids of Class1 and Class2.
+            - Uses the kc object to compute the centroids of the two classes
+            - returns a ranking matrix R where:
+                - R.nr() == num_features
+                - r.nc() == 2
+                - R(i,0) == the index of the ith best feature according to our ranking.
+                  (e.g. samples(n)(R(0,0)) is the best feature from sample(n) and
+                   samples(n)(R(1,0)) is the second best, samples(n)(R(2,0)) the
+                   third best and so on)
+                - R(i,1) == a number that indicates how much separation exists between 
+                  the two centroids when features 0 through i are used.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename sample_matrix_type,
+        typename label_matrix_type
+        >
+    double find_gamma_with_big_centroid_gap (
+        const sample_matrix_type& samples,
+        const label_matrix_type& labels,
+        double initial_gamma = 0.1,
+        unsigned long num_sv = 40
+    );
+    /*!
+        requires
+            - initial_gamma > 0
+            - num_sv > 0
+            - is_binary_classification_problem(samples, labels) == true
+        ensures
+            - This is a function that tries to pick a reasonable default value for the gamma 
+              parameter of the radial_basis_kernel.  It picks the parameter that gives the 
+              largest separation between the centroids, in kernel feature space, of two classes 
+              of data. It does this using the kcentroid object and it sets the kcentroid up 
+              to use num_sv dictionary vectors.
+            - This function does a search for the best gamma and the search starts with
+              the value given by initial_gamma.  Better initial guesses will give 
+              better results since the routine may get stuck in a local minima.
+            - returns the value of gamma that results in the largest separation.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename sample_matrix_type,
+        typename label_matrix_type
+        >
+    double verbose_find_gamma_with_big_centroid_gap (
+        const sample_matrix_type& samples,
+        const label_matrix_type& labels,
+        double initial_gamma = 0.1,
+        unsigned long num_sv = 40
+    );
+    /*!
+        requires
+            - initial_gamma > 0
+            - num_sv > 0
+            - is_binary_classification_problem(samples, labels) == true
+        ensures
+            - This function does the same exact thing as the above find_gamma_with_big_centroid_gap()
+              except that it is also verbose in the sense that it will print status messages to
+              standard out during its processing.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename vector_type
+        >
+    double compute_mean_squared_distance (
+        const vector_type& samples
+    );
+    /*!
+        requires
+            - vector_type is something with an interface compatible with std::vector.  
+              Additionally, it must in turn contain dlib::matrix types which contain 
+              scalars such as float or double values.
+            - for all valid i: is_vector(samples[i]) == true
+        ensures
+            - computes the average value of the squares of all the pairwise 
+              distances between every element of samples.  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_KERNEL_FEATURE_RANKINg_ABSTRACT_H_
+
+
+