Merging upstream version 1.45.3+dfsg.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-05 12:08:03 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-05 12:08:18 +0000
commit: 5da14042f70711ea5cf66e034699730335462f66 (patch)
tree: 0f6354ccac934ed87a2d555f45be4c831cf92f4a /src/ml/dlib/examples/linear_manifold_regularizer_ex.cpp
parent: Releasing debian version 1.44.3-2. (diff)
download: netdata-5da14042f70711ea5cf66e034699730335462f66.tar.xz
netdata-5da14042f70711ea5cf66e034699730335462f66.zip
1 files changed, 284 insertions, 0 deletions
diff --git a/src/ml/dlib/examples/linear_manifold_regularizer_ex.cpp b/src/ml/dlib/examples/linear_manifold_regularizer_ex.cpp
new file mode 100644
index 000000000..9c6f10f26
--- /dev/null
+++ b/src/ml/dlib/examples/linear_manifold_regularizer_ex.cpp
@@ -0,0 +1,284 @@
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+
+    This is an example illustrating the use of the linear_manifold_regularizer 
+    and empirical_kernel_map from the dlib C++ Library.
+
+    This example program assumes you are familiar with some general elements of 
+    the library.  In particular, you should have at least read the svm_ex.cpp 
+    and matrix_ex.cpp examples.  You should also have read the empirical_kernel_map_ex.cpp
+    example program as the present example builds upon it.
+
+
+
+    This program shows an example of what is called semi-supervised learning.  
+    That is, a small amount of labeled data is augmented with a large amount 
+    of unlabeled data.  A learning algorithm is then run on all the data 
+    and the hope is that by including the unlabeled data we will end up with
+    a better result.
+
+
+    In this particular example we will generate 200,000 sample points of
+    unlabeled data along with 2 samples of labeled data.  The sample points
+    will be drawn randomly from two concentric circles.  One labeled data
+    point will be drawn from each circle.  The goal is to learn to
+    correctly separate the two circles using only the 2 labeled points 
+    and the unlabeled data.
+
+    To do this we will first run an approximate form of k nearest neighbors
+    to determine which of the unlabeled samples are closest together.  We will
+    then make the manifold assumption, that is, we will assume that points close
+    to each other should share the same classification label.  
+
+    Once we have determined which points are near neighbors we will use the 
+    empirical_kernel_map and linear_manifold_regularizer to transform all the 
+    data points into a new vector space where any linear rule will have similar 
+    output for points which we have decided are near neighbors.
+
+    Finally, we will classify all the unlabeled data according to which of 
+    the two labeled points are nearest.  Normally this would not work but by 
+    using the manifold assumption we will be able to successfully classify
+    all the unlabeled data.
+
+
+    
+    For further information on this subject you should begin with the following
+    paper as it discusses a very similar application of manifold regularization.
+
+        Beyond the Point Cloud: from Transductive to Semi-supervised Learning
+        by Vikas Sindhwani, Partha Niyogi, and Mikhail Belkin
+
+
+
+
+                    ******** SAMPLE PROGRAM OUTPUT ********
+
+    Testing manifold regularization with an intrinsic_regularization_strength of 0.
+    number of edges generated: 49998
+    Running simple test...
+    error: 0.37022
+    error: 0.44036
+    error: 0.376715
+    error: 0.307545
+    error: 0.463455
+    error: 0.426065
+    error: 0.416155
+    error: 0.288295
+    error: 0.400115
+    error: 0.46347
+
+    Testing manifold regularization with an intrinsic_regularization_strength of 10000.
+    number of edges generated: 49998
+    Running simple test...
+    error: 0
+    error: 0
+    error: 0
+    error: 0
+    error: 0
+    error: 0
+    error: 0
+    error: 0
+    error: 0
+    error: 0
+
+
+*/
+
+#include <dlib/manifold_regularization.h>
+#include <dlib/svm.h>
+#include <dlib/rand.h>
+#include <dlib/statistics.h>
+#include <iostream>
+#include <vector>
+#include <ctime>
+
+
+using namespace std;
+using namespace dlib;
+
+// ----------------------------------------------------------------------------------------
+
+// First let's make a typedef for the kind of samples we will be using. 
+typedef matrix<double, 0, 1> sample_type;
+
+// We will be using the radial_basis_kernel in this example program.
+typedef radial_basis_kernel<sample_type> kernel_type;
+
+// ----------------------------------------------------------------------------------------
+
+void generate_circle (
+    std::vector<sample_type>& samples,
+    double radius,
+    const long num
+);
+/*!
+    requires
+        - num > 0
+        - radius > 0
+    ensures
+        - generates num points centered at (0,0) with the given radius.  Adds these
+          points into the given samples vector.
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+void test_manifold_regularization (
+    const double intrinsic_regularization_strength
+);
+/*!
+    ensures
+        - Runs an example test using the linear_manifold_regularizer with the given
+          intrinsic_regularization_strength.   
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+int main()
+{
+    // Run the test without any manifold regularization. 
+    test_manifold_regularization(0);
+
+    // Run the test with manifold regularization.  You can think of this number as
+    // a measure of how much we trust the manifold assumption.  So if you are really
+    // confident that you can select neighboring points which should have the same
+    // classification then make this number big.   
+    test_manifold_regularization(10000.0);
+}
+
+// ----------------------------------------------------------------------------------------
+
+void test_manifold_regularization (
+    const double intrinsic_regularization_strength
+)
+{
+    cout << "Testing manifold regularization with an intrinsic_regularization_strength of " 
+         << intrinsic_regularization_strength << ".\n";
+
+    std::vector<sample_type> samples;
+
+    // Declare an instance of the kernel we will be using.  
+    const kernel_type kern(0.1);
+
+    const unsigned long num_points = 100000;
+
+    // create a large dataset with two concentric circles.  There will be 100000 points on each circle
+    // for a total of 200000 samples.
+    generate_circle(samples, 2, num_points);  // circle of radius 2
+    generate_circle(samples, 4, num_points);  // circle of radius 4
+
+    // Create a set of sample_pairs that tells us which samples are "close" and should thus 
+    // be classified similarly.  These edges will be used to define the manifold regularizer.
+    // To find these edges we use a simple function that samples point pairs randomly and 
+    // returns the top 5% with the shortest edges.
+    std::vector<sample_pair> edges;
+    find_percent_shortest_edges_randomly(samples, squared_euclidean_distance(), 0.05, 1000000, time(0), edges);
+
+    cout << "number of edges generated: " << edges.size() << endl;
+
+    empirical_kernel_map<kernel_type> ekm;
+
+    // Since the circles are not linearly separable we will use an empirical kernel map to 
+    // map them into a space where they are separable.  We create an empirical_kernel_map 
+    // using a random subset of our data samples as basis samples.  Note, however, that even
+    // though the circles are linearly separable in this new space given by the empirical_kernel_map
+    // we still won't be able to correctly classify all the points given just the 2 labeled examples.
+    // We will need to make use of the nearest neighbor information stored in edges.  To do that
+    // we will use the linear_manifold_regularizer.
+    ekm.load(kern, randomly_subsample(samples, 50));
+
+    // Project all the samples into the span of our 50 basis samples
+    for (unsigned long i = 0; i < samples.size(); ++i)
+        samples[i] = ekm.project(samples[i]);
+
+
+    // Now create the manifold regularizer.  The result is a transformation matrix that
+    // embodies the manifold assumption discussed above.  
+    linear_manifold_regularizer<sample_type> lmr;
+    // use_gaussian_weights is a function object that tells lmr how to weight each edge.  In this
+    // case we let the weight decay as edges get longer.  So shorter edges are more important than
+    // longer edges.
+    lmr.build(samples, edges, use_gaussian_weights(0.1));
+    const matrix<double> T = lmr.get_transformation_matrix(intrinsic_regularization_strength);
+
+    // Apply the transformation generated by the linear_manifold_regularizer to 
+    // all our samples.
+    for (unsigned long i = 0; i < samples.size(); ++i)
+        samples[i] = T*samples[i];
+
+
+    // For convenience, generate a projection_function and merge the transformation
+    // matrix T into it.  That is, we will have: proj(x) == T*ekm.project(x).
+    projection_function<kernel_type> proj = ekm.get_projection_function();
+    proj.weights = T*proj.weights;
+
+    cout << "Running simple test..." << endl;
+
+    // Pick 2 different labeled points.  One on the inner circle and another on the outer.  
+    // For each of these test points we will see if using the single plane that separates
+    // them is a good way to separate the concentric circles.  We also do this a bunch 
+    // of times with different randomly chosen points so we can see how robust the result is.
+    for (int itr = 0; itr < 10; ++itr)
+    {
+        std::vector<sample_type> test_points;
+        // generate a random point from the radius 2 circle
+        generate_circle(test_points, 2, 1);
+        // generate a random point from the radius 4 circle
+        generate_circle(test_points, 4, 1);
+
+        // project the two test points into kernel space.  Recall that this projection_function
+        // has the manifold regularizer incorporated into it.  
+        const sample_type class1_point = proj(test_points[0]);
+        const sample_type class2_point = proj(test_points[1]);
+
+        double num_wrong = 0;
+
+        // Now attempt to classify all the data samples according to which point
+        // they are closest to.  The output of this program shows that without manifold 
+        // regularization this test will fail but with it it will perfectly classify
+        // all the points.
+        for (unsigned long i = 0; i < samples.size(); ++i)
+        {
+            double distance_to_class1 = length(samples[i] - class1_point);
+            double distance_to_class2 = length(samples[i] - class2_point);
+
+            bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2);
+
+            bool really_is_class_1 = (i < num_points);
+
+            // now count how many times we make a mistake
+            if (predicted_as_class_1 != really_is_class_1)
+                ++num_wrong;
+        }
+
+        cout << "error: "<< num_wrong/samples.size() << endl;
+    }
+
+    cout << endl;
+}
+
+// ----------------------------------------------------------------------------------------
+
+dlib::rand rnd;
+
+void generate_circle (
+    std::vector<sample_type>& samples,
+    double radius,
+    const long num
+)
+{
+    sample_type m(2,1);
+
+    for (long i = 0; i < num; ++i)
+    {
+        double sign = 1;
+        if (rnd.get_random_double() < 0.5)
+            sign = -1;
+        m(0) = 2*radius*rnd.get_random_double()-radius;
+        m(1) = sign*sqrt(radius*radius - m(0)*m(0));
+
+        samples.push_back(m);
+    }
+}
+
+// ----------------------------------------------------------------------------------------
+
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-05 12:08:03 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-05 12:08:18 +0000
commit	5da14042f70711ea5cf66e034699730335462f66 (patch)
tree	0f6354ccac934ed87a2d555f45be4c831cf92f4a /src/ml/dlib/examples/linear_manifold_regularizer_ex.cpp
parent	Releasing debian version 1.44.3-2. (diff)
download	netdata-5da14042f70711ea5cf66e034699730335462f66.tar.xz netdata-5da14042f70711ea5cf66e034699730335462f66.zip