Merging upstream version 1.44.3.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-03-09 13:19:48 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-03-09 13:20:02 +0000
commit: 58daab21cd043e1dc37024a7f99b396788372918 (patch)
tree: 96771e43bb69f7c1c2b0b4f7374cb74d7866d0cb /ml/dlib/dlib/svm/kkmeans_abstract.h
parent: Releasing debian version 1.43.2-1. (diff)
download: netdata-58daab21cd043e1dc37024a7f99b396788372918.tar.xz
netdata-58daab21cd043e1dc37024a7f99b396788372918.zip
1 files changed, 365 insertions, 0 deletions
diff --git a/ml/dlib/dlib/svm/kkmeans_abstract.h b/ml/dlib/dlib/svm/kkmeans_abstract.h
new file mode 100644
index 000000000..9f9d7ccce
--- /dev/null
+++ b/ml/dlib/dlib/svm/kkmeans_abstract.h
@@ -0,0 +1,365 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_KKMEANs_ABSTRACT_
+#ifdef DLIB_KKMEANs_ABSTRACT_
+
+#include <cmath>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+#include "../serialize.h"
+#include "kernel_abstract.h"
+#include "kcentroid_abstract.h"
+#include "../noncopyable.h"
+
+namespace dlib
+{
+
+    template <
+        typename kernel_type 
+        >
+    class kkmeans : public noncopyable
+    {
+        /*!
+            REQUIREMENTS ON kernel_type
+                is a kernel function object as defined in dlib/svm/kernel_abstract.h 
+
+            INITIAL VALUE
+                - number_of_centers() == 1
+                - get_min_change() == 0.01
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of a kernelized k-means clustering algorithm.  
+                It performs k-means clustering by using the kcentroid object.  
+        !*/
+
+    public:
+        typedef typename kernel_type::scalar_type scalar_type;
+        typedef typename kernel_type::sample_type sample_type;
+        typedef typename kernel_type::mem_manager_type mem_manager_type;
+
+        kkmeans (
+            const kcentroid<kernel_type>& kc_ 
+        );
+        /*!
+            ensures
+                - #number_of_centers() == 1
+                - #get_min_change() == 0.01
+                - #get_kcentroid(0) == a copy of kc_
+        !*/
+
+        ~kkmeans(
+        );
+        /*!
+            ensures
+                - all resources associated with *this have been released
+        !*/
+
+        void set_kcentroid (
+            const kcentroid<kernel_type>& kc_
+        );
+        /*!
+            ensures
+                - for all idx:  
+                    - #get_kcentroid(idx) == a copy of kc_
+        !*/
+
+        const kcentroid<kernel_type>& get_kcentroid (
+            unsigned long i
+        ) const;
+        /*!
+            requires
+                - i < number_of_centers()
+            ensures
+                - returns a const reference to the ith kcentroid object contained in
+                  this object.  Each kcentroid represents one of the centers found
+                  by the k-means clustering algorithm.
+        !*/
+
+        const kernel_type& get_kernel (
+        ) const;
+        /*!
+            ensures
+                - returns a const reference to the kernel used by this object
+        !*/
+
+        void set_number_of_centers (
+            unsigned long num
+        );
+        /*!
+            requires
+                - num > 0
+            ensures
+                - #number_of_centers() == num
+        !*/
+
+        unsigned long number_of_centers (
+        ) const;
+        /*!
+            ensures
+                - returns the number of centers used in this instance of the k-means clustering
+                  algorithm.
+        !*/
+
+        template <
+            typename matrix_type,
+            typename matrix_type2
+            >
+        void train (
+            const matrix_type& samples,
+            const matrix_type2& initial_centers,
+            long max_iter = 1000
+        );
+        /*!
+            requires
+                - matrix_type and matrix_type2 must either be dlib::matrix objects or convertible to dlib::matrix
+                  via mat()
+                - matrix_type::type == sample_type  (i.e. matrix_type should contain sample_type objects)
+                - matrix_type2::type == sample_type (i.e. matrix_type2 should contain sample_type objects)
+                - initial_centers.nc() == 1         (i.e. must be a column vector)
+                - samples.nc() == 1                 (i.e. must be a column vector)
+                - initial_centers.nr() == number_of_centers()
+            ensures
+                - performs k-means clustering of the given set of samples.  The initial center points
+                  are taken from the initial_centers argument.
+                - loops over the data and continues to refine the clustering until either less than 
+                  get_min_change() fraction of the data points change clusters or we have done max_iter 
+                  iterations over the data.
+                - After this function finishes you can call the operator() function below
+                  to determine which centroid a given sample is closest to.
+        !*/
+
+        unsigned long operator() (
+            const sample_type& sample
+        ) const;
+        /*!
+            ensures
+                - returns a number idx such that:
+                    - idx < number_of_centers()
+                    - get_kcentroid(idx) == the centroid that is closest to the given
+                      sample.
+        !*/
+
+        void set_min_change (
+            scalar_type min_change
+        );
+        /*!
+            requires
+                - 0 <= min_change < 1
+            ensures
+                - #get_min_change() == min_change
+        !*/
+
+        const scalar_type get_min_change (
+        ) const;
+        /*!
+            ensures
+                - returns the minimum fraction of data points that need to change
+                  centers in an iteration of kmeans for the algorithm to keep going.
+        !*/
+
+        void swap (
+            kkmeans& item
+        );
+        /*!
+            ensures
+                - swaps *this and item
+        !*/
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename kernel_type
+        >
+    void swap(
+        kkmeans<kernel_type>& a, 
+        kkmeans<kernel_type>& b
+    ) { a.swap(b); }
+    /*!
+        provides a global swap function
+    !*/
+
+    template <
+        typename kernel_type
+        >
+    void serialize (
+        const kkmeans<kernel_type>& item,
+        std::ostream& out
+    );
+    /*!
+        provides serialization support for kkmeans objects
+    !*/
+
+    template <
+        typename kernel_type 
+        >
+    void deserialize (
+        kkmeans<kernel_type>& item,
+        std::istream& in 
+    );
+    /*!
+        provides serialization support for kkmeans objects
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename vector_type1, 
+        typename vector_type2, 
+        typename kernel_type
+        >
+    void pick_initial_centers(
+        long num_centers, 
+        vector_type1& centers, 
+        const vector_type2& samples, 
+        const kernel_type& k, 
+        double percentile = 0.01
+    );
+    /*!
+        requires
+            - num_centers > 1
+            - 0 <= percentile < 1
+            - samples.size() > 1
+            - vector_type1 == something with an interface compatible with std::vector
+            - vector_type2 == something with an interface compatible with std::vector
+            - k(samples[0],samples[0]) must be a valid expression that returns a double
+            - both centers and samples must be able to contain kernel_type::sample_type 
+              objects
+        ensures
+            - finds num_centers candidate cluster centers in the data in the samples 
+              vector.  Assumes that k is the kernel that will be used during clustering 
+              to define the space in which clustering occurs.
+            - The centers are found by looking for points that are far away from other 
+              candidate centers.  However, if the data is noisy you probably want to 
+              ignore the farthest way points since they will be outliers.  To do this 
+              set percentile to the fraction of outliers you expect the data to contain.
+            - #centers.size() == num_centers
+            - #centers == a vector containing the candidate centers found
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename vector_type1, 
+        typename vector_type2
+        >
+    void pick_initial_centers(
+        long num_centers, 
+        vector_type1& centers, 
+        const vector_type2& samples, 
+        double percentile = 0.01
+    );
+    /*!
+        requires
+            - num_centers > 1
+            - 0 <= percentile < 1
+            - samples.size() > 1
+            - vector_type1 == something with an interface compatible with std::vector
+            - vector_type2 == something with an interface compatible with std::vector
+            - Both centers and samples must be able to contain dlib::matrix based row or
+              column vectors.
+        ensures
+            - performs: pick_initial_centers(num_centers, centers, samples, linear_kernel<sample_type>(), percentile)
+              (i.e. this function is simply an overload that uses the linear kernel.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename array_type, 
+        typename sample_type,
+        typename alloc
+        >
+    void find_clusters_using_kmeans (
+        const array_type& samples,
+        std::vector<sample_type, alloc>& centers,
+        unsigned long max_iter = 1000
+    );
+    /*!
+        requires
+            - samples.size() > 0
+            - samples == a bunch of row or column vectors and they all must be of the
+              same length.
+            - centers.size() > 0
+            - array_type == something with an interface compatible with std::vector
+              and it must contain row or column vectors capable of being stored in 
+              sample_type objects.
+            - sample_type == a dlib::matrix capable of representing vectors
+        ensures
+            - performs regular old linear kmeans clustering on the samples.  The clustering
+              begins with the initial set of centers given as an argument to this function.
+              When it finishes #centers will contain the resulting centers.
+            - no more than max_iter iterations will be performed before this function
+              terminates.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename array_type, 
+        typename sample_type,
+        typename alloc
+        >
+    void find_clusters_using_angular_kmeans (
+        const array_type& samples,
+        std::vector<sample_type, alloc>& centers,
+        unsigned long max_iter = 1000
+    );
+    /*!
+        requires
+            - samples.size() > 0
+            - samples == a bunch of row or column vectors and they all must be of the
+              same length.
+            - centers.size() > 0
+            - array_type == something with an interface compatible with std::vector
+              and it must contain row or column vectors capable of being stored in 
+              sample_type objects.
+            - sample_type == a dlib::matrix capable of representing vectors
+        ensures
+            - performs linear kmeans clustering on the samples, except instead of using
+              Euclidean distance to compare samples to the centers it uses the angle
+              between a sample and a center (with respect to the origin).  So we try to
+              cluster samples together if they have small angles with respect to each
+              other. The clustering begins with the initial set of centers given as an
+              argument to this function.  When it finishes #centers will contain the
+              resulting centers.
+            - for all valid i:
+                - length(#centers[i]) == 1
+                  (i.e. the output centers are scaled to be unit vectors since their
+                  magnitude is irrelevant.  Moreover, this makes it so you can use
+                  functions like nearest_center() with #centers to find the cluster
+                  assignments.)
+            - No more than max_iter iterations will be performed before this function
+              terminates.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename array_type, 
+        typename EXP 
+        >
+    unsigned long nearest_center (
+        const array_type& centers,
+        const matrix_exp<EXP>& sample
+    );
+    /*!
+        requires
+            - centers.size() > 0
+            - sample.size() > 0
+            - is_vector(sample) == true
+            - centers must be an array of vectors such that the following expression is
+              valid: length_squared(sample - centers[0]).  (e.g. centers could be a
+              std::vector of matrix objects holding column vectors).
+        ensures
+            - returns the index that identifies the element of centers that is nearest to
+              sample.  That is, returns a number IDX such that centers[IDX] is the element
+              of centers that minimizes length(centers[IDX]-sample).
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_KKMEANs_ABSTRACT_
+
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-03-09 13:19:48 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-03-09 13:20:02 +0000
commit	58daab21cd043e1dc37024a7f99b396788372918 (patch)
tree	96771e43bb69f7c1c2b0b4f7374cb74d7866d0cb /ml/dlib/dlib/svm/kkmeans_abstract.h
parent	Releasing debian version 1.43.2-1. (diff)
download	netdata-58daab21cd043e1dc37024a7f99b396788372918.tar.xz netdata-58daab21cd043e1dc37024a7f99b396788372918.zip