summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/svm/kkmeans_abstract.h
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-03-09 13:19:48 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-03-09 13:20:02 +0000
commit58daab21cd043e1dc37024a7f99b396788372918 (patch)
tree96771e43bb69f7c1c2b0b4f7374cb74d7866d0cb /ml/dlib/dlib/svm/kkmeans_abstract.h
parentReleasing debian version 1.43.2-1. (diff)
downloadnetdata-58daab21cd043e1dc37024a7f99b396788372918.tar.xz
netdata-58daab21cd043e1dc37024a7f99b396788372918.zip
Merging upstream version 1.44.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ml/dlib/dlib/svm/kkmeans_abstract.h')
-rw-r--r--ml/dlib/dlib/svm/kkmeans_abstract.h365
1 files changed, 365 insertions, 0 deletions
diff --git a/ml/dlib/dlib/svm/kkmeans_abstract.h b/ml/dlib/dlib/svm/kkmeans_abstract.h
new file mode 100644
index 000000000..9f9d7ccce
--- /dev/null
+++ b/ml/dlib/dlib/svm/kkmeans_abstract.h
@@ -0,0 +1,365 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_KKMEANs_ABSTRACT_
+#ifdef DLIB_KKMEANs_ABSTRACT_
+
+#include <cmath>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+#include "../serialize.h"
+#include "kernel_abstract.h"
+#include "kcentroid_abstract.h"
+#include "../noncopyable.h"
+
+namespace dlib
+{
+
+ template <
+ typename kernel_type
+ >
+ class kkmeans : public noncopyable
+ {
+ /*!
+ REQUIREMENTS ON kernel_type
+ is a kernel function object as defined in dlib/svm/kernel_abstract.h
+
+ INITIAL VALUE
+ - number_of_centers() == 1
+ - get_min_change() == 0.01
+
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of a kernelized k-means clustering algorithm.
+ It performs k-means clustering by using the kcentroid object.
+ !*/
+
+ public:
+ typedef typename kernel_type::scalar_type scalar_type;
+ typedef typename kernel_type::sample_type sample_type;
+ typedef typename kernel_type::mem_manager_type mem_manager_type;
+
+ kkmeans (
+ const kcentroid<kernel_type>& kc_
+ );
+ /*!
+ ensures
+ - #number_of_centers() == 1
+ - #get_min_change() == 0.01
+ - #get_kcentroid(0) == a copy of kc_
+ !*/
+
+ ~kkmeans(
+ );
+ /*!
+ ensures
+ - all resources associated with *this have been released
+ !*/
+
+ void set_kcentroid (
+ const kcentroid<kernel_type>& kc_
+ );
+ /*!
+ ensures
+ - for all idx:
+ - #get_kcentroid(idx) == a copy of kc_
+ !*/
+
+ const kcentroid<kernel_type>& get_kcentroid (
+ unsigned long i
+ ) const;
+ /*!
+ requires
+ - i < number_of_centers()
+ ensures
+ - returns a const reference to the ith kcentroid object contained in
+ this object. Each kcentroid represents one of the centers found
+ by the k-means clustering algorithm.
+ !*/
+
+ const kernel_type& get_kernel (
+ ) const;
+ /*!
+ ensures
+ - returns a const reference to the kernel used by this object
+ !*/
+
+ void set_number_of_centers (
+ unsigned long num
+ );
+ /*!
+ requires
+ - num > 0
+ ensures
+ - #number_of_centers() == num
+ !*/
+
+ unsigned long number_of_centers (
+ ) const;
+ /*!
+ ensures
+ - returns the number of centers used in this instance of the k-means clustering
+ algorithm.
+ !*/
+
+ template <
+ typename matrix_type,
+ typename matrix_type2
+ >
+ void train (
+ const matrix_type& samples,
+ const matrix_type2& initial_centers,
+ long max_iter = 1000
+ );
+ /*!
+ requires
+ - matrix_type and matrix_type2 must either be dlib::matrix objects or convertible to dlib::matrix
+ via mat()
+ - matrix_type::type == sample_type (i.e. matrix_type should contain sample_type objects)
+ - matrix_type2::type == sample_type (i.e. matrix_type2 should contain sample_type objects)
+ - initial_centers.nc() == 1 (i.e. must be a column vector)
+ - samples.nc() == 1 (i.e. must be a column vector)
+ - initial_centers.nr() == number_of_centers()
+ ensures
+ - performs k-means clustering of the given set of samples. The initial center points
+ are taken from the initial_centers argument.
+ - loops over the data and continues to refine the clustering until either less than
+ get_min_change() fraction of the data points change clusters or we have done max_iter
+ iterations over the data.
+ - After this function finishes you can call the operator() function below
+ to determine which centroid a given sample is closest to.
+ !*/
+
+ unsigned long operator() (
+ const sample_type& sample
+ ) const;
+ /*!
+ ensures
+ - returns a number idx such that:
+ - idx < number_of_centers()
+ - get_kcentroid(idx) == the centroid that is closest to the given
+ sample.
+ !*/
+
+ void set_min_change (
+ scalar_type min_change
+ );
+ /*!
+ requires
+ - 0 <= min_change < 1
+ ensures
+ - #get_min_change() == min_change
+ !*/
+
+ const scalar_type get_min_change (
+ ) const;
+ /*!
+ ensures
+ - returns the minimum fraction of data points that need to change
+ centers in an iteration of kmeans for the algorithm to keep going.
+ !*/
+
+ void swap (
+ kkmeans& item
+ );
+ /*!
+ ensures
+ - swaps *this and item
+ !*/
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename kernel_type
+ >
+ void swap(
+ kkmeans<kernel_type>& a,
+ kkmeans<kernel_type>& b
+ ) { a.swap(b); }
+ /*!
+ provides a global swap function
+ !*/
+
+ template <
+ typename kernel_type
+ >
+ void serialize (
+ const kkmeans<kernel_type>& item,
+ std::ostream& out
+ );
+ /*!
+ provides serialization support for kkmeans objects
+ !*/
+
+ template <
+ typename kernel_type
+ >
+ void deserialize (
+ kkmeans<kernel_type>& item,
+ std::istream& in
+ );
+ /*!
+ provides serialization support for kkmeans objects
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type1,
+ typename vector_type2,
+ typename kernel_type
+ >
+ void pick_initial_centers(
+ long num_centers,
+ vector_type1& centers,
+ const vector_type2& samples,
+ const kernel_type& k,
+ double percentile = 0.01
+ );
+ /*!
+ requires
+ - num_centers > 1
+ - 0 <= percentile < 1
+ - samples.size() > 1
+ - vector_type1 == something with an interface compatible with std::vector
+ - vector_type2 == something with an interface compatible with std::vector
+ - k(samples[0],samples[0]) must be a valid expression that returns a double
+ - both centers and samples must be able to contain kernel_type::sample_type
+ objects
+ ensures
+ - finds num_centers candidate cluster centers in the data in the samples
+ vector. Assumes that k is the kernel that will be used during clustering
+ to define the space in which clustering occurs.
+ - The centers are found by looking for points that are far away from other
+ candidate centers. However, if the data is noisy you probably want to
+ ignore the farthest way points since they will be outliers. To do this
+ set percentile to the fraction of outliers you expect the data to contain.
+ - #centers.size() == num_centers
+ - #centers == a vector containing the candidate centers found
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type1,
+ typename vector_type2
+ >
+ void pick_initial_centers(
+ long num_centers,
+ vector_type1& centers,
+ const vector_type2& samples,
+ double percentile = 0.01
+ );
+ /*!
+ requires
+ - num_centers > 1
+ - 0 <= percentile < 1
+ - samples.size() > 1
+ - vector_type1 == something with an interface compatible with std::vector
+ - vector_type2 == something with an interface compatible with std::vector
+ - Both centers and samples must be able to contain dlib::matrix based row or
+ column vectors.
+ ensures
+ - performs: pick_initial_centers(num_centers, centers, samples, linear_kernel<sample_type>(), percentile)
+ (i.e. this function is simply an overload that uses the linear kernel.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename array_type,
+ typename sample_type,
+ typename alloc
+ >
+ void find_clusters_using_kmeans (
+ const array_type& samples,
+ std::vector<sample_type, alloc>& centers,
+ unsigned long max_iter = 1000
+ );
+ /*!
+ requires
+ - samples.size() > 0
+ - samples == a bunch of row or column vectors and they all must be of the
+ same length.
+ - centers.size() > 0
+ - array_type == something with an interface compatible with std::vector
+ and it must contain row or column vectors capable of being stored in
+ sample_type objects.
+ - sample_type == a dlib::matrix capable of representing vectors
+ ensures
+ - performs regular old linear kmeans clustering on the samples. The clustering
+ begins with the initial set of centers given as an argument to this function.
+ When it finishes #centers will contain the resulting centers.
+ - no more than max_iter iterations will be performed before this function
+ terminates.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename array_type,
+ typename sample_type,
+ typename alloc
+ >
+ void find_clusters_using_angular_kmeans (
+ const array_type& samples,
+ std::vector<sample_type, alloc>& centers,
+ unsigned long max_iter = 1000
+ );
+ /*!
+ requires
+ - samples.size() > 0
+ - samples == a bunch of row or column vectors and they all must be of the
+ same length.
+ - centers.size() > 0
+ - array_type == something with an interface compatible with std::vector
+ and it must contain row or column vectors capable of being stored in
+ sample_type objects.
+ - sample_type == a dlib::matrix capable of representing vectors
+ ensures
+ - performs linear kmeans clustering on the samples, except instead of using
+ Euclidean distance to compare samples to the centers it uses the angle
+ between a sample and a center (with respect to the origin). So we try to
+ cluster samples together if they have small angles with respect to each
+ other. The clustering begins with the initial set of centers given as an
+ argument to this function. When it finishes #centers will contain the
+ resulting centers.
+ - for all valid i:
+ - length(#centers[i]) == 1
+ (i.e. the output centers are scaled to be unit vectors since their
+ magnitude is irrelevant. Moreover, this makes it so you can use
+ functions like nearest_center() with #centers to find the cluster
+ assignments.)
+ - No more than max_iter iterations will be performed before this function
+ terminates.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename array_type,
+ typename EXP
+ >
+ unsigned long nearest_center (
+ const array_type& centers,
+ const matrix_exp<EXP>& sample
+ );
+ /*!
+ requires
+ - centers.size() > 0
+ - sample.size() > 0
+ - is_vector(sample) == true
+ - centers must be an array of vectors such that the following expression is
+ valid: length_squared(sample - centers[0]). (e.g. centers could be a
+ std::vector of matrix objects holding column vectors).
+ ensures
+ - returns the index that identifies the element of centers that is nearest to
+ sample. That is, returns a number IDX such that centers[IDX] is the element
+ of centers that minimizes length(centers[IDX]-sample).
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_KKMEANs_ABSTRACT_
+