diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-03-09 13:19:48 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-03-09 13:20:02 +0000 |
commit | 58daab21cd043e1dc37024a7f99b396788372918 (patch) | |
tree | 96771e43bb69f7c1c2b0b4f7374cb74d7866d0cb /ml/dlib/dlib/svm/kkmeans_abstract.h | |
parent | Releasing debian version 1.43.2-1. (diff) | |
download | netdata-58daab21cd043e1dc37024a7f99b396788372918.tar.xz netdata-58daab21cd043e1dc37024a7f99b396788372918.zip |
Merging upstream version 1.44.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ml/dlib/dlib/svm/kkmeans_abstract.h')
-rw-r--r-- | ml/dlib/dlib/svm/kkmeans_abstract.h | 365 |
1 files changed, 365 insertions, 0 deletions
diff --git a/ml/dlib/dlib/svm/kkmeans_abstract.h b/ml/dlib/dlib/svm/kkmeans_abstract.h new file mode 100644 index 000000000..9f9d7ccce --- /dev/null +++ b/ml/dlib/dlib/svm/kkmeans_abstract.h @@ -0,0 +1,365 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_KKMEANs_ABSTRACT_ +#ifdef DLIB_KKMEANs_ABSTRACT_ + +#include <cmath> +#include "../matrix/matrix_abstract.h" +#include "../algs.h" +#include "../serialize.h" +#include "kernel_abstract.h" +#include "kcentroid_abstract.h" +#include "../noncopyable.h" + +namespace dlib +{ + + template < + typename kernel_type + > + class kkmeans : public noncopyable + { + /*! + REQUIREMENTS ON kernel_type + is a kernel function object as defined in dlib/svm/kernel_abstract.h + + INITIAL VALUE + - number_of_centers() == 1 + - get_min_change() == 0.01 + + WHAT THIS OBJECT REPRESENTS + This is an implementation of a kernelized k-means clustering algorithm. + It performs k-means clustering by using the kcentroid object. + !*/ + + public: + typedef typename kernel_type::scalar_type scalar_type; + typedef typename kernel_type::sample_type sample_type; + typedef typename kernel_type::mem_manager_type mem_manager_type; + + kkmeans ( + const kcentroid<kernel_type>& kc_ + ); + /*! + ensures + - #number_of_centers() == 1 + - #get_min_change() == 0.01 + - #get_kcentroid(0) == a copy of kc_ + !*/ + + ~kkmeans( + ); + /*! + ensures + - all resources associated with *this have been released + !*/ + + void set_kcentroid ( + const kcentroid<kernel_type>& kc_ + ); + /*! + ensures + - for all idx: + - #get_kcentroid(idx) == a copy of kc_ + !*/ + + const kcentroid<kernel_type>& get_kcentroid ( + unsigned long i + ) const; + /*! + requires + - i < number_of_centers() + ensures + - returns a const reference to the ith kcentroid object contained in + this object. Each kcentroid represents one of the centers found + by the k-means clustering algorithm. + !*/ + + const kernel_type& get_kernel ( + ) const; + /*! + ensures + - returns a const reference to the kernel used by this object + !*/ + + void set_number_of_centers ( + unsigned long num + ); + /*! + requires + - num > 0 + ensures + - #number_of_centers() == num + !*/ + + unsigned long number_of_centers ( + ) const; + /*! + ensures + - returns the number of centers used in this instance of the k-means clustering + algorithm. + !*/ + + template < + typename matrix_type, + typename matrix_type2 + > + void train ( + const matrix_type& samples, + const matrix_type2& initial_centers, + long max_iter = 1000 + ); + /*! + requires + - matrix_type and matrix_type2 must either be dlib::matrix objects or convertible to dlib::matrix + via mat() + - matrix_type::type == sample_type (i.e. matrix_type should contain sample_type objects) + - matrix_type2::type == sample_type (i.e. matrix_type2 should contain sample_type objects) + - initial_centers.nc() == 1 (i.e. must be a column vector) + - samples.nc() == 1 (i.e. must be a column vector) + - initial_centers.nr() == number_of_centers() + ensures + - performs k-means clustering of the given set of samples. The initial center points + are taken from the initial_centers argument. + - loops over the data and continues to refine the clustering until either less than + get_min_change() fraction of the data points change clusters or we have done max_iter + iterations over the data. + - After this function finishes you can call the operator() function below + to determine which centroid a given sample is closest to. + !*/ + + unsigned long operator() ( + const sample_type& sample + ) const; + /*! + ensures + - returns a number idx such that: + - idx < number_of_centers() + - get_kcentroid(idx) == the centroid that is closest to the given + sample. + !*/ + + void set_min_change ( + scalar_type min_change + ); + /*! + requires + - 0 <= min_change < 1 + ensures + - #get_min_change() == min_change + !*/ + + const scalar_type get_min_change ( + ) const; + /*! + ensures + - returns the minimum fraction of data points that need to change + centers in an iteration of kmeans for the algorithm to keep going. + !*/ + + void swap ( + kkmeans& item + ); + /*! + ensures + - swaps *this and item + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename kernel_type + > + void swap( + kkmeans<kernel_type>& a, + kkmeans<kernel_type>& b + ) { a.swap(b); } + /*! + provides a global swap function + !*/ + + template < + typename kernel_type + > + void serialize ( + const kkmeans<kernel_type>& item, + std::ostream& out + ); + /*! + provides serialization support for kkmeans objects + !*/ + + template < + typename kernel_type + > + void deserialize ( + kkmeans<kernel_type>& item, + std::istream& in + ); + /*! + provides serialization support for kkmeans objects + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type1, + typename vector_type2, + typename kernel_type + > + void pick_initial_centers( + long num_centers, + vector_type1& centers, + const vector_type2& samples, + const kernel_type& k, + double percentile = 0.01 + ); + /*! + requires + - num_centers > 1 + - 0 <= percentile < 1 + - samples.size() > 1 + - vector_type1 == something with an interface compatible with std::vector + - vector_type2 == something with an interface compatible with std::vector + - k(samples[0],samples[0]) must be a valid expression that returns a double + - both centers and samples must be able to contain kernel_type::sample_type + objects + ensures + - finds num_centers candidate cluster centers in the data in the samples + vector. Assumes that k is the kernel that will be used during clustering + to define the space in which clustering occurs. + - The centers are found by looking for points that are far away from other + candidate centers. However, if the data is noisy you probably want to + ignore the farthest way points since they will be outliers. To do this + set percentile to the fraction of outliers you expect the data to contain. + - #centers.size() == num_centers + - #centers == a vector containing the candidate centers found + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type1, + typename vector_type2 + > + void pick_initial_centers( + long num_centers, + vector_type1& centers, + const vector_type2& samples, + double percentile = 0.01 + ); + /*! + requires + - num_centers > 1 + - 0 <= percentile < 1 + - samples.size() > 1 + - vector_type1 == something with an interface compatible with std::vector + - vector_type2 == something with an interface compatible with std::vector + - Both centers and samples must be able to contain dlib::matrix based row or + column vectors. + ensures + - performs: pick_initial_centers(num_centers, centers, samples, linear_kernel<sample_type>(), percentile) + (i.e. this function is simply an overload that uses the linear kernel. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename array_type, + typename sample_type, + typename alloc + > + void find_clusters_using_kmeans ( + const array_type& samples, + std::vector<sample_type, alloc>& centers, + unsigned long max_iter = 1000 + ); + /*! + requires + - samples.size() > 0 + - samples == a bunch of row or column vectors and they all must be of the + same length. + - centers.size() > 0 + - array_type == something with an interface compatible with std::vector + and it must contain row or column vectors capable of being stored in + sample_type objects. + - sample_type == a dlib::matrix capable of representing vectors + ensures + - performs regular old linear kmeans clustering on the samples. The clustering + begins with the initial set of centers given as an argument to this function. + When it finishes #centers will contain the resulting centers. + - no more than max_iter iterations will be performed before this function + terminates. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename array_type, + typename sample_type, + typename alloc + > + void find_clusters_using_angular_kmeans ( + const array_type& samples, + std::vector<sample_type, alloc>& centers, + unsigned long max_iter = 1000 + ); + /*! + requires + - samples.size() > 0 + - samples == a bunch of row or column vectors and they all must be of the + same length. + - centers.size() > 0 + - array_type == something with an interface compatible with std::vector + and it must contain row or column vectors capable of being stored in + sample_type objects. + - sample_type == a dlib::matrix capable of representing vectors + ensures + - performs linear kmeans clustering on the samples, except instead of using + Euclidean distance to compare samples to the centers it uses the angle + between a sample and a center (with respect to the origin). So we try to + cluster samples together if they have small angles with respect to each + other. The clustering begins with the initial set of centers given as an + argument to this function. When it finishes #centers will contain the + resulting centers. + - for all valid i: + - length(#centers[i]) == 1 + (i.e. the output centers are scaled to be unit vectors since their + magnitude is irrelevant. Moreover, this makes it so you can use + functions like nearest_center() with #centers to find the cluster + assignments.) + - No more than max_iter iterations will be performed before this function + terminates. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename array_type, + typename EXP + > + unsigned long nearest_center ( + const array_type& centers, + const matrix_exp<EXP>& sample + ); + /*! + requires + - centers.size() > 0 + - sample.size() > 0 + - is_vector(sample) == true + - centers must be an array of vectors such that the following expression is + valid: length_squared(sample - centers[0]). (e.g. centers could be a + std::vector of matrix objects holding column vectors). + ensures + - returns the index that identifies the element of centers that is nearest to + sample. That is, returns a number IDX such that centers[IDX] is the element + of centers that minimizes length(centers[IDX]-sample). + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_KKMEANs_ABSTRACT_ + |