summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/svm/kkmeans.h
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/svm/kkmeans.h')
-rw-r--r--ml/dlib/dlib/svm/kkmeans.h654
1 files changed, 654 insertions, 0 deletions
diff --git a/ml/dlib/dlib/svm/kkmeans.h b/ml/dlib/dlib/svm/kkmeans.h
new file mode 100644
index 00000000..4c72106d
--- /dev/null
+++ b/ml/dlib/dlib/svm/kkmeans.h
@@ -0,0 +1,654 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_KKMEANs_
+#define DLIB_KKMEANs_
+
+#include <cmath>
+#include <vector>
+
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+#include "../serialize.h"
+#include "kernel.h"
+#include "../array.h"
+#include "kcentroid.h"
+#include "kkmeans_abstract.h"
+#include "../noncopyable.h"
+
+namespace dlib
+{
+
+ template <
+ typename kernel_type
+ >
+ class kkmeans : public noncopyable
+ {
+ public:
+ typedef typename kernel_type::scalar_type scalar_type;
+ typedef typename kernel_type::sample_type sample_type;
+ typedef typename kernel_type::mem_manager_type mem_manager_type;
+
+ kkmeans (
+ const kcentroid<kernel_type>& kc_
+ ):
+ kc(kc_),
+ min_change(0.01)
+ {
+ set_number_of_centers(1);
+ }
+
+ ~kkmeans()
+ {
+ }
+
+ const kernel_type& get_kernel (
+ ) const
+ {
+ return kc.get_kernel();
+ }
+
+ void set_kcentroid (
+ const kcentroid<kernel_type>& kc_
+ )
+ {
+ kc = kc_;
+ set_number_of_centers(number_of_centers());
+ }
+
+ const kcentroid<kernel_type>& get_kcentroid (
+ unsigned long i
+ ) const
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(i < number_of_centers(),
+ "\tkcentroid kkmeans::get_kcentroid(i)"
+ << "\n\tYou have given an invalid value for i"
+ << "\n\ti: " << i
+ << "\n\tnumber_of_centers(): " << number_of_centers()
+ << "\n\tthis: " << this
+ );
+
+ return *centers[i];
+ }
+
+ void set_number_of_centers (
+ unsigned long num
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(num > 0,
+ "\tvoid kkmeans::set_number_of_centers()"
+ << "\n\tYou can't set the number of centers to zero"
+ << "\n\tthis: " << this
+ );
+
+ centers.set_max_size(num);
+ centers.set_size(num);
+
+ for (unsigned long i = 0; i < centers.size(); ++i)
+ {
+ centers[i].reset(new kcentroid<kernel_type>(kc));
+ }
+ }
+
+ unsigned long number_of_centers (
+ ) const
+ {
+ return centers.size();
+ }
+
+ template <typename T, typename U>
+ void train (
+ const T& samples,
+ const U& initial_centers,
+ long max_iter = 1000
+ )
+ {
+ do_train(mat(samples),mat(initial_centers),max_iter);
+ }
+
+ unsigned long operator() (
+ const sample_type& sample
+ ) const
+ {
+ unsigned long label = 0;
+ scalar_type best_score = (*centers[0])(sample);
+
+ // figure out which center the given sample is closest too
+ for (unsigned long i = 1; i < centers.size(); ++i)
+ {
+ scalar_type temp = (*centers[i])(sample);
+ if (temp < best_score)
+ {
+ label = i;
+ best_score = temp;
+ }
+ }
+
+ return label;
+ }
+
+ void set_min_change (
+ scalar_type min_change_
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT( 0 <= min_change_ < 1,
+ "\tvoid kkmeans::set_min_change()"
+ << "\n\tInvalid arguments to this function"
+ << "\n\tthis: " << this
+ << "\n\tmin_change_: " << min_change_
+ );
+ min_change = min_change_;
+ }
+
+ const scalar_type get_min_change (
+ ) const
+ {
+ return min_change;
+ }
+
+ void swap (
+ kkmeans& item
+ )
+ {
+ centers.swap(item.centers);
+ kc.swap(item.kc);
+ assignments.swap(item.assignments);
+ exchange(min_change, item.min_change);
+ }
+
+ friend void serialize(const kkmeans& item, std::ostream& out)
+ {
+ serialize(item.centers.size(),out);
+ for (unsigned long i = 0; i < item.centers.size(); ++i)
+ {
+ serialize(*item.centers[i], out);
+ }
+ serialize(item.kc, out);
+ serialize(item.min_change, out);
+ }
+
+ friend void deserialize(kkmeans& item, std::istream& in)
+ {
+ unsigned long num;
+ deserialize(num, in);
+ item.centers.resize(num);
+ for (unsigned long i = 0; i < item.centers.size(); ++i)
+ {
+ std::unique_ptr<kcentroid<kernel_type> > temp(new kcentroid<kernel_type>(kernel_type()));
+ deserialize(*temp, in);
+ item.centers[i].swap(temp);
+ }
+
+ deserialize(item.kc, in);
+ deserialize(item.min_change, in);
+ }
+
+ private:
+
+ template <typename matrix_type, typename matrix_type2>
+ void do_train (
+ const matrix_type& samples,
+ const matrix_type2& initial_centers,
+ long max_iter = 1000
+ )
+ {
+ COMPILE_TIME_ASSERT((is_same_type<typename matrix_type::type, sample_type>::value));
+ COMPILE_TIME_ASSERT((is_same_type<typename matrix_type2::type, sample_type>::value));
+
+ // make sure requires clause is not broken
+ DLIB_ASSERT(samples.nc() == 1 && initial_centers.nc() == 1 &&
+ initial_centers.nr() == static_cast<long>(number_of_centers()),
+ "\tvoid kkmeans::train()"
+ << "\n\tInvalid arguments to this function"
+ << "\n\tthis: " << this
+ << "\n\tsamples.nc(): " << samples.nc()
+ << "\n\tinitial_centers.nc(): " << initial_centers.nc()
+ << "\n\tinitial_centers.nr(): " << initial_centers.nr()
+ );
+
+ // clear out the old data and initialize the centers
+ for (unsigned long i = 0; i < centers.size(); ++i)
+ {
+ centers[i]->clear_dictionary();
+ centers[i]->train(initial_centers(i));
+ }
+
+ assignments.resize(samples.size());
+
+ bool assignment_changed = true;
+
+ // loop until the centers stabilize
+ long count = 0;
+ const unsigned long min_num_change = static_cast<unsigned long>(min_change*samples.size());
+ unsigned long num_changed = min_num_change;
+ while (assignment_changed && count < max_iter && num_changed >= min_num_change)
+ {
+ ++count;
+ assignment_changed = false;
+ num_changed = 0;
+
+ // loop over all the samples and assign them to their closest centers
+ for (long i = 0; i < samples.size(); ++i)
+ {
+ // find the best center
+ unsigned long best_center = 0;
+ scalar_type best_score = (*centers[0])(samples(i));
+ for (unsigned long c = 1; c < centers.size(); ++c)
+ {
+ scalar_type temp = (*centers[c])(samples(i));
+ if (temp < best_score)
+ {
+ best_score = temp;
+ best_center = c;
+ }
+ }
+
+ // if the current sample changed centers then make note of that
+ if (assignments[i] != best_center)
+ {
+ assignments[i] = best_center;
+ assignment_changed = true;
+ ++num_changed;
+ }
+ }
+
+ if (assignment_changed)
+ {
+ // now clear out the old data
+ for (unsigned long i = 0; i < centers.size(); ++i)
+ centers[i]->clear_dictionary();
+
+ // recalculate the cluster centers
+ for (unsigned long i = 0; i < assignments.size(); ++i)
+ centers[assignments[i]]->train(samples(i));
+ }
+
+ }
+
+
+ }
+
+ array<std::unique_ptr<kcentroid<kernel_type> > > centers;
+ kcentroid<kernel_type> kc;
+ scalar_type min_change;
+
+ // temp variables
+ array<unsigned long> assignments;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename kernel_type>
+ void swap(kkmeans<kernel_type>& a, kkmeans<kernel_type>& b)
+ { a.swap(b); }
+
+// ----------------------------------------------------------------------------------------
+
+ struct dlib_pick_initial_centers_data
+ {
+ dlib_pick_initial_centers_data():idx(0), dist(std::numeric_limits<double>::infinity()){}
+ long idx;
+ double dist;
+ bool operator< (const dlib_pick_initial_centers_data& d) const { return dist < d.dist; }
+ };
+
+ template <
+ typename vector_type1,
+ typename vector_type2,
+ typename kernel_type
+ >
+ void pick_initial_centers(
+ long num_centers,
+ vector_type1& centers,
+ const vector_type2& samples,
+ const kernel_type& k,
+ double percentile = 0.01
+ )
+ {
+ /*
+ This function is basically just a non-randomized version of the kmeans++ algorithm
+ described in the paper:
+ kmeans++: The Advantages of Careful Seeding by Arthur and Vassilvitskii
+
+ */
+
+
+ // make sure requires clause is not broken
+ DLIB_ASSERT(num_centers > 1 && 0 <= percentile && percentile < 1 && samples.size() > 1,
+ "\tvoid pick_initial_centers()"
+ << "\n\tYou passed invalid arguments to this function"
+ << "\n\tnum_centers: " << num_centers
+ << "\n\tpercentile: " << percentile
+ << "\n\tsamples.size(): " << samples.size()
+ );
+
+ std::vector<dlib_pick_initial_centers_data> scores(samples.size());
+ std::vector<dlib_pick_initial_centers_data> scores_sorted(samples.size());
+ centers.clear();
+
+ // pick the first sample as one of the centers
+ centers.push_back(samples[0]);
+
+ const long best_idx = static_cast<long>(std::max(0.0,samples.size() - samples.size()*percentile - 1));
+
+ // pick the next center
+ for (long i = 0; i < num_centers-1; ++i)
+ {
+ // Loop over the samples and compare them to the most recent center. Store
+ // the distance from each sample to its closest center in scores.
+ const double k_cc = k(centers[i], centers[i]);
+ for (unsigned long s = 0; s < samples.size(); ++s)
+ {
+ // compute the distance between this sample and the current center
+ const double dist = k_cc + k(samples[s],samples[s]) - 2*k(samples[s], centers[i]);
+
+ if (dist < scores[s].dist)
+ {
+ scores[s].dist = dist;
+ scores[s].idx = s;
+ }
+ }
+
+ scores_sorted = scores;
+
+ // now find the winning center and add it to centers. It is the one that is
+ // far away from all the other centers.
+ sort(scores_sorted.begin(), scores_sorted.end());
+ centers.push_back(samples[scores_sorted[best_idx].idx]);
+ }
+
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type1,
+ typename vector_type2
+ >
+ void pick_initial_centers(
+ long num_centers,
+ vector_type1& centers,
+ const vector_type2& samples,
+ double percentile = 0.01
+ )
+ {
+ typedef typename vector_type1::value_type sample_type;
+ linear_kernel<sample_type> kern;
+ pick_initial_centers(num_centers, centers, samples, kern, percentile);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename array_type,
+ typename sample_type,
+ typename alloc
+ >
+ void find_clusters_using_kmeans (
+ const array_type& samples,
+ std::vector<sample_type, alloc>& centers,
+ unsigned long max_iter = 1000
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(samples.size() > 0 && centers.size() > 0,
+ "\tvoid find_clusters_using_kmeans()"
+ << "\n\tYou passed invalid arguments to this function"
+ << "\n\t samples.size(): " << samples.size()
+ << "\n\t centers.size(): " << centers.size()
+ );
+
+#ifdef ENABLE_ASSERTS
+ {
+ const long nr = samples[0].nr();
+ const long nc = samples[0].nc();
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ DLIB_ASSERT(is_vector(samples[i]) && samples[i].nr() == nr && samples[i].nc() == nc,
+ "\tvoid find_clusters_using_kmeans()"
+ << "\n\t You passed invalid arguments to this function"
+ << "\n\t is_vector(samples[i]): " << is_vector(samples[i])
+ << "\n\t samples[i].nr(): " << samples[i].nr()
+ << "\n\t nr: " << nr
+ << "\n\t samples[i].nc(): " << samples[i].nc()
+ << "\n\t nc: " << nc
+ << "\n\t i: " << i
+ );
+ }
+ }
+#endif
+
+ typedef typename sample_type::type scalar_type;
+
+ sample_type zero(centers[0]);
+ set_all_elements(zero, 0);
+
+ std::vector<unsigned long> center_element_count;
+
+ // tells which center a sample belongs to
+ std::vector<unsigned long> assignments(samples.size(), samples.size());
+
+
+ unsigned long iter = 0;
+ bool centers_changed = true;
+ while (centers_changed && iter < max_iter)
+ {
+ ++iter;
+ centers_changed = false;
+ center_element_count.assign(centers.size(), 0);
+
+ // loop over each sample and see which center it is closest to
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ // find the best center for sample[i]
+ scalar_type best_dist = std::numeric_limits<scalar_type>::max();
+ unsigned long best_center = 0;
+ for (unsigned long j = 0; j < centers.size(); ++j)
+ {
+ scalar_type dist = length(centers[j] - samples[i]);
+ if (dist < best_dist)
+ {
+ best_dist = dist;
+ best_center = j;
+ }
+ }
+
+ if (assignments[i] != best_center)
+ {
+ centers_changed = true;
+ assignments[i] = best_center;
+ }
+
+ center_element_count[best_center] += 1;
+ }
+
+ // now update all the centers
+ centers.assign(centers.size(), zero);
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ centers[assignments[i]] += samples[i];
+ }
+ for (unsigned long i = 0; i < centers.size(); ++i)
+ {
+ if (center_element_count[i] != 0)
+ centers[i] /= center_element_count[i];
+ }
+ }
+
+
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename array_type,
+ typename sample_type,
+ typename alloc
+ >
+ void find_clusters_using_angular_kmeans (
+ const array_type& samples,
+ std::vector<sample_type, alloc>& centers,
+ unsigned long max_iter = 1000
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(samples.size() > 0 && centers.size() > 0,
+ "\tvoid find_clusters_using_angular_kmeans()"
+ << "\n\tYou passed invalid arguments to this function"
+ << "\n\t samples.size(): " << samples.size()
+ << "\n\t centers.size(): " << centers.size()
+ );
+
+#ifdef ENABLE_ASSERTS
+ {
+ const long nr = samples[0].nr();
+ const long nc = samples[0].nc();
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ DLIB_ASSERT(is_vector(samples[i]) && samples[i].nr() == nr && samples[i].nc() == nc,
+ "\tvoid find_clusters_using_angular_kmeans()"
+ << "\n\t You passed invalid arguments to this function"
+ << "\n\t is_vector(samples[i]): " << is_vector(samples[i])
+ << "\n\t samples[i].nr(): " << samples[i].nr()
+ << "\n\t nr: " << nr
+ << "\n\t samples[i].nc(): " << samples[i].nc()
+ << "\n\t nc: " << nc
+ << "\n\t i: " << i
+ );
+ }
+ }
+#endif
+
+ typedef typename sample_type::type scalar_type;
+
+ sample_type zero(centers[0]);
+ set_all_elements(zero, 0);
+
+ unsigned long seed = 0;
+
+ // tells which center a sample belongs to
+ std::vector<unsigned long> assignments(samples.size(), samples.size());
+ std::vector<double> lengths;
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ lengths.push_back(length(samples[i]));
+ // If there are zero vectors in samples then just say their length is 1 so we
+ // can avoid a division by zero check later on. Also, this doesn't matter
+ // since zero vectors can be assigned to any cluster randomly as there is no
+ // basis for picking one based on angle.
+ if (lengths.back() == 0)
+ lengths.back() = 1;
+ }
+
+ // We will keep the centers as unit vectors at all times throughout the processing.
+ for (unsigned long i = 0; i < centers.size(); ++i)
+ {
+ double len = length(centers[i]);
+ // Avoid having length 0 centers. If that is the case then pick another center
+ // at random.
+ while(len == 0)
+ {
+ centers[i] = matrix_cast<scalar_type>(gaussian_randm(centers[i].nr(), centers[i].nc(), seed++));
+ len = length(centers[i]);
+ }
+ centers[i] /= len;
+ }
+
+
+ unsigned long iter = 0;
+ bool centers_changed = true;
+ while (centers_changed && iter < max_iter)
+ {
+ ++iter;
+ centers_changed = false;
+
+ // loop over each sample and see which center it is closest to
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ // find the best center for sample[i]
+ scalar_type best_angle = std::numeric_limits<scalar_type>::max();
+ unsigned long best_center = 0;
+ for (unsigned long j = 0; j < centers.size(); ++j)
+ {
+ scalar_type angle = -dot(centers[j],samples[i])/lengths[i];
+
+ if (angle < best_angle)
+ {
+ best_angle = angle;
+ best_center = j;
+ }
+ }
+
+ if (assignments[i] != best_center)
+ {
+ centers_changed = true;
+ assignments[i] = best_center;
+ }
+ }
+
+ // now update all the centers
+ centers.assign(centers.size(), zero);
+ for (unsigned long i = 0; i < samples.size(); ++i)
+ {
+ centers[assignments[i]] += samples[i];
+ }
+ // Now length normalize all the centers.
+ for (unsigned long i = 0; i < centers.size(); ++i)
+ {
+ double len = length(centers[i]);
+ // Avoid having length 0 centers. If that is the case then pick another center
+ // at random.
+ while(len == 0)
+ {
+ centers[i] = matrix_cast<scalar_type>(gaussian_randm(centers[i].nr(), centers[i].nc(), seed++));
+ len = length(centers[i]);
+ centers_changed = true;
+ }
+ centers[i] /= len;
+ }
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename array_type,
+ typename EXP
+ >
+ unsigned long nearest_center (
+ const array_type& centers,
+ const matrix_exp<EXP>& sample
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(centers.size() > 0 && sample.size() > 0 && is_vector(sample),
+ "\t unsigned long nearest_center()"
+ << "\n\t You have given invalid inputs to this function."
+ << "\n\t centers.size(): " << centers.size()
+ << "\n\t sample.size(): " << sample.size()
+ << "\n\t is_vector(sample): " << is_vector(sample)
+ );
+
+ double best_dist = length_squared(centers[0] - sample);
+ unsigned long best_idx = 0;
+ for (unsigned long i = 1; i < centers.size(); ++i)
+ {
+ const double dist = length_squared(centers[i] - sample);
+ if (dist < best_dist)
+ {
+ best_dist = dist;
+ best_idx = i;
+ }
+ }
+ return best_idx;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_KKMEANs_
+
+