diff options
Diffstat (limited to 'ml/dlib/dlib/svm/kkmeans.h')
-rw-r--r-- | ml/dlib/dlib/svm/kkmeans.h | 654 |
1 files changed, 654 insertions, 0 deletions
diff --git a/ml/dlib/dlib/svm/kkmeans.h b/ml/dlib/dlib/svm/kkmeans.h new file mode 100644 index 00000000..4c72106d --- /dev/null +++ b/ml/dlib/dlib/svm/kkmeans.h @@ -0,0 +1,654 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_KKMEANs_ +#define DLIB_KKMEANs_ + +#include <cmath> +#include <vector> + +#include "../matrix/matrix_abstract.h" +#include "../algs.h" +#include "../serialize.h" +#include "kernel.h" +#include "../array.h" +#include "kcentroid.h" +#include "kkmeans_abstract.h" +#include "../noncopyable.h" + +namespace dlib +{ + + template < + typename kernel_type + > + class kkmeans : public noncopyable + { + public: + typedef typename kernel_type::scalar_type scalar_type; + typedef typename kernel_type::sample_type sample_type; + typedef typename kernel_type::mem_manager_type mem_manager_type; + + kkmeans ( + const kcentroid<kernel_type>& kc_ + ): + kc(kc_), + min_change(0.01) + { + set_number_of_centers(1); + } + + ~kkmeans() + { + } + + const kernel_type& get_kernel ( + ) const + { + return kc.get_kernel(); + } + + void set_kcentroid ( + const kcentroid<kernel_type>& kc_ + ) + { + kc = kc_; + set_number_of_centers(number_of_centers()); + } + + const kcentroid<kernel_type>& get_kcentroid ( + unsigned long i + ) const + { + // make sure requires clause is not broken + DLIB_ASSERT(i < number_of_centers(), + "\tkcentroid kkmeans::get_kcentroid(i)" + << "\n\tYou have given an invalid value for i" + << "\n\ti: " << i + << "\n\tnumber_of_centers(): " << number_of_centers() + << "\n\tthis: " << this + ); + + return *centers[i]; + } + + void set_number_of_centers ( + unsigned long num + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(num > 0, + "\tvoid kkmeans::set_number_of_centers()" + << "\n\tYou can't set the number of centers to zero" + << "\n\tthis: " << this + ); + + centers.set_max_size(num); + centers.set_size(num); + + for (unsigned long i = 0; i < centers.size(); ++i) + { + centers[i].reset(new kcentroid<kernel_type>(kc)); + } + } + + unsigned long number_of_centers ( + ) const + { + return centers.size(); + } + + template <typename T, typename U> + void train ( + const T& samples, + const U& initial_centers, + long max_iter = 1000 + ) + { + do_train(mat(samples),mat(initial_centers),max_iter); + } + + unsigned long operator() ( + const sample_type& sample + ) const + { + unsigned long label = 0; + scalar_type best_score = (*centers[0])(sample); + + // figure out which center the given sample is closest too + for (unsigned long i = 1; i < centers.size(); ++i) + { + scalar_type temp = (*centers[i])(sample); + if (temp < best_score) + { + label = i; + best_score = temp; + } + } + + return label; + } + + void set_min_change ( + scalar_type min_change_ + ) + { + // make sure requires clause is not broken + DLIB_ASSERT( 0 <= min_change_ < 1, + "\tvoid kkmeans::set_min_change()" + << "\n\tInvalid arguments to this function" + << "\n\tthis: " << this + << "\n\tmin_change_: " << min_change_ + ); + min_change = min_change_; + } + + const scalar_type get_min_change ( + ) const + { + return min_change; + } + + void swap ( + kkmeans& item + ) + { + centers.swap(item.centers); + kc.swap(item.kc); + assignments.swap(item.assignments); + exchange(min_change, item.min_change); + } + + friend void serialize(const kkmeans& item, std::ostream& out) + { + serialize(item.centers.size(),out); + for (unsigned long i = 0; i < item.centers.size(); ++i) + { + serialize(*item.centers[i], out); + } + serialize(item.kc, out); + serialize(item.min_change, out); + } + + friend void deserialize(kkmeans& item, std::istream& in) + { + unsigned long num; + deserialize(num, in); + item.centers.resize(num); + for (unsigned long i = 0; i < item.centers.size(); ++i) + { + std::unique_ptr<kcentroid<kernel_type> > temp(new kcentroid<kernel_type>(kernel_type())); + deserialize(*temp, in); + item.centers[i].swap(temp); + } + + deserialize(item.kc, in); + deserialize(item.min_change, in); + } + + private: + + template <typename matrix_type, typename matrix_type2> + void do_train ( + const matrix_type& samples, + const matrix_type2& initial_centers, + long max_iter = 1000 + ) + { + COMPILE_TIME_ASSERT((is_same_type<typename matrix_type::type, sample_type>::value)); + COMPILE_TIME_ASSERT((is_same_type<typename matrix_type2::type, sample_type>::value)); + + // make sure requires clause is not broken + DLIB_ASSERT(samples.nc() == 1 && initial_centers.nc() == 1 && + initial_centers.nr() == static_cast<long>(number_of_centers()), + "\tvoid kkmeans::train()" + << "\n\tInvalid arguments to this function" + << "\n\tthis: " << this + << "\n\tsamples.nc(): " << samples.nc() + << "\n\tinitial_centers.nc(): " << initial_centers.nc() + << "\n\tinitial_centers.nr(): " << initial_centers.nr() + ); + + // clear out the old data and initialize the centers + for (unsigned long i = 0; i < centers.size(); ++i) + { + centers[i]->clear_dictionary(); + centers[i]->train(initial_centers(i)); + } + + assignments.resize(samples.size()); + + bool assignment_changed = true; + + // loop until the centers stabilize + long count = 0; + const unsigned long min_num_change = static_cast<unsigned long>(min_change*samples.size()); + unsigned long num_changed = min_num_change; + while (assignment_changed && count < max_iter && num_changed >= min_num_change) + { + ++count; + assignment_changed = false; + num_changed = 0; + + // loop over all the samples and assign them to their closest centers + for (long i = 0; i < samples.size(); ++i) + { + // find the best center + unsigned long best_center = 0; + scalar_type best_score = (*centers[0])(samples(i)); + for (unsigned long c = 1; c < centers.size(); ++c) + { + scalar_type temp = (*centers[c])(samples(i)); + if (temp < best_score) + { + best_score = temp; + best_center = c; + } + } + + // if the current sample changed centers then make note of that + if (assignments[i] != best_center) + { + assignments[i] = best_center; + assignment_changed = true; + ++num_changed; + } + } + + if (assignment_changed) + { + // now clear out the old data + for (unsigned long i = 0; i < centers.size(); ++i) + centers[i]->clear_dictionary(); + + // recalculate the cluster centers + for (unsigned long i = 0; i < assignments.size(); ++i) + centers[assignments[i]]->train(samples(i)); + } + + } + + + } + + array<std::unique_ptr<kcentroid<kernel_type> > > centers; + kcentroid<kernel_type> kc; + scalar_type min_change; + + // temp variables + array<unsigned long> assignments; + }; + +// ---------------------------------------------------------------------------------------- + + template <typename kernel_type> + void swap(kkmeans<kernel_type>& a, kkmeans<kernel_type>& b) + { a.swap(b); } + +// ---------------------------------------------------------------------------------------- + + struct dlib_pick_initial_centers_data + { + dlib_pick_initial_centers_data():idx(0), dist(std::numeric_limits<double>::infinity()){} + long idx; + double dist; + bool operator< (const dlib_pick_initial_centers_data& d) const { return dist < d.dist; } + }; + + template < + typename vector_type1, + typename vector_type2, + typename kernel_type + > + void pick_initial_centers( + long num_centers, + vector_type1& centers, + const vector_type2& samples, + const kernel_type& k, + double percentile = 0.01 + ) + { + /* + This function is basically just a non-randomized version of the kmeans++ algorithm + described in the paper: + kmeans++: The Advantages of Careful Seeding by Arthur and Vassilvitskii + + */ + + + // make sure requires clause is not broken + DLIB_ASSERT(num_centers > 1 && 0 <= percentile && percentile < 1 && samples.size() > 1, + "\tvoid pick_initial_centers()" + << "\n\tYou passed invalid arguments to this function" + << "\n\tnum_centers: " << num_centers + << "\n\tpercentile: " << percentile + << "\n\tsamples.size(): " << samples.size() + ); + + std::vector<dlib_pick_initial_centers_data> scores(samples.size()); + std::vector<dlib_pick_initial_centers_data> scores_sorted(samples.size()); + centers.clear(); + + // pick the first sample as one of the centers + centers.push_back(samples[0]); + + const long best_idx = static_cast<long>(std::max(0.0,samples.size() - samples.size()*percentile - 1)); + + // pick the next center + for (long i = 0; i < num_centers-1; ++i) + { + // Loop over the samples and compare them to the most recent center. Store + // the distance from each sample to its closest center in scores. + const double k_cc = k(centers[i], centers[i]); + for (unsigned long s = 0; s < samples.size(); ++s) + { + // compute the distance between this sample and the current center + const double dist = k_cc + k(samples[s],samples[s]) - 2*k(samples[s], centers[i]); + + if (dist < scores[s].dist) + { + scores[s].dist = dist; + scores[s].idx = s; + } + } + + scores_sorted = scores; + + // now find the winning center and add it to centers. It is the one that is + // far away from all the other centers. + sort(scores_sorted.begin(), scores_sorted.end()); + centers.push_back(samples[scores_sorted[best_idx].idx]); + } + + } + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type1, + typename vector_type2 + > + void pick_initial_centers( + long num_centers, + vector_type1& centers, + const vector_type2& samples, + double percentile = 0.01 + ) + { + typedef typename vector_type1::value_type sample_type; + linear_kernel<sample_type> kern; + pick_initial_centers(num_centers, centers, samples, kern, percentile); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename array_type, + typename sample_type, + typename alloc + > + void find_clusters_using_kmeans ( + const array_type& samples, + std::vector<sample_type, alloc>& centers, + unsigned long max_iter = 1000 + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(samples.size() > 0 && centers.size() > 0, + "\tvoid find_clusters_using_kmeans()" + << "\n\tYou passed invalid arguments to this function" + << "\n\t samples.size(): " << samples.size() + << "\n\t centers.size(): " << centers.size() + ); + +#ifdef ENABLE_ASSERTS + { + const long nr = samples[0].nr(); + const long nc = samples[0].nc(); + for (unsigned long i = 0; i < samples.size(); ++i) + { + DLIB_ASSERT(is_vector(samples[i]) && samples[i].nr() == nr && samples[i].nc() == nc, + "\tvoid find_clusters_using_kmeans()" + << "\n\t You passed invalid arguments to this function" + << "\n\t is_vector(samples[i]): " << is_vector(samples[i]) + << "\n\t samples[i].nr(): " << samples[i].nr() + << "\n\t nr: " << nr + << "\n\t samples[i].nc(): " << samples[i].nc() + << "\n\t nc: " << nc + << "\n\t i: " << i + ); + } + } +#endif + + typedef typename sample_type::type scalar_type; + + sample_type zero(centers[0]); + set_all_elements(zero, 0); + + std::vector<unsigned long> center_element_count; + + // tells which center a sample belongs to + std::vector<unsigned long> assignments(samples.size(), samples.size()); + + + unsigned long iter = 0; + bool centers_changed = true; + while (centers_changed && iter < max_iter) + { + ++iter; + centers_changed = false; + center_element_count.assign(centers.size(), 0); + + // loop over each sample and see which center it is closest to + for (unsigned long i = 0; i < samples.size(); ++i) + { + // find the best center for sample[i] + scalar_type best_dist = std::numeric_limits<scalar_type>::max(); + unsigned long best_center = 0; + for (unsigned long j = 0; j < centers.size(); ++j) + { + scalar_type dist = length(centers[j] - samples[i]); + if (dist < best_dist) + { + best_dist = dist; + best_center = j; + } + } + + if (assignments[i] != best_center) + { + centers_changed = true; + assignments[i] = best_center; + } + + center_element_count[best_center] += 1; + } + + // now update all the centers + centers.assign(centers.size(), zero); + for (unsigned long i = 0; i < samples.size(); ++i) + { + centers[assignments[i]] += samples[i]; + } + for (unsigned long i = 0; i < centers.size(); ++i) + { + if (center_element_count[i] != 0) + centers[i] /= center_element_count[i]; + } + } + + + } + +// ---------------------------------------------------------------------------------------- + + template < + typename array_type, + typename sample_type, + typename alloc + > + void find_clusters_using_angular_kmeans ( + const array_type& samples, + std::vector<sample_type, alloc>& centers, + unsigned long max_iter = 1000 + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(samples.size() > 0 && centers.size() > 0, + "\tvoid find_clusters_using_angular_kmeans()" + << "\n\tYou passed invalid arguments to this function" + << "\n\t samples.size(): " << samples.size() + << "\n\t centers.size(): " << centers.size() + ); + +#ifdef ENABLE_ASSERTS + { + const long nr = samples[0].nr(); + const long nc = samples[0].nc(); + for (unsigned long i = 0; i < samples.size(); ++i) + { + DLIB_ASSERT(is_vector(samples[i]) && samples[i].nr() == nr && samples[i].nc() == nc, + "\tvoid find_clusters_using_angular_kmeans()" + << "\n\t You passed invalid arguments to this function" + << "\n\t is_vector(samples[i]): " << is_vector(samples[i]) + << "\n\t samples[i].nr(): " << samples[i].nr() + << "\n\t nr: " << nr + << "\n\t samples[i].nc(): " << samples[i].nc() + << "\n\t nc: " << nc + << "\n\t i: " << i + ); + } + } +#endif + + typedef typename sample_type::type scalar_type; + + sample_type zero(centers[0]); + set_all_elements(zero, 0); + + unsigned long seed = 0; + + // tells which center a sample belongs to + std::vector<unsigned long> assignments(samples.size(), samples.size()); + std::vector<double> lengths; + for (unsigned long i = 0; i < samples.size(); ++i) + { + lengths.push_back(length(samples[i])); + // If there are zero vectors in samples then just say their length is 1 so we + // can avoid a division by zero check later on. Also, this doesn't matter + // since zero vectors can be assigned to any cluster randomly as there is no + // basis for picking one based on angle. + if (lengths.back() == 0) + lengths.back() = 1; + } + + // We will keep the centers as unit vectors at all times throughout the processing. + for (unsigned long i = 0; i < centers.size(); ++i) + { + double len = length(centers[i]); + // Avoid having length 0 centers. If that is the case then pick another center + // at random. + while(len == 0) + { + centers[i] = matrix_cast<scalar_type>(gaussian_randm(centers[i].nr(), centers[i].nc(), seed++)); + len = length(centers[i]); + } + centers[i] /= len; + } + + + unsigned long iter = 0; + bool centers_changed = true; + while (centers_changed && iter < max_iter) + { + ++iter; + centers_changed = false; + + // loop over each sample and see which center it is closest to + for (unsigned long i = 0; i < samples.size(); ++i) + { + // find the best center for sample[i] + scalar_type best_angle = std::numeric_limits<scalar_type>::max(); + unsigned long best_center = 0; + for (unsigned long j = 0; j < centers.size(); ++j) + { + scalar_type angle = -dot(centers[j],samples[i])/lengths[i]; + + if (angle < best_angle) + { + best_angle = angle; + best_center = j; + } + } + + if (assignments[i] != best_center) + { + centers_changed = true; + assignments[i] = best_center; + } + } + + // now update all the centers + centers.assign(centers.size(), zero); + for (unsigned long i = 0; i < samples.size(); ++i) + { + centers[assignments[i]] += samples[i]; + } + // Now length normalize all the centers. + for (unsigned long i = 0; i < centers.size(); ++i) + { + double len = length(centers[i]); + // Avoid having length 0 centers. If that is the case then pick another center + // at random. + while(len == 0) + { + centers[i] = matrix_cast<scalar_type>(gaussian_randm(centers[i].nr(), centers[i].nc(), seed++)); + len = length(centers[i]); + centers_changed = true; + } + centers[i] /= len; + } + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename array_type, + typename EXP + > + unsigned long nearest_center ( + const array_type& centers, + const matrix_exp<EXP>& sample + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(centers.size() > 0 && sample.size() > 0 && is_vector(sample), + "\t unsigned long nearest_center()" + << "\n\t You have given invalid inputs to this function." + << "\n\t centers.size(): " << centers.size() + << "\n\t sample.size(): " << sample.size() + << "\n\t is_vector(sample): " << is_vector(sample) + ); + + double best_dist = length_squared(centers[0] - sample); + unsigned long best_idx = 0; + for (unsigned long i = 1; i < centers.size(); ++i) + { + const double dist = length_squared(centers[i] - sample); + if (dist < best_dist) + { + best_dist = dist; + best_idx = i; + } + } + return best_idx; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_KKMEANs_ + + |