diff options
Diffstat (limited to 'ml/dlib/dlib/clustering')
-rw-r--r-- | ml/dlib/dlib/clustering/bottom_up_cluster.h | 253 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h | 136 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/chinese_whispers.h | 135 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/chinese_whispers_abstract.h | 97 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/modularity_clustering.h | 515 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/modularity_clustering_abstract.h | 125 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/spectral_cluster.h | 80 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/spectral_cluster_abstract.h | 43 |
8 files changed, 0 insertions, 1384 deletions
diff --git a/ml/dlib/dlib/clustering/bottom_up_cluster.h b/ml/dlib/dlib/clustering/bottom_up_cluster.h deleted file mode 100644 index f80b65108..000000000 --- a/ml/dlib/dlib/clustering/bottom_up_cluster.h +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright (C) 2015 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_ -#define DLIB_BOTTOM_uP_CLUSTER_Hh_ - -#include <queue> -#include <map> - -#include "bottom_up_cluster_abstract.h" -#include "../algs.h" -#include "../matrix.h" -#include "../disjoint_subsets.h" -#include "../graph_utils.h" - - -namespace dlib -{ - -// ---------------------------------------------------------------------------------------- - - namespace buc_impl - { - inline void merge_sets ( - matrix<double>& dists, - unsigned long dest, - unsigned long src - ) - { - for (long r = 0; r < dists.nr(); ++r) - dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src)); - } - - struct compare_dist - { - bool operator() ( - const sample_pair& a, - const sample_pair& b - ) const - { - return a.distance() > b.distance(); - } - }; - } - -// ---------------------------------------------------------------------------------------- - - template < - typename EXP - > - unsigned long bottom_up_cluster ( - const matrix_exp<EXP>& dists_, - std::vector<unsigned long>& labels, - unsigned long min_num_clusters, - double max_dist = std::numeric_limits<double>::infinity() - ) - { - matrix<double> dists = matrix_cast<double>(dists_); - // make sure requires clause is not broken - DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0, - "\t unsigned long bottom_up_cluster()" - << "\n\t Invalid inputs were given to this function." - << "\n\t dists.nr(): " << dists.nr() - << "\n\t dists.nc(): " << dists.nc() - << "\n\t min_num_clusters: " << min_num_clusters - ); - - using namespace buc_impl; - - labels.resize(dists.nr()); - disjoint_subsets sets; - sets.set_size(dists.nr()); - if (labels.size() == 0) - return 0; - - // push all the edges in the graph into a priority queue so the best edges to merge - // come first. - std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que; - for (long r = 0; r < dists.nr(); ++r) - for (long c = r+1; c < dists.nc(); ++c) - que.push(sample_pair(r,c,dists(r,c))); - - // Now start merging nodes. - for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter) - { - // find the next best thing to merge. - double best_dist = que.top().distance(); - unsigned long a = sets.find_set(que.top().index1()); - unsigned long b = sets.find_set(que.top().index2()); - que.pop(); - // we have been merging and modifying the distances, so make sure this distance - // is still valid and these guys haven't been merged already. - while(a == b || best_dist < dists(a,b)) - { - // Haven't merged it yet, so put it back in with updated distance for - // reconsideration later. - if (a != b) - que.push(sample_pair(a, b, dists(a, b))); - - best_dist = que.top().distance(); - a = sets.find_set(que.top().index1()); - b = sets.find_set(que.top().index2()); - que.pop(); - } - - - // now merge these sets if the best distance is small enough - if (best_dist > max_dist) - break; - unsigned long news = sets.merge_sets(a,b); - unsigned long olds = (news==a)?b:a; - merge_sets(dists, news, olds); - } - - // figure out which cluster each element is in. Also make sure the labels are - // contiguous. - std::map<unsigned long, unsigned long> relabel; - for (unsigned long r = 0; r < labels.size(); ++r) - { - unsigned long l = sets.find_set(r); - // relabel to make contiguous - if (relabel.count(l) == 0) - { - unsigned long next = relabel.size(); - relabel[l] = next; - } - labels[r] = relabel[l]; - } - - - return relabel.size(); - } - -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- - - struct snl_range - { - snl_range() = default; - snl_range(double val) : lower(val), upper(val) {} - snl_range(double l, double u) : lower(l), upper(u) { DLIB_ASSERT(lower <= upper)} - - double lower = 0; - double upper = 0; - - double width() const { return upper-lower; } - bool operator<(const snl_range& item) const { return lower < item.lower; } - }; - - inline snl_range merge(const snl_range& a, const snl_range& b) - { - return snl_range(std::min(a.lower, b.lower), std::max(a.upper, b.upper)); - } - - inline double distance (const snl_range& a, const snl_range& b) - { - return std::max(a.lower,b.lower) - std::min(a.upper,b.upper); - } - - inline std::ostream& operator<< (std::ostream& out, const snl_range& item ) - { - out << "["<<item.lower<<","<<item.upper<<"]"; - return out; - } - -// ---------------------------------------------------------------------------------------- - - inline std::vector<snl_range> segment_number_line ( - const std::vector<double>& x, - const double max_range_width - ) - { - DLIB_CASSERT(max_range_width >= 0); - - // create initial ranges, one for each value in x. So initially, all the ranges have - // width of 0. - std::vector<snl_range> ranges; - for (auto v : x) - ranges.push_back(v); - std::sort(ranges.begin(), ranges.end()); - - std::vector<snl_range> greedy_final_ranges; - if (ranges.size() == 0) - return greedy_final_ranges; - // We will try two different clustering strategies. One that does a simple greedy left - // to right sweep and another that does a bottom up agglomerative clustering. This - // first loop runs the greedy left to right sweep. Then at the end of this routine we - // will return the results that produced the tightest clustering. - greedy_final_ranges.push_back(ranges[0]); - for (size_t i = 1; i < ranges.size(); ++i) - { - auto m = merge(greedy_final_ranges.back(), ranges[i]); - if (m.width() <= max_range_width) - greedy_final_ranges.back() = m; - else - greedy_final_ranges.push_back(ranges[i]); - } - - - // Here we do the bottom up clustering. So compute the edges connecting our ranges. - // We will simply say there are edges between ranges if and only if they are - // immediately adjacent on the number line. - std::vector<sample_pair> edges; - for (size_t i = 1; i < ranges.size(); ++i) - edges.push_back(sample_pair(i-1,i, distance(ranges[i-1],ranges[i]))); - std::sort(edges.begin(), edges.end(), order_by_distance<sample_pair>); - - disjoint_subsets sets; - sets.set_size(ranges.size()); - - // Now start merging nodes. - for (auto edge : edges) - { - // find the next best thing to merge. - unsigned long a = sets.find_set(edge.index1()); - unsigned long b = sets.find_set(edge.index2()); - - // merge it if it doesn't result in an interval that's too big. - auto m = merge(ranges[a], ranges[b]); - if (m.width() <= max_range_width) - { - unsigned long news = sets.merge_sets(a,b); - ranges[news] = m; - } - } - - // Now create a list of the final ranges. We will do this by keeping track of which - // range we already added to final_ranges. - std::vector<snl_range> final_ranges; - std::vector<bool> already_output(ranges.size(), false); - for (unsigned long i = 0; i < sets.size(); ++i) - { - auto s = sets.find_set(i); - if (!already_output[s]) - { - final_ranges.push_back(ranges[s]); - already_output[s] = true; - } - } - - // only use the greedy clusters if they found a clustering with fewer clusters. - // Otherwise, the bottom up clustering probably produced a more sensible clustering. - if (final_ranges.size() <= greedy_final_ranges.size()) - return final_ranges; - else - return greedy_final_ranges; - } - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_ - diff --git a/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h b/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h deleted file mode 100644 index 72d362c12..000000000 --- a/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (C) 2015 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#undef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_ -#ifdef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_ - -#include "../matrix.h" - -namespace dlib -{ - -// ---------------------------------------------------------------------------------------- - - template < - typename EXP - > - unsigned long bottom_up_cluster ( - const matrix_exp<EXP>& dists, - std::vector<unsigned long>& labels, - unsigned long min_num_clusters, - double max_dist = std::numeric_limits<double>::infinity() - ); - /*! - requires - - dists.nr() == dists.nc() - - min_num_clusters > 0 - - dists == trans(dists) - (l.e. dists should be symmetric) - ensures - - Runs a bottom up agglomerative clustering algorithm. - - Interprets dists as a matrix that gives the distances between dists.nr() - items. In particular, we take dists(i,j) to be the distance between the ith - and jth element of some set. This function clusters the elements of this set - into at least min_num_clusters (or dists.nr() if there aren't enough - elements). Additionally, within each cluster, the maximum pairwise distance - between any two cluster elements is <= max_dist. - - returns the number of clusters found. - - #labels.size() == dists.nr() - - for all valid i: - - #labels[i] == the cluster ID of the node with index i (i.e. the node - corresponding to the distances dists(i,*)). - - 0 <= #labels[i] < the number of clusters found - (i.e. cluster IDs are assigned contiguously and start at 0) - !*/ - -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- - - struct snl_range - { - /*! - WHAT THIS OBJECT REPRESENTS - This object represents an interval on the real number line. It is used - to store the outputs of the segment_number_line() routine defined below. - !*/ - - snl_range( - ); - /*! - ensures - - #lower == 0 - - #upper == 0 - !*/ - - snl_range( - double val - ); - /*! - ensures - - #lower == val - - #upper == val - !*/ - - snl_range( - double l, - double u - ); - /*! - requires - - l <= u - ensures - - #lower == l - - #upper == u - !*/ - - double lower; - double upper; - - double width( - ) const { return upper-lower; } - /*! - ensures - - returns the width of this interval on the number line. - !*/ - - bool operator<(const snl_range& item) const { return lower < item.lower; } - /*! - ensures - - provides a total ordering of snl_range objects assuming they are - non-overlapping. - !*/ - }; - - std::ostream& operator<< (std::ostream& out, const snl_range& item ); - /*! - ensures - - prints item to out in the form [lower,upper]. - !*/ - -// ---------------------------------------------------------------------------------------- - - std::vector<snl_range> segment_number_line ( - const std::vector<double>& x, - const double max_range_width - ); - /*! - requires - - max_range_width >= 0 - ensures - - Finds a clustering of the values in x and returns the ranges that define the - clustering. This routine uses a combination of bottom up clustering and a - simple greedy scan to try and find the most compact set of ranges that - contain all the values in x. - - This routine has approximately linear runtime. - - Every value in x will be contained inside one of the returned snl_range - objects; - - All returned snl_range object's will have a width() <= max_range_width and - will also be non-overlapping. - !*/ - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_ - - diff --git a/ml/dlib/dlib/clustering/chinese_whispers.h b/ml/dlib/dlib/clustering/chinese_whispers.h deleted file mode 100644 index 332cce1a0..000000000 --- a/ml/dlib/dlib/clustering/chinese_whispers.h +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (C) 2012 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_CHINESE_WHISPErS_Hh_ -#define DLIB_CHINESE_WHISPErS_Hh_ - -#include "chinese_whispers_abstract.h" -#include <vector> -#include "../rand.h" -#include "../graph_utils/edge_list_graphs.h" - -namespace dlib -{ - -// ---------------------------------------------------------------------------------------- - - inline unsigned long chinese_whispers ( - const std::vector<ordered_sample_pair>& edges, - std::vector<unsigned long>& labels, - const unsigned long num_iterations, - dlib::rand& rnd - ) - { - // make sure requires clause is not broken - DLIB_ASSERT(is_ordered_by_index(edges), - "\t unsigned long chinese_whispers()" - << "\n\t Invalid inputs were given to this function" - ); - - labels.clear(); - if (edges.size() == 0) - return 0; - - std::vector<std::pair<unsigned long, unsigned long> > neighbors; - find_neighbor_ranges(edges, neighbors); - - // Initialize the labels, each node gets a different label. - labels.resize(neighbors.size()); - for (unsigned long i = 0; i < labels.size(); ++i) - labels[i] = i; - - - for (unsigned long iter = 0; iter < neighbors.size()*num_iterations; ++iter) - { - // Pick a random node. - const unsigned long idx = rnd.get_random_64bit_number()%neighbors.size(); - - // Count how many times each label happens amongst our neighbors. - std::map<unsigned long, double> labels_to_counts; - const unsigned long end = neighbors[idx].second; - for (unsigned long i = neighbors[idx].first; i != end; ++i) - { - labels_to_counts[labels[edges[i].index2()]] += edges[i].distance(); - } - - // find the most common label - std::map<unsigned long, double>::iterator i; - double best_score = -std::numeric_limits<double>::infinity(); - unsigned long best_label = labels[idx]; - for (i = labels_to_counts.begin(); i != labels_to_counts.end(); ++i) - { - if (i->second > best_score) - { - best_score = i->second; - best_label = i->first; - } - } - - labels[idx] = best_label; - } - - - // Remap the labels into a contiguous range. First we find the - // mapping. - std::map<unsigned long,unsigned long> label_remap; - for (unsigned long i = 0; i < labels.size(); ++i) - { - const unsigned long next_id = label_remap.size(); - if (label_remap.count(labels[i]) == 0) - label_remap[labels[i]] = next_id; - } - // now apply the mapping to all the labels. - for (unsigned long i = 0; i < labels.size(); ++i) - { - labels[i] = label_remap[labels[i]]; - } - - return label_remap.size(); - } - -// ---------------------------------------------------------------------------------------- - - inline unsigned long chinese_whispers ( - const std::vector<sample_pair>& edges, - std::vector<unsigned long>& labels, - const unsigned long num_iterations, - dlib::rand& rnd - ) - { - std::vector<ordered_sample_pair> oedges; - convert_unordered_to_ordered(edges, oedges); - std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>); - - return chinese_whispers(oedges, labels, num_iterations, rnd); - } - -// ---------------------------------------------------------------------------------------- - - inline unsigned long chinese_whispers ( - const std::vector<sample_pair>& edges, - std::vector<unsigned long>& labels, - const unsigned long num_iterations = 100 - ) - { - dlib::rand rnd; - return chinese_whispers(edges, labels, num_iterations, rnd); - } - -// ---------------------------------------------------------------------------------------- - - inline unsigned long chinese_whispers ( - const std::vector<ordered_sample_pair>& edges, - std::vector<unsigned long>& labels, - const unsigned long num_iterations = 100 - ) - { - dlib::rand rnd; - return chinese_whispers(edges, labels, num_iterations, rnd); - } - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_CHINESE_WHISPErS_Hh_ - diff --git a/ml/dlib/dlib/clustering/chinese_whispers_abstract.h b/ml/dlib/dlib/clustering/chinese_whispers_abstract.h deleted file mode 100644 index 7a184c6f9..000000000 --- a/ml/dlib/dlib/clustering/chinese_whispers_abstract.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (C) 2012 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#undef DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_ -#ifdef DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_ - -#include <vector> -#include "../rand.h" -#include "../graph_utils/ordered_sample_pair_abstract.h" -#include "../graph_utils/sample_pair_abstract.h" - -namespace dlib -{ - -// ---------------------------------------------------------------------------------------- - - unsigned long chinese_whispers ( - const std::vector<ordered_sample_pair>& edges, - std::vector<unsigned long>& labels, - const unsigned long num_iterations, - dlib::rand& rnd - ); - /*! - requires - - is_ordered_by_index(edges) == true - ensures - - This function implements the graph clustering algorithm described in the - paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its - Application to Natural Language Processing Problems by Chris Biemann. - - Interprets edges as a directed graph. That is, it contains the edges on the - said graph and the ordered_sample_pair::distance() values define the edge - weights (larger values indicating a stronger edge connection between the - nodes). If an edge has a distance() value of infinity then it is considered - a "must link" edge. - - returns the number of clusters found. - - #labels.size() == max_index_plus_one(edges) - - for all valid i: - - #labels[i] == the cluster ID of the node with index i in the graph. - - 0 <= #labels[i] < the number of clusters found - (i.e. cluster IDs are assigned contiguously and start at 0) - - Duplicate edges are interpreted as if there had been just one edge with a - distance value equal to the sum of all the duplicate edge's distance values. - - The algorithm performs exactly num_iterations passes over the graph before - terminating. - !*/ - -// ---------------------------------------------------------------------------------------- - - unsigned long chinese_whispers ( - const std::vector<sample_pair>& edges, - std::vector<unsigned long>& labels, - const unsigned long num_iterations, - dlib::rand& rnd - ); - /*! - ensures - - This function is identical to the above chinese_whispers() routine except - that it operates on a vector of sample_pair objects instead of - ordered_sample_pairs. Therefore, this is simply a convenience routine. In - particular, it is implemented by transforming the given edges into - ordered_sample_pairs and then calling the chinese_whispers() routine defined - above. - !*/ - -// ---------------------------------------------------------------------------------------- - - unsigned long chinese_whispers ( - const std::vector<ordered_sample_pair>& edges, - std::vector<unsigned long>& labels, - const unsigned long num_iterations = 100 - ); - /*! - requires - - is_ordered_by_index(edges) == true - ensures - - performs: return chinese_whispers(edges, labels, num_iterations, rnd) - where rnd is a default initialized dlib::rand object. - !*/ - -// ---------------------------------------------------------------------------------------- - - unsigned long chinese_whispers ( - const std::vector<sample_pair>& edges, - std::vector<unsigned long>& labels, - const unsigned long num_iterations = 100 - ); - /*! - ensures - - performs: return chinese_whispers(edges, labels, num_iterations, rnd) - where rnd is a default initialized dlib::rand object. - !*/ - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_ - diff --git a/ml/dlib/dlib/clustering/modularity_clustering.h b/ml/dlib/dlib/clustering/modularity_clustering.h deleted file mode 100644 index 8b8a0b0a5..000000000 --- a/ml/dlib/dlib/clustering/modularity_clustering.h +++ /dev/null @@ -1,515 +0,0 @@ -// Copyright (C) 2012 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_MODULARITY_ClUSTERING__H__ -#define DLIB_MODULARITY_ClUSTERING__H__ - -#include "modularity_clustering_abstract.h" -#include "../sparse_vector.h" -#include "../graph_utils/edge_list_graphs.h" -#include "../matrix.h" -#include "../rand.h" - -namespace dlib -{ - -// ----------------------------------------------------------------------------------------- - - namespace impl - { - inline double newman_cluster_split ( - dlib::rand& rnd, - const std::vector<ordered_sample_pair>& edges, - const matrix<double,0,1>& node_degrees, // k from the Newman paper - const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper - const double& edge_sum, // m from the Newman paper - matrix<double,0,1>& labels, - const double eps, - const unsigned long max_iterations - ) - /*! - requires - - node_degrees.size() == max_index_plus_one(edges) - - Bdiag.size() == max_index_plus_one(edges) - - edges must be sorted according to order_by_index() - ensures - - This routine splits a graph into two subgraphs using the Newman - clustering method. - - returns the modularity obtained when the graph is split according - to the contents of #labels. - - #labels.size() == node_degrees.size() - - for all valid i: #labels(i) == -1 or +1 - - if (this function returns 0) then - - all the labels are equal, i.e. the graph is not split. - !*/ - { - // Scale epsilon so that it is relative to the expected value of an element of a - // unit vector of length node_degrees.size(). - const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size()); - - // Make a random unit vector and put in labels. - labels.set_size(node_degrees.size()); - for (long i = 0; i < labels.size(); ++i) - labels(i) = rnd.get_random_gaussian(); - labels /= length(labels); - - matrix<double,0,1> Bv, Bv_unit; - - // Do the power iteration for a while. - double eig = -1; - double offset = 0; - while (eig < 0) - { - - // any number larger than power_iter_eps - double iteration_change = power_iter_eps*2+1; - for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i) - { - sparse_matrix_vector_multiply(edges, labels, Bv); - Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; - - if (offset != 0) - { - Bv -= offset*labels; - } - - - const double len = length(Bv); - if (len != 0) - { - Bv_unit = Bv/len; - iteration_change = max(abs(labels-Bv_unit)); - labels.swap(Bv_unit); - } - else - { - // Had a bad time, pick another random vector and try it with the - // power iteration. - for (long i = 0; i < labels.size(); ++i) - labels(i) = rnd.get_random_gaussian(); - } - } - - eig = dot(Bv,labels); - // we will repeat this loop if the largest eigenvalue is negative - offset = eig; - } - - - for (long i = 0; i < labels.size(); ++i) - { - if (labels(i) > 0) - labels(i) = 1; - else - labels(i) = -1; - } - - - // compute B*labels, store result in Bv. - sparse_matrix_vector_multiply(edges, labels, Bv); - Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; - - // Do some label refinement. In this step we swap labels if it - // improves the modularity score. - bool flipped_label = true; - while(flipped_label) - { - flipped_label = false; - unsigned long idx = 0; - for (long i = 0; i < labels.size(); ++i) - { - const double val = -2*labels(i); - const double increase = 4*Bdiag(i) + 2*val*Bv(i); - - // if there is an increase in modularity for swapping this label - if (increase > 0) - { - labels(i) *= -1; - while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) - { - const long j = edges[idx].index2(); - Bv(j) += val*edges[idx].distance(); - ++idx; - } - - Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees; - - flipped_label = true; - } - else - { - while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) - { - ++idx; - } - } - } - } - - - const double modularity = dot(Bv, labels)/(4*edge_sum); - - return modularity; - } - - // ------------------------------------------------------------------------------------- - - inline unsigned long newman_cluster_helper ( - dlib::rand& rnd, - const std::vector<ordered_sample_pair>& edges, - const matrix<double,0,1>& node_degrees, // k from the Newman paper - const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper - const double& edge_sum, // m from the Newman paper - std::vector<unsigned long>& labels, - double modularity_threshold, - const double eps, - const unsigned long max_iterations - ) - /*! - ensures - - returns the number of clusters the data was split into - !*/ - { - matrix<double,0,1> l; - const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations); - - - // We need to collapse the node index values down to contiguous values. So - // we use the following two vectors to contain the mappings from input index - // values to their corresponding index values in each split. - std::vector<unsigned long> left_idx_map(node_degrees.size()); - std::vector<unsigned long> right_idx_map(node_degrees.size()); - - // figure out how many nodes went into each side of the split. - unsigned long num_left_split = 0; - unsigned long num_right_split = 0; - for (long i = 0; i < l.size(); ++i) - { - if (l(i) > 0) - { - left_idx_map[i] = num_left_split; - ++num_left_split; - } - else - { - right_idx_map[i] = num_right_split; - ++num_right_split; - } - } - - // do a recursive split if it will improve the modularity. - if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0) - { - - // split the node_degrees and Bdiag matrices into left and right split parts - matrix<double,0,1> left_node_degrees(num_left_split); - matrix<double,0,1> right_node_degrees(num_right_split); - matrix<double,0,1> left_Bdiag(num_left_split); - matrix<double,0,1> right_Bdiag(num_right_split); - for (long i = 0; i < l.size(); ++i) - { - if (l(i) > 0) - { - left_node_degrees(left_idx_map[i]) = node_degrees(i); - left_Bdiag(left_idx_map[i]) = Bdiag(i); - } - else - { - right_node_degrees(right_idx_map[i]) = node_degrees(i); - right_Bdiag(right_idx_map[i]) = Bdiag(i); - } - } - - - // put the edges from one side of the split into split_edges - std::vector<ordered_sample_pair> split_edges; - modularity_threshold = 0; - for (unsigned long k = 0; k < edges.size(); ++k) - { - const unsigned long i = edges[k].index1(); - const unsigned long j = edges[k].index2(); - const double d = edges[k].distance(); - if (l(i) > 0 && l(j) > 0) - { - split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d)); - modularity_threshold += d; - } - } - modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum); - modularity_threshold /= 4*edge_sum; - - unsigned long num_left_clusters; - std::vector<unsigned long> left_labels; - num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag, - edge_sum,left_labels,modularity_threshold, - eps, max_iterations); - - // now load the other side into split_edges and cluster it as well - split_edges.clear(); - modularity_threshold = 0; - for (unsigned long k = 0; k < edges.size(); ++k) - { - const unsigned long i = edges[k].index1(); - const unsigned long j = edges[k].index2(); - const double d = edges[k].distance(); - if (l(i) < 0 && l(j) < 0) - { - split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d)); - modularity_threshold += d; - } - } - modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum); - modularity_threshold /= 4*edge_sum; - - unsigned long num_right_clusters; - std::vector<unsigned long> right_labels; - num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag, - edge_sum,right_labels,modularity_threshold, - eps, max_iterations); - - // Now merge the labels from the two splits. - labels.resize(node_degrees.size()); - for (unsigned long i = 0; i < labels.size(); ++i) - { - // if this node was in the left split - if (l(i) > 0) - { - labels[i] = left_labels[left_idx_map[i]]; - } - else // if this node was in the right split - { - labels[i] = right_labels[right_idx_map[i]] + num_left_clusters; - } - } - - - return num_left_clusters + num_right_clusters; - } - else - { - labels.assign(node_degrees.size(),0); - return 1; - } - - } - } - -// ---------------------------------------------------------------------------------------- - - inline unsigned long newman_cluster ( - const std::vector<ordered_sample_pair>& edges, - std::vector<unsigned long>& labels, - const double eps = 1e-4, - const unsigned long max_iterations = 2000 - ) - { - // make sure requires clause is not broken - DLIB_ASSERT(is_ordered_by_index(edges), - "\t unsigned long newman_cluster()" - << "\n\t Invalid inputs were given to this function" - ); - - labels.clear(); - if (edges.size() == 0) - return 0; - - const unsigned long num_nodes = max_index_plus_one(edges); - - // compute the node_degrees vector, edge_sum value, and diag(B). - matrix<double,0,1> node_degrees(num_nodes); - matrix<double,0,1> Bdiag(num_nodes); - Bdiag = 0; - double edge_sum = 0; - node_degrees = 0; - for (unsigned long i = 0; i < edges.size(); ++i) - { - node_degrees(edges[i].index1()) += edges[i].distance(); - edge_sum += edges[i].distance(); - if (edges[i].index1() == edges[i].index2()) - Bdiag(edges[i].index1()) += edges[i].distance(); - } - edge_sum /= 2; - Bdiag -= squared(node_degrees)/(2*edge_sum); - - - dlib::rand rnd; - return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations); - } - -// ---------------------------------------------------------------------------------------- - - inline unsigned long newman_cluster ( - const std::vector<sample_pair>& edges, - std::vector<unsigned long>& labels, - const double eps = 1e-4, - const unsigned long max_iterations = 2000 - ) - { - std::vector<ordered_sample_pair> oedges; - convert_unordered_to_ordered(edges, oedges); - std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>); - - return newman_cluster(oedges, labels, eps, max_iterations); - } - -// ---------------------------------------------------------------------------------------- - - namespace impl - { - inline std::vector<unsigned long> remap_labels ( - const std::vector<unsigned long>& labels, - unsigned long& num_labels - ) - /*! - ensures - - This function takes labels and produces a mapping which maps elements of - labels into the most compact range in [0, max] as possible. In particular, - there won't be any unused integers in the mapped range. - - #num_labels == the number of distinct values in labels. - - returns a vector V such that: - - V.size() == labels.size() - - max(mat(V))+1 == num_labels. - - for all valid i,j: - - if (labels[i] == labels[j]) then - - V[i] == V[j] - - else - - V[i] != V[j] - !*/ - { - std::map<unsigned long, unsigned long> temp; - for (unsigned long i = 0; i < labels.size(); ++i) - { - if (temp.count(labels[i]) == 0) - { - const unsigned long next = temp.size(); - temp[labels[i]] = next; - } - } - - num_labels = temp.size(); - - std::vector<unsigned long> result(labels.size()); - for (unsigned long i = 0; i < labels.size(); ++i) - { - result[i] = temp[labels[i]]; - } - return result; - } - } - -// ---------------------------------------------------------------------------------------- - - inline double modularity ( - const std::vector<sample_pair>& edges, - const std::vector<unsigned long>& labels - ) - { - const unsigned long num_nodes = max_index_plus_one(edges); - // make sure requires clause is not broken - DLIB_ASSERT(labels.size() == num_nodes, - "\t double modularity()" - << "\n\t Invalid inputs were given to this function" - ); - - unsigned long num_labels; - const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels); - - std::vector<double> cluster_sums(num_labels,0); - std::vector<double> k(num_nodes,0); - - double Q = 0; - double m = 0; - for (unsigned long i = 0; i < edges.size(); ++i) - { - const unsigned long n1 = edges[i].index1(); - const unsigned long n2 = edges[i].index2(); - k[n1] += edges[i].distance(); - if (n1 != n2) - k[n2] += edges[i].distance(); - - if (n1 != n2) - m += edges[i].distance(); - else - m += edges[i].distance()/2; - - if (labels_[n1] == labels_[n2]) - { - if (n1 != n2) - Q += 2*edges[i].distance(); - else - Q += edges[i].distance(); - } - } - - if (m == 0) - return 0; - - for (unsigned long i = 0; i < labels_.size(); ++i) - { - cluster_sums[labels_[i]] += k[i]; - } - - for (unsigned long i = 0; i < labels_.size(); ++i) - { - Q -= k[i]*cluster_sums[labels_[i]]/(2*m); - } - - return 1.0/(2*m)*Q; - } - -// ---------------------------------------------------------------------------------------- - - inline double modularity ( - const std::vector<ordered_sample_pair>& edges, - const std::vector<unsigned long>& labels - ) - { - const unsigned long num_nodes = max_index_plus_one(edges); - // make sure requires clause is not broken - DLIB_ASSERT(labels.size() == num_nodes, - "\t double modularity()" - << "\n\t Invalid inputs were given to this function" - ); - - - unsigned long num_labels; - const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels); - - std::vector<double> cluster_sums(num_labels,0); - std::vector<double> k(num_nodes,0); - - double Q = 0; - double m = 0; - for (unsigned long i = 0; i < edges.size(); ++i) - { - const unsigned long n1 = edges[i].index1(); - const unsigned long n2 = edges[i].index2(); - k[n1] += edges[i].distance(); - m += edges[i].distance(); - if (labels_[n1] == labels_[n2]) - { - Q += edges[i].distance(); - } - } - - if (m == 0) - return 0; - - for (unsigned long i = 0; i < labels_.size(); ++i) - { - cluster_sums[labels_[i]] += k[i]; - } - - for (unsigned long i = 0; i < labels_.size(); ++i) - { - Q -= k[i]*cluster_sums[labels_[i]]/m; - } - - return 1.0/m*Q; - } - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_MODULARITY_ClUSTERING__H__ - diff --git a/ml/dlib/dlib/clustering/modularity_clustering_abstract.h b/ml/dlib/dlib/clustering/modularity_clustering_abstract.h deleted file mode 100644 index c1e7c20c4..000000000 --- a/ml/dlib/dlib/clustering/modularity_clustering_abstract.h +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (C) 2012 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#undef DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_ -#ifdef DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_ - -#include <vector> -#include "../graph_utils/ordered_sample_pair_abstract.h" -#include "../graph_utils/sample_pair_abstract.h" - -namespace dlib -{ - -// ----------------------------------------------------------------------------------------- - - double modularity ( - const std::vector<sample_pair>& edges, - const std::vector<unsigned long>& labels - ); - /*! - requires - - labels.size() == max_index_plus_one(edges) - - for all valid i: - - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity() - ensures - - Interprets edges as an undirected graph. That is, it contains the edges on - the said graph and the sample_pair::distance() values define the edge weights - (larger values indicating a stronger edge connection between the nodes). - - This function returns the modularity value obtained when the given input - graph is broken into subgraphs according to the contents of labels. In - particular, we say that two nodes with indices i and j are in the same - subgraph or community if and only if labels[i] == labels[j]. - - Duplicate edges are interpreted as if there had been just one edge with a - distance value equal to the sum of all the duplicate edge's distance values. - - See the paper Modularity and community structure in networks by M. E. J. Newman - for a detailed definition. - !*/ - -// ---------------------------------------------------------------------------------------- - - double modularity ( - const std::vector<ordered_sample_pair>& edges, - const std::vector<unsigned long>& labels - ); - /*! - requires - - labels.size() == max_index_plus_one(edges) - - for all valid i: - - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity() - ensures - - Interprets edges as a directed graph. That is, it contains the edges on the - said graph and the ordered_sample_pair::distance() values define the edge - weights (larger values indicating a stronger edge connection between the - nodes). Note that, generally, modularity is only really defined for - undirected graphs. Therefore, the "directed graph" given to this function - should have symmetric edges between all nodes. The reason this function is - provided at all is because sometimes a vector of ordered_sample_pair objects - is a useful representation of an undirected graph. - - This function returns the modularity value obtained when the given input - graph is broken into subgraphs according to the contents of labels. In - particular, we say that two nodes with indices i and j are in the same - subgraph or community if and only if labels[i] == labels[j]. - - Duplicate edges are interpreted as if there had been just one edge with a - distance value equal to the sum of all the duplicate edge's distance values. - - See the paper Modularity and community structure in networks by M. E. J. Newman - for a detailed definition. - !*/ - -// ---------------------------------------------------------------------------------------- - - unsigned long newman_cluster ( - const std::vector<ordered_sample_pair>& edges, - std::vector<unsigned long>& labels, - const double eps = 1e-4, - const unsigned long max_iterations = 2000 - ); - /*! - requires - - is_ordered_by_index(edges) == true - - for all valid i: - - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity() - ensures - - This function performs the clustering algorithm described in the paper - Modularity and community structure in networks by M. E. J. Newman. - - This function interprets edges as a graph and attempts to find the labeling - that maximizes modularity(edges, #labels). - - returns the number of clusters found. - - #labels.size() == max_index_plus_one(edges) - - for all valid i: - - #labels[i] == the cluster ID of the node with index i in the graph. - - 0 <= #labels[i] < the number of clusters found - (i.e. cluster IDs are assigned contiguously and start at 0) - - The main computation of the algorithm is involved in finding an eigenvector - of a certain matrix. To do this, we use the power iteration. In particular, - each time we try to find an eigenvector we will let the power iteration loop - at most max_iterations times or until it reaches an accuracy of eps. - Whichever comes first. - !*/ - -// ---------------------------------------------------------------------------------------- - - unsigned long newman_cluster ( - const std::vector<sample_pair>& edges, - std::vector<unsigned long>& labels, - const double eps = 1e-4, - const unsigned long max_iterations = 2000 - ); - /*! - requires - - for all valid i: - - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity() - ensures - - This function is identical to the above newman_cluster() routine except that - it operates on a vector of sample_pair objects instead of - ordered_sample_pairs. Therefore, this is simply a convenience routine. In - particular, it is implemented by transforming the given edges into - ordered_sample_pairs and then calling the newman_cluster() routine defined - above. - !*/ - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_ - diff --git a/ml/dlib/dlib/clustering/spectral_cluster.h b/ml/dlib/dlib/clustering/spectral_cluster.h deleted file mode 100644 index 2cac9870f..000000000 --- a/ml/dlib/dlib/clustering/spectral_cluster.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2015 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_SPECTRAL_CLUSTEr_H_ -#define DLIB_SPECTRAL_CLUSTEr_H_ - -#include "spectral_cluster_abstract.h" -#include <vector> -#include "../matrix.h" -#include "../svm/kkmeans.h" - -namespace dlib -{ - template < - typename kernel_type, - typename vector_type - > - std::vector<unsigned long> spectral_cluster ( - const kernel_type& k, - const vector_type& samples, - const unsigned long num_clusters - ) - { - DLIB_CASSERT(num_clusters > 0, - "\t std::vector<unsigned long> spectral_cluster(k,samples,num_clusters)" - << "\n\t num_clusters can't be 0." - ); - - if (num_clusters == 1) - { - // nothing to do, just assign everything to the 0 cluster. - return std::vector<unsigned long>(samples.size(), 0); - } - - // compute the similarity matrix. - matrix<double> K(samples.size(), samples.size()); - for (long r = 0; r < K.nr(); ++r) - for (long c = r+1; c < K.nc(); ++c) - K(r,c) = K(c,r) = (double)k(samples[r], samples[c]); - for (long r = 0; r < K.nr(); ++r) - K(r,r) = 0; - - matrix<double,0,1> D(K.nr()); - for (long r = 0; r < K.nr(); ++r) - D(r) = sum(rowm(K,r)); - D = sqrt(reciprocal(D)); - K = diagm(D)*K*diagm(D); - matrix<double> u,w,v; - // Use the normal SVD routine unless the matrix is really big, then use the fast - // approximate version. - if (K.nr() < 1000) - svd3(K,u,w,v); - else - svd_fast(K,u,w,v, num_clusters+100, 5); - // Pick out the eigenvectors associated with the largest eigenvalues. - rsort_columns(v,w); - v = colm(v, range(0,num_clusters-1)); - // Now build the normalized spectral vectors, one for each input vector. - std::vector<matrix<double,0,1> > spec_samps, centers; - for (long r = 0; r < v.nr(); ++r) - { - spec_samps.push_back(trans(rowm(v,r))); - const double len = length(spec_samps.back()); - if (len != 0) - spec_samps.back() /= len; - } - // Finally do the K-means clustering - pick_initial_centers(num_clusters, centers, spec_samps); - find_clusters_using_kmeans(spec_samps, centers); - // And then compute the cluster assignments based on the output of K-means. - std::vector<unsigned long> assignments; - for (unsigned long i = 0; i < spec_samps.size(); ++i) - assignments.push_back(nearest_center(centers, spec_samps[i])); - - return assignments; - } - -} - -#endif // DLIB_SPECTRAL_CLUSTEr_H_ - diff --git a/ml/dlib/dlib/clustering/spectral_cluster_abstract.h b/ml/dlib/dlib/clustering/spectral_cluster_abstract.h deleted file mode 100644 index 880ad80af..000000000 --- a/ml/dlib/dlib/clustering/spectral_cluster_abstract.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (C) 2015 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#undef DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_ -#ifdef DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_ - -#include <vector> - -namespace dlib -{ - template < - typename kernel_type, - typename vector_type - > - std::vector<unsigned long> spectral_cluster ( - const kernel_type& k, - const vector_type& samples, - const unsigned long num_clusters - ); - /*! - requires - - samples must be something with an interface compatible with std::vector. - - The following expression must evaluate to a double or float: - k(samples[i], samples[j]) - - num_clusters > 0 - ensures - - Performs the spectral clustering algorithm described in the paper: - On spectral clustering: Analysis and an algorithm by Ng, Jordan, and Weiss. - and returns the results. - - This function clusters the input data samples into num_clusters clusters and - returns a vector that indicates which cluster each sample falls into. In - particular, we return an array A such that: - - A.size() == samples.size() - - A[i] == the cluster assignment of samples[i]. - - for all valid i: 0 <= A[i] < num_clusters - - The "similarity" of samples[i] with samples[j] is given by - k(samples[i],samples[j]). This means that k() should output a number >= 0 - and the number should be larger for samples that are more similar. - !*/ -} - -#endif // DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_ - - |