diff options
Diffstat (limited to 'ml/dlib/dlib/clustering')
-rw-r--r-- | ml/dlib/dlib/clustering/bottom_up_cluster.h | 253 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h | 136 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/chinese_whispers.h | 135 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/chinese_whispers_abstract.h | 97 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/modularity_clustering.h | 515 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/modularity_clustering_abstract.h | 125 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/spectral_cluster.h | 80 | ||||
-rw-r--r-- | ml/dlib/dlib/clustering/spectral_cluster_abstract.h | 43 |
8 files changed, 1384 insertions, 0 deletions
diff --git a/ml/dlib/dlib/clustering/bottom_up_cluster.h b/ml/dlib/dlib/clustering/bottom_up_cluster.h new file mode 100644 index 000000000..f80b65108 --- /dev/null +++ b/ml/dlib/dlib/clustering/bottom_up_cluster.h @@ -0,0 +1,253 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_ +#define DLIB_BOTTOM_uP_CLUSTER_Hh_ + +#include <queue> +#include <map> + +#include "bottom_up_cluster_abstract.h" +#include "../algs.h" +#include "../matrix.h" +#include "../disjoint_subsets.h" +#include "../graph_utils.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + namespace buc_impl + { + inline void merge_sets ( + matrix<double>& dists, + unsigned long dest, + unsigned long src + ) + { + for (long r = 0; r < dists.nr(); ++r) + dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src)); + } + + struct compare_dist + { + bool operator() ( + const sample_pair& a, + const sample_pair& b + ) const + { + return a.distance() > b.distance(); + } + }; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP + > + unsigned long bottom_up_cluster ( + const matrix_exp<EXP>& dists_, + std::vector<unsigned long>& labels, + unsigned long min_num_clusters, + double max_dist = std::numeric_limits<double>::infinity() + ) + { + matrix<double> dists = matrix_cast<double>(dists_); + // make sure requires clause is not broken + DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0, + "\t unsigned long bottom_up_cluster()" + << "\n\t Invalid inputs were given to this function." + << "\n\t dists.nr(): " << dists.nr() + << "\n\t dists.nc(): " << dists.nc() + << "\n\t min_num_clusters: " << min_num_clusters + ); + + using namespace buc_impl; + + labels.resize(dists.nr()); + disjoint_subsets sets; + sets.set_size(dists.nr()); + if (labels.size() == 0) + return 0; + + // push all the edges in the graph into a priority queue so the best edges to merge + // come first. + std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que; + for (long r = 0; r < dists.nr(); ++r) + for (long c = r+1; c < dists.nc(); ++c) + que.push(sample_pair(r,c,dists(r,c))); + + // Now start merging nodes. + for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter) + { + // find the next best thing to merge. + double best_dist = que.top().distance(); + unsigned long a = sets.find_set(que.top().index1()); + unsigned long b = sets.find_set(que.top().index2()); + que.pop(); + // we have been merging and modifying the distances, so make sure this distance + // is still valid and these guys haven't been merged already. + while(a == b || best_dist < dists(a,b)) + { + // Haven't merged it yet, so put it back in with updated distance for + // reconsideration later. + if (a != b) + que.push(sample_pair(a, b, dists(a, b))); + + best_dist = que.top().distance(); + a = sets.find_set(que.top().index1()); + b = sets.find_set(que.top().index2()); + que.pop(); + } + + + // now merge these sets if the best distance is small enough + if (best_dist > max_dist) + break; + unsigned long news = sets.merge_sets(a,b); + unsigned long olds = (news==a)?b:a; + merge_sets(dists, news, olds); + } + + // figure out which cluster each element is in. Also make sure the labels are + // contiguous. + std::map<unsigned long, unsigned long> relabel; + for (unsigned long r = 0; r < labels.size(); ++r) + { + unsigned long l = sets.find_set(r); + // relabel to make contiguous + if (relabel.count(l) == 0) + { + unsigned long next = relabel.size(); + relabel[l] = next; + } + labels[r] = relabel[l]; + } + + + return relabel.size(); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + struct snl_range + { + snl_range() = default; + snl_range(double val) : lower(val), upper(val) {} + snl_range(double l, double u) : lower(l), upper(u) { DLIB_ASSERT(lower <= upper)} + + double lower = 0; + double upper = 0; + + double width() const { return upper-lower; } + bool operator<(const snl_range& item) const { return lower < item.lower; } + }; + + inline snl_range merge(const snl_range& a, const snl_range& b) + { + return snl_range(std::min(a.lower, b.lower), std::max(a.upper, b.upper)); + } + + inline double distance (const snl_range& a, const snl_range& b) + { + return std::max(a.lower,b.lower) - std::min(a.upper,b.upper); + } + + inline std::ostream& operator<< (std::ostream& out, const snl_range& item ) + { + out << "["<<item.lower<<","<<item.upper<<"]"; + return out; + } + +// ---------------------------------------------------------------------------------------- + + inline std::vector<snl_range> segment_number_line ( + const std::vector<double>& x, + const double max_range_width + ) + { + DLIB_CASSERT(max_range_width >= 0); + + // create initial ranges, one for each value in x. So initially, all the ranges have + // width of 0. + std::vector<snl_range> ranges; + for (auto v : x) + ranges.push_back(v); + std::sort(ranges.begin(), ranges.end()); + + std::vector<snl_range> greedy_final_ranges; + if (ranges.size() == 0) + return greedy_final_ranges; + // We will try two different clustering strategies. One that does a simple greedy left + // to right sweep and another that does a bottom up agglomerative clustering. This + // first loop runs the greedy left to right sweep. Then at the end of this routine we + // will return the results that produced the tightest clustering. + greedy_final_ranges.push_back(ranges[0]); + for (size_t i = 1; i < ranges.size(); ++i) + { + auto m = merge(greedy_final_ranges.back(), ranges[i]); + if (m.width() <= max_range_width) + greedy_final_ranges.back() = m; + else + greedy_final_ranges.push_back(ranges[i]); + } + + + // Here we do the bottom up clustering. So compute the edges connecting our ranges. + // We will simply say there are edges between ranges if and only if they are + // immediately adjacent on the number line. + std::vector<sample_pair> edges; + for (size_t i = 1; i < ranges.size(); ++i) + edges.push_back(sample_pair(i-1,i, distance(ranges[i-1],ranges[i]))); + std::sort(edges.begin(), edges.end(), order_by_distance<sample_pair>); + + disjoint_subsets sets; + sets.set_size(ranges.size()); + + // Now start merging nodes. + for (auto edge : edges) + { + // find the next best thing to merge. + unsigned long a = sets.find_set(edge.index1()); + unsigned long b = sets.find_set(edge.index2()); + + // merge it if it doesn't result in an interval that's too big. + auto m = merge(ranges[a], ranges[b]); + if (m.width() <= max_range_width) + { + unsigned long news = sets.merge_sets(a,b); + ranges[news] = m; + } + } + + // Now create a list of the final ranges. We will do this by keeping track of which + // range we already added to final_ranges. + std::vector<snl_range> final_ranges; + std::vector<bool> already_output(ranges.size(), false); + for (unsigned long i = 0; i < sets.size(); ++i) + { + auto s = sets.find_set(i); + if (!already_output[s]) + { + final_ranges.push_back(ranges[s]); + already_output[s] = true; + } + } + + // only use the greedy clusters if they found a clustering with fewer clusters. + // Otherwise, the bottom up clustering probably produced a more sensible clustering. + if (final_ranges.size() <= greedy_final_ranges.size()) + return final_ranges; + else + return greedy_final_ranges; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_ + diff --git a/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h b/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h new file mode 100644 index 000000000..72d362c12 --- /dev/null +++ b/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h @@ -0,0 +1,136 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_ +#ifdef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_ + +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP + > + unsigned long bottom_up_cluster ( + const matrix_exp<EXP>& dists, + std::vector<unsigned long>& labels, + unsigned long min_num_clusters, + double max_dist = std::numeric_limits<double>::infinity() + ); + /*! + requires + - dists.nr() == dists.nc() + - min_num_clusters > 0 + - dists == trans(dists) + (l.e. dists should be symmetric) + ensures + - Runs a bottom up agglomerative clustering algorithm. + - Interprets dists as a matrix that gives the distances between dists.nr() + items. In particular, we take dists(i,j) to be the distance between the ith + and jth element of some set. This function clusters the elements of this set + into at least min_num_clusters (or dists.nr() if there aren't enough + elements). Additionally, within each cluster, the maximum pairwise distance + between any two cluster elements is <= max_dist. + - returns the number of clusters found. + - #labels.size() == dists.nr() + - for all valid i: + - #labels[i] == the cluster ID of the node with index i (i.e. the node + corresponding to the distances dists(i,*)). + - 0 <= #labels[i] < the number of clusters found + (i.e. cluster IDs are assigned contiguously and start at 0) + !*/ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + struct snl_range + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents an interval on the real number line. It is used + to store the outputs of the segment_number_line() routine defined below. + !*/ + + snl_range( + ); + /*! + ensures + - #lower == 0 + - #upper == 0 + !*/ + + snl_range( + double val + ); + /*! + ensures + - #lower == val + - #upper == val + !*/ + + snl_range( + double l, + double u + ); + /*! + requires + - l <= u + ensures + - #lower == l + - #upper == u + !*/ + + double lower; + double upper; + + double width( + ) const { return upper-lower; } + /*! + ensures + - returns the width of this interval on the number line. + !*/ + + bool operator<(const snl_range& item) const { return lower < item.lower; } + /*! + ensures + - provides a total ordering of snl_range objects assuming they are + non-overlapping. + !*/ + }; + + std::ostream& operator<< (std::ostream& out, const snl_range& item ); + /*! + ensures + - prints item to out in the form [lower,upper]. + !*/ + +// ---------------------------------------------------------------------------------------- + + std::vector<snl_range> segment_number_line ( + const std::vector<double>& x, + const double max_range_width + ); + /*! + requires + - max_range_width >= 0 + ensures + - Finds a clustering of the values in x and returns the ranges that define the + clustering. This routine uses a combination of bottom up clustering and a + simple greedy scan to try and find the most compact set of ranges that + contain all the values in x. + - This routine has approximately linear runtime. + - Every value in x will be contained inside one of the returned snl_range + objects; + - All returned snl_range object's will have a width() <= max_range_width and + will also be non-overlapping. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/clustering/chinese_whispers.h b/ml/dlib/dlib/clustering/chinese_whispers.h new file mode 100644 index 000000000..332cce1a0 --- /dev/null +++ b/ml/dlib/dlib/clustering/chinese_whispers.h @@ -0,0 +1,135 @@ +// Copyright (C) 2012 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_CHINESE_WHISPErS_Hh_ +#define DLIB_CHINESE_WHISPErS_Hh_ + +#include "chinese_whispers_abstract.h" +#include <vector> +#include "../rand.h" +#include "../graph_utils/edge_list_graphs.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + inline unsigned long chinese_whispers ( + const std::vector<ordered_sample_pair>& edges, + std::vector<unsigned long>& labels, + const unsigned long num_iterations, + dlib::rand& rnd + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(is_ordered_by_index(edges), + "\t unsigned long chinese_whispers()" + << "\n\t Invalid inputs were given to this function" + ); + + labels.clear(); + if (edges.size() == 0) + return 0; + + std::vector<std::pair<unsigned long, unsigned long> > neighbors; + find_neighbor_ranges(edges, neighbors); + + // Initialize the labels, each node gets a different label. + labels.resize(neighbors.size()); + for (unsigned long i = 0; i < labels.size(); ++i) + labels[i] = i; + + + for (unsigned long iter = 0; iter < neighbors.size()*num_iterations; ++iter) + { + // Pick a random node. + const unsigned long idx = rnd.get_random_64bit_number()%neighbors.size(); + + // Count how many times each label happens amongst our neighbors. + std::map<unsigned long, double> labels_to_counts; + const unsigned long end = neighbors[idx].second; + for (unsigned long i = neighbors[idx].first; i != end; ++i) + { + labels_to_counts[labels[edges[i].index2()]] += edges[i].distance(); + } + + // find the most common label + std::map<unsigned long, double>::iterator i; + double best_score = -std::numeric_limits<double>::infinity(); + unsigned long best_label = labels[idx]; + for (i = labels_to_counts.begin(); i != labels_to_counts.end(); ++i) + { + if (i->second > best_score) + { + best_score = i->second; + best_label = i->first; + } + } + + labels[idx] = best_label; + } + + + // Remap the labels into a contiguous range. First we find the + // mapping. + std::map<unsigned long,unsigned long> label_remap; + for (unsigned long i = 0; i < labels.size(); ++i) + { + const unsigned long next_id = label_remap.size(); + if (label_remap.count(labels[i]) == 0) + label_remap[labels[i]] = next_id; + } + // now apply the mapping to all the labels. + for (unsigned long i = 0; i < labels.size(); ++i) + { + labels[i] = label_remap[labels[i]]; + } + + return label_remap.size(); + } + +// ---------------------------------------------------------------------------------------- + + inline unsigned long chinese_whispers ( + const std::vector<sample_pair>& edges, + std::vector<unsigned long>& labels, + const unsigned long num_iterations, + dlib::rand& rnd + ) + { + std::vector<ordered_sample_pair> oedges; + convert_unordered_to_ordered(edges, oedges); + std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>); + + return chinese_whispers(oedges, labels, num_iterations, rnd); + } + +// ---------------------------------------------------------------------------------------- + + inline unsigned long chinese_whispers ( + const std::vector<sample_pair>& edges, + std::vector<unsigned long>& labels, + const unsigned long num_iterations = 100 + ) + { + dlib::rand rnd; + return chinese_whispers(edges, labels, num_iterations, rnd); + } + +// ---------------------------------------------------------------------------------------- + + inline unsigned long chinese_whispers ( + const std::vector<ordered_sample_pair>& edges, + std::vector<unsigned long>& labels, + const unsigned long num_iterations = 100 + ) + { + dlib::rand rnd; + return chinese_whispers(edges, labels, num_iterations, rnd); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_CHINESE_WHISPErS_Hh_ + diff --git a/ml/dlib/dlib/clustering/chinese_whispers_abstract.h b/ml/dlib/dlib/clustering/chinese_whispers_abstract.h new file mode 100644 index 000000000..7a184c6f9 --- /dev/null +++ b/ml/dlib/dlib/clustering/chinese_whispers_abstract.h @@ -0,0 +1,97 @@ +// Copyright (C) 2012 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_ +#ifdef DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_ + +#include <vector> +#include "../rand.h" +#include "../graph_utils/ordered_sample_pair_abstract.h" +#include "../graph_utils/sample_pair_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + unsigned long chinese_whispers ( + const std::vector<ordered_sample_pair>& edges, + std::vector<unsigned long>& labels, + const unsigned long num_iterations, + dlib::rand& rnd + ); + /*! + requires + - is_ordered_by_index(edges) == true + ensures + - This function implements the graph clustering algorithm described in the + paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its + Application to Natural Language Processing Problems by Chris Biemann. + - Interprets edges as a directed graph. That is, it contains the edges on the + said graph and the ordered_sample_pair::distance() values define the edge + weights (larger values indicating a stronger edge connection between the + nodes). If an edge has a distance() value of infinity then it is considered + a "must link" edge. + - returns the number of clusters found. + - #labels.size() == max_index_plus_one(edges) + - for all valid i: + - #labels[i] == the cluster ID of the node with index i in the graph. + - 0 <= #labels[i] < the number of clusters found + (i.e. cluster IDs are assigned contiguously and start at 0) + - Duplicate edges are interpreted as if there had been just one edge with a + distance value equal to the sum of all the duplicate edge's distance values. + - The algorithm performs exactly num_iterations passes over the graph before + terminating. + !*/ + +// ---------------------------------------------------------------------------------------- + + unsigned long chinese_whispers ( + const std::vector<sample_pair>& edges, + std::vector<unsigned long>& labels, + const unsigned long num_iterations, + dlib::rand& rnd + ); + /*! + ensures + - This function is identical to the above chinese_whispers() routine except + that it operates on a vector of sample_pair objects instead of + ordered_sample_pairs. Therefore, this is simply a convenience routine. In + particular, it is implemented by transforming the given edges into + ordered_sample_pairs and then calling the chinese_whispers() routine defined + above. + !*/ + +// ---------------------------------------------------------------------------------------- + + unsigned long chinese_whispers ( + const std::vector<ordered_sample_pair>& edges, + std::vector<unsigned long>& labels, + const unsigned long num_iterations = 100 + ); + /*! + requires + - is_ordered_by_index(edges) == true + ensures + - performs: return chinese_whispers(edges, labels, num_iterations, rnd) + where rnd is a default initialized dlib::rand object. + !*/ + +// ---------------------------------------------------------------------------------------- + + unsigned long chinese_whispers ( + const std::vector<sample_pair>& edges, + std::vector<unsigned long>& labels, + const unsigned long num_iterations = 100 + ); + /*! + ensures + - performs: return chinese_whispers(edges, labels, num_iterations, rnd) + where rnd is a default initialized dlib::rand object. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_ + diff --git a/ml/dlib/dlib/clustering/modularity_clustering.h b/ml/dlib/dlib/clustering/modularity_clustering.h new file mode 100644 index 000000000..8b8a0b0a5 --- /dev/null +++ b/ml/dlib/dlib/clustering/modularity_clustering.h @@ -0,0 +1,515 @@ +// Copyright (C) 2012 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_MODULARITY_ClUSTERING__H__ +#define DLIB_MODULARITY_ClUSTERING__H__ + +#include "modularity_clustering_abstract.h" +#include "../sparse_vector.h" +#include "../graph_utils/edge_list_graphs.h" +#include "../matrix.h" +#include "../rand.h" + +namespace dlib +{ + +// ----------------------------------------------------------------------------------------- + + namespace impl + { + inline double newman_cluster_split ( + dlib::rand& rnd, + const std::vector<ordered_sample_pair>& edges, + const matrix<double,0,1>& node_degrees, // k from the Newman paper + const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper + const double& edge_sum, // m from the Newman paper + matrix<double,0,1>& labels, + const double eps, + const unsigned long max_iterations + ) + /*! + requires + - node_degrees.size() == max_index_plus_one(edges) + - Bdiag.size() == max_index_plus_one(edges) + - edges must be sorted according to order_by_index() + ensures + - This routine splits a graph into two subgraphs using the Newman + clustering method. + - returns the modularity obtained when the graph is split according + to the contents of #labels. + - #labels.size() == node_degrees.size() + - for all valid i: #labels(i) == -1 or +1 + - if (this function returns 0) then + - all the labels are equal, i.e. the graph is not split. + !*/ + { + // Scale epsilon so that it is relative to the expected value of an element of a + // unit vector of length node_degrees.size(). + const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size()); + + // Make a random unit vector and put in labels. + labels.set_size(node_degrees.size()); + for (long i = 0; i < labels.size(); ++i) + labels(i) = rnd.get_random_gaussian(); + labels /= length(labels); + + matrix<double,0,1> Bv, Bv_unit; + + // Do the power iteration for a while. + double eig = -1; + double offset = 0; + while (eig < 0) + { + + // any number larger than power_iter_eps + double iteration_change = power_iter_eps*2+1; + for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i) + { + sparse_matrix_vector_multiply(edges, labels, Bv); + Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; + + if (offset != 0) + { + Bv -= offset*labels; + } + + + const double len = length(Bv); + if (len != 0) + { + Bv_unit = Bv/len; + iteration_change = max(abs(labels-Bv_unit)); + labels.swap(Bv_unit); + } + else + { + // Had a bad time, pick another random vector and try it with the + // power iteration. + for (long i = 0; i < labels.size(); ++i) + labels(i) = rnd.get_random_gaussian(); + } + } + + eig = dot(Bv,labels); + // we will repeat this loop if the largest eigenvalue is negative + offset = eig; + } + + + for (long i = 0; i < labels.size(); ++i) + { + if (labels(i) > 0) + labels(i) = 1; + else + labels(i) = -1; + } + + + // compute B*labels, store result in Bv. + sparse_matrix_vector_multiply(edges, labels, Bv); + Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; + + // Do some label refinement. In this step we swap labels if it + // improves the modularity score. + bool flipped_label = true; + while(flipped_label) + { + flipped_label = false; + unsigned long idx = 0; + for (long i = 0; i < labels.size(); ++i) + { + const double val = -2*labels(i); + const double increase = 4*Bdiag(i) + 2*val*Bv(i); + + // if there is an increase in modularity for swapping this label + if (increase > 0) + { + labels(i) *= -1; + while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) + { + const long j = edges[idx].index2(); + Bv(j) += val*edges[idx].distance(); + ++idx; + } + + Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees; + + flipped_label = true; + } + else + { + while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) + { + ++idx; + } + } + } + } + + + const double modularity = dot(Bv, labels)/(4*edge_sum); + + return modularity; + } + + // ------------------------------------------------------------------------------------- + + inline unsigned long newman_cluster_helper ( + dlib::rand& rnd, + const std::vector<ordered_sample_pair>& edges, + const matrix<double,0,1>& node_degrees, // k from the Newman paper + const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper + const double& edge_sum, // m from the Newman paper + std::vector<unsigned long>& labels, + double modularity_threshold, + const double eps, + const unsigned long max_iterations + ) + /*! + ensures + - returns the number of clusters the data was split into + !*/ + { + matrix<double,0,1> l; + const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations); + + + // We need to collapse the node index values down to contiguous values. So + // we use the following two vectors to contain the mappings from input index + // values to their corresponding index values in each split. + std::vector<unsigned long> left_idx_map(node_degrees.size()); + std::vector<unsigned long> right_idx_map(node_degrees.size()); + + // figure out how many nodes went into each side of the split. + unsigned long num_left_split = 0; + unsigned long num_right_split = 0; + for (long i = 0; i < l.size(); ++i) + { + if (l(i) > 0) + { + left_idx_map[i] = num_left_split; + ++num_left_split; + } + else + { + right_idx_map[i] = num_right_split; + ++num_right_split; + } + } + + // do a recursive split if it will improve the modularity. + if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0) + { + + // split the node_degrees and Bdiag matrices into left and right split parts + matrix<double,0,1> left_node_degrees(num_left_split); + matrix<double,0,1> right_node_degrees(num_right_split); + matrix<double,0,1> left_Bdiag(num_left_split); + matrix<double,0,1> right_Bdiag(num_right_split); + for (long i = 0; i < l.size(); ++i) + { + if (l(i) > 0) + { + left_node_degrees(left_idx_map[i]) = node_degrees(i); + left_Bdiag(left_idx_map[i]) = Bdiag(i); + } + else + { + right_node_degrees(right_idx_map[i]) = node_degrees(i); + right_Bdiag(right_idx_map[i]) = Bdiag(i); + } + } + + + // put the edges from one side of the split into split_edges + std::vector<ordered_sample_pair> split_edges; + modularity_threshold = 0; + for (unsigned long k = 0; k < edges.size(); ++k) + { + const unsigned long i = edges[k].index1(); + const unsigned long j = edges[k].index2(); + const double d = edges[k].distance(); + if (l(i) > 0 && l(j) > 0) + { + split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d)); + modularity_threshold += d; + } + } + modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum); + modularity_threshold /= 4*edge_sum; + + unsigned long num_left_clusters; + std::vector<unsigned long> left_labels; + num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag, + edge_sum,left_labels,modularity_threshold, + eps, max_iterations); + + // now load the other side into split_edges and cluster it as well + split_edges.clear(); + modularity_threshold = 0; + for (unsigned long k = 0; k < edges.size(); ++k) + { + const unsigned long i = edges[k].index1(); + const unsigned long j = edges[k].index2(); + const double d = edges[k].distance(); + if (l(i) < 0 && l(j) < 0) + { + split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d)); + modularity_threshold += d; + } + } + modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum); + modularity_threshold /= 4*edge_sum; + + unsigned long num_right_clusters; + std::vector<unsigned long> right_labels; + num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag, + edge_sum,right_labels,modularity_threshold, + eps, max_iterations); + + // Now merge the labels from the two splits. + labels.resize(node_degrees.size()); + for (unsigned long i = 0; i < labels.size(); ++i) + { + // if this node was in the left split + if (l(i) > 0) + { + labels[i] = left_labels[left_idx_map[i]]; + } + else // if this node was in the right split + { + labels[i] = right_labels[right_idx_map[i]] + num_left_clusters; + } + } + + + return num_left_clusters + num_right_clusters; + } + else + { + labels.assign(node_degrees.size(),0); + return 1; + } + + } + } + +// ---------------------------------------------------------------------------------------- + + inline unsigned long newman_cluster ( + const std::vector<ordered_sample_pair>& edges, + std::vector<unsigned long>& labels, + const double eps = 1e-4, + const unsigned long max_iterations = 2000 + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(is_ordered_by_index(edges), + "\t unsigned long newman_cluster()" + << "\n\t Invalid inputs were given to this function" + ); + + labels.clear(); + if (edges.size() == 0) + return 0; + + const unsigned long num_nodes = max_index_plus_one(edges); + + // compute the node_degrees vector, edge_sum value, and diag(B). + matrix<double,0,1> node_degrees(num_nodes); + matrix<double,0,1> Bdiag(num_nodes); + Bdiag = 0; + double edge_sum = 0; + node_degrees = 0; + for (unsigned long i = 0; i < edges.size(); ++i) + { + node_degrees(edges[i].index1()) += edges[i].distance(); + edge_sum += edges[i].distance(); + if (edges[i].index1() == edges[i].index2()) + Bdiag(edges[i].index1()) += edges[i].distance(); + } + edge_sum /= 2; + Bdiag -= squared(node_degrees)/(2*edge_sum); + + + dlib::rand rnd; + return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations); + } + +// ---------------------------------------------------------------------------------------- + + inline unsigned long newman_cluster ( + const std::vector<sample_pair>& edges, + std::vector<unsigned long>& labels, + const double eps = 1e-4, + const unsigned long max_iterations = 2000 + ) + { + std::vector<ordered_sample_pair> oedges; + convert_unordered_to_ordered(edges, oedges); + std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>); + + return newman_cluster(oedges, labels, eps, max_iterations); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + inline std::vector<unsigned long> remap_labels ( + const std::vector<unsigned long>& labels, + unsigned long& num_labels + ) + /*! + ensures + - This function takes labels and produces a mapping which maps elements of + labels into the most compact range in [0, max] as possible. In particular, + there won't be any unused integers in the mapped range. + - #num_labels == the number of distinct values in labels. + - returns a vector V such that: + - V.size() == labels.size() + - max(mat(V))+1 == num_labels. + - for all valid i,j: + - if (labels[i] == labels[j]) then + - V[i] == V[j] + - else + - V[i] != V[j] + !*/ + { + std::map<unsigned long, unsigned long> temp; + for (unsigned long i = 0; i < labels.size(); ++i) + { + if (temp.count(labels[i]) == 0) + { + const unsigned long next = temp.size(); + temp[labels[i]] = next; + } + } + + num_labels = temp.size(); + + std::vector<unsigned long> result(labels.size()); + for (unsigned long i = 0; i < labels.size(); ++i) + { + result[i] = temp[labels[i]]; + } + return result; + } + } + +// ---------------------------------------------------------------------------------------- + + inline double modularity ( + const std::vector<sample_pair>& edges, + const std::vector<unsigned long>& labels + ) + { + const unsigned long num_nodes = max_index_plus_one(edges); + // make sure requires clause is not broken + DLIB_ASSERT(labels.size() == num_nodes, + "\t double modularity()" + << "\n\t Invalid inputs were given to this function" + ); + + unsigned long num_labels; + const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels); + + std::vector<double> cluster_sums(num_labels,0); + std::vector<double> k(num_nodes,0); + + double Q = 0; + double m = 0; + for (unsigned long i = 0; i < edges.size(); ++i) + { + const unsigned long n1 = edges[i].index1(); + const unsigned long n2 = edges[i].index2(); + k[n1] += edges[i].distance(); + if (n1 != n2) + k[n2] += edges[i].distance(); + + if (n1 != n2) + m += edges[i].distance(); + else + m += edges[i].distance()/2; + + if (labels_[n1] == labels_[n2]) + { + if (n1 != n2) + Q += 2*edges[i].distance(); + else + Q += edges[i].distance(); + } + } + + if (m == 0) + return 0; + + for (unsigned long i = 0; i < labels_.size(); ++i) + { + cluster_sums[labels_[i]] += k[i]; + } + + for (unsigned long i = 0; i < labels_.size(); ++i) + { + Q -= k[i]*cluster_sums[labels_[i]]/(2*m); + } + + return 1.0/(2*m)*Q; + } + +// ---------------------------------------------------------------------------------------- + + inline double modularity ( + const std::vector<ordered_sample_pair>& edges, + const std::vector<unsigned long>& labels + ) + { + const unsigned long num_nodes = max_index_plus_one(edges); + // make sure requires clause is not broken + DLIB_ASSERT(labels.size() == num_nodes, + "\t double modularity()" + << "\n\t Invalid inputs were given to this function" + ); + + + unsigned long num_labels; + const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels); + + std::vector<double> cluster_sums(num_labels,0); + std::vector<double> k(num_nodes,0); + + double Q = 0; + double m = 0; + for (unsigned long i = 0; i < edges.size(); ++i) + { + const unsigned long n1 = edges[i].index1(); + const unsigned long n2 = edges[i].index2(); + k[n1] += edges[i].distance(); + m += edges[i].distance(); + if (labels_[n1] == labels_[n2]) + { + Q += edges[i].distance(); + } + } + + if (m == 0) + return 0; + + for (unsigned long i = 0; i < labels_.size(); ++i) + { + cluster_sums[labels_[i]] += k[i]; + } + + for (unsigned long i = 0; i < labels_.size(); ++i) + { + Q -= k[i]*cluster_sums[labels_[i]]/m; + } + + return 1.0/m*Q; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_MODULARITY_ClUSTERING__H__ + diff --git a/ml/dlib/dlib/clustering/modularity_clustering_abstract.h b/ml/dlib/dlib/clustering/modularity_clustering_abstract.h new file mode 100644 index 000000000..c1e7c20c4 --- /dev/null +++ b/ml/dlib/dlib/clustering/modularity_clustering_abstract.h @@ -0,0 +1,125 @@ +// Copyright (C) 2012 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_ +#ifdef DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_ + +#include <vector> +#include "../graph_utils/ordered_sample_pair_abstract.h" +#include "../graph_utils/sample_pair_abstract.h" + +namespace dlib +{ + +// ----------------------------------------------------------------------------------------- + + double modularity ( + const std::vector<sample_pair>& edges, + const std::vector<unsigned long>& labels + ); + /*! + requires + - labels.size() == max_index_plus_one(edges) + - for all valid i: + - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity() + ensures + - Interprets edges as an undirected graph. That is, it contains the edges on + the said graph and the sample_pair::distance() values define the edge weights + (larger values indicating a stronger edge connection between the nodes). + - This function returns the modularity value obtained when the given input + graph is broken into subgraphs according to the contents of labels. In + particular, we say that two nodes with indices i and j are in the same + subgraph or community if and only if labels[i] == labels[j]. + - Duplicate edges are interpreted as if there had been just one edge with a + distance value equal to the sum of all the duplicate edge's distance values. + - See the paper Modularity and community structure in networks by M. E. J. Newman + for a detailed definition. + !*/ + +// ---------------------------------------------------------------------------------------- + + double modularity ( + const std::vector<ordered_sample_pair>& edges, + const std::vector<unsigned long>& labels + ); + /*! + requires + - labels.size() == max_index_plus_one(edges) + - for all valid i: + - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity() + ensures + - Interprets edges as a directed graph. That is, it contains the edges on the + said graph and the ordered_sample_pair::distance() values define the edge + weights (larger values indicating a stronger edge connection between the + nodes). Note that, generally, modularity is only really defined for + undirected graphs. Therefore, the "directed graph" given to this function + should have symmetric edges between all nodes. The reason this function is + provided at all is because sometimes a vector of ordered_sample_pair objects + is a useful representation of an undirected graph. + - This function returns the modularity value obtained when the given input + graph is broken into subgraphs according to the contents of labels. In + particular, we say that two nodes with indices i and j are in the same + subgraph or community if and only if labels[i] == labels[j]. + - Duplicate edges are interpreted as if there had been just one edge with a + distance value equal to the sum of all the duplicate edge's distance values. + - See the paper Modularity and community structure in networks by M. E. J. Newman + for a detailed definition. + !*/ + +// ---------------------------------------------------------------------------------------- + + unsigned long newman_cluster ( + const std::vector<ordered_sample_pair>& edges, + std::vector<unsigned long>& labels, + const double eps = 1e-4, + const unsigned long max_iterations = 2000 + ); + /*! + requires + - is_ordered_by_index(edges) == true + - for all valid i: + - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity() + ensures + - This function performs the clustering algorithm described in the paper + Modularity and community structure in networks by M. E. J. Newman. + - This function interprets edges as a graph and attempts to find the labeling + that maximizes modularity(edges, #labels). + - returns the number of clusters found. + - #labels.size() == max_index_plus_one(edges) + - for all valid i: + - #labels[i] == the cluster ID of the node with index i in the graph. + - 0 <= #labels[i] < the number of clusters found + (i.e. cluster IDs are assigned contiguously and start at 0) + - The main computation of the algorithm is involved in finding an eigenvector + of a certain matrix. To do this, we use the power iteration. In particular, + each time we try to find an eigenvector we will let the power iteration loop + at most max_iterations times or until it reaches an accuracy of eps. + Whichever comes first. + !*/ + +// ---------------------------------------------------------------------------------------- + + unsigned long newman_cluster ( + const std::vector<sample_pair>& edges, + std::vector<unsigned long>& labels, + const double eps = 1e-4, + const unsigned long max_iterations = 2000 + ); + /*! + requires + - for all valid i: + - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity() + ensures + - This function is identical to the above newman_cluster() routine except that + it operates on a vector of sample_pair objects instead of + ordered_sample_pairs. Therefore, this is simply a convenience routine. In + particular, it is implemented by transforming the given edges into + ordered_sample_pairs and then calling the newman_cluster() routine defined + above. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_ + diff --git a/ml/dlib/dlib/clustering/spectral_cluster.h b/ml/dlib/dlib/clustering/spectral_cluster.h new file mode 100644 index 000000000..2cac9870f --- /dev/null +++ b/ml/dlib/dlib/clustering/spectral_cluster.h @@ -0,0 +1,80 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_SPECTRAL_CLUSTEr_H_ +#define DLIB_SPECTRAL_CLUSTEr_H_ + +#include "spectral_cluster_abstract.h" +#include <vector> +#include "../matrix.h" +#include "../svm/kkmeans.h" + +namespace dlib +{ + template < + typename kernel_type, + typename vector_type + > + std::vector<unsigned long> spectral_cluster ( + const kernel_type& k, + const vector_type& samples, + const unsigned long num_clusters + ) + { + DLIB_CASSERT(num_clusters > 0, + "\t std::vector<unsigned long> spectral_cluster(k,samples,num_clusters)" + << "\n\t num_clusters can't be 0." + ); + + if (num_clusters == 1) + { + // nothing to do, just assign everything to the 0 cluster. + return std::vector<unsigned long>(samples.size(), 0); + } + + // compute the similarity matrix. + matrix<double> K(samples.size(), samples.size()); + for (long r = 0; r < K.nr(); ++r) + for (long c = r+1; c < K.nc(); ++c) + K(r,c) = K(c,r) = (double)k(samples[r], samples[c]); + for (long r = 0; r < K.nr(); ++r) + K(r,r) = 0; + + matrix<double,0,1> D(K.nr()); + for (long r = 0; r < K.nr(); ++r) + D(r) = sum(rowm(K,r)); + D = sqrt(reciprocal(D)); + K = diagm(D)*K*diagm(D); + matrix<double> u,w,v; + // Use the normal SVD routine unless the matrix is really big, then use the fast + // approximate version. + if (K.nr() < 1000) + svd3(K,u,w,v); + else + svd_fast(K,u,w,v, num_clusters+100, 5); + // Pick out the eigenvectors associated with the largest eigenvalues. + rsort_columns(v,w); + v = colm(v, range(0,num_clusters-1)); + // Now build the normalized spectral vectors, one for each input vector. + std::vector<matrix<double,0,1> > spec_samps, centers; + for (long r = 0; r < v.nr(); ++r) + { + spec_samps.push_back(trans(rowm(v,r))); + const double len = length(spec_samps.back()); + if (len != 0) + spec_samps.back() /= len; + } + // Finally do the K-means clustering + pick_initial_centers(num_clusters, centers, spec_samps); + find_clusters_using_kmeans(spec_samps, centers); + // And then compute the cluster assignments based on the output of K-means. + std::vector<unsigned long> assignments; + for (unsigned long i = 0; i < spec_samps.size(); ++i) + assignments.push_back(nearest_center(centers, spec_samps[i])); + + return assignments; + } + +} + +#endif // DLIB_SPECTRAL_CLUSTEr_H_ + diff --git a/ml/dlib/dlib/clustering/spectral_cluster_abstract.h b/ml/dlib/dlib/clustering/spectral_cluster_abstract.h new file mode 100644 index 000000000..880ad80af --- /dev/null +++ b/ml/dlib/dlib/clustering/spectral_cluster_abstract.h @@ -0,0 +1,43 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_ +#ifdef DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_ + +#include <vector> + +namespace dlib +{ + template < + typename kernel_type, + typename vector_type + > + std::vector<unsigned long> spectral_cluster ( + const kernel_type& k, + const vector_type& samples, + const unsigned long num_clusters + ); + /*! + requires + - samples must be something with an interface compatible with std::vector. + - The following expression must evaluate to a double or float: + k(samples[i], samples[j]) + - num_clusters > 0 + ensures + - Performs the spectral clustering algorithm described in the paper: + On spectral clustering: Analysis and an algorithm by Ng, Jordan, and Weiss. + and returns the results. + - This function clusters the input data samples into num_clusters clusters and + returns a vector that indicates which cluster each sample falls into. In + particular, we return an array A such that: + - A.size() == samples.size() + - A[i] == the cluster assignment of samples[i]. + - for all valid i: 0 <= A[i] < num_clusters + - The "similarity" of samples[i] with samples[j] is given by + k(samples[i],samples[j]). This means that k() should output a number >= 0 + and the number should be larger for samples that are more similar. + !*/ +} + +#endif // DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_ + + |