diff options
Diffstat (limited to 'ml/dlib/dlib/clustering/modularity_clustering.h')
-rw-r--r-- | ml/dlib/dlib/clustering/modularity_clustering.h | 515 |
1 files changed, 515 insertions, 0 deletions
diff --git a/ml/dlib/dlib/clustering/modularity_clustering.h b/ml/dlib/dlib/clustering/modularity_clustering.h new file mode 100644 index 000000000..8b8a0b0a5 --- /dev/null +++ b/ml/dlib/dlib/clustering/modularity_clustering.h @@ -0,0 +1,515 @@ +// Copyright (C) 2012 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_MODULARITY_ClUSTERING__H__ +#define DLIB_MODULARITY_ClUSTERING__H__ + +#include "modularity_clustering_abstract.h" +#include "../sparse_vector.h" +#include "../graph_utils/edge_list_graphs.h" +#include "../matrix.h" +#include "../rand.h" + +namespace dlib +{ + +// ----------------------------------------------------------------------------------------- + + namespace impl + { + inline double newman_cluster_split ( + dlib::rand& rnd, + const std::vector<ordered_sample_pair>& edges, + const matrix<double,0,1>& node_degrees, // k from the Newman paper + const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper + const double& edge_sum, // m from the Newman paper + matrix<double,0,1>& labels, + const double eps, + const unsigned long max_iterations + ) + /*! + requires + - node_degrees.size() == max_index_plus_one(edges) + - Bdiag.size() == max_index_plus_one(edges) + - edges must be sorted according to order_by_index() + ensures + - This routine splits a graph into two subgraphs using the Newman + clustering method. + - returns the modularity obtained when the graph is split according + to the contents of #labels. + - #labels.size() == node_degrees.size() + - for all valid i: #labels(i) == -1 or +1 + - if (this function returns 0) then + - all the labels are equal, i.e. the graph is not split. + !*/ + { + // Scale epsilon so that it is relative to the expected value of an element of a + // unit vector of length node_degrees.size(). + const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size()); + + // Make a random unit vector and put in labels. + labels.set_size(node_degrees.size()); + for (long i = 0; i < labels.size(); ++i) + labels(i) = rnd.get_random_gaussian(); + labels /= length(labels); + + matrix<double,0,1> Bv, Bv_unit; + + // Do the power iteration for a while. + double eig = -1; + double offset = 0; + while (eig < 0) + { + + // any number larger than power_iter_eps + double iteration_change = power_iter_eps*2+1; + for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i) + { + sparse_matrix_vector_multiply(edges, labels, Bv); + Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; + + if (offset != 0) + { + Bv -= offset*labels; + } + + + const double len = length(Bv); + if (len != 0) + { + Bv_unit = Bv/len; + iteration_change = max(abs(labels-Bv_unit)); + labels.swap(Bv_unit); + } + else + { + // Had a bad time, pick another random vector and try it with the + // power iteration. + for (long i = 0; i < labels.size(); ++i) + labels(i) = rnd.get_random_gaussian(); + } + } + + eig = dot(Bv,labels); + // we will repeat this loop if the largest eigenvalue is negative + offset = eig; + } + + + for (long i = 0; i < labels.size(); ++i) + { + if (labels(i) > 0) + labels(i) = 1; + else + labels(i) = -1; + } + + + // compute B*labels, store result in Bv. + sparse_matrix_vector_multiply(edges, labels, Bv); + Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; + + // Do some label refinement. In this step we swap labels if it + // improves the modularity score. + bool flipped_label = true; + while(flipped_label) + { + flipped_label = false; + unsigned long idx = 0; + for (long i = 0; i < labels.size(); ++i) + { + const double val = -2*labels(i); + const double increase = 4*Bdiag(i) + 2*val*Bv(i); + + // if there is an increase in modularity for swapping this label + if (increase > 0) + { + labels(i) *= -1; + while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) + { + const long j = edges[idx].index2(); + Bv(j) += val*edges[idx].distance(); + ++idx; + } + + Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees; + + flipped_label = true; + } + else + { + while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) + { + ++idx; + } + } + } + } + + + const double modularity = dot(Bv, labels)/(4*edge_sum); + + return modularity; + } + + // ------------------------------------------------------------------------------------- + + inline unsigned long newman_cluster_helper ( + dlib::rand& rnd, + const std::vector<ordered_sample_pair>& edges, + const matrix<double,0,1>& node_degrees, // k from the Newman paper + const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper + const double& edge_sum, // m from the Newman paper + std::vector<unsigned long>& labels, + double modularity_threshold, + const double eps, + const unsigned long max_iterations + ) + /*! + ensures + - returns the number of clusters the data was split into + !*/ + { + matrix<double,0,1> l; + const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations); + + + // We need to collapse the node index values down to contiguous values. So + // we use the following two vectors to contain the mappings from input index + // values to their corresponding index values in each split. + std::vector<unsigned long> left_idx_map(node_degrees.size()); + std::vector<unsigned long> right_idx_map(node_degrees.size()); + + // figure out how many nodes went into each side of the split. + unsigned long num_left_split = 0; + unsigned long num_right_split = 0; + for (long i = 0; i < l.size(); ++i) + { + if (l(i) > 0) + { + left_idx_map[i] = num_left_split; + ++num_left_split; + } + else + { + right_idx_map[i] = num_right_split; + ++num_right_split; + } + } + + // do a recursive split if it will improve the modularity. + if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0) + { + + // split the node_degrees and Bdiag matrices into left and right split parts + matrix<double,0,1> left_node_degrees(num_left_split); + matrix<double,0,1> right_node_degrees(num_right_split); + matrix<double,0,1> left_Bdiag(num_left_split); + matrix<double,0,1> right_Bdiag(num_right_split); + for (long i = 0; i < l.size(); ++i) + { + if (l(i) > 0) + { + left_node_degrees(left_idx_map[i]) = node_degrees(i); + left_Bdiag(left_idx_map[i]) = Bdiag(i); + } + else + { + right_node_degrees(right_idx_map[i]) = node_degrees(i); + right_Bdiag(right_idx_map[i]) = Bdiag(i); + } + } + + + // put the edges from one side of the split into split_edges + std::vector<ordered_sample_pair> split_edges; + modularity_threshold = 0; + for (unsigned long k = 0; k < edges.size(); ++k) + { + const unsigned long i = edges[k].index1(); + const unsigned long j = edges[k].index2(); + const double d = edges[k].distance(); + if (l(i) > 0 && l(j) > 0) + { + split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d)); + modularity_threshold += d; + } + } + modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum); + modularity_threshold /= 4*edge_sum; + + unsigned long num_left_clusters; + std::vector<unsigned long> left_labels; + num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag, + edge_sum,left_labels,modularity_threshold, + eps, max_iterations); + + // now load the other side into split_edges and cluster it as well + split_edges.clear(); + modularity_threshold = 0; + for (unsigned long k = 0; k < edges.size(); ++k) + { + const unsigned long i = edges[k].index1(); + const unsigned long j = edges[k].index2(); + const double d = edges[k].distance(); + if (l(i) < 0 && l(j) < 0) + { + split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d)); + modularity_threshold += d; + } + } + modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum); + modularity_threshold /= 4*edge_sum; + + unsigned long num_right_clusters; + std::vector<unsigned long> right_labels; + num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag, + edge_sum,right_labels,modularity_threshold, + eps, max_iterations); + + // Now merge the labels from the two splits. + labels.resize(node_degrees.size()); + for (unsigned long i = 0; i < labels.size(); ++i) + { + // if this node was in the left split + if (l(i) > 0) + { + labels[i] = left_labels[left_idx_map[i]]; + } + else // if this node was in the right split + { + labels[i] = right_labels[right_idx_map[i]] + num_left_clusters; + } + } + + + return num_left_clusters + num_right_clusters; + } + else + { + labels.assign(node_degrees.size(),0); + return 1; + } + + } + } + +// ---------------------------------------------------------------------------------------- + + inline unsigned long newman_cluster ( + const std::vector<ordered_sample_pair>& edges, + std::vector<unsigned long>& labels, + const double eps = 1e-4, + const unsigned long max_iterations = 2000 + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(is_ordered_by_index(edges), + "\t unsigned long newman_cluster()" + << "\n\t Invalid inputs were given to this function" + ); + + labels.clear(); + if (edges.size() == 0) + return 0; + + const unsigned long num_nodes = max_index_plus_one(edges); + + // compute the node_degrees vector, edge_sum value, and diag(B). + matrix<double,0,1> node_degrees(num_nodes); + matrix<double,0,1> Bdiag(num_nodes); + Bdiag = 0; + double edge_sum = 0; + node_degrees = 0; + for (unsigned long i = 0; i < edges.size(); ++i) + { + node_degrees(edges[i].index1()) += edges[i].distance(); + edge_sum += edges[i].distance(); + if (edges[i].index1() == edges[i].index2()) + Bdiag(edges[i].index1()) += edges[i].distance(); + } + edge_sum /= 2; + Bdiag -= squared(node_degrees)/(2*edge_sum); + + + dlib::rand rnd; + return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations); + } + +// ---------------------------------------------------------------------------------------- + + inline unsigned long newman_cluster ( + const std::vector<sample_pair>& edges, + std::vector<unsigned long>& labels, + const double eps = 1e-4, + const unsigned long max_iterations = 2000 + ) + { + std::vector<ordered_sample_pair> oedges; + convert_unordered_to_ordered(edges, oedges); + std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>); + + return newman_cluster(oedges, labels, eps, max_iterations); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + inline std::vector<unsigned long> remap_labels ( + const std::vector<unsigned long>& labels, + unsigned long& num_labels + ) + /*! + ensures + - This function takes labels and produces a mapping which maps elements of + labels into the most compact range in [0, max] as possible. In particular, + there won't be any unused integers in the mapped range. + - #num_labels == the number of distinct values in labels. + - returns a vector V such that: + - V.size() == labels.size() + - max(mat(V))+1 == num_labels. + - for all valid i,j: + - if (labels[i] == labels[j]) then + - V[i] == V[j] + - else + - V[i] != V[j] + !*/ + { + std::map<unsigned long, unsigned long> temp; + for (unsigned long i = 0; i < labels.size(); ++i) + { + if (temp.count(labels[i]) == 0) + { + const unsigned long next = temp.size(); + temp[labels[i]] = next; + } + } + + num_labels = temp.size(); + + std::vector<unsigned long> result(labels.size()); + for (unsigned long i = 0; i < labels.size(); ++i) + { + result[i] = temp[labels[i]]; + } + return result; + } + } + +// ---------------------------------------------------------------------------------------- + + inline double modularity ( + const std::vector<sample_pair>& edges, + const std::vector<unsigned long>& labels + ) + { + const unsigned long num_nodes = max_index_plus_one(edges); + // make sure requires clause is not broken + DLIB_ASSERT(labels.size() == num_nodes, + "\t double modularity()" + << "\n\t Invalid inputs were given to this function" + ); + + unsigned long num_labels; + const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels); + + std::vector<double> cluster_sums(num_labels,0); + std::vector<double> k(num_nodes,0); + + double Q = 0; + double m = 0; + for (unsigned long i = 0; i < edges.size(); ++i) + { + const unsigned long n1 = edges[i].index1(); + const unsigned long n2 = edges[i].index2(); + k[n1] += edges[i].distance(); + if (n1 != n2) + k[n2] += edges[i].distance(); + + if (n1 != n2) + m += edges[i].distance(); + else + m += edges[i].distance()/2; + + if (labels_[n1] == labels_[n2]) + { + if (n1 != n2) + Q += 2*edges[i].distance(); + else + Q += edges[i].distance(); + } + } + + if (m == 0) + return 0; + + for (unsigned long i = 0; i < labels_.size(); ++i) + { + cluster_sums[labels_[i]] += k[i]; + } + + for (unsigned long i = 0; i < labels_.size(); ++i) + { + Q -= k[i]*cluster_sums[labels_[i]]/(2*m); + } + + return 1.0/(2*m)*Q; + } + +// ---------------------------------------------------------------------------------------- + + inline double modularity ( + const std::vector<ordered_sample_pair>& edges, + const std::vector<unsigned long>& labels + ) + { + const unsigned long num_nodes = max_index_plus_one(edges); + // make sure requires clause is not broken + DLIB_ASSERT(labels.size() == num_nodes, + "\t double modularity()" + << "\n\t Invalid inputs were given to this function" + ); + + + unsigned long num_labels; + const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels); + + std::vector<double> cluster_sums(num_labels,0); + std::vector<double> k(num_nodes,0); + + double Q = 0; + double m = 0; + for (unsigned long i = 0; i < edges.size(); ++i) + { + const unsigned long n1 = edges[i].index1(); + const unsigned long n2 = edges[i].index2(); + k[n1] += edges[i].distance(); + m += edges[i].distance(); + if (labels_[n1] == labels_[n2]) + { + Q += edges[i].distance(); + } + } + + if (m == 0) + return 0; + + for (unsigned long i = 0; i < labels_.size(); ++i) + { + cluster_sums[labels_[i]] += k[i]; + } + + for (unsigned long i = 0; i < labels_.size(); ++i) + { + Q -= k[i]*cluster_sums[labels_[i]]/m; + } + + return 1.0/m*Q; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_MODULARITY_ClUSTERING__H__ + |