// Copyright (C) 2012 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_MODULARITY_ClUSTERING__H__ #define DLIB_MODULARITY_ClUSTERING__H__ #include "modularity_clustering_abstract.h" #include "../sparse_vector.h" #include "../graph_utils/edge_list_graphs.h" #include "../matrix.h" #include "../rand.h" namespace dlib { // ----------------------------------------------------------------------------------------- namespace impl { inline double newman_cluster_split ( dlib::rand& rnd, const std::vector& edges, const matrix& node_degrees, // k from the Newman paper const matrix& Bdiag, // diag(B) from the Newman paper const double& edge_sum, // m from the Newman paper matrix& labels, const double eps, const unsigned long max_iterations ) /*! requires - node_degrees.size() == max_index_plus_one(edges) - Bdiag.size() == max_index_plus_one(edges) - edges must be sorted according to order_by_index() ensures - This routine splits a graph into two subgraphs using the Newman clustering method. - returns the modularity obtained when the graph is split according to the contents of #labels. - #labels.size() == node_degrees.size() - for all valid i: #labels(i) == -1 or +1 - if (this function returns 0) then - all the labels are equal, i.e. the graph is not split. !*/ { // Scale epsilon so that it is relative to the expected value of an element of a // unit vector of length node_degrees.size(). const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size()); // Make a random unit vector and put in labels. labels.set_size(node_degrees.size()); for (long i = 0; i < labels.size(); ++i) labels(i) = rnd.get_random_gaussian(); labels /= length(labels); matrix Bv, Bv_unit; // Do the power iteration for a while. double eig = -1; double offset = 0; while (eig < 0) { // any number larger than power_iter_eps double iteration_change = power_iter_eps*2+1; for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i) { sparse_matrix_vector_multiply(edges, labels, Bv); Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; if (offset != 0) { Bv -= offset*labels; } const double len = length(Bv); if (len != 0) { Bv_unit = Bv/len; iteration_change = max(abs(labels-Bv_unit)); labels.swap(Bv_unit); } else { // Had a bad time, pick another random vector and try it with the // power iteration. for (long i = 0; i < labels.size(); ++i) labels(i) = rnd.get_random_gaussian(); } } eig = dot(Bv,labels); // we will repeat this loop if the largest eigenvalue is negative offset = eig; } for (long i = 0; i < labels.size(); ++i) { if (labels(i) > 0) labels(i) = 1; else labels(i) = -1; } // compute B*labels, store result in Bv. sparse_matrix_vector_multiply(edges, labels, Bv); Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; // Do some label refinement. In this step we swap labels if it // improves the modularity score. bool flipped_label = true; while(flipped_label) { flipped_label = false; unsigned long idx = 0; for (long i = 0; i < labels.size(); ++i) { const double val = -2*labels(i); const double increase = 4*Bdiag(i) + 2*val*Bv(i); // if there is an increase in modularity for swapping this label if (increase > 0) { labels(i) *= -1; while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) { const long j = edges[idx].index2(); Bv(j) += val*edges[idx].distance(); ++idx; } Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees; flipped_label = true; } else { while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) { ++idx; } } } } const double modularity = dot(Bv, labels)/(4*edge_sum); return modularity; } // ------------------------------------------------------------------------------------- inline unsigned long newman_cluster_helper ( dlib::rand& rnd, const std::vector& edges, const matrix& node_degrees, // k from the Newman paper const matrix& Bdiag, // diag(B) from the Newman paper const double& edge_sum, // m from the Newman paper std::vector& labels, double modularity_threshold, const double eps, const unsigned long max_iterations ) /*! ensures - returns the number of clusters the data was split into !*/ { matrix l; const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations); // We need to collapse the node index values down to contiguous values. So // we use the following two vectors to contain the mappings from input index // values to their corresponding index values in each split. std::vector left_idx_map(node_degrees.size()); std::vector right_idx_map(node_degrees.size()); // figure out how many nodes went into each side of the split. unsigned long num_left_split = 0; unsigned long num_right_split = 0; for (long i = 0; i < l.size(); ++i) { if (l(i) > 0) { left_idx_map[i] = num_left_split; ++num_left_split; } else { right_idx_map[i] = num_right_split; ++num_right_split; } } // do a recursive split if it will improve the modularity. if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0) { // split the node_degrees and Bdiag matrices into left and right split parts matrix left_node_degrees(num_left_split); matrix right_node_degrees(num_right_split); matrix left_Bdiag(num_left_split); matrix right_Bdiag(num_right_split); for (long i = 0; i < l.size(); ++i) { if (l(i) > 0) { left_node_degrees(left_idx_map[i]) = node_degrees(i); left_Bdiag(left_idx_map[i]) = Bdiag(i); } else { right_node_degrees(right_idx_map[i]) = node_degrees(i); right_Bdiag(right_idx_map[i]) = Bdiag(i); } } // put the edges from one side of the split into split_edges std::vector split_edges; modularity_threshold = 0; for (unsigned long k = 0; k < edges.size(); ++k) { const unsigned long i = edges[k].index1(); const unsigned long j = edges[k].index2(); const double d = edges[k].distance(); if (l(i) > 0 && l(j) > 0) { split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d)); modularity_threshold += d; } } modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum); modularity_threshold /= 4*edge_sum; unsigned long num_left_clusters; std::vector left_labels; num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag, edge_sum,left_labels,modularity_threshold, eps, max_iterations); // now load the other side into split_edges and cluster it as well split_edges.clear(); modularity_threshold = 0; for (unsigned long k = 0; k < edges.size(); ++k) { const unsigned long i = edges[k].index1(); const unsigned long j = edges[k].index2(); const double d = edges[k].distance(); if (l(i) < 0 && l(j) < 0) { split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d)); modularity_threshold += d; } } modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum); modularity_threshold /= 4*edge_sum; unsigned long num_right_clusters; std::vector right_labels; num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag, edge_sum,right_labels,modularity_threshold, eps, max_iterations); // Now merge the labels from the two splits. labels.resize(node_degrees.size()); for (unsigned long i = 0; i < labels.size(); ++i) { // if this node was in the left split if (l(i) > 0) { labels[i] = left_labels[left_idx_map[i]]; } else // if this node was in the right split { labels[i] = right_labels[right_idx_map[i]] + num_left_clusters; } } return num_left_clusters + num_right_clusters; } else { labels.assign(node_degrees.size(),0); return 1; } } } // ---------------------------------------------------------------------------------------- inline unsigned long newman_cluster ( const std::vector& edges, std::vector& labels, const double eps = 1e-4, const unsigned long max_iterations = 2000 ) { // make sure requires clause is not broken DLIB_ASSERT(is_ordered_by_index(edges), "\t unsigned long newman_cluster()" << "\n\t Invalid inputs were given to this function" ); labels.clear(); if (edges.size() == 0) return 0; const unsigned long num_nodes = max_index_plus_one(edges); // compute the node_degrees vector, edge_sum value, and diag(B). matrix node_degrees(num_nodes); matrix Bdiag(num_nodes); Bdiag = 0; double edge_sum = 0; node_degrees = 0; for (unsigned long i = 0; i < edges.size(); ++i) { node_degrees(edges[i].index1()) += edges[i].distance(); edge_sum += edges[i].distance(); if (edges[i].index1() == edges[i].index2()) Bdiag(edges[i].index1()) += edges[i].distance(); } edge_sum /= 2; Bdiag -= squared(node_degrees)/(2*edge_sum); dlib::rand rnd; return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations); } // ---------------------------------------------------------------------------------------- inline unsigned long newman_cluster ( const std::vector& edges, std::vector& labels, const double eps = 1e-4, const unsigned long max_iterations = 2000 ) { std::vector oedges; convert_unordered_to_ordered(edges, oedges); std::sort(oedges.begin(), oedges.end(), &order_by_index); return newman_cluster(oedges, labels, eps, max_iterations); } // ---------------------------------------------------------------------------------------- namespace impl { inline std::vector remap_labels ( const std::vector& labels, unsigned long& num_labels ) /*! ensures - This function takes labels and produces a mapping which maps elements of labels into the most compact range in [0, max] as possible. In particular, there won't be any unused integers in the mapped range. - #num_labels == the number of distinct values in labels. - returns a vector V such that: - V.size() == labels.size() - max(mat(V))+1 == num_labels. - for all valid i,j: - if (labels[i] == labels[j]) then - V[i] == V[j] - else - V[i] != V[j] !*/ { std::map temp; for (unsigned long i = 0; i < labels.size(); ++i) { if (temp.count(labels[i]) == 0) { const unsigned long next = temp.size(); temp[labels[i]] = next; } } num_labels = temp.size(); std::vector result(labels.size()); for (unsigned long i = 0; i < labels.size(); ++i) { result[i] = temp[labels[i]]; } return result; } } // ---------------------------------------------------------------------------------------- inline double modularity ( const std::vector& edges, const std::vector& labels ) { const unsigned long num_nodes = max_index_plus_one(edges); // make sure requires clause is not broken DLIB_ASSERT(labels.size() == num_nodes, "\t double modularity()" << "\n\t Invalid inputs were given to this function" ); unsigned long num_labels; const std::vector& labels_ = dlib::impl::remap_labels(labels,num_labels); std::vector cluster_sums(num_labels,0); std::vector k(num_nodes,0); double Q = 0; double m = 0; for (unsigned long i = 0; i < edges.size(); ++i) { const unsigned long n1 = edges[i].index1(); const unsigned long n2 = edges[i].index2(); k[n1] += edges[i].distance(); if (n1 != n2) k[n2] += edges[i].distance(); if (n1 != n2) m += edges[i].distance(); else m += edges[i].distance()/2; if (labels_[n1] == labels_[n2]) { if (n1 != n2) Q += 2*edges[i].distance(); else Q += edges[i].distance(); } } if (m == 0) return 0; for (unsigned long i = 0; i < labels_.size(); ++i) { cluster_sums[labels_[i]] += k[i]; } for (unsigned long i = 0; i < labels_.size(); ++i) { Q -= k[i]*cluster_sums[labels_[i]]/(2*m); } return 1.0/(2*m)*Q; } // ---------------------------------------------------------------------------------------- inline double modularity ( const std::vector& edges, const std::vector& labels ) { const unsigned long num_nodes = max_index_plus_one(edges); // make sure requires clause is not broken DLIB_ASSERT(labels.size() == num_nodes, "\t double modularity()" << "\n\t Invalid inputs were given to this function" ); unsigned long num_labels; const std::vector& labels_ = dlib::impl::remap_labels(labels,num_labels); std::vector cluster_sums(num_labels,0); std::vector k(num_nodes,0); double Q = 0; double m = 0; for (unsigned long i = 0; i < edges.size(); ++i) { const unsigned long n1 = edges[i].index1(); const unsigned long n2 = edges[i].index2(); k[n1] += edges[i].distance(); m += edges[i].distance(); if (labels_[n1] == labels_[n2]) { Q += edges[i].distance(); } } if (m == 0) return 0; for (unsigned long i = 0; i < labels_.size(); ++i) { cluster_sums[labels_[i]] += k[i]; } for (unsigned long i = 0; i < labels_.size(); ++i) { Q -= k[i]*cluster_sums[labels_[i]]/m; } return 1.0/m*Q; } // ---------------------------------------------------------------------------------------- } #endif // DLIB_MODULARITY_ClUSTERING__H__