summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/clustering/modularity_clustering.h
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/clustering/modularity_clustering.h')
-rw-r--r--ml/dlib/dlib/clustering/modularity_clustering.h515
1 files changed, 515 insertions, 0 deletions
diff --git a/ml/dlib/dlib/clustering/modularity_clustering.h b/ml/dlib/dlib/clustering/modularity_clustering.h
new file mode 100644
index 000000000..8b8a0b0a5
--- /dev/null
+++ b/ml/dlib/dlib/clustering/modularity_clustering.h
@@ -0,0 +1,515 @@
+// Copyright (C) 2012 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_MODULARITY_ClUSTERING__H__
+#define DLIB_MODULARITY_ClUSTERING__H__
+
+#include "modularity_clustering_abstract.h"
+#include "../sparse_vector.h"
+#include "../graph_utils/edge_list_graphs.h"
+#include "../matrix.h"
+#include "../rand.h"
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ inline double newman_cluster_split (
+ dlib::rand& rnd,
+ const std::vector<ordered_sample_pair>& edges,
+ const matrix<double,0,1>& node_degrees, // k from the Newman paper
+ const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper
+ const double& edge_sum, // m from the Newman paper
+ matrix<double,0,1>& labels,
+ const double eps,
+ const unsigned long max_iterations
+ )
+ /*!
+ requires
+ - node_degrees.size() == max_index_plus_one(edges)
+ - Bdiag.size() == max_index_plus_one(edges)
+ - edges must be sorted according to order_by_index()
+ ensures
+ - This routine splits a graph into two subgraphs using the Newman
+ clustering method.
+ - returns the modularity obtained when the graph is split according
+ to the contents of #labels.
+ - #labels.size() == node_degrees.size()
+ - for all valid i: #labels(i) == -1 or +1
+ - if (this function returns 0) then
+ - all the labels are equal, i.e. the graph is not split.
+ !*/
+ {
+ // Scale epsilon so that it is relative to the expected value of an element of a
+ // unit vector of length node_degrees.size().
+ const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size());
+
+ // Make a random unit vector and put in labels.
+ labels.set_size(node_degrees.size());
+ for (long i = 0; i < labels.size(); ++i)
+ labels(i) = rnd.get_random_gaussian();
+ labels /= length(labels);
+
+ matrix<double,0,1> Bv, Bv_unit;
+
+ // Do the power iteration for a while.
+ double eig = -1;
+ double offset = 0;
+ while (eig < 0)
+ {
+
+ // any number larger than power_iter_eps
+ double iteration_change = power_iter_eps*2+1;
+ for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i)
+ {
+ sparse_matrix_vector_multiply(edges, labels, Bv);
+ Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
+
+ if (offset != 0)
+ {
+ Bv -= offset*labels;
+ }
+
+
+ const double len = length(Bv);
+ if (len != 0)
+ {
+ Bv_unit = Bv/len;
+ iteration_change = max(abs(labels-Bv_unit));
+ labels.swap(Bv_unit);
+ }
+ else
+ {
+ // Had a bad time, pick another random vector and try it with the
+ // power iteration.
+ for (long i = 0; i < labels.size(); ++i)
+ labels(i) = rnd.get_random_gaussian();
+ }
+ }
+
+ eig = dot(Bv,labels);
+ // we will repeat this loop if the largest eigenvalue is negative
+ offset = eig;
+ }
+
+
+ for (long i = 0; i < labels.size(); ++i)
+ {
+ if (labels(i) > 0)
+ labels(i) = 1;
+ else
+ labels(i) = -1;
+ }
+
+
+ // compute B*labels, store result in Bv.
+ sparse_matrix_vector_multiply(edges, labels, Bv);
+ Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
+
+ // Do some label refinement. In this step we swap labels if it
+ // improves the modularity score.
+ bool flipped_label = true;
+ while(flipped_label)
+ {
+ flipped_label = false;
+ unsigned long idx = 0;
+ for (long i = 0; i < labels.size(); ++i)
+ {
+ const double val = -2*labels(i);
+ const double increase = 4*Bdiag(i) + 2*val*Bv(i);
+
+ // if there is an increase in modularity for swapping this label
+ if (increase > 0)
+ {
+ labels(i) *= -1;
+ while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
+ {
+ const long j = edges[idx].index2();
+ Bv(j) += val*edges[idx].distance();
+ ++idx;
+ }
+
+ Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees;
+
+ flipped_label = true;
+ }
+ else
+ {
+ while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
+ {
+ ++idx;
+ }
+ }
+ }
+ }
+
+
+ const double modularity = dot(Bv, labels)/(4*edge_sum);
+
+ return modularity;
+ }
+
+ // -------------------------------------------------------------------------------------
+
+ inline unsigned long newman_cluster_helper (
+ dlib::rand& rnd,
+ const std::vector<ordered_sample_pair>& edges,
+ const matrix<double,0,1>& node_degrees, // k from the Newman paper
+ const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper
+ const double& edge_sum, // m from the Newman paper
+ std::vector<unsigned long>& labels,
+ double modularity_threshold,
+ const double eps,
+ const unsigned long max_iterations
+ )
+ /*!
+ ensures
+ - returns the number of clusters the data was split into
+ !*/
+ {
+ matrix<double,0,1> l;
+ const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations);
+
+
+ // We need to collapse the node index values down to contiguous values. So
+ // we use the following two vectors to contain the mappings from input index
+ // values to their corresponding index values in each split.
+ std::vector<unsigned long> left_idx_map(node_degrees.size());
+ std::vector<unsigned long> right_idx_map(node_degrees.size());
+
+ // figure out how many nodes went into each side of the split.
+ unsigned long num_left_split = 0;
+ unsigned long num_right_split = 0;
+ for (long i = 0; i < l.size(); ++i)
+ {
+ if (l(i) > 0)
+ {
+ left_idx_map[i] = num_left_split;
+ ++num_left_split;
+ }
+ else
+ {
+ right_idx_map[i] = num_right_split;
+ ++num_right_split;
+ }
+ }
+
+ // do a recursive split if it will improve the modularity.
+ if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0)
+ {
+
+ // split the node_degrees and Bdiag matrices into left and right split parts
+ matrix<double,0,1> left_node_degrees(num_left_split);
+ matrix<double,0,1> right_node_degrees(num_right_split);
+ matrix<double,0,1> left_Bdiag(num_left_split);
+ matrix<double,0,1> right_Bdiag(num_right_split);
+ for (long i = 0; i < l.size(); ++i)
+ {
+ if (l(i) > 0)
+ {
+ left_node_degrees(left_idx_map[i]) = node_degrees(i);
+ left_Bdiag(left_idx_map[i]) = Bdiag(i);
+ }
+ else
+ {
+ right_node_degrees(right_idx_map[i]) = node_degrees(i);
+ right_Bdiag(right_idx_map[i]) = Bdiag(i);
+ }
+ }
+
+
+ // put the edges from one side of the split into split_edges
+ std::vector<ordered_sample_pair> split_edges;
+ modularity_threshold = 0;
+ for (unsigned long k = 0; k < edges.size(); ++k)
+ {
+ const unsigned long i = edges[k].index1();
+ const unsigned long j = edges[k].index2();
+ const double d = edges[k].distance();
+ if (l(i) > 0 && l(j) > 0)
+ {
+ split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d));
+ modularity_threshold += d;
+ }
+ }
+ modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum);
+ modularity_threshold /= 4*edge_sum;
+
+ unsigned long num_left_clusters;
+ std::vector<unsigned long> left_labels;
+ num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag,
+ edge_sum,left_labels,modularity_threshold,
+ eps, max_iterations);
+
+ // now load the other side into split_edges and cluster it as well
+ split_edges.clear();
+ modularity_threshold = 0;
+ for (unsigned long k = 0; k < edges.size(); ++k)
+ {
+ const unsigned long i = edges[k].index1();
+ const unsigned long j = edges[k].index2();
+ const double d = edges[k].distance();
+ if (l(i) < 0 && l(j) < 0)
+ {
+ split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d));
+ modularity_threshold += d;
+ }
+ }
+ modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum);
+ modularity_threshold /= 4*edge_sum;
+
+ unsigned long num_right_clusters;
+ std::vector<unsigned long> right_labels;
+ num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag,
+ edge_sum,right_labels,modularity_threshold,
+ eps, max_iterations);
+
+ // Now merge the labels from the two splits.
+ labels.resize(node_degrees.size());
+ for (unsigned long i = 0; i < labels.size(); ++i)
+ {
+ // if this node was in the left split
+ if (l(i) > 0)
+ {
+ labels[i] = left_labels[left_idx_map[i]];
+ }
+ else // if this node was in the right split
+ {
+ labels[i] = right_labels[right_idx_map[i]] + num_left_clusters;
+ }
+ }
+
+
+ return num_left_clusters + num_right_clusters;
+ }
+ else
+ {
+ labels.assign(node_degrees.size(),0);
+ return 1;
+ }
+
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline unsigned long newman_cluster (
+ const std::vector<ordered_sample_pair>& edges,
+ std::vector<unsigned long>& labels,
+ const double eps = 1e-4,
+ const unsigned long max_iterations = 2000
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_ordered_by_index(edges),
+ "\t unsigned long newman_cluster()"
+ << "\n\t Invalid inputs were given to this function"
+ );
+
+ labels.clear();
+ if (edges.size() == 0)
+ return 0;
+
+ const unsigned long num_nodes = max_index_plus_one(edges);
+
+ // compute the node_degrees vector, edge_sum value, and diag(B).
+ matrix<double,0,1> node_degrees(num_nodes);
+ matrix<double,0,1> Bdiag(num_nodes);
+ Bdiag = 0;
+ double edge_sum = 0;
+ node_degrees = 0;
+ for (unsigned long i = 0; i < edges.size(); ++i)
+ {
+ node_degrees(edges[i].index1()) += edges[i].distance();
+ edge_sum += edges[i].distance();
+ if (edges[i].index1() == edges[i].index2())
+ Bdiag(edges[i].index1()) += edges[i].distance();
+ }
+ edge_sum /= 2;
+ Bdiag -= squared(node_degrees)/(2*edge_sum);
+
+
+ dlib::rand rnd;
+ return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline unsigned long newman_cluster (
+ const std::vector<sample_pair>& edges,
+ std::vector<unsigned long>& labels,
+ const double eps = 1e-4,
+ const unsigned long max_iterations = 2000
+ )
+ {
+ std::vector<ordered_sample_pair> oedges;
+ convert_unordered_to_ordered(edges, oedges);
+ std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
+
+ return newman_cluster(oedges, labels, eps, max_iterations);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ inline std::vector<unsigned long> remap_labels (
+ const std::vector<unsigned long>& labels,
+ unsigned long& num_labels
+ )
+ /*!
+ ensures
+ - This function takes labels and produces a mapping which maps elements of
+ labels into the most compact range in [0, max] as possible. In particular,
+ there won't be any unused integers in the mapped range.
+ - #num_labels == the number of distinct values in labels.
+ - returns a vector V such that:
+ - V.size() == labels.size()
+ - max(mat(V))+1 == num_labels.
+ - for all valid i,j:
+ - if (labels[i] == labels[j]) then
+ - V[i] == V[j]
+ - else
+ - V[i] != V[j]
+ !*/
+ {
+ std::map<unsigned long, unsigned long> temp;
+ for (unsigned long i = 0; i < labels.size(); ++i)
+ {
+ if (temp.count(labels[i]) == 0)
+ {
+ const unsigned long next = temp.size();
+ temp[labels[i]] = next;
+ }
+ }
+
+ num_labels = temp.size();
+
+ std::vector<unsigned long> result(labels.size());
+ for (unsigned long i = 0; i < labels.size(); ++i)
+ {
+ result[i] = temp[labels[i]];
+ }
+ return result;
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline double modularity (
+ const std::vector<sample_pair>& edges,
+ const std::vector<unsigned long>& labels
+ )
+ {
+ const unsigned long num_nodes = max_index_plus_one(edges);
+ // make sure requires clause is not broken
+ DLIB_ASSERT(labels.size() == num_nodes,
+ "\t double modularity()"
+ << "\n\t Invalid inputs were given to this function"
+ );
+
+ unsigned long num_labels;
+ const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
+
+ std::vector<double> cluster_sums(num_labels,0);
+ std::vector<double> k(num_nodes,0);
+
+ double Q = 0;
+ double m = 0;
+ for (unsigned long i = 0; i < edges.size(); ++i)
+ {
+ const unsigned long n1 = edges[i].index1();
+ const unsigned long n2 = edges[i].index2();
+ k[n1] += edges[i].distance();
+ if (n1 != n2)
+ k[n2] += edges[i].distance();
+
+ if (n1 != n2)
+ m += edges[i].distance();
+ else
+ m += edges[i].distance()/2;
+
+ if (labels_[n1] == labels_[n2])
+ {
+ if (n1 != n2)
+ Q += 2*edges[i].distance();
+ else
+ Q += edges[i].distance();
+ }
+ }
+
+ if (m == 0)
+ return 0;
+
+ for (unsigned long i = 0; i < labels_.size(); ++i)
+ {
+ cluster_sums[labels_[i]] += k[i];
+ }
+
+ for (unsigned long i = 0; i < labels_.size(); ++i)
+ {
+ Q -= k[i]*cluster_sums[labels_[i]]/(2*m);
+ }
+
+ return 1.0/(2*m)*Q;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline double modularity (
+ const std::vector<ordered_sample_pair>& edges,
+ const std::vector<unsigned long>& labels
+ )
+ {
+ const unsigned long num_nodes = max_index_plus_one(edges);
+ // make sure requires clause is not broken
+ DLIB_ASSERT(labels.size() == num_nodes,
+ "\t double modularity()"
+ << "\n\t Invalid inputs were given to this function"
+ );
+
+
+ unsigned long num_labels;
+ const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
+
+ std::vector<double> cluster_sums(num_labels,0);
+ std::vector<double> k(num_nodes,0);
+
+ double Q = 0;
+ double m = 0;
+ for (unsigned long i = 0; i < edges.size(); ++i)
+ {
+ const unsigned long n1 = edges[i].index1();
+ const unsigned long n2 = edges[i].index2();
+ k[n1] += edges[i].distance();
+ m += edges[i].distance();
+ if (labels_[n1] == labels_[n2])
+ {
+ Q += edges[i].distance();
+ }
+ }
+
+ if (m == 0)
+ return 0;
+
+ for (unsigned long i = 0; i < labels_.size(); ++i)
+ {
+ cluster_sums[labels_[i]] += k[i];
+ }
+
+ for (unsigned long i = 0; i < labels_.size(); ++i)
+ {
+ Q -= k[i]*cluster_sums[labels_[i]]/m;
+ }
+
+ return 1.0/m*Q;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MODULARITY_ClUSTERING__H__
+