1 files changed, 515 insertions, 0 deletions
diff --git a/ml/dlib/dlib/clustering/modularity_clustering.h b/ml/dlib/dlib/clustering/modularity_clustering.h
new file mode 100644
index 000000000..8b8a0b0a5
--- /dev/null
+++ b/ml/dlib/dlib/clustering/modularity_clustering.h
@@ -0,0 +1,515 @@
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_MODULARITY_ClUSTERING__H__
+#define DLIB_MODULARITY_ClUSTERING__H__
+
+#include "modularity_clustering_abstract.h"
+#include "../sparse_vector.h"
+#include "../graph_utils/edge_list_graphs.h"
+#include "../matrix.h"
+#include "../rand.h"
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        inline double newman_cluster_split (
+            dlib::rand& rnd,
+            const std::vector<ordered_sample_pair>& edges,
+            const matrix<double,0,1>& node_degrees, // k from the Newman paper
+            const matrix<double,0,1>& Bdiag,        // diag(B) from the Newman paper
+            const double& edge_sum,                 // m from the Newman paper
+            matrix<double,0,1>& labels,
+            const double eps,
+            const unsigned long max_iterations
+        )
+        /*!
+            requires
+                - node_degrees.size() == max_index_plus_one(edges)
+                - Bdiag.size() == max_index_plus_one(edges)
+                - edges must be sorted according to order_by_index()
+            ensures
+                - This routine splits a graph into two subgraphs using the Newman 
+                  clustering method.  
+                - returns the modularity obtained when the graph is split according
+                  to the contents of #labels. 
+                - #labels.size() == node_degrees.size()
+                - for all valid i: #labels(i) == -1 or +1
+                - if (this function returns 0) then
+                    - all the labels are equal, i.e. the graph is not split.
+        !*/
+        {
+            // Scale epsilon so that it is relative to the expected value of an element of a
+            // unit vector of length node_degrees.size().
+            const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size());
+
+            // Make a random unit vector and put in labels.
+            labels.set_size(node_degrees.size());
+            for (long i = 0; i < labels.size(); ++i)
+                labels(i) = rnd.get_random_gaussian();
+            labels /= length(labels);
+
+            matrix<double,0,1> Bv, Bv_unit;
+
+            // Do the power iteration for a while.
+            double eig = -1;
+            double offset = 0;
+            while (eig < 0)
+            {
+
+                // any number larger than power_iter_eps
+                double iteration_change = power_iter_eps*2+1; 
+                for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i) 
+                {
+                    sparse_matrix_vector_multiply(edges, labels, Bv);
+                    Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
+
+                    if (offset != 0)
+                    {
+                        Bv -= offset*labels;
+                    }
+
+
+                    const double len = length(Bv);
+                    if (len != 0)
+                    {
+                        Bv_unit = Bv/len;
+                        iteration_change = max(abs(labels-Bv_unit));
+                        labels.swap(Bv_unit);
+                    }
+                    else
+                    {
+                        // Had a bad time, pick another random vector and try it with the
+                        // power iteration.
+                        for (long i = 0; i < labels.size(); ++i)
+                            labels(i) = rnd.get_random_gaussian();
+                    }
+                }
+
+                eig = dot(Bv,labels);
+                // we will repeat this loop if the largest eigenvalue is negative
+                offset = eig;
+            }
+
+
+            for (long i = 0; i < labels.size(); ++i)
+            {
+                if (labels(i) > 0)
+                    labels(i) = 1;
+                else
+                    labels(i) = -1;
+            }
+
+
+            // compute B*labels, store result in Bv.
+            sparse_matrix_vector_multiply(edges, labels, Bv);
+            Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
+
+            // Do some label refinement.  In this step we swap labels if it
+            // improves the modularity score.
+            bool flipped_label = true;
+            while(flipped_label)
+            {
+                flipped_label = false;
+                unsigned long idx = 0;
+                for (long i = 0; i < labels.size(); ++i)
+                {
+                    const double val = -2*labels(i);
+                    const double increase = 4*Bdiag(i) + 2*val*Bv(i);
+
+                    // if there is an increase in modularity for swapping this label
+                    if (increase > 0)
+                    {
+                        labels(i) *= -1;
+                        while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
+                        {
+                            const long j = edges[idx].index2();
+                            Bv(j) += val*edges[idx].distance();
+                            ++idx;
+                        }
+
+                        Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees;
+
+                        flipped_label = true;
+                    }
+                    else
+                    {
+                        while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
+                        {
+                            ++idx;
+                        }
+                    }
+                }
+            }
+
+
+            const double modularity = dot(Bv, labels)/(4*edge_sum);
+
+            return modularity;
+        }
+
+    // -------------------------------------------------------------------------------------
+
+        inline unsigned long newman_cluster_helper (
+            dlib::rand& rnd,
+            const std::vector<ordered_sample_pair>& edges,
+            const matrix<double,0,1>& node_degrees, // k from the Newman paper
+            const matrix<double,0,1>& Bdiag,        // diag(B) from the Newman paper
+            const double& edge_sum,                 // m from the Newman paper
+            std::vector<unsigned long>& labels,
+            double modularity_threshold,
+            const double eps,
+            const unsigned long max_iterations
+        )
+        /*!
+            ensures
+                - returns the number of clusters the data was split into
+        !*/
+        {
+            matrix<double,0,1> l;
+            const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations);
+
+
+            // We need to collapse the node index values down to contiguous values.  So
+            // we use the following two vectors to contain the mappings from input index
+            // values to their corresponding index values in each split.
+            std::vector<unsigned long> left_idx_map(node_degrees.size());
+            std::vector<unsigned long> right_idx_map(node_degrees.size());
+
+            // figure out how many nodes went into each side of the split.
+            unsigned long num_left_split = 0;
+            unsigned long num_right_split = 0;
+            for (long i = 0; i < l.size(); ++i)
+            {
+                if (l(i) > 0)
+                {
+                    left_idx_map[i] = num_left_split;
+                    ++num_left_split;
+                }
+                else
+                {
+                    right_idx_map[i] = num_right_split;
+                    ++num_right_split;
+                }
+            }
+
+            // do a recursive split if it will improve the modularity.
+            if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0)
+            {
+
+                // split the node_degrees and Bdiag matrices into left and right split parts
+                matrix<double,0,1> left_node_degrees(num_left_split);
+                matrix<double,0,1> right_node_degrees(num_right_split);
+                matrix<double,0,1> left_Bdiag(num_left_split);
+                matrix<double,0,1> right_Bdiag(num_right_split);
+                for (long i = 0; i < l.size(); ++i)
+                {
+                    if (l(i) > 0)
+                    {
+                        left_node_degrees(left_idx_map[i]) = node_degrees(i);
+                        left_Bdiag(left_idx_map[i]) = Bdiag(i);
+                    }
+                    else
+                    {
+                        right_node_degrees(right_idx_map[i]) = node_degrees(i);
+                        right_Bdiag(right_idx_map[i]) = Bdiag(i);
+                    }
+                }
+
+
+                // put the edges from one side of the split into split_edges
+                std::vector<ordered_sample_pair> split_edges;
+                modularity_threshold = 0;
+                for (unsigned long k = 0; k < edges.size(); ++k)
+                {
+                    const unsigned long i = edges[k].index1();
+                    const unsigned long j = edges[k].index2();
+                    const double d = edges[k].distance();
+                    if (l(i) > 0 && l(j) > 0)
+                    {
+                        split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d));
+                        modularity_threshold += d;
+                    }
+                }
+                modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum);
+                modularity_threshold /= 4*edge_sum;
+
+                unsigned long num_left_clusters;
+                std::vector<unsigned long> left_labels;
+                num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag,
+                                                          edge_sum,left_labels,modularity_threshold,
+                                                          eps, max_iterations);
+
+                // now load the other side into split_edges and cluster it as well
+                split_edges.clear();
+                modularity_threshold = 0;
+                for (unsigned long k = 0; k < edges.size(); ++k)
+                {
+                    const unsigned long i = edges[k].index1();
+                    const unsigned long j = edges[k].index2();
+                    const double d = edges[k].distance();
+                    if (l(i) < 0 && l(j) < 0)
+                    {
+                        split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d));
+                        modularity_threshold += d;
+                    }
+                }
+                modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum);
+                modularity_threshold /= 4*edge_sum;
+
+                unsigned long num_right_clusters;
+                std::vector<unsigned long> right_labels;
+                num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag,
+                                                           edge_sum,right_labels,modularity_threshold,
+                                                           eps, max_iterations);
+
+                // Now merge the labels from the two splits.
+                labels.resize(node_degrees.size());
+                for (unsigned long i = 0; i < labels.size(); ++i)
+                {
+                    // if this node was in the left split
+                    if (l(i) > 0)
+                    {
+                        labels[i] = left_labels[left_idx_map[i]];
+                    }
+                    else // if this node was in the right split
+                    {
+                        labels[i] = right_labels[right_idx_map[i]] + num_left_clusters;
+                    }
+                }
+
+
+                return num_left_clusters + num_right_clusters;
+            }
+            else
+            {
+                labels.assign(node_degrees.size(),0);
+                return 1;
+            }
+
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline unsigned long newman_cluster (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const double eps = 1e-4,
+        const unsigned long max_iterations = 2000
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_ordered_by_index(edges),
+                    "\t unsigned long newman_cluster()"
+                    << "\n\t Invalid inputs were given to this function"
+        );
+
+        labels.clear();
+        if (edges.size() == 0)
+            return 0;
+
+        const unsigned long num_nodes = max_index_plus_one(edges);
+
+        // compute the node_degrees vector, edge_sum value, and diag(B).
+        matrix<double,0,1> node_degrees(num_nodes);
+        matrix<double,0,1> Bdiag(num_nodes);
+        Bdiag = 0;
+        double edge_sum = 0;
+        node_degrees = 0;
+        for (unsigned long i = 0; i < edges.size(); ++i)
+        {
+            node_degrees(edges[i].index1()) += edges[i].distance();
+            edge_sum += edges[i].distance();
+            if (edges[i].index1() == edges[i].index2())
+                Bdiag(edges[i].index1()) += edges[i].distance();
+        }
+        edge_sum /= 2;
+        Bdiag -= squared(node_degrees)/(2*edge_sum);
+
+
+        dlib::rand rnd;
+        return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline unsigned long newman_cluster (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const double eps = 1e-4,
+        const unsigned long max_iterations = 2000
+    )
+    {
+        std::vector<ordered_sample_pair> oedges;
+        convert_unordered_to_ordered(edges, oedges);
+        std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
+
+        return newman_cluster(oedges, labels, eps, max_iterations);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        inline std::vector<unsigned long> remap_labels (
+            const std::vector<unsigned long>& labels,
+            unsigned long& num_labels
+        )
+        /*!
+            ensures
+                - This function takes labels and produces a mapping which maps elements of
+                  labels into the most compact range in [0, max] as possible.  In particular,
+                  there won't be any unused integers in the mapped range.
+                - #num_labels == the number of distinct values in labels.
+                - returns a vector V such that:
+                    - V.size() == labels.size()
+                    - max(mat(V))+1 == num_labels.
+                    - for all valid i,j:
+                        - if (labels[i] == labels[j]) then
+                            - V[i] == V[j]
+                        - else
+                            - V[i] != V[j]
+        !*/
+        {
+            std::map<unsigned long, unsigned long> temp;
+            for (unsigned long i = 0; i < labels.size(); ++i)
+            {
+                if (temp.count(labels[i]) == 0)
+                {
+                    const unsigned long next = temp.size();
+                    temp[labels[i]] = next;
+                }
+            }
+
+            num_labels = temp.size();
+
+            std::vector<unsigned long> result(labels.size());
+            for (unsigned long i = 0; i < labels.size(); ++i)
+            {
+                result[i] = temp[labels[i]];
+            }
+            return result;
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline double modularity (
+        const std::vector<sample_pair>& edges,
+        const std::vector<unsigned long>& labels
+    )
+    {
+        const unsigned long num_nodes = max_index_plus_one(edges);
+        // make sure requires clause is not broken
+        DLIB_ASSERT(labels.size() == num_nodes,
+                    "\t double modularity()"
+                    << "\n\t Invalid inputs were given to this function"
+        );
+
+        unsigned long num_labels;
+        const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
+
+        std::vector<double> cluster_sums(num_labels,0);
+        std::vector<double> k(num_nodes,0);
+
+        double Q = 0;
+        double m = 0;
+        for (unsigned long i = 0; i < edges.size(); ++i)
+        {
+            const unsigned long n1 = edges[i].index1();
+            const unsigned long n2 = edges[i].index2();
+            k[n1] += edges[i].distance();
+            if (n1 != n2)
+                k[n2] += edges[i].distance();
+
+            if (n1 != n2)
+                m += edges[i].distance();
+            else
+                m += edges[i].distance()/2;
+
+            if (labels_[n1] == labels_[n2])
+            {
+                if (n1 != n2)
+                    Q += 2*edges[i].distance();
+                else
+                    Q += edges[i].distance();
+            }
+        }
+
+        if (m == 0)
+            return 0;
+
+        for (unsigned long i = 0; i < labels_.size(); ++i)
+        {
+            cluster_sums[labels_[i]] += k[i];
+        }
+
+        for (unsigned long i = 0; i < labels_.size(); ++i)
+        {
+            Q -= k[i]*cluster_sums[labels_[i]]/(2*m);
+        }
+
+        return 1.0/(2*m)*Q;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline double modularity (
+        const std::vector<ordered_sample_pair>& edges,
+        const std::vector<unsigned long>& labels
+    )
+    {
+        const unsigned long num_nodes = max_index_plus_one(edges);
+        // make sure requires clause is not broken
+        DLIB_ASSERT(labels.size() == num_nodes,
+                    "\t double modularity()"
+                    << "\n\t Invalid inputs were given to this function"
+        );
+
+
+        unsigned long num_labels;
+        const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
+
+        std::vector<double> cluster_sums(num_labels,0);
+        std::vector<double> k(num_nodes,0);
+
+        double Q = 0;
+        double m = 0;
+        for (unsigned long i = 0; i < edges.size(); ++i)
+        {
+            const unsigned long n1 = edges[i].index1();
+            const unsigned long n2 = edges[i].index2();
+            k[n1] += edges[i].distance();
+            m += edges[i].distance();
+            if (labels_[n1] == labels_[n2])
+            {
+                Q += edges[i].distance();
+            }
+        }
+
+        if (m == 0)
+            return 0;
+
+        for (unsigned long i = 0; i < labels_.size(); ++i)
+        {
+            cluster_sums[labels_[i]] += k[i];
+        }
+
+        for (unsigned long i = 0; i < labels_.size(); ++i)
+        {
+            Q -= k[i]*cluster_sums[labels_[i]]/m;
+        }
+
+        return 1.0/m*Q;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MODULARITY_ClUSTERING__H__
+