summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/clustering
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/clustering')
-rw-r--r--ml/dlib/dlib/clustering/bottom_up_cluster.h253
-rw-r--r--ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h136
-rw-r--r--ml/dlib/dlib/clustering/chinese_whispers.h135
-rw-r--r--ml/dlib/dlib/clustering/chinese_whispers_abstract.h97
-rw-r--r--ml/dlib/dlib/clustering/modularity_clustering.h515
-rw-r--r--ml/dlib/dlib/clustering/modularity_clustering_abstract.h125
-rw-r--r--ml/dlib/dlib/clustering/spectral_cluster.h80
-rw-r--r--ml/dlib/dlib/clustering/spectral_cluster_abstract.h43
8 files changed, 0 insertions, 1384 deletions
diff --git a/ml/dlib/dlib/clustering/bottom_up_cluster.h b/ml/dlib/dlib/clustering/bottom_up_cluster.h
deleted file mode 100644
index f80b65108..000000000
--- a/ml/dlib/dlib/clustering/bottom_up_cluster.h
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright (C) 2015 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_
-#define DLIB_BOTTOM_uP_CLUSTER_Hh_
-
-#include <queue>
-#include <map>
-
-#include "bottom_up_cluster_abstract.h"
-#include "../algs.h"
-#include "../matrix.h"
-#include "../disjoint_subsets.h"
-#include "../graph_utils.h"
-
-
-namespace dlib
-{
-
-// ----------------------------------------------------------------------------------------
-
- namespace buc_impl
- {
- inline void merge_sets (
- matrix<double>& dists,
- unsigned long dest,
- unsigned long src
- )
- {
- for (long r = 0; r < dists.nr(); ++r)
- dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src));
- }
-
- struct compare_dist
- {
- bool operator() (
- const sample_pair& a,
- const sample_pair& b
- ) const
- {
- return a.distance() > b.distance();
- }
- };
- }
-
-// ----------------------------------------------------------------------------------------
-
- template <
- typename EXP
- >
- unsigned long bottom_up_cluster (
- const matrix_exp<EXP>& dists_,
- std::vector<unsigned long>& labels,
- unsigned long min_num_clusters,
- double max_dist = std::numeric_limits<double>::infinity()
- )
- {
- matrix<double> dists = matrix_cast<double>(dists_);
- // make sure requires clause is not broken
- DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0,
- "\t unsigned long bottom_up_cluster()"
- << "\n\t Invalid inputs were given to this function."
- << "\n\t dists.nr(): " << dists.nr()
- << "\n\t dists.nc(): " << dists.nc()
- << "\n\t min_num_clusters: " << min_num_clusters
- );
-
- using namespace buc_impl;
-
- labels.resize(dists.nr());
- disjoint_subsets sets;
- sets.set_size(dists.nr());
- if (labels.size() == 0)
- return 0;
-
- // push all the edges in the graph into a priority queue so the best edges to merge
- // come first.
- std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que;
- for (long r = 0; r < dists.nr(); ++r)
- for (long c = r+1; c < dists.nc(); ++c)
- que.push(sample_pair(r,c,dists(r,c)));
-
- // Now start merging nodes.
- for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter)
- {
- // find the next best thing to merge.
- double best_dist = que.top().distance();
- unsigned long a = sets.find_set(que.top().index1());
- unsigned long b = sets.find_set(que.top().index2());
- que.pop();
- // we have been merging and modifying the distances, so make sure this distance
- // is still valid and these guys haven't been merged already.
- while(a == b || best_dist < dists(a,b))
- {
- // Haven't merged it yet, so put it back in with updated distance for
- // reconsideration later.
- if (a != b)
- que.push(sample_pair(a, b, dists(a, b)));
-
- best_dist = que.top().distance();
- a = sets.find_set(que.top().index1());
- b = sets.find_set(que.top().index2());
- que.pop();
- }
-
-
- // now merge these sets if the best distance is small enough
- if (best_dist > max_dist)
- break;
- unsigned long news = sets.merge_sets(a,b);
- unsigned long olds = (news==a)?b:a;
- merge_sets(dists, news, olds);
- }
-
- // figure out which cluster each element is in. Also make sure the labels are
- // contiguous.
- std::map<unsigned long, unsigned long> relabel;
- for (unsigned long r = 0; r < labels.size(); ++r)
- {
- unsigned long l = sets.find_set(r);
- // relabel to make contiguous
- if (relabel.count(l) == 0)
- {
- unsigned long next = relabel.size();
- relabel[l] = next;
- }
- labels[r] = relabel[l];
- }
-
-
- return relabel.size();
- }
-
-// ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-
- struct snl_range
- {
- snl_range() = default;
- snl_range(double val) : lower(val), upper(val) {}
- snl_range(double l, double u) : lower(l), upper(u) { DLIB_ASSERT(lower <= upper)}
-
- double lower = 0;
- double upper = 0;
-
- double width() const { return upper-lower; }
- bool operator<(const snl_range& item) const { return lower < item.lower; }
- };
-
- inline snl_range merge(const snl_range& a, const snl_range& b)
- {
- return snl_range(std::min(a.lower, b.lower), std::max(a.upper, b.upper));
- }
-
- inline double distance (const snl_range& a, const snl_range& b)
- {
- return std::max(a.lower,b.lower) - std::min(a.upper,b.upper);
- }
-
- inline std::ostream& operator<< (std::ostream& out, const snl_range& item )
- {
- out << "["<<item.lower<<","<<item.upper<<"]";
- return out;
- }
-
-// ----------------------------------------------------------------------------------------
-
- inline std::vector<snl_range> segment_number_line (
- const std::vector<double>& x,
- const double max_range_width
- )
- {
- DLIB_CASSERT(max_range_width >= 0);
-
- // create initial ranges, one for each value in x. So initially, all the ranges have
- // width of 0.
- std::vector<snl_range> ranges;
- for (auto v : x)
- ranges.push_back(v);
- std::sort(ranges.begin(), ranges.end());
-
- std::vector<snl_range> greedy_final_ranges;
- if (ranges.size() == 0)
- return greedy_final_ranges;
- // We will try two different clustering strategies. One that does a simple greedy left
- // to right sweep and another that does a bottom up agglomerative clustering. This
- // first loop runs the greedy left to right sweep. Then at the end of this routine we
- // will return the results that produced the tightest clustering.
- greedy_final_ranges.push_back(ranges[0]);
- for (size_t i = 1; i < ranges.size(); ++i)
- {
- auto m = merge(greedy_final_ranges.back(), ranges[i]);
- if (m.width() <= max_range_width)
- greedy_final_ranges.back() = m;
- else
- greedy_final_ranges.push_back(ranges[i]);
- }
-
-
- // Here we do the bottom up clustering. So compute the edges connecting our ranges.
- // We will simply say there are edges between ranges if and only if they are
- // immediately adjacent on the number line.
- std::vector<sample_pair> edges;
- for (size_t i = 1; i < ranges.size(); ++i)
- edges.push_back(sample_pair(i-1,i, distance(ranges[i-1],ranges[i])));
- std::sort(edges.begin(), edges.end(), order_by_distance<sample_pair>);
-
- disjoint_subsets sets;
- sets.set_size(ranges.size());
-
- // Now start merging nodes.
- for (auto edge : edges)
- {
- // find the next best thing to merge.
- unsigned long a = sets.find_set(edge.index1());
- unsigned long b = sets.find_set(edge.index2());
-
- // merge it if it doesn't result in an interval that's too big.
- auto m = merge(ranges[a], ranges[b]);
- if (m.width() <= max_range_width)
- {
- unsigned long news = sets.merge_sets(a,b);
- ranges[news] = m;
- }
- }
-
- // Now create a list of the final ranges. We will do this by keeping track of which
- // range we already added to final_ranges.
- std::vector<snl_range> final_ranges;
- std::vector<bool> already_output(ranges.size(), false);
- for (unsigned long i = 0; i < sets.size(); ++i)
- {
- auto s = sets.find_set(i);
- if (!already_output[s])
- {
- final_ranges.push_back(ranges[s]);
- already_output[s] = true;
- }
- }
-
- // only use the greedy clusters if they found a clustering with fewer clusters.
- // Otherwise, the bottom up clustering probably produced a more sensible clustering.
- if (final_ranges.size() <= greedy_final_ranges.size())
- return final_ranges;
- else
- return greedy_final_ranges;
- }
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_
-
diff --git a/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h b/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h
deleted file mode 100644
index 72d362c12..000000000
--- a/ml/dlib/dlib/clustering/bottom_up_cluster_abstract.h
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (C) 2015 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#undef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_
-#ifdef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_
-
-#include "../matrix.h"
-
-namespace dlib
-{
-
-// ----------------------------------------------------------------------------------------
-
- template <
- typename EXP
- >
- unsigned long bottom_up_cluster (
- const matrix_exp<EXP>& dists,
- std::vector<unsigned long>& labels,
- unsigned long min_num_clusters,
- double max_dist = std::numeric_limits<double>::infinity()
- );
- /*!
- requires
- - dists.nr() == dists.nc()
- - min_num_clusters > 0
- - dists == trans(dists)
- (l.e. dists should be symmetric)
- ensures
- - Runs a bottom up agglomerative clustering algorithm.
- - Interprets dists as a matrix that gives the distances between dists.nr()
- items. In particular, we take dists(i,j) to be the distance between the ith
- and jth element of some set. This function clusters the elements of this set
- into at least min_num_clusters (or dists.nr() if there aren't enough
- elements). Additionally, within each cluster, the maximum pairwise distance
- between any two cluster elements is <= max_dist.
- - returns the number of clusters found.
- - #labels.size() == dists.nr()
- - for all valid i:
- - #labels[i] == the cluster ID of the node with index i (i.e. the node
- corresponding to the distances dists(i,*)).
- - 0 <= #labels[i] < the number of clusters found
- (i.e. cluster IDs are assigned contiguously and start at 0)
- !*/
-
-// ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-
- struct snl_range
- {
- /*!
- WHAT THIS OBJECT REPRESENTS
- This object represents an interval on the real number line. It is used
- to store the outputs of the segment_number_line() routine defined below.
- !*/
-
- snl_range(
- );
- /*!
- ensures
- - #lower == 0
- - #upper == 0
- !*/
-
- snl_range(
- double val
- );
- /*!
- ensures
- - #lower == val
- - #upper == val
- !*/
-
- snl_range(
- double l,
- double u
- );
- /*!
- requires
- - l <= u
- ensures
- - #lower == l
- - #upper == u
- !*/
-
- double lower;
- double upper;
-
- double width(
- ) const { return upper-lower; }
- /*!
- ensures
- - returns the width of this interval on the number line.
- !*/
-
- bool operator<(const snl_range& item) const { return lower < item.lower; }
- /*!
- ensures
- - provides a total ordering of snl_range objects assuming they are
- non-overlapping.
- !*/
- };
-
- std::ostream& operator<< (std::ostream& out, const snl_range& item );
- /*!
- ensures
- - prints item to out in the form [lower,upper].
- !*/
-
-// ----------------------------------------------------------------------------------------
-
- std::vector<snl_range> segment_number_line (
- const std::vector<double>& x,
- const double max_range_width
- );
- /*!
- requires
- - max_range_width >= 0
- ensures
- - Finds a clustering of the values in x and returns the ranges that define the
- clustering. This routine uses a combination of bottom up clustering and a
- simple greedy scan to try and find the most compact set of ranges that
- contain all the values in x.
- - This routine has approximately linear runtime.
- - Every value in x will be contained inside one of the returned snl_range
- objects;
- - All returned snl_range object's will have a width() <= max_range_width and
- will also be non-overlapping.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_
-
-
diff --git a/ml/dlib/dlib/clustering/chinese_whispers.h b/ml/dlib/dlib/clustering/chinese_whispers.h
deleted file mode 100644
index 332cce1a0..000000000
--- a/ml/dlib/dlib/clustering/chinese_whispers.h
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright (C) 2012 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#ifndef DLIB_CHINESE_WHISPErS_Hh_
-#define DLIB_CHINESE_WHISPErS_Hh_
-
-#include "chinese_whispers_abstract.h"
-#include <vector>
-#include "../rand.h"
-#include "../graph_utils/edge_list_graphs.h"
-
-namespace dlib
-{
-
-// ----------------------------------------------------------------------------------------
-
- inline unsigned long chinese_whispers (
- const std::vector<ordered_sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const unsigned long num_iterations,
- dlib::rand& rnd
- )
- {
- // make sure requires clause is not broken
- DLIB_ASSERT(is_ordered_by_index(edges),
- "\t unsigned long chinese_whispers()"
- << "\n\t Invalid inputs were given to this function"
- );
-
- labels.clear();
- if (edges.size() == 0)
- return 0;
-
- std::vector<std::pair<unsigned long, unsigned long> > neighbors;
- find_neighbor_ranges(edges, neighbors);
-
- // Initialize the labels, each node gets a different label.
- labels.resize(neighbors.size());
- for (unsigned long i = 0; i < labels.size(); ++i)
- labels[i] = i;
-
-
- for (unsigned long iter = 0; iter < neighbors.size()*num_iterations; ++iter)
- {
- // Pick a random node.
- const unsigned long idx = rnd.get_random_64bit_number()%neighbors.size();
-
- // Count how many times each label happens amongst our neighbors.
- std::map<unsigned long, double> labels_to_counts;
- const unsigned long end = neighbors[idx].second;
- for (unsigned long i = neighbors[idx].first; i != end; ++i)
- {
- labels_to_counts[labels[edges[i].index2()]] += edges[i].distance();
- }
-
- // find the most common label
- std::map<unsigned long, double>::iterator i;
- double best_score = -std::numeric_limits<double>::infinity();
- unsigned long best_label = labels[idx];
- for (i = labels_to_counts.begin(); i != labels_to_counts.end(); ++i)
- {
- if (i->second > best_score)
- {
- best_score = i->second;
- best_label = i->first;
- }
- }
-
- labels[idx] = best_label;
- }
-
-
- // Remap the labels into a contiguous range. First we find the
- // mapping.
- std::map<unsigned long,unsigned long> label_remap;
- for (unsigned long i = 0; i < labels.size(); ++i)
- {
- const unsigned long next_id = label_remap.size();
- if (label_remap.count(labels[i]) == 0)
- label_remap[labels[i]] = next_id;
- }
- // now apply the mapping to all the labels.
- for (unsigned long i = 0; i < labels.size(); ++i)
- {
- labels[i] = label_remap[labels[i]];
- }
-
- return label_remap.size();
- }
-
-// ----------------------------------------------------------------------------------------
-
- inline unsigned long chinese_whispers (
- const std::vector<sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const unsigned long num_iterations,
- dlib::rand& rnd
- )
- {
- std::vector<ordered_sample_pair> oedges;
- convert_unordered_to_ordered(edges, oedges);
- std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
-
- return chinese_whispers(oedges, labels, num_iterations, rnd);
- }
-
-// ----------------------------------------------------------------------------------------
-
- inline unsigned long chinese_whispers (
- const std::vector<sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const unsigned long num_iterations = 100
- )
- {
- dlib::rand rnd;
- return chinese_whispers(edges, labels, num_iterations, rnd);
- }
-
-// ----------------------------------------------------------------------------------------
-
- inline unsigned long chinese_whispers (
- const std::vector<ordered_sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const unsigned long num_iterations = 100
- )
- {
- dlib::rand rnd;
- return chinese_whispers(edges, labels, num_iterations, rnd);
- }
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_CHINESE_WHISPErS_Hh_
-
diff --git a/ml/dlib/dlib/clustering/chinese_whispers_abstract.h b/ml/dlib/dlib/clustering/chinese_whispers_abstract.h
deleted file mode 100644
index 7a184c6f9..000000000
--- a/ml/dlib/dlib/clustering/chinese_whispers_abstract.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (C) 2012 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#undef DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_
-#ifdef DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_
-
-#include <vector>
-#include "../rand.h"
-#include "../graph_utils/ordered_sample_pair_abstract.h"
-#include "../graph_utils/sample_pair_abstract.h"
-
-namespace dlib
-{
-
-// ----------------------------------------------------------------------------------------
-
- unsigned long chinese_whispers (
- const std::vector<ordered_sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const unsigned long num_iterations,
- dlib::rand& rnd
- );
- /*!
- requires
- - is_ordered_by_index(edges) == true
- ensures
- - This function implements the graph clustering algorithm described in the
- paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
- Application to Natural Language Processing Problems by Chris Biemann.
- - Interprets edges as a directed graph. That is, it contains the edges on the
- said graph and the ordered_sample_pair::distance() values define the edge
- weights (larger values indicating a stronger edge connection between the
- nodes). If an edge has a distance() value of infinity then it is considered
- a "must link" edge.
- - returns the number of clusters found.
- - #labels.size() == max_index_plus_one(edges)
- - for all valid i:
- - #labels[i] == the cluster ID of the node with index i in the graph.
- - 0 <= #labels[i] < the number of clusters found
- (i.e. cluster IDs are assigned contiguously and start at 0)
- - Duplicate edges are interpreted as if there had been just one edge with a
- distance value equal to the sum of all the duplicate edge's distance values.
- - The algorithm performs exactly num_iterations passes over the graph before
- terminating.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
- unsigned long chinese_whispers (
- const std::vector<sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const unsigned long num_iterations,
- dlib::rand& rnd
- );
- /*!
- ensures
- - This function is identical to the above chinese_whispers() routine except
- that it operates on a vector of sample_pair objects instead of
- ordered_sample_pairs. Therefore, this is simply a convenience routine. In
- particular, it is implemented by transforming the given edges into
- ordered_sample_pairs and then calling the chinese_whispers() routine defined
- above.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
- unsigned long chinese_whispers (
- const std::vector<ordered_sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const unsigned long num_iterations = 100
- );
- /*!
- requires
- - is_ordered_by_index(edges) == true
- ensures
- - performs: return chinese_whispers(edges, labels, num_iterations, rnd)
- where rnd is a default initialized dlib::rand object.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
- unsigned long chinese_whispers (
- const std::vector<sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const unsigned long num_iterations = 100
- );
- /*!
- ensures
- - performs: return chinese_whispers(edges, labels, num_iterations, rnd)
- where rnd is a default initialized dlib::rand object.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_CHINESE_WHISPErS_ABSTRACT_Hh_
-
diff --git a/ml/dlib/dlib/clustering/modularity_clustering.h b/ml/dlib/dlib/clustering/modularity_clustering.h
deleted file mode 100644
index 8b8a0b0a5..000000000
--- a/ml/dlib/dlib/clustering/modularity_clustering.h
+++ /dev/null
@@ -1,515 +0,0 @@
-// Copyright (C) 2012 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#ifndef DLIB_MODULARITY_ClUSTERING__H__
-#define DLIB_MODULARITY_ClUSTERING__H__
-
-#include "modularity_clustering_abstract.h"
-#include "../sparse_vector.h"
-#include "../graph_utils/edge_list_graphs.h"
-#include "../matrix.h"
-#include "../rand.h"
-
-namespace dlib
-{
-
-// -----------------------------------------------------------------------------------------
-
- namespace impl
- {
- inline double newman_cluster_split (
- dlib::rand& rnd,
- const std::vector<ordered_sample_pair>& edges,
- const matrix<double,0,1>& node_degrees, // k from the Newman paper
- const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper
- const double& edge_sum, // m from the Newman paper
- matrix<double,0,1>& labels,
- const double eps,
- const unsigned long max_iterations
- )
- /*!
- requires
- - node_degrees.size() == max_index_plus_one(edges)
- - Bdiag.size() == max_index_plus_one(edges)
- - edges must be sorted according to order_by_index()
- ensures
- - This routine splits a graph into two subgraphs using the Newman
- clustering method.
- - returns the modularity obtained when the graph is split according
- to the contents of #labels.
- - #labels.size() == node_degrees.size()
- - for all valid i: #labels(i) == -1 or +1
- - if (this function returns 0) then
- - all the labels are equal, i.e. the graph is not split.
- !*/
- {
- // Scale epsilon so that it is relative to the expected value of an element of a
- // unit vector of length node_degrees.size().
- const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size());
-
- // Make a random unit vector and put in labels.
- labels.set_size(node_degrees.size());
- for (long i = 0; i < labels.size(); ++i)
- labels(i) = rnd.get_random_gaussian();
- labels /= length(labels);
-
- matrix<double,0,1> Bv, Bv_unit;
-
- // Do the power iteration for a while.
- double eig = -1;
- double offset = 0;
- while (eig < 0)
- {
-
- // any number larger than power_iter_eps
- double iteration_change = power_iter_eps*2+1;
- for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i)
- {
- sparse_matrix_vector_multiply(edges, labels, Bv);
- Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
-
- if (offset != 0)
- {
- Bv -= offset*labels;
- }
-
-
- const double len = length(Bv);
- if (len != 0)
- {
- Bv_unit = Bv/len;
- iteration_change = max(abs(labels-Bv_unit));
- labels.swap(Bv_unit);
- }
- else
- {
- // Had a bad time, pick another random vector and try it with the
- // power iteration.
- for (long i = 0; i < labels.size(); ++i)
- labels(i) = rnd.get_random_gaussian();
- }
- }
-
- eig = dot(Bv,labels);
- // we will repeat this loop if the largest eigenvalue is negative
- offset = eig;
- }
-
-
- for (long i = 0; i < labels.size(); ++i)
- {
- if (labels(i) > 0)
- labels(i) = 1;
- else
- labels(i) = -1;
- }
-
-
- // compute B*labels, store result in Bv.
- sparse_matrix_vector_multiply(edges, labels, Bv);
- Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
-
- // Do some label refinement. In this step we swap labels if it
- // improves the modularity score.
- bool flipped_label = true;
- while(flipped_label)
- {
- flipped_label = false;
- unsigned long idx = 0;
- for (long i = 0; i < labels.size(); ++i)
- {
- const double val = -2*labels(i);
- const double increase = 4*Bdiag(i) + 2*val*Bv(i);
-
- // if there is an increase in modularity for swapping this label
- if (increase > 0)
- {
- labels(i) *= -1;
- while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
- {
- const long j = edges[idx].index2();
- Bv(j) += val*edges[idx].distance();
- ++idx;
- }
-
- Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees;
-
- flipped_label = true;
- }
- else
- {
- while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
- {
- ++idx;
- }
- }
- }
- }
-
-
- const double modularity = dot(Bv, labels)/(4*edge_sum);
-
- return modularity;
- }
-
- // -------------------------------------------------------------------------------------
-
- inline unsigned long newman_cluster_helper (
- dlib::rand& rnd,
- const std::vector<ordered_sample_pair>& edges,
- const matrix<double,0,1>& node_degrees, // k from the Newman paper
- const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper
- const double& edge_sum, // m from the Newman paper
- std::vector<unsigned long>& labels,
- double modularity_threshold,
- const double eps,
- const unsigned long max_iterations
- )
- /*!
- ensures
- - returns the number of clusters the data was split into
- !*/
- {
- matrix<double,0,1> l;
- const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations);
-
-
- // We need to collapse the node index values down to contiguous values. So
- // we use the following two vectors to contain the mappings from input index
- // values to their corresponding index values in each split.
- std::vector<unsigned long> left_idx_map(node_degrees.size());
- std::vector<unsigned long> right_idx_map(node_degrees.size());
-
- // figure out how many nodes went into each side of the split.
- unsigned long num_left_split = 0;
- unsigned long num_right_split = 0;
- for (long i = 0; i < l.size(); ++i)
- {
- if (l(i) > 0)
- {
- left_idx_map[i] = num_left_split;
- ++num_left_split;
- }
- else
- {
- right_idx_map[i] = num_right_split;
- ++num_right_split;
- }
- }
-
- // do a recursive split if it will improve the modularity.
- if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0)
- {
-
- // split the node_degrees and Bdiag matrices into left and right split parts
- matrix<double,0,1> left_node_degrees(num_left_split);
- matrix<double,0,1> right_node_degrees(num_right_split);
- matrix<double,0,1> left_Bdiag(num_left_split);
- matrix<double,0,1> right_Bdiag(num_right_split);
- for (long i = 0; i < l.size(); ++i)
- {
- if (l(i) > 0)
- {
- left_node_degrees(left_idx_map[i]) = node_degrees(i);
- left_Bdiag(left_idx_map[i]) = Bdiag(i);
- }
- else
- {
- right_node_degrees(right_idx_map[i]) = node_degrees(i);
- right_Bdiag(right_idx_map[i]) = Bdiag(i);
- }
- }
-
-
- // put the edges from one side of the split into split_edges
- std::vector<ordered_sample_pair> split_edges;
- modularity_threshold = 0;
- for (unsigned long k = 0; k < edges.size(); ++k)
- {
- const unsigned long i = edges[k].index1();
- const unsigned long j = edges[k].index2();
- const double d = edges[k].distance();
- if (l(i) > 0 && l(j) > 0)
- {
- split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d));
- modularity_threshold += d;
- }
- }
- modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum);
- modularity_threshold /= 4*edge_sum;
-
- unsigned long num_left_clusters;
- std::vector<unsigned long> left_labels;
- num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag,
- edge_sum,left_labels,modularity_threshold,
- eps, max_iterations);
-
- // now load the other side into split_edges and cluster it as well
- split_edges.clear();
- modularity_threshold = 0;
- for (unsigned long k = 0; k < edges.size(); ++k)
- {
- const unsigned long i = edges[k].index1();
- const unsigned long j = edges[k].index2();
- const double d = edges[k].distance();
- if (l(i) < 0 && l(j) < 0)
- {
- split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d));
- modularity_threshold += d;
- }
- }
- modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum);
- modularity_threshold /= 4*edge_sum;
-
- unsigned long num_right_clusters;
- std::vector<unsigned long> right_labels;
- num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag,
- edge_sum,right_labels,modularity_threshold,
- eps, max_iterations);
-
- // Now merge the labels from the two splits.
- labels.resize(node_degrees.size());
- for (unsigned long i = 0; i < labels.size(); ++i)
- {
- // if this node was in the left split
- if (l(i) > 0)
- {
- labels[i] = left_labels[left_idx_map[i]];
- }
- else // if this node was in the right split
- {
- labels[i] = right_labels[right_idx_map[i]] + num_left_clusters;
- }
- }
-
-
- return num_left_clusters + num_right_clusters;
- }
- else
- {
- labels.assign(node_degrees.size(),0);
- return 1;
- }
-
- }
- }
-
-// ----------------------------------------------------------------------------------------
-
- inline unsigned long newman_cluster (
- const std::vector<ordered_sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const double eps = 1e-4,
- const unsigned long max_iterations = 2000
- )
- {
- // make sure requires clause is not broken
- DLIB_ASSERT(is_ordered_by_index(edges),
- "\t unsigned long newman_cluster()"
- << "\n\t Invalid inputs were given to this function"
- );
-
- labels.clear();
- if (edges.size() == 0)
- return 0;
-
- const unsigned long num_nodes = max_index_plus_one(edges);
-
- // compute the node_degrees vector, edge_sum value, and diag(B).
- matrix<double,0,1> node_degrees(num_nodes);
- matrix<double,0,1> Bdiag(num_nodes);
- Bdiag = 0;
- double edge_sum = 0;
- node_degrees = 0;
- for (unsigned long i = 0; i < edges.size(); ++i)
- {
- node_degrees(edges[i].index1()) += edges[i].distance();
- edge_sum += edges[i].distance();
- if (edges[i].index1() == edges[i].index2())
- Bdiag(edges[i].index1()) += edges[i].distance();
- }
- edge_sum /= 2;
- Bdiag -= squared(node_degrees)/(2*edge_sum);
-
-
- dlib::rand rnd;
- return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations);
- }
-
-// ----------------------------------------------------------------------------------------
-
- inline unsigned long newman_cluster (
- const std::vector<sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const double eps = 1e-4,
- const unsigned long max_iterations = 2000
- )
- {
- std::vector<ordered_sample_pair> oedges;
- convert_unordered_to_ordered(edges, oedges);
- std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
-
- return newman_cluster(oedges, labels, eps, max_iterations);
- }
-
-// ----------------------------------------------------------------------------------------
-
- namespace impl
- {
- inline std::vector<unsigned long> remap_labels (
- const std::vector<unsigned long>& labels,
- unsigned long& num_labels
- )
- /*!
- ensures
- - This function takes labels and produces a mapping which maps elements of
- labels into the most compact range in [0, max] as possible. In particular,
- there won't be any unused integers in the mapped range.
- - #num_labels == the number of distinct values in labels.
- - returns a vector V such that:
- - V.size() == labels.size()
- - max(mat(V))+1 == num_labels.
- - for all valid i,j:
- - if (labels[i] == labels[j]) then
- - V[i] == V[j]
- - else
- - V[i] != V[j]
- !*/
- {
- std::map<unsigned long, unsigned long> temp;
- for (unsigned long i = 0; i < labels.size(); ++i)
- {
- if (temp.count(labels[i]) == 0)
- {
- const unsigned long next = temp.size();
- temp[labels[i]] = next;
- }
- }
-
- num_labels = temp.size();
-
- std::vector<unsigned long> result(labels.size());
- for (unsigned long i = 0; i < labels.size(); ++i)
- {
- result[i] = temp[labels[i]];
- }
- return result;
- }
- }
-
-// ----------------------------------------------------------------------------------------
-
- inline double modularity (
- const std::vector<sample_pair>& edges,
- const std::vector<unsigned long>& labels
- )
- {
- const unsigned long num_nodes = max_index_plus_one(edges);
- // make sure requires clause is not broken
- DLIB_ASSERT(labels.size() == num_nodes,
- "\t double modularity()"
- << "\n\t Invalid inputs were given to this function"
- );
-
- unsigned long num_labels;
- const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
-
- std::vector<double> cluster_sums(num_labels,0);
- std::vector<double> k(num_nodes,0);
-
- double Q = 0;
- double m = 0;
- for (unsigned long i = 0; i < edges.size(); ++i)
- {
- const unsigned long n1 = edges[i].index1();
- const unsigned long n2 = edges[i].index2();
- k[n1] += edges[i].distance();
- if (n1 != n2)
- k[n2] += edges[i].distance();
-
- if (n1 != n2)
- m += edges[i].distance();
- else
- m += edges[i].distance()/2;
-
- if (labels_[n1] == labels_[n2])
- {
- if (n1 != n2)
- Q += 2*edges[i].distance();
- else
- Q += edges[i].distance();
- }
- }
-
- if (m == 0)
- return 0;
-
- for (unsigned long i = 0; i < labels_.size(); ++i)
- {
- cluster_sums[labels_[i]] += k[i];
- }
-
- for (unsigned long i = 0; i < labels_.size(); ++i)
- {
- Q -= k[i]*cluster_sums[labels_[i]]/(2*m);
- }
-
- return 1.0/(2*m)*Q;
- }
-
-// ----------------------------------------------------------------------------------------
-
- inline double modularity (
- const std::vector<ordered_sample_pair>& edges,
- const std::vector<unsigned long>& labels
- )
- {
- const unsigned long num_nodes = max_index_plus_one(edges);
- // make sure requires clause is not broken
- DLIB_ASSERT(labels.size() == num_nodes,
- "\t double modularity()"
- << "\n\t Invalid inputs were given to this function"
- );
-
-
- unsigned long num_labels;
- const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
-
- std::vector<double> cluster_sums(num_labels,0);
- std::vector<double> k(num_nodes,0);
-
- double Q = 0;
- double m = 0;
- for (unsigned long i = 0; i < edges.size(); ++i)
- {
- const unsigned long n1 = edges[i].index1();
- const unsigned long n2 = edges[i].index2();
- k[n1] += edges[i].distance();
- m += edges[i].distance();
- if (labels_[n1] == labels_[n2])
- {
- Q += edges[i].distance();
- }
- }
-
- if (m == 0)
- return 0;
-
- for (unsigned long i = 0; i < labels_.size(); ++i)
- {
- cluster_sums[labels_[i]] += k[i];
- }
-
- for (unsigned long i = 0; i < labels_.size(); ++i)
- {
- Q -= k[i]*cluster_sums[labels_[i]]/m;
- }
-
- return 1.0/m*Q;
- }
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_MODULARITY_ClUSTERING__H__
-
diff --git a/ml/dlib/dlib/clustering/modularity_clustering_abstract.h b/ml/dlib/dlib/clustering/modularity_clustering_abstract.h
deleted file mode 100644
index c1e7c20c4..000000000
--- a/ml/dlib/dlib/clustering/modularity_clustering_abstract.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright (C) 2012 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#undef DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_
-#ifdef DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_
-
-#include <vector>
-#include "../graph_utils/ordered_sample_pair_abstract.h"
-#include "../graph_utils/sample_pair_abstract.h"
-
-namespace dlib
-{
-
-// -----------------------------------------------------------------------------------------
-
- double modularity (
- const std::vector<sample_pair>& edges,
- const std::vector<unsigned long>& labels
- );
- /*!
- requires
- - labels.size() == max_index_plus_one(edges)
- - for all valid i:
- - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity()
- ensures
- - Interprets edges as an undirected graph. That is, it contains the edges on
- the said graph and the sample_pair::distance() values define the edge weights
- (larger values indicating a stronger edge connection between the nodes).
- - This function returns the modularity value obtained when the given input
- graph is broken into subgraphs according to the contents of labels. In
- particular, we say that two nodes with indices i and j are in the same
- subgraph or community if and only if labels[i] == labels[j].
- - Duplicate edges are interpreted as if there had been just one edge with a
- distance value equal to the sum of all the duplicate edge's distance values.
- - See the paper Modularity and community structure in networks by M. E. J. Newman
- for a detailed definition.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
- double modularity (
- const std::vector<ordered_sample_pair>& edges,
- const std::vector<unsigned long>& labels
- );
- /*!
- requires
- - labels.size() == max_index_plus_one(edges)
- - for all valid i:
- - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity()
- ensures
- - Interprets edges as a directed graph. That is, it contains the edges on the
- said graph and the ordered_sample_pair::distance() values define the edge
- weights (larger values indicating a stronger edge connection between the
- nodes). Note that, generally, modularity is only really defined for
- undirected graphs. Therefore, the "directed graph" given to this function
- should have symmetric edges between all nodes. The reason this function is
- provided at all is because sometimes a vector of ordered_sample_pair objects
- is a useful representation of an undirected graph.
- - This function returns the modularity value obtained when the given input
- graph is broken into subgraphs according to the contents of labels. In
- particular, we say that two nodes with indices i and j are in the same
- subgraph or community if and only if labels[i] == labels[j].
- - Duplicate edges are interpreted as if there had been just one edge with a
- distance value equal to the sum of all the duplicate edge's distance values.
- - See the paper Modularity and community structure in networks by M. E. J. Newman
- for a detailed definition.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
- unsigned long newman_cluster (
- const std::vector<ordered_sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const double eps = 1e-4,
- const unsigned long max_iterations = 2000
- );
- /*!
- requires
- - is_ordered_by_index(edges) == true
- - for all valid i:
- - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity()
- ensures
- - This function performs the clustering algorithm described in the paper
- Modularity and community structure in networks by M. E. J. Newman.
- - This function interprets edges as a graph and attempts to find the labeling
- that maximizes modularity(edges, #labels).
- - returns the number of clusters found.
- - #labels.size() == max_index_plus_one(edges)
- - for all valid i:
- - #labels[i] == the cluster ID of the node with index i in the graph.
- - 0 <= #labels[i] < the number of clusters found
- (i.e. cluster IDs are assigned contiguously and start at 0)
- - The main computation of the algorithm is involved in finding an eigenvector
- of a certain matrix. To do this, we use the power iteration. In particular,
- each time we try to find an eigenvector we will let the power iteration loop
- at most max_iterations times or until it reaches an accuracy of eps.
- Whichever comes first.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
- unsigned long newman_cluster (
- const std::vector<sample_pair>& edges,
- std::vector<unsigned long>& labels,
- const double eps = 1e-4,
- const unsigned long max_iterations = 2000
- );
- /*!
- requires
- - for all valid i:
- - 0 <= edges[i].distance() < std::numeric_limits<double>::infinity()
- ensures
- - This function is identical to the above newman_cluster() routine except that
- it operates on a vector of sample_pair objects instead of
- ordered_sample_pairs. Therefore, this is simply a convenience routine. In
- particular, it is implemented by transforming the given edges into
- ordered_sample_pairs and then calling the newman_cluster() routine defined
- above.
- !*/
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_MODULARITY_ClUSTERING_ABSTRACT_Hh_
-
diff --git a/ml/dlib/dlib/clustering/spectral_cluster.h b/ml/dlib/dlib/clustering/spectral_cluster.h
deleted file mode 100644
index 2cac9870f..000000000
--- a/ml/dlib/dlib/clustering/spectral_cluster.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (C) 2015 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#ifndef DLIB_SPECTRAL_CLUSTEr_H_
-#define DLIB_SPECTRAL_CLUSTEr_H_
-
-#include "spectral_cluster_abstract.h"
-#include <vector>
-#include "../matrix.h"
-#include "../svm/kkmeans.h"
-
-namespace dlib
-{
- template <
- typename kernel_type,
- typename vector_type
- >
- std::vector<unsigned long> spectral_cluster (
- const kernel_type& k,
- const vector_type& samples,
- const unsigned long num_clusters
- )
- {
- DLIB_CASSERT(num_clusters > 0,
- "\t std::vector<unsigned long> spectral_cluster(k,samples,num_clusters)"
- << "\n\t num_clusters can't be 0."
- );
-
- if (num_clusters == 1)
- {
- // nothing to do, just assign everything to the 0 cluster.
- return std::vector<unsigned long>(samples.size(), 0);
- }
-
- // compute the similarity matrix.
- matrix<double> K(samples.size(), samples.size());
- for (long r = 0; r < K.nr(); ++r)
- for (long c = r+1; c < K.nc(); ++c)
- K(r,c) = K(c,r) = (double)k(samples[r], samples[c]);
- for (long r = 0; r < K.nr(); ++r)
- K(r,r) = 0;
-
- matrix<double,0,1> D(K.nr());
- for (long r = 0; r < K.nr(); ++r)
- D(r) = sum(rowm(K,r));
- D = sqrt(reciprocal(D));
- K = diagm(D)*K*diagm(D);
- matrix<double> u,w,v;
- // Use the normal SVD routine unless the matrix is really big, then use the fast
- // approximate version.
- if (K.nr() < 1000)
- svd3(K,u,w,v);
- else
- svd_fast(K,u,w,v, num_clusters+100, 5);
- // Pick out the eigenvectors associated with the largest eigenvalues.
- rsort_columns(v,w);
- v = colm(v, range(0,num_clusters-1));
- // Now build the normalized spectral vectors, one for each input vector.
- std::vector<matrix<double,0,1> > spec_samps, centers;
- for (long r = 0; r < v.nr(); ++r)
- {
- spec_samps.push_back(trans(rowm(v,r)));
- const double len = length(spec_samps.back());
- if (len != 0)
- spec_samps.back() /= len;
- }
- // Finally do the K-means clustering
- pick_initial_centers(num_clusters, centers, spec_samps);
- find_clusters_using_kmeans(spec_samps, centers);
- // And then compute the cluster assignments based on the output of K-means.
- std::vector<unsigned long> assignments;
- for (unsigned long i = 0; i < spec_samps.size(); ++i)
- assignments.push_back(nearest_center(centers, spec_samps[i]));
-
- return assignments;
- }
-
-}
-
-#endif // DLIB_SPECTRAL_CLUSTEr_H_
-
diff --git a/ml/dlib/dlib/clustering/spectral_cluster_abstract.h b/ml/dlib/dlib/clustering/spectral_cluster_abstract.h
deleted file mode 100644
index 880ad80af..000000000
--- a/ml/dlib/dlib/clustering/spectral_cluster_abstract.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (C) 2015 Davis E. King (davis@dlib.net)
-// License: Boost Software License See LICENSE.txt for the full license.
-#undef DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_
-#ifdef DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_
-
-#include <vector>
-
-namespace dlib
-{
- template <
- typename kernel_type,
- typename vector_type
- >
- std::vector<unsigned long> spectral_cluster (
- const kernel_type& k,
- const vector_type& samples,
- const unsigned long num_clusters
- );
- /*!
- requires
- - samples must be something with an interface compatible with std::vector.
- - The following expression must evaluate to a double or float:
- k(samples[i], samples[j])
- - num_clusters > 0
- ensures
- - Performs the spectral clustering algorithm described in the paper:
- On spectral clustering: Analysis and an algorithm by Ng, Jordan, and Weiss.
- and returns the results.
- - This function clusters the input data samples into num_clusters clusters and
- returns a vector that indicates which cluster each sample falls into. In
- particular, we return an array A such that:
- - A.size() == samples.size()
- - A[i] == the cluster assignment of samples[i].
- - for all valid i: 0 <= A[i] < num_clusters
- - The "similarity" of samples[i] with samples[j] is given by
- k(samples[i],samples[j]). This means that k() should output a number >= 0
- and the number should be larger for samples that are more similar.
- !*/
-}
-
-#endif // DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_
-
-