summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/graph_utils/edge_list_graphs.h
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/graph_utils/edge_list_graphs.h')
-rw-r--r--ml/dlib/dlib/graph_utils/edge_list_graphs.h593
1 files changed, 593 insertions, 0 deletions
diff --git a/ml/dlib/dlib/graph_utils/edge_list_graphs.h b/ml/dlib/dlib/graph_utils/edge_list_graphs.h
new file mode 100644
index 000000000..d2447acdb
--- /dev/null
+++ b/ml/dlib/dlib/graph_utils/edge_list_graphs.h
@@ -0,0 +1,593 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_EDGE_LIST_GrAPHS_Hh_
+#define DLIB_EDGE_LIST_GrAPHS_Hh_
+
+#include "edge_list_graphs_abstract.h"
+#include <limits>
+#include <vector>
+#include "../string.h"
+#include "../rand.h"
+#include <algorithm>
+#include "sample_pair.h"
+#include "ordered_sample_pair.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type
+ >
+ void remove_duplicate_edges (
+ vector_type& pairs
+ )
+ {
+ typedef typename vector_type::value_type T;
+ if (pairs.size() > 0)
+ {
+ // sort pairs so that we can avoid duplicates in the loop below
+ std::sort(pairs.begin(), pairs.end(), &order_by_index<T>);
+
+ // now put edges into temp while avoiding duplicates
+ vector_type temp;
+ temp.reserve(pairs.size());
+ temp.push_back(pairs[0]);
+ for (unsigned long i = 1; i < pairs.size(); ++i)
+ {
+ if (pairs[i] != pairs[i-1])
+ {
+ temp.push_back(pairs[i]);
+ }
+ }
+
+ temp.swap(pairs);
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <typename iterator>
+ iterator iterator_of_worst (
+ iterator begin,
+ const iterator& end
+ )
+ /*!
+ ensures
+ - returns an iterator that points to the element in the given range
+ that has the biggest distance
+ !*/
+ {
+ double dist = begin->distance();
+ iterator worst = begin;
+ for (; begin != end; ++begin)
+ {
+ if (begin->distance() > dist)
+ {
+ dist = begin->distance();
+ worst = begin;
+ }
+ }
+
+ return worst;
+ }
+
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type,
+ typename distance_function_type,
+ typename alloc,
+ typename T
+ >
+ void find_percent_shortest_edges_randomly (
+ const vector_type& samples,
+ const distance_function_type& dist_funct,
+ const double percent,
+ const unsigned long num,
+ const T& random_seed,
+ std::vector<sample_pair, alloc>& out
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT( 0 < percent && percent <= 1 &&
+ num > 0,
+ "\t void find_percent_shortest_edges_randomly()"
+ << "\n\t Invalid inputs were given to this function."
+ << "\n\t samples.size(): " << samples.size()
+ << "\n\t percent: " << percent
+ << "\n\t num: " << num
+ );
+
+ out.clear();
+
+ if (samples.size() <= 1)
+ {
+ return;
+ }
+
+ std::vector<sample_pair, alloc> edges;
+ edges.reserve(num);
+
+ dlib::rand rnd;
+ rnd.set_seed(cast_to_string(random_seed));
+
+ // randomly sample a bunch of edges
+ for (unsigned long i = 0; i < num; ++i)
+ {
+ const unsigned long idx1 = rnd.get_random_32bit_number()%samples.size();
+ const unsigned long idx2 = rnd.get_random_32bit_number()%samples.size();
+ if (idx1 != idx2)
+ {
+ const double dist = dist_funct(samples[idx1], samples[idx2]);
+ if (dist < std::numeric_limits<double>::infinity())
+ {
+ edges.push_back(sample_pair(idx1, idx2, dist));
+ }
+ }
+ }
+
+
+ // now put edges into out while avoiding duplicates
+ if (edges.size() > 0)
+ {
+ remove_duplicate_edges(edges);
+
+ // now sort all the edges by distance and take the percent with the smallest distance
+ std::sort(edges.begin(), edges.end(), &order_by_distance<sample_pair>);
+
+ const unsigned long out_size = std::min<unsigned long>((unsigned long)(num*percent), edges.size());
+ out.assign(edges.begin(), edges.begin() + out_size);
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type,
+ typename distance_function_type,
+ typename alloc,
+ typename T
+ >
+ void find_approximate_k_nearest_neighbors (
+ const vector_type& samples,
+ const distance_function_type& dist_funct,
+ const unsigned long k,
+ unsigned long num,
+ const T& random_seed,
+ std::vector<sample_pair, alloc>& out
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT( num > 0 && k > 0,
+ "\t void find_approximate_k_nearest_neighbors()"
+ << "\n\t Invalid inputs were given to this function."
+ << "\n\t samples.size(): " << samples.size()
+ << "\n\t k: " << k
+ << "\n\t num: " << num
+ );
+
+ out.clear();
+
+ if (samples.size() <= 1)
+ {
+ return;
+ }
+
+ // we add each edge twice in the following loop. So multiply num by 2 to account for that.
+ num *= 2;
+
+ std::vector<ordered_sample_pair> edges;
+ edges.reserve(num);
+ std::vector<sample_pair, alloc> temp;
+ temp.reserve(num);
+
+ dlib::rand rnd;
+ rnd.set_seed(cast_to_string(random_seed));
+
+ // randomly sample a bunch of edges
+ for (unsigned long i = 0; i < num; ++i)
+ {
+ const unsigned long idx1 = rnd.get_random_32bit_number()%samples.size();
+ const unsigned long idx2 = rnd.get_random_32bit_number()%samples.size();
+ if (idx1 != idx2)
+ {
+ const double dist = dist_funct(samples[idx1], samples[idx2]);
+ if (dist < std::numeric_limits<double>::infinity())
+ {
+ edges.push_back(ordered_sample_pair(idx1, idx2, dist));
+ edges.push_back(ordered_sample_pair(idx2, idx1, dist));
+ }
+ }
+ }
+
+ std::sort(edges.begin(), edges.end(), &order_by_index<ordered_sample_pair>);
+
+ std::vector<ordered_sample_pair>::iterator beg, itr;
+ // now copy edges into temp when they aren't duplicates and also only move in the k shortest for
+ // each index.
+ itr = edges.begin();
+ while (itr != edges.end())
+ {
+ // first find the bounding range for all the edges connected to node itr->index1()
+ beg = itr;
+ while (itr != edges.end() && itr->index1() == beg->index1())
+ ++itr;
+
+ // If the node has more than k edges then sort them by distance so that
+ // we will end up with the k best.
+ if (static_cast<unsigned long>(itr - beg) > k)
+ {
+ std::sort(beg, itr, &order_by_distance_and_index<ordered_sample_pair>);
+ }
+
+ // take the k best unique edges from the range [beg,itr)
+ temp.push_back(sample_pair(beg->index1(), beg->index2(), beg->distance()));
+ unsigned long prev_index2 = beg->index2();
+ ++beg;
+ unsigned long count = 1;
+ for (; beg != itr && count < k; ++beg)
+ {
+ if (beg->index2() != prev_index2)
+ {
+ temp.push_back(sample_pair(beg->index1(), beg->index2(), beg->distance()));
+ ++count;
+ }
+ prev_index2 = beg->index2();
+ }
+ }
+
+
+ remove_duplicate_edges(temp);
+ temp.swap(out);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type,
+ typename distance_function_type,
+ typename alloc
+ >
+ void find_k_nearest_neighbors (
+ const vector_type& samples,
+ const distance_function_type& dist_funct,
+ const unsigned long k,
+ std::vector<sample_pair, alloc>& out
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(k > 0,
+ "\t void find_k_nearest_neighbors()"
+ << "\n\t Invalid inputs were given to this function."
+ << "\n\t samples.size(): " << samples.size()
+ << "\n\t k: " << k
+ );
+
+ out.clear();
+
+ if (samples.size() <= 1)
+ {
+ return;
+ }
+
+ using namespace impl;
+ std::vector<sample_pair> edges;
+
+ // Initialize all the edges to an edge with an invalid index
+ edges.resize(samples.size()*k,
+ sample_pair(samples.size(),samples.size(),std::numeric_limits<double>::infinity()));
+
+ // Hold the length for the longest edge for each node. Initially they are all infinity.
+ std::vector<double> worst_dists(samples.size(), std::numeric_limits<double>::infinity());
+
+ std::vector<sample_pair>::iterator begin_i, end_i, begin_j, end_j;
+ begin_i = edges.begin();
+ end_i = begin_i + k;
+
+ // Loop over all combinations of samples. We will maintain the iterator ranges so that
+ // within the inner for loop we have:
+ // [begin_i, end_i) == the range in edges that contains neighbors of samples[i]
+ // [begin_j, end_j) == the range in edges that contains neighbors of samples[j]
+ for (unsigned long i = 0; i+1 < samples.size(); ++i)
+ {
+ begin_j = begin_i;
+ end_j = end_i;
+
+ for (unsigned long j = i+1; j < samples.size(); ++j)
+ {
+ begin_j += k;
+ end_j += k;
+
+ const double dist = dist_funct(samples[i], samples[j]);
+
+ if (dist < worst_dists[i])
+ {
+ *iterator_of_worst(begin_i, end_i) = sample_pair(i, j, dist);
+ worst_dists[i] = iterator_of_worst(begin_i, end_i)->distance();
+ }
+
+ if (dist < worst_dists[j])
+ {
+ *iterator_of_worst(begin_j, end_j) = sample_pair(i, j, dist);
+ worst_dists[j] = iterator_of_worst(begin_j, end_j)->distance();
+ }
+ }
+
+ begin_i += k;
+ end_i += k;
+ }
+
+ // sort the edges so that duplicate edges will be adjacent
+ std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
+
+ // if the first edge is valid
+ if (edges[0].index1() < samples.size())
+ {
+ // now put edges into out while avoiding duplicates and any remaining invalid edges.
+ out.reserve(edges.size());
+ out.push_back(edges[0]);
+ for (unsigned long i = 1; i < edges.size(); ++i)
+ {
+ // if we hit an invalid edge then we can stop
+ if (edges[i].index1() >= samples.size())
+ break;
+
+ // if this isn't a duplicate edge
+ if (edges[i] != edges[i-1])
+ {
+ out.push_back(edges[i]);
+ }
+ }
+ }
+
+
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type
+ >
+ bool contains_duplicate_pairs (
+ const vector_type& pairs
+ )
+ {
+ typedef typename vector_type::value_type T;
+ vector_type temp(pairs);
+ std::sort(temp.begin(), temp.end(), &order_by_index<T>);
+
+ for (unsigned long i = 1; i < temp.size(); ++i)
+ {
+ // if we found a duplicate
+ if (temp[i-1] == temp[i])
+ return true;
+ }
+
+ return false;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type
+ >
+ typename enable_if_c<(is_same_type<sample_pair, typename vector_type::value_type>::value ||
+ is_same_type<ordered_sample_pair, typename vector_type::value_type>::value),
+ unsigned long>::type
+ max_index_plus_one (
+ const vector_type& pairs
+ )
+ {
+ if (pairs.size() == 0)
+ {
+ return 0;
+ }
+ else
+ {
+ unsigned long max_idx = 0;
+ for (unsigned long i = 0; i < pairs.size(); ++i)
+ {
+ if (pairs[i].index1() > max_idx)
+ max_idx = pairs[i].index1();
+ if (pairs[i].index2() > max_idx)
+ max_idx = pairs[i].index2();
+ }
+
+ return max_idx + 1;
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type
+ >
+ void remove_long_edges (
+ vector_type& pairs,
+ double distance_threshold
+ )
+ {
+ vector_type temp;
+ temp.reserve(pairs.size());
+
+ // add all the pairs shorter than the given threshold into temp
+ for (unsigned long i = 0; i < pairs.size(); ++i)
+ {
+ if (pairs[i].distance() <= distance_threshold)
+ temp.push_back(pairs[i]);
+ }
+
+ // move temp into the output vector
+ temp.swap(pairs);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type
+ >
+ void remove_short_edges (
+ vector_type& pairs,
+ double distance_threshold
+ )
+ {
+ vector_type temp;
+ temp.reserve(pairs.size());
+
+ // add all the pairs longer than the given threshold into temp
+ for (unsigned long i = 0; i < pairs.size(); ++i)
+ {
+ if (pairs[i].distance() >= distance_threshold)
+ temp.push_back(pairs[i]);
+ }
+
+ // move temp into the output vector
+ temp.swap(pairs);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type
+ >
+ void remove_percent_longest_edges (
+ vector_type& pairs,
+ double percent
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT( 0 <= percent && percent < 1,
+ "\t void remove_percent_longest_edges()"
+ << "\n\t Invalid inputs were given to this function."
+ << "\n\t percent: " << percent
+ );
+
+ typedef typename vector_type::value_type T;
+ std::sort(pairs.begin(), pairs.end(), &order_by_distance<T>);
+
+ const unsigned long num = static_cast<unsigned long>((1.0-percent)*pairs.size());
+
+ // pick out the num shortest pairs
+ vector_type temp(pairs.begin(), pairs.begin() + num);
+
+ // move temp into the output vector
+ temp.swap(pairs);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type
+ >
+ void remove_percent_shortest_edges (
+ vector_type& pairs,
+ double percent
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT( 0 <= percent && percent < 1,
+ "\t void remove_percent_shortest_edges()"
+ << "\n\t Invalid inputs were given to this function."
+ << "\n\t percent: " << percent
+ );
+
+ typedef typename vector_type::value_type T;
+ std::sort(pairs.rbegin(), pairs.rend(), &order_by_distance<T>);
+
+ const unsigned long num = static_cast<unsigned long>((1.0-percent)*pairs.size());
+
+ // pick out the num shortest pairs
+ vector_type temp(pairs.begin(), pairs.begin() + num);
+
+ // move temp into the output vector
+ temp.swap(pairs);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename vector_type
+ >
+ bool is_ordered_by_index (
+ const vector_type& edges
+ )
+ {
+ for (unsigned long i = 1; i < edges.size(); ++i)
+ {
+ if (order_by_index(edges[i], edges[i-1]))
+ return false;
+ }
+ return true;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename alloc1,
+ typename alloc2
+ >
+ void find_neighbor_ranges (
+ const std::vector<ordered_sample_pair,alloc1>& edges,
+ std::vector<std::pair<unsigned long, unsigned long>,alloc2>& neighbors
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_ordered_by_index(edges),
+ "\t void find_neighbor_ranges()"
+ << "\n\t Invalid inputs were given to this function"
+ );
+
+
+ // setup neighbors so that [neighbors[i].first, neighbors[i].second) is the range
+ // within edges that contains all node i's edges.
+ const unsigned long num_nodes = max_index_plus_one(edges);
+ neighbors.assign(num_nodes, std::make_pair(0,0));
+ unsigned long cur_node = 0;
+ unsigned long start_idx = 0;
+ for (unsigned long i = 0; i < edges.size(); ++i)
+ {
+ if (edges[i].index1() != cur_node)
+ {
+ neighbors[cur_node] = std::make_pair(start_idx, i);
+ start_idx = i;
+ cur_node = edges[i].index1();
+ }
+ }
+ if (neighbors.size() != 0)
+ neighbors[cur_node] = std::make_pair(start_idx, (unsigned long)edges.size());
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename alloc1,
+ typename alloc2
+ >
+ void convert_unordered_to_ordered (
+ const std::vector<sample_pair,alloc1>& edges,
+ std::vector<ordered_sample_pair,alloc2>& out_edges
+ )
+ {
+ out_edges.clear();
+ out_edges.reserve(edges.size()*2);
+ for (unsigned long i = 0; i < edges.size(); ++i)
+ {
+ out_edges.push_back(ordered_sample_pair(edges[i].index1(), edges[i].index2(), edges[i].distance()));
+ if (edges[i].index1() != edges[i].index2())
+ out_edges.push_back(ordered_sample_pair(edges[i].index2(), edges[i].index1(), edges[i].distance()));
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_EDGE_LIST_GrAPHS_Hh_
+
+