diff options
Diffstat (limited to 'ml/dlib/dlib/lsh')
-rw-r--r-- | ml/dlib/dlib/lsh/create_random_projection_hash.h | 232 | ||||
-rw-r--r-- | ml/dlib/dlib/lsh/create_random_projection_hash_abstract.h | 148 | ||||
-rw-r--r-- | ml/dlib/dlib/lsh/hashes.h | 219 | ||||
-rw-r--r-- | ml/dlib/dlib/lsh/hashes_abstract.h | 286 | ||||
-rw-r--r-- | ml/dlib/dlib/lsh/projection_hash.h | 118 | ||||
-rw-r--r-- | ml/dlib/dlib/lsh/projection_hash_abstract.h | 119 |
6 files changed, 1122 insertions, 0 deletions
diff --git a/ml/dlib/dlib/lsh/create_random_projection_hash.h b/ml/dlib/dlib/lsh/create_random_projection_hash.h new file mode 100644 index 000000000..b3aecd9ec --- /dev/null +++ b/ml/dlib/dlib/lsh/create_random_projection_hash.h @@ -0,0 +1,232 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_CREATE_RANDOM_PROJECTION_HAsH_Hh_ +#define DLIB_CREATE_RANDOM_PROJECTION_HAsH_Hh_ + +#include "create_random_projection_hash_abstract.h" +#include "projection_hash.h" +#include "../matrix.h" +#include "../rand.h" +#include "../statistics.h" +#include "../svm.h" +#include <vector> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_random_projection_hash ( + const vector_type& v, + const int bits, + dlib::rand& rnd + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(0 < bits && bits <= 32 && + v.size() > 1, + "\t projection_hash create_random_projection_hash()" + << "\n\t Invalid arguments were given to this function." + << "\n\t bits: " << bits + << "\n\t v.size(): " << v.size() + ); + +#ifdef ENABLE_ASSERTS + for (unsigned long i = 0; i < v.size(); ++i) + { + DLIB_ASSERT(v[0].size() == v[i].size() && v[i].size() > 0 && is_col_vector(v[i]), + "\t projection_hash create_random_projection_hash()" + << "\n\t Invalid arguments were given to this function." + << "\n\t m(0).size(): " << v[0].size() + << "\n\t m("<<i<<").size(): " << v[i].size() + << "\n\t is_col_vector(v["<<i<<"]): " << is_col_vector(v[i]) + ); + } +#endif + + running_covariance<matrix<double> > rc; + for (unsigned long i = 0; i < v.size(); ++i) + rc.add(matrix_cast<double>(v[i])); + + // compute a whitening matrix + matrix<double> whiten = trans(chol(pinv(rc.covariance()))); + + + // hashes + std::vector<unsigned long> h(v.size(),0); + + std::vector<double> vals(v.size(),0); + + // number of hits for each hash value + std::vector<unsigned long> counts; + + std::vector<double> temp; + + // build a random projection matrix + matrix<double> proj(bits, v[0].size()); + for (long r = 0; r < proj.nr(); ++r) + for (long c = 0; c < proj.nc(); ++c) + proj(r,c) = rnd.get_random_gaussian(); + + // merge whitening matrix with projection matrix + proj = proj*whiten; + + matrix<double,0,1> offset(bits); + + + // figure out what the offset values should be + for (int itr = 0; itr < offset.size(); ++itr) + { + counts.assign(static_cast<unsigned long>(std::pow(2.0,bits)), 0); + // count the popularity of each hash value + for (unsigned long i = 0; i < h.size(); ++i) + { + h[i] <<= 1; + counts[h[i]] += 1; + } + + const unsigned long max_h = index_of_max(mat(counts)); + + temp.clear(); + for (unsigned long i = 0; i < v.size(); ++i) + { + vals[i] = dot(rowm(proj,itr), matrix_cast<double>(v[i])); + if (h[i] == max_h) + temp.push_back(vals[i]); + } + + // split down the middle + std::sort(temp.begin(), temp.end()); + const double split = temp[temp.size()/2]; + offset(itr) = -split; + + for (unsigned long i = 0; i < vals.size(); ++i) + { + if (vals[i] - split > 0) + h[i] |= 1; + } + } + + + return projection_hash(proj, offset); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_random_projection_hash ( + const vector_type& v, + const int bits + ) + { + dlib::rand rnd; + return create_random_projection_hash(v,bits,rnd); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_max_margin_projection_hash ( + const vector_type& v, + const int bits, + const double C, + dlib::rand& rnd + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(0 < bits && bits <= 32 && + v.size() > 1, + "\t projection_hash create_max_margin_projection_hash()" + << "\n\t Invalid arguments were given to this function." + << "\n\t bits: " << bits + << "\n\t v.size(): " << v.size() + ); + +#ifdef ENABLE_ASSERTS + for (unsigned long i = 0; i < v.size(); ++i) + { + DLIB_ASSERT(v[0].size() == v[i].size() && v[i].size() > 0 && is_col_vector(v[i]), + "\t projection_hash create_max_margin_projection_hash()" + << "\n\t Invalid arguments were given to this function." + << "\n\t m(0).size(): " << v[0].size() + << "\n\t m("<<i<<").size(): " << v[i].size() + << "\n\t is_col_vector(v["<<i<<"]): " << is_col_vector(v[i]) + ); + } +#endif + + running_covariance<matrix<double> > rc; + for (unsigned long i = 0; i < v.size(); ++i) + rc.add(matrix_cast<double>(v[i])); + + // compute a whitening matrix + matrix<double> whiten = trans(chol(pinv(rc.covariance()))); + const matrix<double,0,1> meanval = whiten*rc.mean(); + + + + typedef matrix<double,0,1> sample_type; + random_subset_selector<sample_type> training_samples; + random_subset_selector<double> training_labels; + // We set this up to use enough samples to cover the vector space used by elements + // of v. + training_samples.set_max_size(v[0].size()*10); + training_labels.set_max_size(v[0].size()*10); + + matrix<double> proj(bits, v[0].size()); + matrix<double,0,1> offset(bits); + + // learn the random planes and put them into proj and offset. + for (int itr = 0; itr < offset.size(); ++itr) + { + training_samples.make_empty(); + training_labels.make_empty(); + // pick random training data and give each sample a random label. + for (unsigned long i = 0; i < v.size(); ++i) + { + training_samples.add(whiten*v[i]-meanval); + if (rnd.get_random_double() > 0.5) + training_labels.add(+1); + else + training_labels.add(-1); + } + + svm_c_linear_dcd_trainer<linear_kernel<sample_type> > trainer; + trainer.set_c(C); + decision_function<linear_kernel<sample_type> > df = trainer.train(training_samples, training_labels); + offset(itr) = -df.b; + set_rowm(proj,itr) = trans(df.basis_vectors(0)); + } + + + return projection_hash(proj*whiten, offset-proj*meanval); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_max_margin_projection_hash ( + const vector_type& v, + const int bits, + const double C = 10 + ) + { + dlib::rand rnd; + return create_max_margin_projection_hash(v,bits,C,rnd); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_CREATE_RANDOM_PROJECTION_HAsH_Hh_ + diff --git a/ml/dlib/dlib/lsh/create_random_projection_hash_abstract.h b/ml/dlib/dlib/lsh/create_random_projection_hash_abstract.h new file mode 100644 index 000000000..cff55b9a5 --- /dev/null +++ b/ml/dlib/dlib/lsh/create_random_projection_hash_abstract.h @@ -0,0 +1,148 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_CREATE_RANDOM_PROJECTION_HAsH_ABSTRACT_Hh_ +#ifdef DLIB_CREATE_RANDOM_PROJECTION_HAsH_ABSTRACT_Hh_ + +#include "projection_hash_abstract.h" +#include "../rand.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_random_projection_hash ( + const vector_type& v, + const int bits, + dlib::rand& rnd + ); + /*! + requires + - 0 < bits <= 32 + - v.size() > 1 + - vector_type == a std::vector or compatible type containing dlib::matrix + objects, each representing a column vector of the same size. + - for all valid i, j: + - is_col_vector(v[i]) == true + - v[i].size() > 0 + - v[i].size() == v[j].size() + - i.e. v contains only column vectors and all the column vectors + have the same non-zero length + - rand_type == a type that implements the dlib/rand/rand_kernel_abstract.h interface + ensures + - returns a hash function H such that: + - H.num_hash_bins() == pow(2,bits) + - H will be setup so that it hashes the contents of v such that each bin + ends up with roughly the same number of elements in it. This is + accomplished by picking random hyperplanes passing though the data. In + particular, each plane normal vector is filled with Gaussian random + numbers and we also perform basic centering to ensure the plane passes + though the data. + - This function uses the supplied random number generator, rnd, to drive part + of it's processing. Therefore, giving different random number generators + will produce different outputs. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_random_projection_hash ( + const vector_type& v, + const int bits + ); + /*! + requires + - 0 < bits <= 32 + - v.size() > 1 + - vector_type == a std::vector or compatible type containing dlib::matrix + objects, each representing a column vector of the same size. + - for all valid i, j: + - is_col_vector(v[i]) == true + - v[i].size() > 0 + - v[i].size() == v[j].size() + - i.e. v contains only column vectors and all the column vectors + have the same non-zero length + ensures + - returns create_random_projection_hash(v,bits,dlib::rand()) + (i.e. calls the above function with a default initialized random number generator) + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_max_margin_projection_hash ( + const vector_type& v, + const int bits, + const double C, + dlib::rand& rnd + ); + /*! + requires + - 0 < bits <= 32 + - v.size() > 1 + - vector_type == a std::vector or compatible type containing dlib::matrix + objects, each representing a column vector of the same size. + - for all valid i, j: + - is_col_vector(v[i]) == true + - v[i].size() > 0 + - v[i].size() == v[j].size() + - i.e. v contains only column vectors and all the column vectors + have the same non-zero length + - rand_type == a type that implements the dlib/rand/rand_kernel_abstract.h interface + ensures + - returns a hash function H such that: + - H.num_hash_bins() == pow(2,bits) + - H will be setup so that it hashes the contents of v such that + each bin ends up with roughly the same number of elements + in it. This is accomplished using a variation on the random hyperplane + generation technique from the paper: + Random Maximum Margin Hashing by Alexis Joly and Olivier Buisson + In particular, we use the svm_c_linear_dcd_trainer to generate planes. + We train it on randomly selected and randomly labeled points from v. + The C SVM parameter is set to the given C argument. + - This function uses the supplied random number generator, rnd, to drive part + of it's processing. Therefore, giving different random number generators + will produce different outputs. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_max_margin_projection_hash ( + const vector_type& v, + const int bits, + const double C = 10 + ); + /*! + requires + - 0 < bits <= 32 + - v.size() > 1 + - vector_type == a std::vector or compatible type containing dlib::matrix + objects, each representing a column vector of the same size. + - for all valid i, j: + - is_col_vector(v[i]) == true + - v[i].size() > 0 + - v[i].size() == v[j].size() + - i.e. v contains only column vectors and all the column vectors + have the same non-zero length + ensures + - returns create_max_margin_projection_hash(v,bits,C,dlib::rand()) + (i.e. calls the above function with a default initialized random number generator) + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_CREATE_RANDOM_PROJECTION_HAsH_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/lsh/hashes.h b/ml/dlib/dlib/lsh/hashes.h new file mode 100644 index 000000000..35053ce4e --- /dev/null +++ b/ml/dlib/dlib/lsh/hashes.h @@ -0,0 +1,219 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_LSH_HAShES_Hh_ +#define DLIB_LSH_HAShES_Hh_ + +#include "hashes_abstract.h" +#include "../hash.h" +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_64 + { + public: + hash_similar_angles_64 ( + ) : seed(0) {} + + hash_similar_angles_64 ( + const uint64 seed_ + ) : seed(seed_) {} + + uint64 get_seed ( + ) const { return seed; } + + + typedef uint64 result_type; + + template < + typename sparse_vector_type + > + typename disable_if<is_matrix<sparse_vector_type>,uint64>::type operator() ( + const sparse_vector_type& v + ) const + { + typedef typename sparse_vector_type::value_type::second_type scalar_type; + + uint64 temp = 0; + for (int i = 0; i < 64; ++i) + { + // compute the dot product between v and a Gaussian random vector. + scalar_type val = 0; + for (typename sparse_vector_type::const_iterator j = v.begin(); j != v.end(); ++j) + val += j->second*gaussian_random_hash(j->first, i, seed); + + if (val > 0) + temp |= 1; + temp <<= 1; + } + return temp; + } + + template <typename EXP> + uint64 operator() ( + const matrix_exp<EXP>& v + ) const + { + typedef typename EXP::type T; + uint64 temp = 0; + for (unsigned long i = 0; i < 64; ++i) + { + if (dot(matrix_cast<T>(gaussian_randm(v.size(),1,i+seed*64)), v) > 0) + temp |= 1; + temp <<= 1; + } + return temp; + } + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const + { + return hamming_distance(a,b); + } + + private: + const uint64 seed; + }; + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_128 + { + public: + hash_similar_angles_128 ( + ) : seed(0),hasher1(0), hasher2(1) {} + + hash_similar_angles_128 ( + const uint64 seed_ + ) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {} + + uint64 get_seed ( + ) const { return seed; } + + typedef std::pair<uint64,uint64> result_type; + + template < + typename vector_type + > + result_type operator() ( + const vector_type& v + ) const + { + return std::make_pair(hasher1(v), hasher2(v)); + } + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const + { + return hamming_distance(a.first,b.first) + + hamming_distance(a.second,b.second); + } + + private: + const uint64 seed; + hash_similar_angles_64 hasher1; + hash_similar_angles_64 hasher2; + + }; + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_256 + { + public: + hash_similar_angles_256 ( + ) : seed(0), hasher1(0), hasher2(1) {} + + hash_similar_angles_256 ( + const uint64 seed_ + ) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {} + + uint64 get_seed ( + ) const { return seed; } + + typedef std::pair<uint64,uint64> hash128_type; + typedef std::pair<hash128_type,hash128_type> result_type; + + template < + typename vector_type + > + result_type operator() ( + const vector_type& v + ) const + { + return std::make_pair(hasher1(v), hasher2(v)); + } + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const + { + return hasher1.distance(a.first,b.first) + + hasher1.distance(a.second,b.second); + } + + private: + const uint64 seed; + hash_similar_angles_128 hasher1; + hash_similar_angles_128 hasher2; + + }; + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_512 + { + public: + hash_similar_angles_512 ( + ) : seed(0), hasher1(0), hasher2(1) {} + + hash_similar_angles_512 ( + const uint64 seed_ + ) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {} + + uint64 get_seed ( + ) const { return seed; } + + + typedef hash_similar_angles_256::result_type hash256_type; + typedef std::pair<hash256_type,hash256_type> result_type; + + template < + typename vector_type + > + result_type operator() ( + const vector_type& v + ) const + { + return std::make_pair(hasher1(v), hasher2(v)); + } + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const + { + return hasher1.distance(a.first,b.first) + + hasher1.distance(a.second,b.second); + } + + private: + const uint64 seed; + hash_similar_angles_256 hasher1; + hash_similar_angles_256 hasher2; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_LSH_HAShES_Hh_ + diff --git a/ml/dlib/dlib/lsh/hashes_abstract.h b/ml/dlib/dlib/lsh/hashes_abstract.h new file mode 100644 index 000000000..27f8ddb69 --- /dev/null +++ b/ml/dlib/dlib/lsh/hashes_abstract.h @@ -0,0 +1,286 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_LSH_HAShES_ABSTRACT_Hh_ +#ifdef DLIB_LSH_HAShES_ABSTRACT_Hh_ + +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_64 + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for computing locality sensitive hashes that give + vectors with small angles between each other similar hash values. In + particular, this object creates 64 random planes which pass though the + origin and uses them to create a 64bit hash. To compute the hash for a new + vector, this object checks which side of each plane the vector falls on and + records this information into a 64bit integer. + !*/ + + public: + + hash_similar_angles_64 ( + ); + /*! + ensures + - #get_seed() == 0 + !*/ + + hash_similar_angles_64 ( + const uint64 seed + ); + /*! + ensures + - #get_seed() == seed + !*/ + + uint64 get_seed ( + ) const; + /*! + ensures + - returns the random seed used to generate the random planes used for + hashing. + !*/ + + typedef uint64 result_type; + + template <typename vector_type> + result_type perator() ( + const vector_type& v + ) const; + /*! + requires + - v is an unsorted sparse vector or a dlib matrix representing either a + column or row vector. + ensures + - returns a 64 bit hash of the input vector v. The bits in the hash record + which side of each random plane v falls on. + + !*/ + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const; + /*! + ensures + - returns the Hamming distance between the two hashes given to this + function. That is, we return the number of bits in a and b which differ. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + struct hash_similar_angles_128 + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for computing locality sensitive hashes that give + vectors with small angles between each other similar hash values. In + particular, this object creates 128 random planes which pass though the + origin and uses them to create a 128bit hash. To compute the hash for a new + vector, this object checks which side of each plane the vector falls on and + records this information into a 128bit integer. + !*/ + + public: + + hash_similar_angles_128 ( + ); + /*! + ensures + - #get_seed() == 0 + !*/ + + hash_similar_angles_128 ( + const uint64 seed + ); + /*! + ensures + - #get_seed() == seed + !*/ + + uint64 get_seed ( + ) const; + /*! + ensures + - returns the random seed used to generate the random planes used for + hashing. + !*/ + + typedef std::pair<uint64,uint64> result_type; + + template <typename vector_type> + result_type perator() ( + const vector_type& v + ) const; + /*! + requires + - v is an unsorted sparse vector or a dlib matrix representing either a + column or row vector. + ensures + - returns a 128 bit hash of the input vector v. The bits in the hash record + which side of each random plane v falls on. + + !*/ + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const; + /*! + ensures + - returns the Hamming distance between the two hashes given to this + function. That is, we return the number of bits in a and b which differ. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + struct hash_similar_angles_256 + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for computing locality sensitive hashes that give + vectors with small angles between each other similar hash values. In + particular, this object creates 256 random planes which pass though the + origin and uses them to create a 256bit hash. To compute the hash for a new + vector, this object checks which side of each plane the vector falls on and + records this information into a 256bit integer. + !*/ + + public: + + hash_similar_angles_256 ( + ); + /*! + ensures + - #get_seed() == 0 + !*/ + + hash_similar_angles_256 ( + const uint64 seed + ); + /*! + ensures + - #get_seed() == seed + !*/ + + uint64 get_seed ( + ) const; + /*! + ensures + - returns the random seed used to generate the random planes used for + hashing. + !*/ + + typedef std::pair<uint64,uint64> hash128_type; + typedef std::pair<hash128_type,hash128_type> result_type; + + template <typename vector_type> + result_type perator() ( + const vector_type& v + ) const; + /*! + requires + - v is an unsorted sparse vector or a dlib matrix representing either a + column or row vector. + ensures + - returns a 256 bit hash of the input vector v. The bits in the hash record + which side of each random plane v falls on. + + !*/ + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const; + /*! + ensures + - returns the Hamming distance between the two hashes given to this + function. That is, we return the number of bits in a and b which differ. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + struct hash_similar_angles_512 + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for computing locality sensitive hashes that give + vectors with small angles between each other similar hash values. In + particular, this object creates 512 random planes which pass though the + origin and uses them to create a 512bit hash. To compute the hash for a new + vector, this object checks which side of each plane the vector falls on and + records this information into a 512bit integer. + !*/ + + public: + + hash_similar_angles_512 ( + ); + /*! + ensures + - #get_seed() == 0 + !*/ + + hash_similar_angles_512 ( + const uint64 seed + ); + /*! + ensures + - #get_seed() == seed + !*/ + + uint64 get_seed ( + ) const; + /*! + ensures + - returns the random seed used to generate the random planes used for + hashing. + !*/ + + typedef hash_similar_angles_256::result_type hash256_type; + typedef std::pair<hash256_type,hash256_type> result_type; + + template <typename vector_type> + result_type perator() ( + const vector_type& v + ) const; + /*! + requires + - v is an unsorted sparse vector or a dlib matrix representing either a + column or row vector. + ensures + - returns a 512 bit hash of the input vector v. The bits in the hash record + which side of each random plane v falls on. + + !*/ + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const; + /*! + ensures + - returns the Hamming distance between the two hashes given to this + function. That is, we return the number of bits in a and b which differ. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_LSH_HAShES_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/lsh/projection_hash.h b/ml/dlib/dlib/lsh/projection_hash.h new file mode 100644 index 000000000..16de0ba11 --- /dev/null +++ b/ml/dlib/dlib/lsh/projection_hash.h @@ -0,0 +1,118 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_PROJECTION_HASh_Hh_ +#define DLIB_PROJECTION_HASh_Hh_ + +#include "projection_hash_abstract.h" +#include "../matrix.h" +#include "../rand.h" +#include <vector> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class projection_hash + { + public: + + projection_hash() {} + + template <typename EXP1, typename EXP2> + projection_hash( + const matrix_exp<EXP1>& proj_, + const matrix_exp<EXP2>& offset_ + ) : proj(proj_), offset(offset_) + { + // make sure requires clause is not broken + DLIB_ASSERT(proj.nr() == offset.nr(), + "\t projection_hash::projection_hash()" + << "\n\t Invalid arguments were given to this function." + << "\n\t proj.nr(): " << proj.nr() + << "\n\t offset.nr(): " << offset.nr() + ); + + } + + const matrix<double>& get_projection_matrix ( + ) const { return proj; } + + const matrix<double,0,1>& get_offset_matrix ( + ) const { return offset; } + + unsigned long num_hash_bins ( + ) const + { + return static_cast<unsigned long>(std::pow(2.0, (double)offset.size())); + } + + template <typename EXP> + unsigned long operator() ( + const matrix_exp<EXP>& v + ) const + { + // make sure requires clause is not broken + DLIB_ASSERT(is_col_vector(v) && + v.size() == get_projection_matrix().nc() && + v.size() > 0, + "\t unsigned long projection_hash::operator()(v)" + << "\n\t Invalid arguments were given to this function." + << "\n\t is_col_vector(v): " << is_col_vector(v) + << "\n\t get_projection_matrix().nc(): " << get_projection_matrix().nc() + << "\n\t v.size(): " << v.size() + ); + + return do_hash(proj*matrix_cast<double>(v) + offset); + } + + private: + + template <typename EXP> + unsigned long do_hash ( + const matrix_exp<EXP>& v + ) const + { + unsigned long h = 0; + for (long i = 0; i < v.size(); ++i) + { + h <<= 1; + if (v(i) > 0) + h |= 1; + } + return h; + } + + matrix<double> proj; + matrix<double,0,1> offset; + }; + +// ---------------------------------------------------------------------------------------- + + inline void serialize ( + const projection_hash& item, + std::ostream& out + ) + { + serialize(item.get_projection_matrix(), out); + serialize(item.get_offset_matrix(), out); + } + + inline void deserialize ( + projection_hash& item, + std::istream& in + ) + { + matrix<double> proj; + matrix<double,0,1> offset; + deserialize(proj, in); + deserialize(offset, in); + item = projection_hash(proj, offset); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_PROJECTION_HASh_Hh_ + diff --git a/ml/dlib/dlib/lsh/projection_hash_abstract.h b/ml/dlib/dlib/lsh/projection_hash_abstract.h new file mode 100644 index 000000000..abe78d10c --- /dev/null +++ b/ml/dlib/dlib/lsh/projection_hash_abstract.h @@ -0,0 +1,119 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_PROJECTION_HASh_ABSTRACT_Hh_ +#ifdef DLIB_PROJECTION_HASh_ABSTRACT_Hh_ + +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class projection_hash + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a tool for hashing elements of a vector space into the integers. + It is intended to represent locality sensitive hashing functions such as + the popular random projection hashing method. + + In particular, it represents hash functions of the form: + hash bit 0 = sign(rowm(P*v + O,0)) + hash bit 1 = sign(rowm(P*v + O,1)) + hash bit 2 = sign(rowm(P*v + O,2)) + ... + Where v is the vector to be hashed. The parameters of the projection + hash are the P and O matrices. + + THREAD SAFETY + The const members of this object can be called concurrently from multiple + threads, however, any operation that modifies the state of an instance of + this object must serialize access to that instance. + !*/ + public: + + projection_hash( + ); + /*! + ensures + - #get_projection_matrix().size() == 0 + - #get_offset_matrix().size() == 0 + !*/ + + template <typename EXP1, typename EXP2> + projection_hash( + const matrix_exp<EXP1>& proj, + const matrix_exp<EXP2>& offset + ); + /*! + requires + - proj.nr() == offset.nr() + ensures + - #get_projection_matrix() == proj + - #get_offset_matrix() == offset + !*/ + + const matrix<double>& get_projection_matrix ( + ) const; + /*! + ensures + - returns the P matrix discussed above in the WHAT THIS OBJECT REPRESENTS + section. + !*/ + + const matrix<double,0,1>& get_offset_matrix ( + ) const; + /*! + ensures + - returns the O matrix discussed above in the WHAT THIS OBJECT REPRESENTS + section. + !*/ + + unsigned long num_hash_bins ( + ) const; + /*! + ensures + - returns the number of possible outputs from this hashing function. + - Specifically, returns: std::pow(2, get_offset_matrix().size()) + !*/ + + template <typename EXP> + unsigned long operator() ( + const matrix_exp<EXP>& v + ) const; + /*! + requires + - is_col_vector(v) == true + - v.size() == get_projection_matrix().nc() + - v.size() > 0 + ensures + - hashes v into the range [0, num_hash_bins()) using the method + discussed in the WHAT THIS OBJECT REPRESENTS section. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + void serialize ( + const projection_hash& item, + std::ostream& out + ); + /*! + provides serialization support + !*/ + + void deserialize ( + projection_hash& item, + std::istream& in + ); + /*! + provides deserialization support + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_PROJECTION_HASh_ABSTRACT_Hh_ + |