diff options
Diffstat (limited to 'ml/dlib/dlib/statistics/statistics_abstract.h')
-rw-r--r-- | ml/dlib/dlib/statistics/statistics_abstract.h | 1387 |
1 files changed, 1387 insertions, 0 deletions
diff --git a/ml/dlib/dlib/statistics/statistics_abstract.h b/ml/dlib/dlib/statistics/statistics_abstract.h new file mode 100644 index 00000000..ef8f1380 --- /dev/null +++ b/ml/dlib/dlib/statistics/statistics_abstract.h @@ -0,0 +1,1387 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net), Steve Taylor +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_STATISTICs_ABSTRACT_ +#ifdef DLIB_STATISTICs_ABSTRACT_ + +#include <limits> +#include <cmath> +#include "../matrix/matrix_abstract.h" +#include "../svm/sparse_vector_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename T, + typename alloc + > + double mean_sign_agreement ( + const std::vector<T,alloc>& a, + const std::vector<T,alloc>& b + ); + /*! + requires + - a.size() == b.size() + ensures + - returns the number of times a[i] has the same sign as b[i] divided by + a.size(). So we return the probability that elements of a and b have + the same sign. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T, + typename alloc + > + double correlation ( + const std::vector<T,alloc>& a, + const std::vector<T,alloc>& b + ); + /*! + requires + - a.size() == b.size() + - a.size() > 1 + ensures + - returns the correlation coefficient between all the elements of a and b. + (i.e. how correlated is a(i) with b(i)) + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T, + typename alloc + > + double covariance ( + const std::vector<T,alloc>& a, + const std::vector<T,alloc>& b + ); + /*! + requires + - a.size() == b.size() + - a.size() > 1 + ensures + - returns the covariance between all the elements of a and b. + (i.e. how does a(i) vary with b(i)) + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T, + typename alloc + > + double r_squared ( + const std::vector<T,alloc>& a, + const std::vector<T,alloc>& b + ); + /*! + requires + - a.size() == b.size() + - a.size() > 1 + ensures + - returns the R^2 coefficient of determination between all the elements of a and b. + This value is just the square of correlation(a,b). + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T, + typename alloc + > + double mean_squared_error ( + const std::vector<T,alloc>& a, + const std::vector<T,alloc>& b + ); + /*! + requires + - a.size() == b.size() + ensures + - returns the mean squared error between all the elements of a and b. + (i.e. mean(squared(mat(a)-mat(b)))) + !*/ + +// ---------------------------------------------------------------------------------------- + + double binomial_random_vars_are_different ( + uint64_t k1, + uint64_t n1, + uint64_t k2, + uint64_t n2 + ); + /*! + requires + - k1 <= n1 + - k2 <= n2 + ensures + - Given two binomially distributed random variables, X1 and X2, we want to know + if these variables have the same parameter (i.e. the chance of "success"). + So assume that: + - You observed X1 to give k1 successes out of n1 trials. + - You observed X2 to give k2 successes out of n2 trials. + - This function performs a simple likelihood ratio test to determine if X1 and + X2 have the same parameter. The return value of this function will be: + - Close to 0 if they are probably the same. + - Larger than 0 if X1 probably has a higher "success" rate than X2. + - Smaller than 0 if X2 probably has a higher "success" rate than X1. + Moreover, the larger the absolute magnitude of the return value the more + likely it is that X1 and X2 have different distributions. + - For a discussion of the technique and applications see: + Dunning, Ted. "Accurate methods for the statistics of surprise and + coincidence." Computational linguistics 19.1 (1993): 61-74. + !*/ + +// ---------------------------------------------------------------------------------------- + + double event_correlation ( + uint64_t A_count, + uint64_t B_count, + uint64_t AB_count, + uint64_t total_num_observations + ); + /*! + requires + - AB_count <= A_count <= total_num_observations + - AB_count <= B_count <= total_num_observations + - A_count + B_count - AB_count <= total_num_observations + ensures + - This function does a statistical test to determine if two events co-occur in + a statistically significant way. In particular, we assume you performed + total_num_observations measurements and during those measurements you: + - Observed event A to happen A_count times. + - Observed event B to happen B_count times. + - Observed AB_count co-occurrences of the events. That is, AB_count is the + number of times the events happened together during the same measurement. + - This function returns a number, COR, which can take any real value. It has + the following interpretations: + - COR == 0: there is no evidence of correlation between the two events. + They appear to be unrelated. + - COR > 0: There is evidence that A and B co-occur together. That is, + they happen at the same times more often than you would expect if they + were independent events. The larger the magnitude of COR the more + evidence we have for the correlation. + - COR < 0: There is evidence that A and B are anti-correlated. That is, + when A happens B is unlikely to happen and vise versa. The larger the + magnitude of COR the more evidence we have for the anti-correlation. + - This function implements the simple likelihood ratio test discussed in the + following paper: + Dunning, Ted. "Accurate methods for the statistics of surprise and + coincidence." Computational linguistics 19.1 (1993): 61-74. + So for an extended discussion of the method see the above paper. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + class running_stats + { + /*! + REQUIREMENTS ON T + - T must be a float, double, or long double type + + INITIAL VALUE + - mean() == 0 + - current_n() == 0 + + WHAT THIS OBJECT REPRESENTS + This object represents something that can compute the running mean, + variance, skewness, and excess kurtosis of a stream of real numbers. + !*/ + public: + + running_stats( + ); + /*! + ensures + - this object is properly initialized + !*/ + + void clear( + ); + /*! + ensures + - this object has its initial value + - clears all memory of any previous data points + !*/ + + T current_n ( + ) const; + /*! + ensures + - returns the number of points given to this object so far. + !*/ + + void add ( + const T& val + ); + /*! + ensures + - updates the mean, variance, skewness, and kurtosis stored in this object + so that the new value is factored into them. + - #mean() == mean()*current_n()/(current_n()+1) + val/(current_n()+1). + (i.e. the updated mean value that takes the new value into account) + - #variance() == the updated variance that takes this new value into account. + - #skewness() == the updated skewness that takes this new value into account. + - #ex_kurtosis() == the updated kurtosis that takes this new value into account. + - #current_n() == current_n() + 1 + !*/ + + T mean ( + ) const; + /*! + ensures + - returns the mean of all the values presented to this object + so far. + !*/ + + T variance ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the unbiased sample variance of all the values presented to this + object so far. + !*/ + + T stddev ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the unbiased sampled standard deviation of all the values + presented to this object so far. + !*/ + + T skewness ( + ) const; + /*! + requires + - current_n() > 2 + ensures + - returns the unbiased sample skewness of all the values presented + to this object so far. + !*/ + + T ex_kurtosis( + ) const; + /*! + requires + - current_n() > 3 + ensures + - returns the unbiased sample kurtosis of all the values presented + to this object so far. + !*/ + + T max ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the largest value presented to this object so far. + !*/ + + T min ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the smallest value presented to this object so far. + !*/ + + T scale ( + const T& val + ) const; + /*! + requires + - current_n() > 1 + ensures + - return (val-mean())/stddev(); + !*/ + + running_stats operator+ ( + const running_stats& rhs + ) const; + /*! + ensures + - returns a new running_stats object that represents the combination of all + the values given to *this and rhs. That is, this function returns a + running_stats object, R, that is equivalent to what you would obtain if + all calls to this->add() and rhs.add() had instead been done to R. + !*/ + }; + + template <typename T> + void serialize ( + const running_stats<T>& item, + std::ostream& out + ); + /*! + provides serialization support + !*/ + + template <typename T> + void deserialize ( + running_stats<T>& item, + std::istream& in + ); + /*! + provides serialization support + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + class running_scalar_covariance + { + /*! + REQUIREMENTS ON T + - T must be a float, double, or long double type + + INITIAL VALUE + - mean_x() == 0 + - mean_y() == 0 + - current_n() == 0 + + WHAT THIS OBJECT REPRESENTS + This object represents something that can compute the running covariance + of a stream of real number pairs. + !*/ + + public: + + running_scalar_covariance( + ); + /*! + ensures + - this object is properly initialized + !*/ + + void clear( + ); + /*! + ensures + - this object has its initial value + - clears all memory of any previous data points + !*/ + + void add ( + const T& x, + const T& y + ); + /*! + ensures + - updates the statistics stored in this object so that + the new pair (x,y) is factored into them. + - #current_n() == current_n() + 1 + !*/ + + T current_n ( + ) const; + /*! + ensures + - returns the number of points given to this object so far. + !*/ + + T mean_x ( + ) const; + /*! + ensures + - returns the mean value of all x samples presented to this object + via add(). + !*/ + + T mean_y ( + ) const; + /*! + ensures + - returns the mean value of all y samples presented to this object + via add(). + !*/ + + T covariance ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the covariance between all the x and y samples presented + to this object via add() + !*/ + + T correlation ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the correlation coefficient between all the x and y samples + presented to this object via add() + !*/ + + T variance_x ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the unbiased sample variance value of all x samples presented + to this object via add(). + !*/ + + T variance_y ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the unbiased sample variance value of all y samples presented + to this object via add(). + !*/ + + T stddev_x ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the unbiased sample standard deviation of all x samples + presented to this object via add(). + !*/ + + T stddev_y ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the unbiased sample standard deviation of all y samples + presented to this object via add(). + !*/ + + running_scalar_covariance operator+ ( + const running_covariance& rhs + ) const; + /*! + ensures + - returns a new running_scalar_covariance object that represents the + combination of all the values given to *this and rhs. That is, this + function returns a running_scalar_covariance object, R, that is + equivalent to what you would obtain if all calls to this->add() and + rhs.add() had instead been done to R. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + class running_scalar_covariance_decayed + { + /*! + REQUIREMENTS ON T + - T must be a float, double, or long double type + + INITIAL VALUE + - mean_x() == 0 + - mean_y() == 0 + - current_n() == 0 + + WHAT THIS OBJECT REPRESENTS + This object represents something that can compute the running covariance of + a stream of real number pairs. It is essentially the same as + running_scalar_covariance except that it forgets about data it has seen + after a certain period of time. It does this by exponentially decaying old + statistics. + !*/ + + public: + + running_scalar_covariance_decayed( + T decay_halflife = 1000 + ); + /*! + requires + - decay_halflife > 0 + ensures + - #forget_factor() == std::pow(0.5, 1/decay_halflife); + (i.e. after decay_halflife calls to add() the data given to the first add + will be down weighted by 0.5 in the statistics stored in this object). + !*/ + + T forget_factor ( + ) const; + /*! + ensures + - returns the exponential forget factor used to forget old statistics when + add() is called. + !*/ + + void add ( + const T& x, + const T& y + ); + /*! + ensures + - updates the statistics stored in this object so that + the new pair (x,y) is factored into them. + - #current_n() == current_n()*forget_factor() + forget_factor() + - Down weights old statistics by a factor of forget_factor(). + !*/ + + T current_n ( + ) const; + /*! + ensures + - returns the effective number of points given to this object. As add() + is called this value will converge to a constant, the value of which is + based on the decay_halflife supplied to the constructor. + !*/ + + T mean_x ( + ) const; + /*! + ensures + - returns the mean value of all x samples presented to this object + via add(). + !*/ + + T mean_y ( + ) const; + /*! + ensures + - returns the mean value of all y samples presented to this object + via add(). + !*/ + + T covariance ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the covariance between all the x and y samples presented + to this object via add() + !*/ + + T correlation ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the correlation coefficient between all the x and y samples + presented to this object via add() + !*/ + + T variance_x ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the sample variance value of all x samples presented + to this object via add(). + !*/ + + T variance_y ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the sample variance value of all y samples presented + to this object via add(). + !*/ + + T stddev_x ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the sample standard deviation of all x samples + presented to this object via add(). + !*/ + + T stddev_y ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the sample standard deviation of all y samples + presented to this object via add(). + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + class running_stats_decayed + { + /*! + REQUIREMENTS ON T + - T must be a float, double, or long double type + + INITIAL VALUE + - mean() == 0 + - current_n() == 0 + + WHAT THIS OBJECT REPRESENTS + This object represents something that can compute the running mean and + variance of a stream of real numbers. It is similar to running_stats + except that it forgets about data it has seen after a certain period of + time. It does this by exponentially decaying old statistics. + !*/ + + public: + + running_stats_decayed( + T decay_halflife = 1000 + ); + /*! + requires + - decay_halflife > 0 + ensures + - #forget_factor() == std::pow(0.5, 1/decay_halflife); + (i.e. after decay_halflife calls to add() the data given to the first add + will be down weighted by 0.5 in the statistics stored in this object). + !*/ + + T forget_factor ( + ) const; + /*! + ensures + - returns the exponential forget factor used to forget old statistics when + add() is called. + !*/ + + void add ( + const T& x + ); + /*! + ensures + - updates the statistics stored in this object so that x is factored into + them. + - #current_n() == current_n()*forget_factor() + forget_factor() + - Down weights old statistics by a factor of forget_factor(). + !*/ + + T current_n ( + ) const; + /*! + ensures + - returns the effective number of points given to this object. As add() + is called this value will converge to a constant, the value of which is + based on the decay_halflife supplied to the constructor. + !*/ + + T mean ( + ) const; + /*! + ensures + - returns the mean value of all x samples presented to this object + via add(). + !*/ + + T variance ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the sample variance value of all x samples presented to this + object via add(). + !*/ + + T stddev ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the sample standard deviation of all x samples presented to this + object via add(). + !*/ + + }; + + template <typename T> + void serialize ( + const running_stats_decayed<T>& item, + std::ostream& out + ); + /*! + provides serialization support + !*/ + + template <typename T> + void deserialize ( + running_stats_decayed<T>& item, + std::istream& in + ); + /*! + provides serialization support + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type + > + class running_covariance + { + /*! + REQUIREMENTS ON matrix_type + Must be some type of dlib::matrix. + + INITIAL VALUE + - in_vector_size() == 0 + - current_n() == 0 + + WHAT THIS OBJECT REPRESENTS + This object is a simple tool for computing the mean and + covariance of a sequence of vectors. + !*/ + public: + + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef typename matrix_type::type scalar_type; + typedef typename matrix_type::layout_type layout_type; + typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix; + typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix; + + running_covariance( + ); + /*! + ensures + - this object is properly initialized + !*/ + + void clear( + ); + /*! + ensures + - this object has its initial value + - clears all memory of any previous data points + !*/ + + long current_n ( + ) const; + /*! + ensures + - returns the number of samples that have been presented to this object + !*/ + + long in_vector_size ( + ) const; + /*! + ensures + - if (this object has been presented with any input vectors or + set_dimension() has been called) then + - returns the dimension of the column vectors used with this object + - else + - returns 0 + !*/ + + void set_dimension ( + long size + ); + /*! + requires + - size > 0 + ensures + - #in_vector_size() == size + - #current_n() == 0 + !*/ + + template <typename T> + void add ( + const T& val + ); + /*! + requires + - val must represent a column vector. It can either be a dlib::matrix + object or some kind of unsorted sparse vector type. See the top of + dlib/svm/sparse_vector_abstract.h for a definition of unsorted sparse vector. + - val must have a number of dimensions which is compatible with the current + setting of in_vector_size(). In particular, this means that the + following must hold: + - if (val is a dlib::matrix) then + - in_vector_size() == 0 || val.size() == val_vector_size() + - else + - max_index_plus_one(val) <= in_vector_size() + - in_vector_size() > 0 + (i.e. you must call set_dimension() prior to calling add() if + you want to use sparse vectors.) + ensures + - updates the mean and covariance stored in this object so that + the new value is factored into them. + - if (val is a dlib::matrix) then + - #in_vector_size() == val.size() + !*/ + + const column_matrix mean ( + ) const; + /*! + requires + - in_vector_size() != 0 + ensures + - returns the mean of all the vectors presented to this object + so far. + !*/ + + const general_matrix covariance ( + ) const; + /*! + requires + - in_vector_size() != 0 + - current_n() > 1 + ensures + - returns the unbiased sample covariance matrix for all the vectors + presented to this object so far. + !*/ + + const running_covariance operator+ ( + const running_covariance& item + ) const; + /*! + requires + - in_vector_size() == 0 || item.in_vector_size() == 0 || in_vector_size() == item.in_vector_size() + (i.e. the in_vector_size() of *this and item must match or one must be zero) + ensures + - returns a new running_covariance object that represents the combination of all + the vectors given to *this and item. That is, this function returns a + running_covariance object, R, that is equivalent to what you would obtain if all + calls to this->add() and item.add() had instead been done to R. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type + > + class running_cross_covariance + { + /*! + REQUIREMENTS ON matrix_type + Must be some type of dlib::matrix. + + INITIAL VALUE + - x_vector_size() == 0 + - y_vector_size() == 0 + - current_n() == 0 + + WHAT THIS OBJECT REPRESENTS + This object is a simple tool for computing the mean and cross-covariance + matrices of a sequence of pairs of vectors. + !*/ + + public: + + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef typename matrix_type::type scalar_type; + typedef typename matrix_type::layout_type layout_type; + typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix; + typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix; + + running_cross_covariance( + ); + /*! + ensures + - this object is properly initialized + !*/ + + void clear( + ); + /*! + ensures + - This object has its initial value. + - Clears all memory of any previous data points. + !*/ + + long x_vector_size ( + ) const; + /*! + ensures + - if (this object has been presented with any input vectors or + set_dimensions() has been called) then + - returns the dimension of the x vectors given to this object via add(). + - else + - returns 0 + !*/ + + long y_vector_size ( + ) const; + /*! + ensures + - if (this object has been presented with any input vectors or + set_dimensions() has been called) then + - returns the dimension of the y vectors given to this object via add(). + - else + - returns 0 + !*/ + + void set_dimensions ( + long x_size, + long y_size + ); + /*! + requires + - x_size > 0 + - y_size > 0 + ensures + - #x_vector_size() == x_size + - #y_vector_size() == y_size + - #current_n() == 0 + !*/ + + long current_n ( + ) const; + /*! + ensures + - returns the number of samples that have been presented to this object. + !*/ + + template <typename T, typename U> + void add ( + const T& x, + const U& y + ); + /*! + requires + - x and y must represent column vectors. They can either be dlib::matrix + objects or some kind of unsorted sparse vector type. See the top of + dlib/svm/sparse_vector_abstract.h for a definition of unsorted sparse vector. + - x and y must have a number of dimensions which is compatible with the + current setting of x_vector_size() and y_vector_size(). In particular, + this means that the following must hold: + - if (x or y is a sparse vector type) then + - x_vector_size() > 0 && y_vector_size() > 0 + (i.e. you must call set_dimensions() prior to calling add() if + you want to use sparse vectors.) + - if (x is a dlib::matrix) then + - x_vector_size() == 0 || x.size() == x_vector_size() + - else + - max_index_plus_one(x) <= x_vector_size() + - if (y is a dlib::matrix) then + - y_vector_size() == 0 || y.size() == y_vector_size() + - else + - max_index_plus_one(y) <= y_vector_size() + ensures + - updates the mean and cross-covariance matrices stored in this object so + that the new (x,y) vector pair is factored into them. + - if (x is a dlib::matrix) then + - #x_vector_size() == x.size() + - if (y is a dlib::matrix) then + - #y_vector_size() == y.size() + !*/ + + const column_matrix mean_x ( + ) const; + /*! + requires + - current_n() != 0 + ensures + - returns the mean of all the x vectors presented to this object so far. + - The returned vector will have x_vector_size() dimensions. + !*/ + + const column_matrix mean_y ( + ) const; + /*! + requires + - current_n() != 0 + ensures + - returns the mean of all the y vectors presented to this object so far. + - The returned vector will have y_vector_size() dimensions. + !*/ + + const general_matrix covariance_xy ( + ) const; + /*! + requires + - current_n() > 1 + ensures + - returns the unbiased sample cross-covariance matrix for all the vector + pairs presented to this object so far. In particular, returns a matrix + M such that: + - M.nr() == x_vector_size() + - M.nc() == y_vector_size() + - M == the cross-covariance matrix of the data given to add(). + !*/ + + const running_cross_covariance operator+ ( + const running_cross_covariance& item + ) const; + /*! + requires + - x_vector_size() == 0 || item.x_vector_size() == 0 || x_vector_size() == item.x_vector_size() + (i.e. the x_vector_size() of *this and item must match or one must be zero) + - y_vector_size() == 0 || item.y_vector_size() == 0 || y_vector_size() == item.y_vector_size() + (i.e. the y_vector_size() of *this and item must match or one must be zero) + ensures + - returns a new running_cross_covariance object that represents the + combination of all the vectors given to *this and item. That is, this + function returns a running_cross_covariance object, R, that is equivalent + to what you would obtain if all calls to this->add() and item.add() had + instead been done to R. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type + > + class vector_normalizer + { + /*! + REQUIREMENTS ON matrix_type + - must be a dlib::matrix object capable of representing column + vectors + + INITIAL VALUE + - in_vector_size() == 0 + - out_vector_size() == 0 + - means().size() == 0 + - std_devs().size() == 0 + + WHAT THIS OBJECT REPRESENTS + This object represents something that can learn to normalize a set + of column vectors. In particular, normalized column vectors should + have zero mean and a variance of one. + + Also, if desired, this object can use principal component + analysis for the purposes of reducing the number of elements in a + vector. + + THREAD SAFETY + Note that this object contains a cached matrix object it uses + to store intermediate results for normalization. This avoids + needing to reallocate it every time this object performs normalization + but also makes it non-thread safe. So make sure you don't share + instances of this object between threads. + !*/ + + public: + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef typename matrix_type::type scalar_type; + typedef matrix_type result_type; + + template <typename vector_type> + void train ( + const vector_type& samples + ); + /*! + requires + - samples.size() > 0 + - samples == a column matrix or something convertible to a column + matrix via mat(). Also, x should contain + matrix_type objects that represent nonempty column vectors. + - samples does not contain any infinite or NaN values + ensures + - #in_vector_size() == samples(0).nr() + - #out_vector_size() == samples(0).nr() + - This object has learned how to normalize vectors that look like + vectors in the given set of samples. + - #means() == mean(samples) + - #std_devs() == reciprocal(sqrt(variance(samples))); + !*/ + + long in_vector_size ( + ) const; + /*! + ensures + - returns the number of rows that input vectors are + required to contain if they are to be normalized by + this object. + !*/ + + long out_vector_size ( + ) const; + /*! + ensures + - returns the number of rows in the normalized vectors + that come out of this object. + !*/ + + const matrix_type& means ( + ) const; + /*! + ensures + - returns a matrix M such that: + - M.nc() == 1 + - M.nr() == in_vector_size() + - M(i) == the mean of the ith input feature shown to train() + !*/ + + const matrix_type& std_devs ( + ) const; + /*! + ensures + - returns a matrix SD such that: + - SD.nc() == 1 + - SD.nr() == in_vector_size() + - SD(i) == the reciprocal of the standard deviation of the ith + input feature shown to train() + !*/ + + const result_type& operator() ( + const matrix_type& x + ) const; + /*! + requires + - x.nr() == in_vector_size() + - x.nc() == 1 + ensures + - returns a normalized version of x, call it Z, that has the + following properties: + - Z.nr() == out_vector_size() + - Z.nc() == 1 + - the mean of each element of Z is 0 + - the variance of each element of Z is 1 + - Z == pointwise_multiply(x-means(), std_devs()); + !*/ + + void swap ( + vector_normalizer& item + ); + /*! + ensures + - swaps *this and item + !*/ + }; + + template < + typename matrix_type + > + inline void swap ( + vector_normalizer<matrix_type>& a, + vector_normalizer<matrix_type>& b + ) { a.swap(b); } + /*! + provides a global swap function + !*/ + + template < + typename matrix_type, + > + void deserialize ( + vector_normalizer<matrix_type>& item, + std::istream& in + ); + /*! + provides deserialization support + !*/ + + template < + typename matrix_type, + > + void serialize ( + const vector_normalizer<matrix_type>& item, + std::ostream& out + ); + /*! + provides serialization support + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type + > + class vector_normalizer_pca + { + /*! + REQUIREMENTS ON matrix_type + - must be a dlib::matrix object capable of representing column + vectors + + INITIAL VALUE + - in_vector_size() == 0 + - out_vector_size() == 0 + - means().size() == 0 + - std_devs().size() == 0 + - pca_matrix().size() == 0 + + WHAT THIS OBJECT REPRESENTS + This object represents something that can learn to normalize a set + of column vectors. In particular, normalized column vectors should + have zero mean and a variance of one. + + Also, this object uses principal component analysis for the purposes + of reducing the number of elements in a vector. + + THREAD SAFETY + Note that this object contains a cached matrix object it uses + to store intermediate results for normalization. This avoids + needing to reallocate it every time this object performs normalization + but also makes it non-thread safe. So make sure you don't share + instances of this object between threads. + !*/ + + public: + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef typename matrix_type::type scalar_type; + typedef matrix<scalar_type,0,1,mem_manager_type> result_type; + + template <typename vector_type> + void train ( + const vector_type& samples, + const double eps = 0.99 + ); + /*! + requires + - 0 < eps <= 1 + - samples.size() > 0 + - samples == a column matrix or something convertible to a column + matrix via mat(). Also, x should contain + matrix_type objects that represent nonempty column vectors. + - samples does not contain any infinite or NaN values + ensures + - This object has learned how to normalize vectors that look like + vectors in the given set of samples. + - Principal component analysis is performed to find a transform + that might reduce the number of output features. + - #in_vector_size() == samples(0).nr() + - 0 < #out_vector_size() <= samples(0).nr() + - eps is a number that controls how "lossy" the pca transform will be. + Large values of eps result in #out_vector_size() being larger and + smaller values of eps result in #out_vector_size() being smaller. + - #means() == mean(samples) + - #std_devs() == reciprocal(sqrt(variance(samples))); + - #pca_matrix() == the PCA transform matrix that is out_vector_size() + rows by in_vector_size() columns. + !*/ + + long in_vector_size ( + ) const; + /*! + ensures + - returns the number of rows that input vectors are + required to contain if they are to be normalized by + this object. + !*/ + + long out_vector_size ( + ) const; + /*! + ensures + - returns the number of rows in the normalized vectors + that come out of this object. + !*/ + + const matrix<scalar_type,0,1,mem_manager_type>& means ( + ) const; + /*! + ensures + - returns a matrix M such that: + - M.nc() == 1 + - M.nr() == in_vector_size() + - M(i) == the mean of the ith input feature shown to train() + !*/ + + const matrix<scalar_type,0,1,mem_manager_type>& std_devs ( + ) const; + /*! + ensures + - returns a matrix SD such that: + - SD.nc() == 1 + - SD.nr() == in_vector_size() + - SD(i) == the reciprocal of the standard deviation of the ith + input feature shown to train() + !*/ + + const matrix<scalar_type,0,0,mem_manager_type>& pca_matrix ( + ) const; + /*! + ensures + - returns a matrix PCA such that: + - PCA.nr() == out_vector_size() + - PCA.nc() == in_vector_size() + - PCA == the principal component analysis transformation + matrix + !*/ + + const result_type& operator() ( + const matrix_type& x + ) const; + /*! + requires + - x.nr() == in_vector_size() + - x.nc() == 1 + ensures + - returns a normalized version of x, call it Z, that has the + following properties: + - Z.nr() == out_vector_size() + - Z.nc() == 1 + - the mean of each element of Z is 0 + - the variance of each element of Z is 1 + - Z == pca_matrix()*pointwise_multiply(x-means(), std_devs()); + !*/ + + void swap ( + vector_normalizer_pca& item + ); + /*! + ensures + - swaps *this and item + !*/ + }; + + template < + typename matrix_type + > + inline void swap ( + vector_normalizer_pca<matrix_type>& a, + vector_normalizer_pca<matrix_type>& b + ) { a.swap(b); } + /*! + provides a global swap function + !*/ + + template < + typename matrix_type, + > + void deserialize ( + vector_normalizer_pca<matrix_type>& item, + std::istream& in + ); + /*! + provides deserialization support + !*/ + + template < + typename matrix_type, + > + void serialize ( + const vector_normalizer_pca<matrix_type>& item, + std::ostream& out + ); + /*! + provides serialization support + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_STATISTICs_ABSTRACT_ + |