diff options
Diffstat (limited to 'ml/dlib/tools/python/src/cca.cpp')
-rw-r--r-- | ml/dlib/tools/python/src/cca.cpp | 137 |
1 files changed, 0 insertions, 137 deletions
diff --git a/ml/dlib/tools/python/src/cca.cpp b/ml/dlib/tools/python/src/cca.cpp deleted file mode 100644 index dcf476522..000000000 --- a/ml/dlib/tools/python/src/cca.cpp +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (C) 2013 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. - -#include "opaque_types.h" -#include <dlib/python.h> -#include <dlib/statistics.h> - -using namespace dlib; -namespace py = pybind11; - -typedef std::vector<std::pair<unsigned long,double> > sparse_vect; - -struct cca_outputs -{ - matrix<double,0,1> correlations; - matrix<double> Ltrans; - matrix<double> Rtrans; -}; - -cca_outputs _cca1 ( - const std::vector<sparse_vect>& L, - const std::vector<sparse_vect>& R, - unsigned long num_correlations, - unsigned long extra_rank, - unsigned long q, - double regularization -) -{ - pyassert(num_correlations > 0 && L.size() > 0 && R.size() > 0 && L.size() == R.size() && regularization >= 0, - "Invalid inputs"); - - cca_outputs temp; - temp.correlations = cca(L,R,temp.Ltrans,temp.Rtrans,num_correlations,extra_rank,q,regularization); - return temp; -} - -// ---------------------------------------------------------------------------------------- - -unsigned long sparse_vector_max_index_plus_one ( - const sparse_vect& v -) -{ - return max_index_plus_one(v); -} - -matrix<double,0,1> apply_cca_transform ( - const matrix<double>& m, - const sparse_vect& v -) -{ - pyassert((long)max_index_plus_one(v) <= m.nr(), "Invalid Inputs"); - return sparse_matrix_vector_multiply(trans(m), v); -} - -void bind_cca(py::module& m) -{ - py::class_<cca_outputs>(m, "cca_outputs") - .def_readwrite("correlations", &cca_outputs::correlations) - .def_readwrite("Ltrans", &cca_outputs::Ltrans) - .def_readwrite("Rtrans", &cca_outputs::Rtrans); - - m.def("max_index_plus_one", sparse_vector_max_index_plus_one, py::arg("v"), -"ensures \n\ - - returns the dimensionality of the given sparse vector. That is, returns a \n\ - number one larger than the maximum index value in the vector. If the vector \n\ - is empty then returns 0. " - ); - - - m.def("apply_cca_transform", apply_cca_transform, py::arg("m"), py::arg("v"), -"requires \n\ - - max_index_plus_one(v) <= m.nr() \n\ -ensures \n\ - - returns trans(m)*v \n\ - (i.e. multiply m by the vector v and return the result) " - ); - - - m.def("cca", _cca1, py::arg("L"), py::arg("R"), py::arg("num_correlations"), py::arg("extra_rank")=5, py::arg("q")=2, py::arg("regularization")=0, -"requires \n\ - - num_correlations > 0 \n\ - - len(L) > 0 \n\ - - len(R) > 0 \n\ - - len(L) == len(R) \n\ - - regularization >= 0 \n\ - - L and R must be properly sorted sparse vectors. This means they must list their \n\ - elements in ascending index order and not contain duplicate index values. You can use \n\ - make_sparse_vector() to ensure this is true. \n\ -ensures \n\ - - This function performs a canonical correlation analysis between the vectors \n\ - in L and R. That is, it finds two transformation matrices, Ltrans and \n\ - Rtrans, such that row vectors in the transformed matrices L*Ltrans and \n\ - R*Rtrans are as correlated as possible (note that in this notation we \n\ - interpret L as a matrix with the input vectors in its rows). Note also that \n\ - this function tries to find transformations which produce num_correlations \n\ - dimensional output vectors. \n\ - - Note that you can easily apply the transformation to a vector using \n\ - apply_cca_transform(). So for example, like this: \n\ - - apply_cca_transform(Ltrans, some_sparse_vector) \n\ - - returns a structure containing the Ltrans and Rtrans transformation matrices \n\ - as well as the estimated correlations between elements of the transformed \n\ - vectors. \n\ - - This function assumes the data vectors in L and R have already been centered \n\ - (i.e. we assume the vectors have zero means). However, in many cases it is \n\ - fine to use uncentered data with cca(). But if it is important for your \n\ - problem then you should center your data before passing it to cca(). \n\ - - This function works with reduced rank approximations of the L and R matrices. \n\ - This makes it fast when working with large matrices. In particular, we use \n\ - the dlib::svd_fast() routine to find reduced rank representations of the input \n\ - matrices by calling it as follows: svd_fast(L, U,D,V, num_correlations+extra_rank, q) \n\ - and similarly for R. This means that you can use the extra_rank and q \n\ - arguments to cca() to influence the accuracy of the reduced rank \n\ - approximation. However, the default values should work fine for most \n\ - problems. \n\ - - The dimensions of the output vectors produced by L*#Ltrans or R*#Rtrans are \n\ - ordered such that the dimensions with the highest correlations come first. \n\ - That is, after applying the transforms produced by cca() to a set of vectors \n\ - you will find that dimension 0 has the highest correlation, then dimension 1 \n\ - has the next highest, and so on. This also means that the list of estimated \n\ - correlations returned from cca() will always be listed in decreasing order. \n\ - - This function performs the ridge regression version of Canonical Correlation \n\ - Analysis when regularization is set to a value > 0. In particular, larger \n\ - values indicate the solution should be more heavily regularized. This can be \n\ - useful when the dimensionality of the data is larger than the number of \n\ - samples. \n\ - - A good discussion of CCA can be found in the paper \"Canonical Correlation \n\ - Analysis\" by David Weenink. In particular, this function is implemented \n\ - using equations 29 and 30 from his paper. We also use the idea of doing CCA \n\ - on a reduced rank approximation of L and R as suggested by Paramveer S. \n\ - Dhillon in his paper \"Two Step CCA: A new spectral method for estimating \n\ - vector models of words\". " - - ); -} - - - |