summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/optimization/elastic_net.h
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/optimization/elastic_net.h')
-rw-r--r--ml/dlib/dlib/optimization/elastic_net.h389
1 files changed, 389 insertions, 0 deletions
diff --git a/ml/dlib/dlib/optimization/elastic_net.h b/ml/dlib/dlib/optimization/elastic_net.h
new file mode 100644
index 000000000..6c4b6d0b4
--- /dev/null
+++ b/ml/dlib/dlib/optimization/elastic_net.h
@@ -0,0 +1,389 @@
+// Copyright (C) 2016 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_ElASTIC_NET_Hh_
+#define DLIB_ElASTIC_NET_Hh_
+
+#include "../matrix.h"
+#include "elastic_net_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class elastic_net
+ {
+ public:
+
+ template <typename EXP>
+ explicit elastic_net(
+ const matrix_exp<EXP>& XX
+ ) : eps(1e-5), max_iterations(50000), verbose(false)
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(XX.size() > 0 &&
+ XX.nr() == XX.nc(),
+ "\t elastic_net::elastic_net(XX)"
+ << " \n\t XX must be a non-empty square matrix."
+ << " \n\t XX.nr(): " << XX.nr()
+ << " \n\t XX.nc(): " << XX.nc()
+ << " \n\t this: " << this
+ );
+
+
+ // If the number of columns in X is big and in particular bigger than the number of
+ // rows then we can get rid of them by doing some SVD magic. Doing this doesn't
+ // make the final results of anything change but makes all the matrices have
+ // dimensions that are X.nr() in size, which can be much smaller.
+ matrix<double,0,1> s;
+ svd3(XX,u,eig_vals,eig_vects);
+ s = sqrt(eig_vals);
+ X = eig_vects*diagm(s);
+ u = eig_vects*inv(diagm(s));
+
+
+
+ samples.resize(X.nr()*2);
+
+ for (size_t i = 0; i < samples.size(); ++i)
+ index.push_back(i);
+ active_size = index.size();
+
+
+ // setup the training samples used in the SVM optimizer below
+ for (size_t i = 0; i < samples.size(); ++i)
+ {
+ auto& x = samples[i];
+ const long idx = i/2;
+ if (i%2 == 0)
+ x.label = +1;
+ else
+ x.label = -1;
+
+ x.r = idx%X.nr();
+ }
+ }
+
+ template <typename EXP1, typename EXP2>
+ elastic_net(
+ const matrix_exp<EXP1>& XX,
+ const matrix_exp<EXP2>& XY
+ ) : elastic_net(XX)
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(XX.size() > 0 &&
+ XX.nr() == XX.nc() &&
+ is_col_vector(XY) &&
+ XX.nc() == XY.size() ,
+ "\t elastic_net::elastic_net(XX,XY)"
+ << " \n\t Invalid inputs were given to this function."
+ << " \n\t XX.size(): " << XX.size()
+ << " \n\t is_col_vector(XY): " << is_col_vector(XY)
+ << " \n\t XX.nr(): " << XX.nr()
+ << " \n\t XX.nc(): " << XX.nc()
+ << " \n\t XY.size(): " << XY.size()
+ << " \n\t this: " << this
+ );
+
+ set_xy(XY);
+ }
+
+ long size (
+ ) const { return u.nr(); }
+
+ template <typename EXP>
+ void set_xy(
+ const matrix_exp<EXP>& XY
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_col_vector(XY) &&
+ XY.size() == size(),
+ "\t void elastic_net::set_y(Y)"
+ << " \n\t Invalid inputs were given to this function."
+ << " \n\t is_col_vector(XY): " << is_col_vector(XY)
+ << " \n\t size(): " << size()
+ << " \n\t XY.size(): " << XY.size()
+ << " \n\t this: " << this
+ );
+
+ Y = trans(u)*XY;
+ // We can use the ynorm after it has been projected because the only place Y
+ // appears in the algorithm is in terms of dot products with w and x vectors.
+ // But those vectors are always in the span of X and therefore we only see the
+ // part of the norm of Y that is in the span of X (and hence u since u and X
+ // have the same span by construction)
+ ynorm = length_squared(Y);
+ xdoty = X*Y;
+ eig_vects_xdoty = trans(eig_vects)*xdoty;
+
+ w.set_size(Y.size());
+ // zero out any memory of previous solutions
+ alpha.assign(X.nr()*2, 0);
+ }
+
+ bool have_target_values (
+ ) const { return Y.size() != 0; }
+
+ void set_epsilon(
+ double eps_
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(eps_ > 0,
+ "\t void elastic_net::set_epsilon()"
+ << " \n\t eps_ must be greater than 0"
+ << " \n\t eps_: " << eps_
+ << " \n\t this: " << this
+ );
+
+ eps = eps_;
+ }
+
+ unsigned long get_max_iterations (
+ ) const { return max_iterations; }
+
+ void set_max_iterations (
+ unsigned long max_iter
+ )
+ {
+ max_iterations = max_iter;
+ }
+
+ void be_verbose (
+ )
+ {
+ verbose = true;
+ }
+
+ void be_quiet (
+ )
+ {
+ verbose = false;
+ }
+
+ double get_epsilon (
+ ) const { return eps; }
+
+ matrix<double,0,1> operator() (
+ double ridge_lambda,
+ double lasso_budget = std::numeric_limits<double>::infinity()
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(have_target_values() &&
+ ridge_lambda > 0 &&
+ lasso_budget > 0 ,
+ "\t matrix<double,0,1> elastic_net::operator()()"
+ << " \n\t Invalid inputs were given to this function."
+ << " \n\t have_target_values(): " << have_target_values()
+ << " \n\t ridge_lambda: " << ridge_lambda
+ << " \n\t lasso_budget: " << lasso_budget
+ << " \n\t this: " << this
+ );
+
+
+ // First check if lasso_budget is so big that it isn't even active. We do this
+ // by doing just ridge regression and checking the result.
+ matrix<double,0,1> betas = eig_vects*tmp(inv(diagm(eig_vals + ridge_lambda))*eig_vects_xdoty);
+ if (sum(abs(betas)) <= lasso_budget)
+ return betas;
+
+
+ // Set w back to 0. We will compute the w corresponding to what is currently
+ // in alpha layer on. This way w and alpha are always in sync.
+ w = 0;
+ wy_mult = 0;
+ wdoty = 0;
+
+
+ // return dot(w,x)
+ auto dot = [&](const matrix<double,0,1>& w, const en_sample2& x)
+ {
+ const double xmul = -x.label*(1/lasso_budget);
+ // Do the base dot product but don't forget to add in the -(1/t)*y part from the svm reduction paper
+ double val = rowm(X,x.r)*w + xmul*wdoty + wy_mult*xdoty(x.r) + xmul*wy_mult*ynorm;
+
+ return val;
+ };
+
+
+ // perform w += scale*x;
+ auto add_to = [&](matrix<double,0,1>& w, double scale, const en_sample2& x)
+ {
+ const double xmul = -x.label*(1/lasso_budget);
+ wy_mult += scale*xmul;
+ wdoty += scale*xdoty(x.r);
+ w += scale*trans(rowm(X,x.r));
+
+ };
+
+ const double Dii = ridge_lambda;
+
+ // setup the training samples used in the SVM optimizer below
+ for (size_t i = 0; i < samples.size(); ++i)
+ {
+ auto& x = samples[i];
+
+ const double xmul = -x.label*(1/lasso_budget);
+ x.xdotx = xmul*xmul*ynorm;
+ for (long c = 0; c < X.nc(); ++c)
+ x.xdotx += std::pow(X(x.r,c)+xmul*Y(c), 2.0) - std::pow(xmul*Y(c),2.0);
+
+ // compute the correct w given whatever might be in alpha.
+ if (alpha[i] != 0)
+ add_to(w, x.label*alpha[i], samples[i]);
+ }
+
+
+ // Now run the optimizer
+ double PG_max_prev = std::numeric_limits<double>::infinity();
+ double PG_min_prev = -std::numeric_limits<double>::infinity();
+
+
+ unsigned int iter;
+ for (iter = 0; iter < max_iterations; ++iter)
+ {
+ // randomly shuffle the indices
+ for (unsigned long i = 0; i < active_size; ++i)
+ {
+ // pick a random index >= i
+ const long j = i + rnd.get_random_32bit_number()%(active_size-i);
+ std::swap(index[i], index[j]);
+ }
+
+ double PG_max = -std::numeric_limits<double>::infinity();
+ double PG_min = std::numeric_limits<double>::infinity();
+ for (size_t ii = 0; ii < active_size; ++ii)
+ {
+ const auto i = index[ii];
+ const auto& x = samples[i];
+ double G = x.label*dot(w, x) - 1 + Dii*alpha[i];
+
+ double PG = 0;
+ if (alpha[i] == 0)
+ {
+ if (G > PG_max_prev)
+ {
+ // shrink the active set of training examples
+ --active_size;
+ std::swap(index[ii], index[active_size]);
+ --ii;
+ continue;
+ }
+
+ if (G < 0)
+ PG = G;
+ }
+ else
+ {
+ PG = G;
+ }
+
+ if (PG > PG_max)
+ PG_max = PG;
+ if (PG < PG_min)
+ PG_min = PG;
+
+ // if PG != 0
+ if (std::abs(PG) > 1e-12)
+ {
+ const double alpha_old = alpha[i];
+ alpha[i] = std::max(alpha[i] - G/(x.xdotx+Dii), (double)0.0);
+ const double delta = (alpha[i]-alpha_old)*x.label;
+ add_to(w, delta, x);
+ }
+ }
+
+ if (verbose)
+ {
+ using namespace std;
+ cout << "gap: " << PG_max - PG_min << endl;
+ cout << "active_size: " << active_size << endl;
+ cout << "iter: " << iter << endl;
+ cout << endl;
+ }
+
+ if (PG_max - PG_min <= eps)
+ {
+ // stop if we are within eps tolerance and the last iteration
+ // was over all the samples
+ if (active_size == index.size())
+ break;
+
+ // Turn off shrinking on the next iteration. We will stop if the
+ // tolerance is still <= eps when shrinking is off.
+ active_size = index.size();
+ PG_max_prev = std::numeric_limits<double>::infinity();
+ PG_min_prev = -std::numeric_limits<double>::infinity();
+ }
+ else
+ {
+ PG_max_prev = PG_max;
+ PG_min_prev = PG_min;
+ if (PG_max_prev <= 0)
+ PG_max_prev = std::numeric_limits<double>::infinity();
+ if (PG_min_prev >= 0)
+ PG_min_prev = -std::numeric_limits<double>::infinity();
+ }
+
+
+ // recalculate wdoty every so often to avoid drift.
+ if (iter%100 == 0)
+ wdoty = dlib::dot(Y, w);
+ }
+
+
+ betas.set_size(alpha.size()/2);
+ for (long i = 0; i < betas.size(); ++i)
+ betas(i) = lasso_budget*(alpha[2*i] - alpha[2*i+1]);
+ betas /= sum(mat(alpha));
+ return betas;
+ }
+
+
+ private:
+
+ struct en_sample2
+ {
+ // X location
+ long r;
+
+
+ double label;
+
+ double xdotx;
+ };
+
+ std::vector<en_sample2> samples;
+ std::vector<double> alpha;
+ double ynorm;
+ matrix<double> X;
+ matrix<double,0,1> Y;
+ matrix<double,0,1> xdoty;
+ double wdoty;
+ double wy_mult; // logically, the real w is what is in the w vector + wy_mult*Y
+ matrix<double,0,1> w;
+ std::vector<long> index;
+ unsigned long active_size;
+
+ matrix<double,0,1> eig_vects_xdoty;
+ matrix<double,0,1> eig_vals;
+ matrix<double> eig_vects;
+ matrix<double> u;
+
+ dlib::rand rnd;
+
+
+ double eps;
+ unsigned long max_iterations;
+ bool verbose;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_ElASTIC_NET_Hh_
+
+