// Copyright (C) 2010 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #include "tester.h" #include #include #include #include #include #include #include #include namespace { using namespace test; using namespace dlib; using namespace std; dlib::logger dlog("test.linear_manifold_regularizer"); template void test_find_k_nearest_neighbors_lsh( const samples_type& samples ) { std::vector edges1, edges2; find_k_nearest_neighbors(samples, cosine_distance(), 2, edges1); find_k_nearest_neighbors_lsh(samples, cosine_distance(), hash_type(), 2, 6, edges2, 2); std::sort(edges1.begin(), edges1.end(), order_by_index); std::sort(edges2.begin(), edges2.end(), order_by_index); DLIB_TEST_MSG(edges1.size() == edges2.size(), edges1.size() << " " << edges2.size()); for (unsigned long i = 0; i < edges1.size(); ++i) { DLIB_TEST(edges1[i] == edges2[i]); DLIB_TEST_MSG(std::abs(edges1[i].distance() - edges2[i].distance()) < 1e-7, edges1[i].distance() - edges2[i].distance()); } } template void test_knn_lsh_sparse() { dlib::rand rnd; std::vector > samples; samples.resize(20); for (unsigned int i = 0; i < samples.size(); ++i) { samples[i][0] = rnd.get_random_gaussian(); samples[i][2] = rnd.get_random_gaussian(); } test_find_k_nearest_neighbors_lsh(samples); test_find_k_nearest_neighbors_lsh(samples); test_find_k_nearest_neighbors_lsh(samples); test_find_k_nearest_neighbors_lsh(samples); } template void test_knn_lsh_dense() { dlib::rand rnd; std::vector > samples; samples.resize(20); for (unsigned int i = 0; i < samples.size(); ++i) { samples[i].set_size(2); samples[i](0) = rnd.get_random_gaussian(); samples[i](1) = rnd.get_random_gaussian(); } test_find_k_nearest_neighbors_lsh(samples); test_find_k_nearest_neighbors_lsh(samples); test_find_k_nearest_neighbors_lsh(samples); test_find_k_nearest_neighbors_lsh(samples); } class linear_manifold_regularizer_tester : public tester { /*! WHAT THIS OBJECT REPRESENTS This object represents a unit test. When it is constructed it adds itself into the testing framework. !*/ public: linear_manifold_regularizer_tester ( ) : tester ( "test_linear_manifold_regularizer", // the command line argument name for this test "Run tests on the linear_manifold_regularizer object.", // the command line argument description 0 // the number of command line arguments for this test ) { seed = 1; } dlib::rand rnd; unsigned long seed; typedef matrix sample_type; typedef radial_basis_kernel kernel_type; void do_the_test() { print_spinner(); std::vector samples; // Declare an instance of the kernel we will be using. const kernel_type kern(0.1); const unsigned long num_points = 200; // create a large dataset with two concentric circles. generate_circle(samples, 1, num_points); // circle of radius 1 generate_circle(samples, 5, num_points); // circle of radius 5 std::vector edges; find_percent_shortest_edges_randomly(samples, squared_euclidean_distance(0.1, 4), 1, 10000, "random seed", edges); dlog << LTRACE << "number of edges generated: " << edges.size(); empirical_kernel_map ekm; ekm.load(kern, randomly_subsample(samples, 100)); // Project all the samples into the span of our 50 basis samples for (unsigned long i = 0; i < samples.size(); ++i) samples[i] = ekm.project(samples[i]); // Now create the manifold regularizer. The result is a transformation matrix that // embodies the manifold assumption discussed above. linear_manifold_regularizer lmr; lmr.build(samples, edges, use_gaussian_weights(0.1)); matrix T = lmr.get_transformation_matrix(10000); print_spinner(); // generate the T matrix manually and make sure it matches. The point of this test // is to make sure that the more complex version of this that happens inside the linear_manifold_regularizer // is correct. It uses a tedious block of loops to do it in a way that is a lot faster for sparse // W matrices but isn't super straight forward. matrix X(samples[0].size(), samples.size()); for (unsigned long i = 0; i < samples.size(); ++i) set_colm(X,i) = samples[i]; matrix W(samples.size(), samples.size()); W = 0; for (unsigned long i = 0; i < edges.size(); ++i) { W(edges[i].index1(), edges[i].index2()) = use_gaussian_weights(0.1)(edges[i]); W(edges[i].index2(), edges[i].index1()) = use_gaussian_weights(0.1)(edges[i]); } matrix L = diagm(sum_rows(W)) - W; matrix trueT = inv_lower_triangular(chol(identity_matrix(X.nr()) + (10000.0/sum(lowerm(W)))*X*L*trans(X))); dlog << LTRACE << "T error: "<< max(abs(T - trueT)); DLIB_TEST(max(abs(T - trueT)) < 1e-7); print_spinner(); // Apply the transformation generated by the linear_manifold_regularizer to // all our samples. for (unsigned long i = 0; i < samples.size(); ++i) samples[i] = T*samples[i]; // For convenience, generate a projection_function and merge the transformation // matrix T into it. projection_function proj = ekm.get_projection_function(); proj.weights = T*proj.weights; // Pick 2 different labeled points. One on the inner circle and another on the outer. // For each of these test points we will see if using the single plane that separates // them is a good way to separate the concentric circles. Also do this a bunch // of times with different randomly chosen points so we can see how robust the result is. for (int itr = 0; itr < 10; ++itr) { print_spinner(); std::vector test_points; // generate a random point from the radius 1 circle generate_circle(test_points, 1, 1); // generate a random point from the radius 5 circle generate_circle(test_points, 5, 1); // project the two test points into kernel space. Recall that this projection_function // has the manifold regularizer incorporated into it. const sample_type class1_point = proj(test_points[0]); const sample_type class2_point = proj(test_points[1]); double num_wrong = 0; // Now attempt to classify all the data samples according to which point // they are closest to. The output of this program shows that without manifold // regularization this test will fail but with it it will perfectly classify // all the points. for (unsigned long i = 0; i < samples.size(); ++i) { double distance_to_class1 = length(samples[i] - class1_point); double distance_to_class2 = length(samples[i] - class2_point); bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2); bool really_is_class_1 = (i < num_points); // now count how many times we make a mistake if (predicted_as_class_1 != really_is_class_1) ++num_wrong; } DLIB_TEST_MSG(num_wrong == 0, num_wrong); } } void generate_circle ( std::vector& samples, double radius, const long num ) { sample_type m(2,1); for (long i = 0; i < num; ++i) { double sign = 1; if (rnd.get_random_double() < 0.5) sign = -1; m(0) = 2*radius*rnd.get_random_double()-radius; m(1) = sign*sqrt(radius*radius - m(0)*m(0)); samples.push_back(m); } } void test_knn1() { std::vector > samples; matrix test; test = 0,0; samples.push_back(test); test = 1,1; samples.push_back(test); test = 1,-1; samples.push_back(test); test = -1,1; samples.push_back(test); test = -1,-1; samples.push_back(test); std::vector edges; find_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, edges); DLIB_TEST(edges.size() == 4); std::sort(edges.begin(), edges.end(), &order_by_index); DLIB_TEST(edges[0] == sample_pair(0,1,0)); DLIB_TEST(edges[1] == sample_pair(0,2,0)); DLIB_TEST(edges[2] == sample_pair(0,3,0)); DLIB_TEST(edges[3] == sample_pair(0,4,0)); find_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, edges); DLIB_TEST(edges.size() == 8); find_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, edges); DLIB_TEST(edges.size() == 4); std::sort(edges.begin(), edges.end(), &order_by_index); DLIB_TEST(edges[0] == sample_pair(1,2,0)); DLIB_TEST(edges[1] == sample_pair(1,3,0)); DLIB_TEST(edges[2] == sample_pair(2,4,0)); DLIB_TEST(edges[3] == sample_pair(3,4,0)); find_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, edges); DLIB_TEST(edges.size() == 0); } void test_knn1_approx() { std::vector > samples; matrix test; test = 0,0; samples.push_back(test); test = 1,1; samples.push_back(test); test = 1,-1; samples.push_back(test); test = -1,1; samples.push_back(test); test = -1,-1; samples.push_back(test); std::vector edges; find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, 10000, seed, edges); DLIB_TEST(edges.size() == 4); std::sort(edges.begin(), edges.end(), &order_by_index); DLIB_TEST(edges[0] == sample_pair(0,1,0)); DLIB_TEST(edges[1] == sample_pair(0,2,0)); DLIB_TEST(edges[2] == sample_pair(0,3,0)); DLIB_TEST(edges[3] == sample_pair(0,4,0)); find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, 10000, seed, edges); DLIB_TEST(edges.size() == 8); find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, 10000, seed, edges); DLIB_TEST(edges.size() == 4); std::sort(edges.begin(), edges.end(), &order_by_index); DLIB_TEST(edges[0] == sample_pair(1,2,0)); DLIB_TEST(edges[1] == sample_pair(1,3,0)); DLIB_TEST(edges[2] == sample_pair(2,4,0)); DLIB_TEST(edges[3] == sample_pair(3,4,0)); find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, 10000, seed, edges); DLIB_TEST(edges.size() == 0); } void test_knn2() { std::vector > samples; matrix test; test = 1,1; samples.push_back(test); test = 1,-1; samples.push_back(test); test = -1,1; samples.push_back(test); test = -1,-1; samples.push_back(test); std::vector edges; find_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, edges); DLIB_TEST(edges.size() == 4); std::sort(edges.begin(), edges.end(), &order_by_index); DLIB_TEST(edges[0] == sample_pair(0,1,0)); DLIB_TEST(edges[1] == sample_pair(0,2,0)); DLIB_TEST(edges[2] == sample_pair(1,3,0)); DLIB_TEST(edges[3] == sample_pair(2,3,0)); find_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, edges); DLIB_TEST(edges.size() == 4*3/2); } void test_knn2_approx() { std::vector > samples; matrix test; test = 1,1; samples.push_back(test); test = 1,-1; samples.push_back(test); test = -1,1; samples.push_back(test); test = -1,-1; samples.push_back(test); std::vector edges; // For this simple graph and high number of samples we will do we should obtain the exact // knn solution. find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, 10000, seed, edges); DLIB_TEST(edges.size() == 4); std::sort(edges.begin(), edges.end(), &order_by_index); DLIB_TEST(edges[0] == sample_pair(0,1,0)); DLIB_TEST(edges[1] == sample_pair(0,2,0)); DLIB_TEST(edges[2] == sample_pair(1,3,0)); DLIB_TEST(edges[3] == sample_pair(2,3,0)); find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, 10000, seed, edges); DLIB_TEST(edges.size() == 4*3/2); } void perform_test ( ) { for (int i = 0; i < 5; ++i) { do_the_test(); ++seed; test_knn1_approx(); test_knn2_approx(); } test_knn1(); test_knn2(); test_knn_lsh_sparse(); test_knn_lsh_sparse(); test_knn_lsh_dense(); test_knn_lsh_dense(); } }; // Create an instance of this object. Doing this causes this test // to be automatically inserted into the testing framework whenever this cpp file // is linked into the project. Note that since we are inside an unnamed-namespace // we won't get any linker errors about the symbol a being defined multiple times. linear_manifold_regularizer_tester a; }