summaryrefslogtreecommitdiffstats
path: root/ml/dlib/examples/dnn_metric_learning_on_images_ex.cpp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-03-09 13:19:48 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-03-09 13:20:02 +0000
commit58daab21cd043e1dc37024a7f99b396788372918 (patch)
tree96771e43bb69f7c1c2b0b4f7374cb74d7866d0cb /ml/dlib/examples/dnn_metric_learning_on_images_ex.cpp
parentReleasing debian version 1.43.2-1. (diff)
downloadnetdata-58daab21cd043e1dc37024a7f99b396788372918.tar.xz
netdata-58daab21cd043e1dc37024a7f99b396788372918.zip
Merging upstream version 1.44.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ml/dlib/examples/dnn_metric_learning_on_images_ex.cpp')
-rw-r--r--ml/dlib/examples/dnn_metric_learning_on_images_ex.cpp340
1 files changed, 340 insertions, 0 deletions
diff --git a/ml/dlib/examples/dnn_metric_learning_on_images_ex.cpp b/ml/dlib/examples/dnn_metric_learning_on_images_ex.cpp
new file mode 100644
index 000000000..4c3856ac6
--- /dev/null
+++ b/ml/dlib/examples/dnn_metric_learning_on_images_ex.cpp
@@ -0,0 +1,340 @@
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+ This is an example illustrating the use of the deep learning tools from the
+ dlib C++ Library. In it, we will show how to use the loss_metric layer to do
+ metric learning on images.
+
+ The main reason you might want to use this kind of algorithm is because you
+ would like to use a k-nearest neighbor classifier or similar algorithm, but
+ you don't know a good way to calculate the distance between two things. A
+ popular example would be face recognition. There are a whole lot of papers
+ that train some kind of deep metric learning algorithm that embeds face
+ images in some vector space where images of the same person are close to each
+ other and images of different people are far apart. Then in that vector
+ space it's very easy to do face recognition with some kind of k-nearest
+ neighbor classifier.
+
+ In this example we will use a version of the ResNet network from the
+ dnn_imagenet_ex.cpp example to learn to map images into some vector space where
+ pictures of the same person are close and pictures of different people are far
+ apart.
+
+ You might want to read the simpler introduction to the deep metric learning
+ API, dnn_metric_learning_ex.cpp, before reading this example. You should
+ also have read the examples that introduce the dlib DNN API before
+ continuing. These are dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp.
+
+*/
+
+#include <dlib/dnn.h>
+#include <dlib/image_io.h>
+#include <dlib/misc_api.h>
+
+using namespace dlib;
+using namespace std;
+
+// ----------------------------------------------------------------------------------------
+
+// We will need to create some functions for loading data. This program will
+// expect to be given a directory structured as follows:
+// top_level_directory/
+// person1/
+// image1.jpg
+// image2.jpg
+// image3.jpg
+// person2/
+// image4.jpg
+// image5.jpg
+// image6.jpg
+// person3/
+// image7.jpg
+// image8.jpg
+// image9.jpg
+//
+// The specific folder and image names don't matter, nor does the number of folders or
+// images. What does matter is that there is a top level folder, which contains
+// subfolders, and each subfolder contains images of a single person.
+
+// This function spiders the top level directory and obtains a list of all the
+// image files.
+std::vector<std::vector<string>> load_objects_list (
+ const string& dir
+)
+{
+ std::vector<std::vector<string>> objects;
+ for (auto subdir : directory(dir).get_dirs())
+ {
+ std::vector<string> imgs;
+ for (auto img : subdir.get_files())
+ imgs.push_back(img);
+
+ if (imgs.size() != 0)
+ objects.push_back(imgs);
+ }
+ return objects;
+}
+
+// This function takes the output of load_objects_list() as input and randomly
+// selects images for training. It should also be pointed out that it's really
+// important that each mini-batch contain multiple images of each person. This
+// is because the metric learning algorithm needs to consider pairs of images
+// that should be close (i.e. images of the same person) as well as pairs of
+// images that should be far apart (i.e. images of different people) during each
+// training step.
+void load_mini_batch (
+ const size_t num_people, // how many different people to include
+ const size_t samples_per_id, // how many images per person to select.
+ dlib::rand& rnd,
+ const std::vector<std::vector<string>>& objs,
+ std::vector<matrix<rgb_pixel>>& images,
+ std::vector<unsigned long>& labels
+)
+{
+ images.clear();
+ labels.clear();
+ DLIB_CASSERT(num_people <= objs.size(), "The dataset doesn't have that many people in it.");
+
+ std::vector<bool> already_selected(objs.size(), false);
+ matrix<rgb_pixel> image;
+ for (size_t i = 0; i < num_people; ++i)
+ {
+ size_t id = rnd.get_random_32bit_number()%objs.size();
+ // don't pick a person we already added to the mini-batch
+ while(already_selected[id])
+ id = rnd.get_random_32bit_number()%objs.size();
+ already_selected[id] = true;
+
+ for (size_t j = 0; j < samples_per_id; ++j)
+ {
+ const auto& obj = objs[id][rnd.get_random_32bit_number()%objs[id].size()];
+ load_image(image, obj);
+ images.push_back(std::move(image));
+ labels.push_back(id);
+ }
+ }
+
+ // You might want to do some data augmentation at this point. Here we do some simple
+ // color augmentation.
+ for (auto&& crop : images)
+ {
+ disturb_colors(crop,rnd);
+ // Jitter most crops
+ if (rnd.get_random_double() > 0.1)
+ crop = jitter_image(crop,rnd);
+ }
+
+
+ // All the images going into a mini-batch have to be the same size. And really, all
+ // the images in your entire training dataset should be the same size for what we are
+ // doing to make the most sense.
+ DLIB_CASSERT(images.size() > 0);
+ for (auto&& img : images)
+ {
+ DLIB_CASSERT(img.nr() == images[0].nr() && img.nc() == images[0].nc(),
+ "All the images in a single mini-batch must be the same size.");
+ }
+}
+
+// ----------------------------------------------------------------------------------------
+
+// The next page of code defines a ResNet network. It's basically copied
+// and pasted from the dnn_imagenet_ex.cpp example, except we replaced the loss
+// layer with loss_metric and make the network somewhat smaller.
+
+template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
+using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
+
+template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
+using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
+
+template <int N, template <typename> class BN, int stride, typename SUBNET>
+using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
+
+
+template <int N, typename SUBNET> using res = relu<residual<block,N,bn_con,SUBNET>>;
+template <int N, typename SUBNET> using ares = relu<residual<block,N,affine,SUBNET>>;
+template <int N, typename SUBNET> using res_down = relu<residual_down<block,N,bn_con,SUBNET>>;
+template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>;
+
+// ----------------------------------------------------------------------------------------
+
+template <typename SUBNET> using level0 = res_down<256,SUBNET>;
+template <typename SUBNET> using level1 = res<256,res<256,res_down<256,SUBNET>>>;
+template <typename SUBNET> using level2 = res<128,res<128,res_down<128,SUBNET>>>;
+template <typename SUBNET> using level3 = res<64,res<64,res<64,res_down<64,SUBNET>>>>;
+template <typename SUBNET> using level4 = res<32,res<32,res<32,SUBNET>>>;
+
+template <typename SUBNET> using alevel0 = ares_down<256,SUBNET>;
+template <typename SUBNET> using alevel1 = ares<256,ares<256,ares_down<256,SUBNET>>>;
+template <typename SUBNET> using alevel2 = ares<128,ares<128,ares_down<128,SUBNET>>>;
+template <typename SUBNET> using alevel3 = ares<64,ares<64,ares<64,ares_down<64,SUBNET>>>>;
+template <typename SUBNET> using alevel4 = ares<32,ares<32,ares<32,SUBNET>>>;
+
+
+// training network type
+using net_type = loss_metric<fc_no_bias<128,avg_pool_everything<
+ level0<
+ level1<
+ level2<
+ level3<
+ level4<
+ max_pool<3,3,2,2,relu<bn_con<con<32,7,7,2,2,
+ input_rgb_image
+ >>>>>>>>>>>>;
+
+// testing network type (replaced batch normalization with fixed affine transforms)
+using anet_type = loss_metric<fc_no_bias<128,avg_pool_everything<
+ alevel0<
+ alevel1<
+ alevel2<
+ alevel3<
+ alevel4<
+ max_pool<3,3,2,2,relu<affine<con<32,7,7,2,2,
+ input_rgb_image
+ >>>>>>>>>>>>;
+
+// ----------------------------------------------------------------------------------------
+
+int main(int argc, char** argv)
+{
+ if (argc != 2)
+ {
+ cout << "Give a folder as input. It should contain sub-folders of images and we will " << endl;
+ cout << "learn to distinguish between these sub-folders with metric learning. " << endl;
+ cout << "For example, you can run this program on the very small examples/johns dataset" << endl;
+ cout << "that comes with dlib by running this command:" << endl;
+ cout << " ./dnn_metric_learning_on_images_ex johns" << endl;
+ return 1;
+ }
+
+ auto objs = load_objects_list(argv[1]);
+
+ cout << "objs.size(): "<< objs.size() << endl;
+
+ std::vector<matrix<rgb_pixel>> images;
+ std::vector<unsigned long> labels;
+
+
+ net_type net;
+
+ dnn_trainer<net_type> trainer(net, sgd(0.0001, 0.9));
+ trainer.set_learning_rate(0.1);
+ trainer.be_verbose();
+ trainer.set_synchronization_file("face_metric_sync", std::chrono::minutes(5));
+ // I've set this to something really small to make the example terminate
+ // sooner. But when you really want to train a good model you should set
+ // this to something like 10000 so training doesn't terminate too early.
+ trainer.set_iterations_without_progress_threshold(300);
+
+ // If you have a lot of data then it might not be reasonable to load it all
+ // into RAM. So you will need to be sure you are decompressing your images
+ // and loading them fast enough to keep the GPU occupied. I like to do this
+ // using the following coding pattern: create a bunch of threads that dump
+ // mini-batches into dlib::pipes.
+ dlib::pipe<std::vector<matrix<rgb_pixel>>> qimages(4);
+ dlib::pipe<std::vector<unsigned long>> qlabels(4);
+ auto data_loader = [&qimages, &qlabels, &objs](time_t seed)
+ {
+ dlib::rand rnd(time(0)+seed);
+ std::vector<matrix<rgb_pixel>> images;
+ std::vector<unsigned long> labels;
+ while(qimages.is_enabled())
+ {
+ try
+ {
+ load_mini_batch(5, 5, rnd, objs, images, labels);
+ qimages.enqueue(images);
+ qlabels.enqueue(labels);
+ }
+ catch(std::exception& e)
+ {
+ cout << "EXCEPTION IN LOADING DATA" << endl;
+ cout << e.what() << endl;
+ }
+ }
+ };
+ // Run the data_loader from 5 threads. You should set the number of threads
+ // relative to the number of CPU cores you have.
+ std::thread data_loader1([data_loader](){ data_loader(1); });
+ std::thread data_loader2([data_loader](){ data_loader(2); });
+ std::thread data_loader3([data_loader](){ data_loader(3); });
+ std::thread data_loader4([data_loader](){ data_loader(4); });
+ std::thread data_loader5([data_loader](){ data_loader(5); });
+
+
+ // Here we do the training. We keep passing mini-batches to the trainer until the
+ // learning rate has dropped low enough.
+ while(trainer.get_learning_rate() >= 1e-4)
+ {
+ qimages.dequeue(images);
+ qlabels.dequeue(labels);
+ trainer.train_one_step(images, labels);
+ }
+
+ // Wait for training threads to stop
+ trainer.get_net();
+ cout << "done training" << endl;
+
+ // Save the network to disk
+ net.clean();
+ serialize("metric_network_renset.dat") << net;
+
+ // stop all the data loading threads and wait for them to terminate.
+ qimages.disable();
+ qlabels.disable();
+ data_loader1.join();
+ data_loader2.join();
+ data_loader3.join();
+ data_loader4.join();
+ data_loader5.join();
+
+
+
+
+
+ // Now, just to show an example of how you would use the network, let's check how well
+ // it performs on the training data.
+ dlib::rand rnd(time(0));
+ load_mini_batch(5, 5, rnd, objs, images, labels);
+
+ // Normally you would use the non-batch-normalized version of the network to do
+ // testing, which is what we do here.
+ anet_type testing_net = net;
+
+ // Run all the images through the network to get their vector embeddings.
+ std::vector<matrix<float,0,1>> embedded = testing_net(images);
+
+ // Now, check if the embedding puts images with the same labels near each other and
+ // images with different labels far apart.
+ int num_right = 0;
+ int num_wrong = 0;
+ for (size_t i = 0; i < embedded.size(); ++i)
+ {
+ for (size_t j = i+1; j < embedded.size(); ++j)
+ {
+ if (labels[i] == labels[j])
+ {
+ // The loss_metric layer will cause images with the same label to be less
+ // than net.loss_details().get_distance_threshold() distance from each
+ // other. So we can use that distance value as our testing threshold.
+ if (length(embedded[i]-embedded[j]) < testing_net.loss_details().get_distance_threshold())
+ ++num_right;
+ else
+ ++num_wrong;
+ }
+ else
+ {
+ if (length(embedded[i]-embedded[j]) >= testing_net.loss_details().get_distance_threshold())
+ ++num_right;
+ else
+ ++num_wrong;
+ }
+ }
+ }
+
+ cout << "num_right: "<< num_right << endl;
+ cout << "num_wrong: "<< num_wrong << endl;
+
+}
+
+