diff options
Diffstat (limited to '')
-rw-r--r-- | ml/dlib/examples/object_detector_ex.cpp | 263 |
1 files changed, 263 insertions, 0 deletions
diff --git a/ml/dlib/examples/object_detector_ex.cpp b/ml/dlib/examples/object_detector_ex.cpp new file mode 100644 index 000000000..cda71eb5a --- /dev/null +++ b/ml/dlib/examples/object_detector_ex.cpp @@ -0,0 +1,263 @@ +// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt +/* + + This is an example illustrating the use of dlib's bag-of-visual-word based + tools for detecting objects in images. In this example we will create three + simple images, each containing some white squares. We will then use the + sliding window classifier tools to learn to detect these squares. + + If the objects you want to detect are somewhat rigid in appearance (e.g. + faces, pedestrians, etc.) then you should try the methods shown in the + fhog_object_detector_ex.cpp example program before trying to use the + bag-of-visual-word tools shown in this example. +*/ + + +#include <dlib/svm_threaded.h> +#include <dlib/gui_widgets.h> +#include <dlib/array.h> +#include <dlib/array2d.h> +#include <dlib/image_keypoint.h> +#include <dlib/image_processing.h> + +#include <iostream> +#include <fstream> + + +using namespace std; +using namespace dlib; + +// ---------------------------------------------------------------------------------------- + +template < + typename image_array_type + > +void make_simple_test_data ( + image_array_type& images, + std::vector<std::vector<rectangle> >& object_locations +) +/*! + ensures + - #images.size() == 3 + - #object_locations.size() == 3 + - Creates some simple images to test the object detection routines. In particular, + this function creates images with white 70x70 squares in them. It also stores + the locations of these squares in object_locations. + - for all valid i: + - object_locations[i] == A list of all the white rectangles present in images[i]. +!*/ +{ + images.clear(); + object_locations.clear(); + + images.resize(3); + images[0].set_size(400,400); + images[1].set_size(400,400); + images[2].set_size(400,400); + + // set all the pixel values to black + assign_all_pixels(images[0], 0); + assign_all_pixels(images[1], 0); + assign_all_pixels(images[2], 0); + + // Now make some squares and draw them onto our black images. All the + // squares will be 70 pixels wide and tall. + + std::vector<rectangle> temp; + temp.push_back(centered_rect(point(100,100), 70,70)); + fill_rect(images[0],temp.back(),255); // Paint the square white + temp.push_back(centered_rect(point(200,300), 70,70)); + fill_rect(images[0],temp.back(),255); // Paint the square white + object_locations.push_back(temp); + + temp.clear(); + temp.push_back(centered_rect(point(140,200), 70,70)); + fill_rect(images[1],temp.back(),255); // Paint the square white + temp.push_back(centered_rect(point(303,200), 70,70)); + fill_rect(images[1],temp.back(),255); // Paint the square white + object_locations.push_back(temp); + + temp.clear(); + temp.push_back(centered_rect(point(123,121), 70,70)); + fill_rect(images[2],temp.back(),255); // Paint the square white + object_locations.push_back(temp); + + // corrupt each image with random noise just to make this a little more + // challenging + dlib::rand rnd; + for (unsigned long i = 0; i < images.size(); ++i) + { + for (long r = 0; r < images[i].nr(); ++r) + { + for (long c = 0; c < images[i].nc(); ++c) + { + images[i][r][c] = put_in_range(0,255,images[i][r][c] + 40*rnd.get_random_gaussian()); + } + } + } +} + +// ---------------------------------------------------------------------------------------- + +int main() +{ + try + { + // The first thing we do is create the set of 3 images discussed above. + dlib::array<array2d<unsigned char> > images; + std::vector<std::vector<rectangle> > object_locations; + make_simple_test_data(images, object_locations); + + + /* + This next block of code specifies the type of sliding window classifier we will + be using to detect the white squares. The most important thing here is the + scan_image_pyramid template. Instances of this template represent the core + of a sliding window classifier. To go into more detail, the sliding window + classifiers used by this object have three parts: + 1. The underlying feature extraction. See the dlib documentation for a detailed + discussion of how the hashed_feature_image and hog_image feature extractors + work. However, to understand this example, all you need to know is that the + feature extractor associates a vector with each location in an image. This + vector is supposed to capture information which describes how parts of the + image look. Importantly, it should do this in a way that is relevant to the + problem you are trying to solve. + + 2. A detection template. This is a rectangle which defines the shape of a + sliding window (i.e. the object_box), as well as a set of rectangular feature + extraction regions inside it. This set of regions defines the spatial + structure of the overall feature extraction within a sliding window. In + particular, each location of a sliding window has a feature vector + associated with it. This feature vector is defined as follows: + - Let N denote the number of feature extraction zones. + - Let M denote the dimensionality of the vectors output by Feature_extractor_type + objects. + - Let F(i) == the M dimensional vector which is the sum of all vectors + given by our Feature_extractor_type object inside the ith feature extraction + zone. + - Then the feature vector for a sliding window is an M*N dimensional vector + [F(1) F(2) F(3) ... F(N)] (i.e. it is a concatenation of the N vectors). + This feature vector can be thought of as a collection of N "bags of features", + each bag coming from a spatial location determined by one of the rectangular + feature extraction zones. + + 3. A weight vector and a threshold value. The dot product between the weight + vector and the feature vector for a sliding window location gives the score + of the window. If this score is greater than the threshold value then the + window location is output as a detection. You don't need to determine these + parameters yourself. They are automatically populated by the + structural_object_detection_trainer. + + The sliding window classifiers described above are applied to every level of an + image pyramid. So you need to tell scan_image_pyramid what kind of pyramid you want + to use. In this case we are using pyramid_down<2> which downsamples each pyramid + layer by half (if you want to use a finer image pyramid then just change the + template argument to a larger value. For example, using pyramid_down<5> would + downsample each layer by a ratio of 5 to 4). + + Finally, some of the feature extraction zones are allowed to move freely within the + object box. This means that when we are sliding the classifier over an image, some + feature extraction zones are stationary (i.e. always in the same place relative to + the object box) while others are allowed to move anywhere within the object box. In + particular, the movable regions are placed at the locations that maximize the score + of the classifier. Note further that each of the movable feature extraction zones + must pass a threshold test for it to be included. That is, if the score that a + movable zone would contribute to the overall score for a sliding window location is + not positive then that zone is not included in the feature vector (i.e. its part of + the feature vector is set to zero. This way the length of the feature vector stays + constant). This movable region construction allows us to represent objects with + parts that move around relative to the object box. For example, a human has hands + but they aren't always in the same place relative to a person's bounding box. + However, to keep this example program simple, we will only be using stationary + feature extraction regions. + */ + typedef hashed_feature_image<hog_image<3,3,1,4,hog_signed_gradient,hog_full_interpolation> > feature_extractor_type; + typedef scan_image_pyramid<pyramid_down<2>, feature_extractor_type> image_scanner_type; + image_scanner_type scanner; + + // The hashed_feature_image in the scanner needs to be supplied with a hash function capable + // of hashing the outputs of the hog_image. Calling this function will set it up for us. The + // 10 here indicates that it will hash HOG vectors into the range [0, pow(2,10)). Therefore, + // the feature vectors output by the hashed_feature_image will have dimension pow(2,10). + setup_hashed_features(scanner, images, 10); + // We should also tell the scanner to use the uniform feature weighting scheme + // since it works best on the data in this example. If you don't call this + // function then it will use a slightly different weighting scheme which can give + // improved results on many normal image types. + use_uniform_feature_weights(scanner); + + // We also need to setup the detection templates the scanner will use. It is important that + // we add detection templates which are capable of matching all the output boxes we want to learn. + // For example, if object_locations contained a rectangle with a height to width ratio of 10 but + // we only added square detection templates then it would be impossible to detect this non-square + // rectangle. The setup_grid_detection_templates_verbose() routine will take care of this for us by + // looking at the contents of object_locations and automatically picking an appropriate set. Also, + // the final arguments indicate that we want our detection templates to have 4 feature extraction + // regions laid out in a 2x2 regular grid inside each sliding window. + setup_grid_detection_templates_verbose(scanner, object_locations, 2, 2); + + + // Now that we have defined the kind of sliding window classifier system we want and stored + // the details into the scanner object we are ready to use the structural_object_detection_trainer + // to learn the weight vector and threshold needed to produce a complete object detector. + structural_object_detection_trainer<image_scanner_type> trainer(scanner); + trainer.set_num_threads(4); // Set this to the number of processing cores on your machine. + + + // There are a variety of other useful parameters to the structural_object_detection_trainer. + // Examples of the ones you are most likely to use follow (see dlib documentation for what they do): + //trainer.set_match_eps(0.80); + //trainer.set_c(1.0); + //trainer.set_loss_per_missed_target(1); + //trainer.set_loss_per_false_alarm(1); + + + // Do the actual training and save the results into the detector object. + object_detector<image_scanner_type> detector = trainer.train(images, object_locations); + + // We can easily test the new detector against our training data. This print statement will indicate that it + // has perfect precision and recall on this simple task. It will also print the average precision (AP). + cout << "Test detector (precision,recall,AP): " << test_object_detection_function(detector, images, object_locations) << endl; + + // The cross validation should also indicate perfect precision and recall. + cout << "3-fold cross validation (precision,recall,AP): " + << cross_validate_object_detection_trainer(trainer, images, object_locations, 3) << endl; + + + + + // Let's display the output of the detector along with our training images. + image_window win; + for (unsigned long i = 0; i < images.size(); ++i) + { + // Run the detector on images[i] + const std::vector<rectangle> rects = detector(images[i]); + cout << "Number of detections: "<< rects.size() << endl; + + // Put the image and detections into the window. + win.clear_overlay(); + win.set_image(images[i]); + win.add_overlay(rects, rgb_pixel(255,0,0)); + + cout << "Hit enter to see the next image."; + cin.get(); + } + + + + + // Finally, note that the detector can be serialized to disk just like other dlib objects. + serialize("object_detector.dat") << detector; + + // Recall from disk. + deserialize("object_detector.dat") >> detector; + } + catch (exception& e) + { + cout << "\nexception thrown!" << endl; + cout << e.what() << endl; + } +} + +// ---------------------------------------------------------------------------------------- + |