diff options
Diffstat (limited to 'ml/dlib/examples/object_detector_advanced_ex.cpp')
-rw-r--r-- | ml/dlib/examples/object_detector_advanced_ex.cpp | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/ml/dlib/examples/object_detector_advanced_ex.cpp b/ml/dlib/examples/object_detector_advanced_ex.cpp new file mode 100644 index 000000000..718994e27 --- /dev/null +++ b/ml/dlib/examples/object_detector_advanced_ex.cpp @@ -0,0 +1,302 @@ +// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt +/* + + This is an example illustrating the process for defining custom + bag-of-visual-word style feature extractors for use with the + structural_object_detection_trainer. + + NOTICE: This example assumes you are familiar with the contents of the + object_detector_ex.cpp example program. Also, if the objects you want to + detect are somewhat rigid in appearance (e.g. faces, pedestrians, etc.) + then you should try the methods shown in the fhog_object_detector_ex.cpp + example program before trying to use the bag-of-visual-word tools shown in + this example. +*/ + + +#include <dlib/svm_threaded.h> +#include <dlib/gui_widgets.h> +#include <dlib/array.h> +#include <dlib/array2d.h> +#include <dlib/image_keypoint.h> +#include <dlib/image_processing.h> + +#include <iostream> +#include <fstream> + + +using namespace std; +using namespace dlib; + +// ---------------------------------------------------------------------------------------- + +template < + typename image_array_type + > +void make_simple_test_data ( + image_array_type& images, + std::vector<std::vector<rectangle> >& object_locations +) +/*! + ensures + - #images.size() == 3 + - #object_locations.size() == 3 + - Creates some simple images to test the object detection routines. In particular, + this function creates images with white 70x70 squares in them. It also stores + the locations of these squares in object_locations. + - for all valid i: + - object_locations[i] == A list of all the white rectangles present in images[i]. +!*/ +{ + images.clear(); + object_locations.clear(); + + images.resize(3); + images[0].set_size(400,400); + images[1].set_size(400,400); + images[2].set_size(400,400); + + // set all the pixel values to black + assign_all_pixels(images[0], 0); + assign_all_pixels(images[1], 0); + assign_all_pixels(images[2], 0); + + // Now make some squares and draw them onto our black images. All the + // squares will be 70 pixels wide and tall. + + std::vector<rectangle> temp; + temp.push_back(centered_rect(point(100,100), 70,70)); + fill_rect(images[0],temp.back(),255); // Paint the square white + temp.push_back(centered_rect(point(200,300), 70,70)); + fill_rect(images[0],temp.back(),255); // Paint the square white + object_locations.push_back(temp); + + temp.clear(); + temp.push_back(centered_rect(point(140,200), 70,70)); + fill_rect(images[1],temp.back(),255); // Paint the square white + temp.push_back(centered_rect(point(303,200), 70,70)); + fill_rect(images[1],temp.back(),255); // Paint the square white + object_locations.push_back(temp); + + temp.clear(); + temp.push_back(centered_rect(point(123,121), 70,70)); + fill_rect(images[2],temp.back(),255); // Paint the square white + object_locations.push_back(temp); +} + +// ---------------------------------------------------------------------------------------- + +class very_simple_feature_extractor : noncopyable +{ + /*! + WHAT THIS OBJECT REPRESENTS + This object is a feature extractor which goes to every pixel in an image and + produces a 32 dimensional feature vector. This vector is an indicator vector + which records the pattern of pixel values in a 4-connected region. So it should + be able to distinguish basic things like whether or not a location falls on the + corner of a white box, on an edge, in the middle, etc. + + + Note that this object also implements the interface defined in dlib/image_keypoint/hashed_feature_image_abstract.h. + This means all the member functions in this object are supposed to behave as + described in the hashed_feature_image specification. So when you define your own + feature extractor objects you should probably refer yourself to that documentation + in addition to reading this example program. + !*/ + + +public: + + template < + typename image_type + > + inline void load ( + const image_type& img + ) + { + feat_image.set_size(img.nr(), img.nc()); + assign_all_pixels(feat_image,0); + for (long r = 1; r+1 < img.nr(); ++r) + { + for (long c = 1; c+1 < img.nc(); ++c) + { + unsigned char f = 0; + if (img[r][c]) f |= 0x1; + if (img[r][c+1]) f |= 0x2; + if (img[r][c-1]) f |= 0x4; + if (img[r+1][c]) f |= 0x8; + if (img[r-1][c]) f |= 0x10; + + // Store the code value for the pattern of pixel values in the 4-connected + // neighborhood around this row and column. + feat_image[r][c] = f; + } + } + } + + inline size_t size () const { return feat_image.size(); } + inline long nr () const { return feat_image.nr(); } + inline long nc () const { return feat_image.nc(); } + + inline long get_num_dimensions ( + ) const + { + // Return the dimensionality of the vectors produced by operator() + return 32; + } + + typedef std::vector<std::pair<unsigned int,double> > descriptor_type; + + inline const descriptor_type& operator() ( + long row, + long col + ) const + /*! + requires + - 0 <= row < nr() + - 0 <= col < nc() + ensures + - returns a sparse vector which describes the image at the given row and column. + In particular, this is a vector that is 0 everywhere except for one element. + !*/ + { + feat.clear(); + const unsigned long only_nonzero_element_index = feat_image[row][col]; + feat.push_back(make_pair(only_nonzero_element_index,1.0)); + return feat; + } + + // This block of functions is meant to provide a way to map between the row/col space taken by + // this object's operator() function and the images supplied to load(). In this example it's trivial. + // However, in general, you might create feature extractors which don't perform extraction at every + // possible image location (e.g. the hog_image) and thus result in some more complex mapping. + inline const rectangle get_block_rect ( long row, long col) const { return centered_rect(col,row,3,3); } + inline const point image_to_feat_space ( const point& p) const { return p; } + inline const rectangle image_to_feat_space ( const rectangle& rect) const { return rect; } + inline const point feat_to_image_space ( const point& p) const { return p; } + inline const rectangle feat_to_image_space ( const rectangle& rect) const { return rect; } + + inline friend void serialize ( const very_simple_feature_extractor& item, std::ostream& out) { serialize(item.feat_image, out); } + inline friend void deserialize ( very_simple_feature_extractor& item, std::istream& in ) { deserialize(item.feat_image, in); } + + void copy_configuration ( const very_simple_feature_extractor& item){} + +private: + array2d<unsigned char> feat_image; + + // This variable doesn't logically contribute to the state of this object. It is here + // only to avoid returning a descriptor_type object by value inside the operator() method. + mutable descriptor_type feat; +}; + +// ---------------------------------------------------------------------------------------- + +int main() +{ + try + { + // Get some data + dlib::array<array2d<unsigned char> > images; + std::vector<std::vector<rectangle> > object_locations; + make_simple_test_data(images, object_locations); + + + typedef scan_image_pyramid<pyramid_down<5>, very_simple_feature_extractor> image_scanner_type; + image_scanner_type scanner; + // Instead of using setup_grid_detection_templates() like in object_detector_ex.cpp, let's manually + // setup the sliding window box. We use a window with the same shape as the white boxes we + // are trying to detect. + const rectangle object_box = compute_box_dimensions(1, // width/height ratio + 70*70 // box area + ); + scanner.add_detection_template(object_box, create_grid_detection_template(object_box,2,2)); + + // Since our sliding window is already the right size to detect our objects we don't need + // to use an image pyramid. So setting this to 1 turns off the image pyramid. + scanner.set_max_pyramid_levels(1); + + + // While the very_simple_feature_extractor doesn't have any parameters, when you go solve + // real problems you might define a feature extractor which has some non-trivial parameters + // that need to be setup before it can be used. So you need to be able to pass these parameters + // to the scanner object somehow. You can do this using the copy_configuration() function as + // shown below. + very_simple_feature_extractor fe; + /* + setup the parameters in the fe object. + ... + */ + // The scanner will use very_simple_feature_extractor::copy_configuration() to copy the state + // of fe into its internal feature extractor. + scanner.copy_configuration(fe); + + + + + // Now that we have defined the kind of sliding window classifier system we want and stored + // the details into the scanner object we are ready to use the structural_object_detection_trainer + // to learn the weight vector and threshold needed to produce a complete object detector. + structural_object_detection_trainer<image_scanner_type> trainer(scanner); + trainer.set_num_threads(4); // Set this to the number of processing cores on your machine. + + + // The trainer will try and find the detector which minimizes the number of detection mistakes. + // This function controls how it decides if a detection output is a mistake or not. The bigger + // the input to this function the more strict it is in deciding if the detector is correctly + // hitting the targets. Try reducing the value to 0.001 and observing the results. You should + // see that the detections aren't exactly on top of the white squares anymore. See the documentation + // for the structural_object_detection_trainer and structural_svm_object_detection_problem objects + // for a more detailed discussion of this parameter. + trainer.set_match_eps(0.95); + + + object_detector<image_scanner_type> detector = trainer.train(images, object_locations); + + // We can easily test the new detector against our training data. This print + // statement will indicate that it has perfect precision and recall on this simple + // task. It will also print the average precision (AP). + cout << "Test detector (precision,recall,AP): " << test_object_detection_function(detector, images, object_locations) << endl; + + // The cross validation should also indicate perfect precision and recall. + cout << "3-fold cross validation (precision,recall,AP): " + << cross_validate_object_detection_trainer(trainer, images, object_locations, 3) << endl; + + + /* + It is also worth pointing out that you don't have to use dlib::array2d objects to + represent your images. In fact, you can use any object, even something like a struct + of many images and other things as the "image". The only requirements on an image + are that it should be possible to pass it to scanner.load(). So if you can say + scanner.load(images[0]), for example, then you are good to go. See the documentation + for scan_image_pyramid::load() for more details. + */ + + + // Let's display the output of the detector along with our training images. + image_window win; + for (unsigned long i = 0; i < images.size(); ++i) + { + // Run the detector on images[i] + const std::vector<rectangle> rects = detector(images[i]); + cout << "Number of detections: "<< rects.size() << endl; + + // Put the image and detections into the window. + win.clear_overlay(); + win.set_image(images[i]); + win.add_overlay(rects, rgb_pixel(255,0,0)); + + cout << "Hit enter to see the next image."; + cin.get(); + } + + } + catch (exception& e) + { + cout << "\nexception thrown!" << endl; + cout << e.what() << endl; + } +} + +// ---------------------------------------------------------------------------------------- + + |