ml/dlib/examples/dnn_mmod_train_find_cars_ex.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425

// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
    This example shows how to train a CNN based object detector using dlib's 
    loss_mmod loss layer.  This loss layer implements the Max-Margin Object
    Detection loss as described in the paper:
        Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046).
    This is the same loss used by the popular SVM+HOG object detector in dlib
    (see fhog_object_detector_ex.cpp) except here we replace the HOG features
    with a CNN and train the entire detector end-to-end.  This allows us to make
    much more powerful detectors.

    It would be a good idea to become familiar with dlib's DNN tooling before reading this
    example.  So you should read dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp
    before reading this example program.  You should also read the introductory DNN+MMOD
    example dnn_mmod_ex.cpp as well before proceeding.
    

    This example is essentially a more complex version of dnn_mmod_ex.cpp.  In it we train
    a detector that finds the rear ends of motor vehicles.  I will also discuss some
    aspects of data preparation useful when training this kind of detector.  
    
*/


#include <iostream>
#include <dlib/dnn.h>
#include <dlib/data_io.h>

using namespace std;
using namespace dlib;


template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5  = con<num_filters,5,5,1,1,SUBNET>;
template <typename SUBNET> using downsampler  = relu<bn_con<con5d<32, relu<bn_con<con5d<32, relu<bn_con<con5d<16,SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5  = relu<bn_con<con5<55,SUBNET>>>;
using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;


// ----------------------------------------------------------------------------------------

int ignore_overlapped_boxes(
    std::vector<mmod_rect>& boxes,
    const test_box_overlap& overlaps
)
/*!
    ensures
        - Whenever two rectangles in boxes overlap, according to overlaps(), we set the
          smallest box to ignore.
        - returns the number of newly ignored boxes.
!*/
{
    int num_ignored = 0;
    for (size_t i = 0; i < boxes.size(); ++i)
    {
        if (boxes[i].ignore)
            continue;
        for (size_t j = i+1; j < boxes.size(); ++j)
        {
            if (boxes[j].ignore)
                continue;
            if (overlaps(boxes[i], boxes[j]))
            {
                ++num_ignored;
                if(boxes[i].rect.area() < boxes[j].rect.area())
                    boxes[i].ignore = true;
                else
                    boxes[j].ignore = true;
            }
        }
    }
    return num_ignored;
}

// ----------------------------------------------------------------------------------------

int main(int argc, char** argv) try
{
    if (argc != 2)
    {
        cout << "Give the path to a folder containing training.xml and testing.xml files." << endl;
        cout << "This example program is specifically designed to run on the dlib vehicle " << endl;
        cout << "detection dataset, which is available at this URL: " << endl;
        cout << "   http://dlib.net/files/data/dlib_rear_end_vehicles_v1.tar" << endl;
        cout << endl;
        cout << "So download that dataset, extract it somewhere, and then run this program" << endl;
        cout << "with the dlib_rear_end_vehicles folder as an argument.  E.g. if you extract" << endl;
        cout << "the dataset to the current folder then you should run this example program" << endl;
        cout << "by typing: " << endl;
        cout << "   ./dnn_mmod_train_find_cars_ex dlib_rear_end_vehicles" << endl;
        cout << endl;
        cout << "It takes about a day to finish if run on a high end GPU like a 1080ti." << endl;
        cout << endl;
        return 0;
    }
    const std::string data_directory = argv[1];


    std::vector<matrix<rgb_pixel>> images_train, images_test;
    std::vector<std::vector<mmod_rect>> boxes_train, boxes_test;
    load_image_dataset(images_train, boxes_train, data_directory+"/training.xml");
    load_image_dataset(images_test,  boxes_test,  data_directory+"/testing.xml");

    // When I was creating the dlib vehicle detection dataset I had to label all the cars
    // in each image.  MMOD requires all cars to be labeled, since any unlabeled part of an
    // image is implicitly assumed to be not a car, and the algorithm will use it as
    // negative training data.  So every car must be labeled, either with a normal
    // rectangle or an "ignore" rectangle that tells MMOD to simply ignore it (i.e. neither
    // treat it as a thing to detect nor as negative training data).  
    // 
    // In our present case, many images contain very tiny cars in the distance, ones that
    // are essentially just dark smudges.  It's not reasonable to expect the CNN
    // architecture we defined to detect such vehicles.  However, I erred on the side of
    // having more complete annotations when creating the dataset.  So when I labeled these
    // images I labeled many of these really difficult cases as vehicles to detect.   
    //
    // So the first thing we are going to do is clean up our dataset a little bit.  In
    // particular, we are going to mark boxes smaller than 35*35 pixels as ignore since
    // only really small and blurry cars appear at those sizes.  We will also mark boxes
    // that are heavily overlapped by another box as ignore.  We do this because we want to
    // allow for stronger non-maximum suppression logic in the learned detector, since that
    // will help make it easier to learn a good detector. 
    // 
    // To explain this non-max suppression idea further it's important to understand how
    // the detector works.  Essentially, sliding window detectors scan all image locations
    // and ask "is there a car here?".  If there really is a car in a specific location in
    // an image then usually many slightly different sliding window locations will produce
    // high detection scores, indicating that there is a car at those locations.  If we
    // just stopped there then each car would produce multiple detections.  But that isn't
    // what we want.  We want each car to produce just one detection.  So it's common for
    // detectors to include "non-maximum suppression" logic which simply takes the
    // strongest detection and then deletes all detections "close to" the strongest.  This
    // is a simple post-processing step that can eliminate duplicate detections.  However,
    // we have to define what "close to" means.  We can do this by looking at your training
    // data and checking how close the closest target boxes are to each other, and then
    // picking a "close to" measure that doesn't suppress those target boxes but is
    // otherwise as tight as possible.  This is exactly what the mmod_options object does
    // by default.
    //
    // Importantly, this means that if your training dataset contains an image with two
    // target boxes that really overlap a whole lot, then the non-maximum suppression
    // "close to" measure will be configured to allow detections to really overlap a whole
    // lot.  On the other hand, if your dataset didn't contain any overlapped boxes at all,
    // then the non-max suppression logic would be configured to filter out any boxes that
    // overlapped at all, and thus would be performing a much stronger non-max suppression.  
    //
    // Why does this matter?  Well, remember that we want to avoid duplicate detections.
    // If non-max suppression just kills everything in a really wide area around a car then
    // the CNN doesn't really need to learn anything about avoiding duplicate detections.
    // However, if non-max suppression only suppresses a tiny area around each detection
    // then the CNN will need to learn to output small detection scores for those areas of
    // the image not suppressed.  The smaller the non-max suppression region the more the
    // CNN has to learn and the more difficult the learning problem will become.  This is
    // why we remove highly overlapped objects from the training dataset.  That is, we do
    // it so the non-max suppression logic will be able to be reasonably effective.  Here
    // we are ensuring that any boxes that are entirely contained by another are
    // suppressed.  We also ensure that boxes with an intersection over union of 0.5 or
    // greater are suppressed.  This will improve the resulting detector since it will be
    // able to use more aggressive non-max suppression settings.

    int num_overlapped_ignored_test = 0;
    for (auto& v : boxes_test)
        num_overlapped_ignored_test += ignore_overlapped_boxes(v, test_box_overlap(0.50, 0.95));

    int num_overlapped_ignored = 0;
    int num_additional_ignored = 0;
    for (auto& v : boxes_train)
    {
        num_overlapped_ignored += ignore_overlapped_boxes(v, test_box_overlap(0.50, 0.95));
        for (auto& bb : v)
        {
            if (bb.rect.width() < 35 && bb.rect.height() < 35)
            {
                if (!bb.ignore)
                {
                    bb.ignore = true;
                    ++num_additional_ignored;
                }
            }

            // The dlib vehicle detection dataset doesn't contain any detections with
            // really extreme aspect ratios.  However, some datasets do, often because of
            // bad labeling.  So it's a good idea to check for that and either eliminate
            // those boxes or set them to ignore.  Although, this depends on your
            // application.  
            // 
            // For instance, if your dataset has boxes with an aspect ratio
            // of 10 then you should think about what that means for the network
            // architecture.  Does the receptive field even cover the entirety of the box
            // in those cases?  Do you care about these boxes?  Are they labeling errors?
            // I find that many people will download some dataset from the internet and
            // just take it as given.  They run it through some training algorithm and take
            // the dataset as unchallengeable truth.  But many datasets are full of
            // labeling errors.  There are also a lot of datasets that aren't full of
            // errors, but are annotated in a sloppy and inconsistent way.  Fixing those
            // errors and inconsistencies can often greatly improve models trained from
            // such data.  It's almost always worth the time to try and improve your
            // training dataset.   
            //
            // In any case, my point is that there are other types of dataset cleaning you
            // could put here.  What exactly you need depends on your application.  But you
            // should carefully consider it and not take your dataset as a given.  The work
            // of creating a good detector is largely about creating a high quality
            // training dataset.  
        }
    }

    // When modifying a dataset like this, it's a really good idea to print a log of how
    // many boxes you ignored.  It's easy to accidentally ignore a huge block of data, so
    // you should always look and see that things are doing what you expect.
    cout << "num_overlapped_ignored: "<< num_overlapped_ignored << endl;
    cout << "num_additional_ignored: "<< num_additional_ignored << endl;
    cout << "num_overlapped_ignored_test: "<< num_overlapped_ignored_test << endl;


    cout << "num training images: " << images_train.size() << endl;
    cout << "num testing images: " << images_test.size() << endl;


    // Our vehicle detection dataset has basically 3 different types of boxes.  Square
    // boxes, tall and skinny boxes (e.g. semi trucks), and short and wide boxes (e.g.
    // sedans).  Here we are telling the MMOD algorithm that a vehicle is recognizable as
    // long as the longest box side is at least 70 pixels long and the shortest box side is
    // at least 30 pixels long.  mmod_options will use these parameters to decide how large
    // each of the sliding windows needs to be so as to be able to detect all the vehicles.
    // Since our dataset has basically these 3 different aspect ratios, it will decide to
    // use 3 different sliding windows.  This means the final con layer in the network will
    // have 3 filters, one for each of these aspect ratios. 
    //
    // Another thing to consider when setting the sliding window size is the "stride" of
    // your network.  The network we defined above downsamples the image by a factor of 8x
    // in the first few layers.  So when the sliding windows are scanning the image, they
    // are stepping over it with a stride of 8 pixels.  If you set the sliding window size
    // too small then the stride will become an issue.  For instance, if you set the
    // sliding window size to 4 pixels, then it means a 4x4 window will be moved by 8
    // pixels at a time when scanning. This is obviously a problem since 75% of the image
    // won't even be visited by the sliding window.  So you need to set the window size to
    // be big enough relative to the stride of your network.  In our case, the windows are
    // at least 30 pixels in length, so being moved by 8 pixel steps is fine. 
    mmod_options options(boxes_train, 70, 30);


    // This setting is very important and dataset specific.  The vehicle detection dataset
    // contains boxes that are marked as "ignore", as we discussed above.  Some of them are
    // ignored because we set ignore to true in the above code.  However, the xml files
    // also contained a lot of ignore boxes.  Some of them are large boxes that encompass
    // large parts of an image and the intention is to have everything inside those boxes
    // be ignored.  Therefore, we need to tell the MMOD algorithm to do that, which we do
    // by setting options.overlaps_ignore appropriately.  
    // 
    // But first, we need to understand exactly what this option does.  The MMOD loss
    // is essentially counting the number of false alarms + missed detections produced by
    // the detector for each image.  During training, the code is running the detector on
    // each image in a mini-batch and looking at its output and counting the number of
    // mistakes.  The optimizer tries to find parameters settings that minimize the number
    // of detector mistakes.
    // 
    // This overlaps_ignore option allows you to tell the loss that some outputs from the
    // detector should be totally ignored, as if they never happened.  In particular, if a
    // detection overlaps a box in the training data with ignore==true then that detection
    // is ignored.  This overlap is determined by calling
    // options.overlaps_ignore(the_detection, the_ignored_training_box).  If it returns
    // true then that detection is ignored.
    // 
    // You should read the documentation for test_box_overlap, the class type for
    // overlaps_ignore for full details.  However, the gist is that the default behavior is
    // to only consider boxes as overlapping if their intersection over union is > 0.5.
    // However, the dlib vehicle detection dataset contains large boxes that are meant to
    // mask out large areas of an image.  So intersection over union isn't an appropriate
    // way to measure "overlaps with box" in this case.  We want any box that is contained
    // inside one of these big regions to be ignored, even if the detection box is really
    // small.  So we set overlaps_ignore to behave that way with this line.
    options.overlaps_ignore = test_box_overlap(0.5, 0.95);

    net_type net(options);

    // The final layer of the network must be a con layer that contains 
    // options.detector_windows.size() filters.  This is because these final filters are
    // what perform the final "sliding window" detection in the network.  For the dlib
    // vehicle dataset, there will be 3 sliding window detectors, so we will be setting
    // num_filters to 3 here.
    net.subnet().layer_details().set_num_filters(options.detector_windows.size());


    dnn_trainer<net_type> trainer(net,sgd(0.0001,0.9));
    trainer.set_learning_rate(0.1);
    trainer.be_verbose();


    // While training, we are going to use early stopping.  That is, we will be checking
    // how good the detector is performing on our test data and when it stops getting
    // better on the test data we will drop the learning rate.  We will keep doing that
    // until the learning rate is less than 1e-4.   These two settings tell the trainer to
    // do that.  Essentially, we are setting the first argument to infinity, and only the
    // test iterations without progress threshold will matter.  In particular, it says that
    // once we observe 1000 testing mini-batches where the test loss clearly isn't
    // decreasing we will lower the learning rate.
    trainer.set_iterations_without_progress_threshold(50000);
    trainer.set_test_iterations_without_progress_threshold(1000);

    const string sync_filename = "mmod_cars_sync";
    trainer.set_synchronization_file(sync_filename, std::chrono::minutes(5));


    std::vector<matrix<rgb_pixel>> mini_batch_samples;
    std::vector<std::vector<mmod_rect>> mini_batch_labels; 
    random_cropper cropper;
    cropper.set_seed(time(0));
    cropper.set_chip_dims(350, 350);
    // Usually you want to give the cropper whatever min sizes you passed to the
    // mmod_options constructor, or very slightly smaller sizes, which is what we do here.
    cropper.set_min_object_size(69,28); 
    cropper.set_max_rotation_degrees(2);
    dlib::rand rnd;

    // Log the training parameters to the console
    cout << trainer << cropper << endl;

    int cnt = 1;
    // Run the trainer until the learning rate gets small.  
    while(trainer.get_learning_rate() >= 1e-4)
    {
        // Every 30 mini-batches we do a testing mini-batch.  
        if (cnt%30 != 0 || images_test.size() == 0)
        {
            cropper(87, images_train, boxes_train, mini_batch_samples, mini_batch_labels);
            // We can also randomly jitter the colors and that often helps a detector
            // generalize better to new images.
            for (auto&& img : mini_batch_samples)
                disturb_colors(img, rnd);

            // It's a good idea to, at least once, put code here that displays the images
            // and boxes the random cropper is generating.  You should look at them and
            // think about if the output makes sense for your problem.  Most of the time
            // it will be fine, but sometimes you will realize that the pattern of cropping
            // isn't really appropriate for your problem and you will need to make some
            // change to how the mini-batches are being generated.  Maybe you will tweak
            // some of the cropper's settings, or write your own entirely separate code to
            // create mini-batches.  But either way, if you don't look you will never know.
            // An easy way to do this is to create a dlib::image_window to display the
            // images and boxes.

            trainer.train_one_step(mini_batch_samples, mini_batch_labels);
        }
        else
        {
            cropper(87, images_test, boxes_test, mini_batch_samples, mini_batch_labels);
            // We can also randomly jitter the colors and that often helps a detector
            // generalize better to new images.
            for (auto&& img : mini_batch_samples)
                disturb_colors(img, rnd);

            trainer.test_one_step(mini_batch_samples, mini_batch_labels);
        }
        ++cnt;
    }
    // wait for training threads to stop
    trainer.get_net();
    cout << "done training" << endl;

    // Save the network to disk
    net.clean();
    serialize("mmod_rear_end_vehicle_detector.dat") << net;


    // It's a really good idea to print the training parameters.  This is because you will
    // invariably be running multiple rounds of training and should be logging the output
    // to a file.  This print statement will include many of the training parameters in
    // your log.
    cout << trainer << cropper << endl;

    cout << "\nsync_filename: " << sync_filename << endl;
    cout << "num training images: "<< images_train.size() << endl;
    cout << "training results: " << test_object_detection_function(net, images_train, boxes_train, test_box_overlap(), 0, options.overlaps_ignore);
    // Upsampling the data will allow the detector to find smaller cars.  Recall that 
    // we configured it to use a sliding window nominally 70 pixels in size.  So upsampling
    // here will let it find things nominally 35 pixels in size.  Although we include a
    // limit of 1800*1800 here which means "don't upsample an image if it's already larger
    // than 1800*1800".  We do this so we don't run out of RAM, which is a concern because
    // some of the images in the dlib vehicle dataset are really high resolution.
    upsample_image_dataset<pyramid_down<2>>(images_train, boxes_train, 1800*1800);
    cout << "training upsampled results: " << test_object_detection_function(net, images_train, boxes_train, test_box_overlap(), 0, options.overlaps_ignore);


    cout << "num testing images: "<< images_test.size() << endl;
    cout << "testing results: " << test_object_detection_function(net, images_test, boxes_test, test_box_overlap(), 0, options.overlaps_ignore);
    upsample_image_dataset<pyramid_down<2>>(images_test, boxes_test, 1800*1800);
    cout << "testing upsampled results: " << test_object_detection_function(net, images_test, boxes_test, test_box_overlap(), 0, options.overlaps_ignore);

    /*
        This program takes many hours to execute on a high end GPU.  It took about a day to
        train on a NVIDIA 1080ti.  The resulting model file is available at
            http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
        It should be noted that this file on dlib.net has a dlib::shape_predictor appended
        onto the end of it (see dnn_mmod_find_cars_ex.cpp for an example of its use).  This
        explains why the model file on dlib.net is larger than the
        mmod_rear_end_vehicle_detector.dat output by this program.

        You can see some videos of this vehicle detector running on YouTube:
            https://www.youtube.com/watch?v=4B3bzmxMAZU
            https://www.youtube.com/watch?v=bP2SUo5vSlc

        Also, the training and testing accuracies were:
            num training images: 2217
            training results: 0.990738 0.736431 0.736073 
            training upsampled results: 0.986837 0.937694 0.936912 
            num testing images: 135
            testing results: 0.988827 0.471372 0.470806 
            testing upsampled results: 0.987879 0.651132 0.650399 
    */

    return 0;

}
catch(std::exception& e)
{
    cout << e.what() << endl;
}