ml/dlib/tools/python/src/image_dataset_metadata.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279

// Copyright (C) 2018  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.

#include "opaque_types.h"
#include <dlib/python.h>
#include <dlib/data_io.h>
#include <dlib/image_processing.h>
#include <pybind11/stl_bind.h>
#include <pybind11/stl.h>
#include <iostream>

namespace pybind11
{

    // a version of bind_map that doesn't force it's own __repr__ on you.
template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
class_<Map, holder_type> bind_map_no_default_repr(handle scope, const std::string &name, Args&&... args) {
    using KeyType = typename Map::key_type;
    using MappedType = typename Map::mapped_type;
    using Class_ = class_<Map, holder_type>;

    // If either type is a non-module-local bound type then make the map binding non-local as well;
    // otherwise (e.g. both types are either module-local or converting) the map will be
    // module-local.
    auto tinfo = detail::get_type_info(typeid(MappedType));
    bool local = !tinfo || tinfo->module_local;
    if (local) {
        tinfo = detail::get_type_info(typeid(KeyType));
        local = !tinfo || tinfo->module_local;
    }

    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);

    cl.def(init<>());


    cl.def("__bool__",
        [](const Map &m) -> bool { return !m.empty(); },
        "Check whether the map is nonempty"
    );

    cl.def("__iter__",
           [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
    );

    cl.def("items",
           [](Map &m) { return make_iterator(m.begin(), m.end()); },
           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
    );

    cl.def("__getitem__",
        [](Map &m, const KeyType &k) -> MappedType & {
            auto it = m.find(k);
            if (it == m.end())
              throw key_error();
           return it->second;
        },
        return_value_policy::reference_internal // ref + keepalive
    );

    // Assignment provided only if the type is copyable
    detail::map_assignment<Map, Class_>(cl);

    cl.def("__delitem__",
           [](Map &m, const KeyType &k) {
               auto it = m.find(k);
               if (it == m.end())
                   throw key_error();
               return m.erase(it);
           }
    );

    cl.def("__len__", &Map::size);

    return cl;
}

}

using namespace dlib;
using namespace std;
using namespace dlib::image_dataset_metadata;

namespace py = pybind11;


dataset py_load_image_dataset_metadata(
    const std::string& filename
)
{
    dataset temp;
    load_image_dataset_metadata(temp, filename);
    return temp;
}

std::shared_ptr<std::map<std::string,point>> map_from_object(py::dict obj)
{
    auto ret = std::make_shared<std::map<std::string,point>>();
    for (auto& v : obj)
    {
        (*ret)[v.first.cast<std::string>()] = v.second.cast<point>();
    }
    return ret;
}

// ----------------------------------------------------------------------------------------

image_dataset_metadata::dataset py_make_bounding_box_regression_training_data (
    const image_dataset_metadata::dataset& truth,
    const py::object& detections
)
{
    try
    {
        // if detections is a std::vector then call like this.
        return make_bounding_box_regression_training_data(truth, detections.cast<const std::vector<std::vector<rectangle>>&>());
    }
    catch (py::cast_error&)
    {
        // otherwise, detections should be a list of std::vectors.
        py::list dets(detections);
        std::vector<std::vector<rectangle>> temp;
        for (auto& d : dets)
            temp.emplace_back(d.cast<const std::vector<rectangle>&>());
        return make_bounding_box_regression_training_data(truth, temp);
    }
}

// ----------------------------------------------------------------------------------------

void bind_image_dataset_metadata(py::module &m_)
{
    auto m = m_.def_submodule("image_dataset_metadata", "Routines and objects for working with dlib's image dataset metadata XML files.");

    auto datasetstr  = [](const dataset& item) { return  "dlib.dataset_dataset_metadata.dataset: images:" + to_string(item.images.size()) + ", " + item.name; };
    auto datasetrepr = [datasetstr](const dataset& item) { return "<"+datasetstr(item)+">"; };
    py::class_<dataset>(m, "dataset",
                    "This object represents a labeled set of images.  In particular, it contains the filename for each image as well as annotated boxes.")
        .def("__str__", datasetstr)
        .def("__repr__", datasetrepr)
        .def_readwrite("images", &dataset::images)
        .def_readwrite("comment", &dataset::comment)
        .def_readwrite("name", &dataset::name);

    auto imagestr  = [](const image& item) { return  "dlib.image_dataset_metadata.image: boxes:"+to_string(item.boxes.size())+ ", " + item.filename; };
    auto imagerepr = [imagestr](const image& item) { return "<"+imagestr(item)+">"; };
    py::class_<image>(m, "image", "This object represents an annotated image.")
        .def_readwrite("filename", &image::filename)
        .def("__str__", imagestr)
        .def("__repr__", imagerepr)
        .def_readwrite("boxes", &image::boxes);


    auto partsstr = [](const std::map<std::string,point>& item) {
        std::ostringstream sout;
        sout << "{";
        for (auto& v : item) 
            sout << "'" << v.first << "': " << v.second << ", ";
        sout << "}";
        return sout.str();
    };
    auto partsrepr = [](const std::map<std::string,point>& item) {
        std::ostringstream sout;
        sout << "dlib.image_dataset_metadata.parts({\n";
        for (auto& v : item) 
            sout << "'" << v.first << "': dlib.point" << v.second << ",\n";
        sout << "})";
        return sout.str();
    };

    py::bind_map_no_default_repr<std::map<std::string,point>, std::shared_ptr<std::map<std::string,point>> >(m, "parts", 
        "This object is a dictionary mapping string names to object part locations.")
        .def(py::init(&map_from_object))
        .def("__str__", partsstr)
        .def("__repr__", partsrepr);


    auto rectstr = [](const rectangle& r) {
        std::ostringstream sout;
        sout << "dlib.rectangle(" << r.left() << "," << r.top() << "," << r.right() << "," << r.bottom() << ")";
        return sout.str();
    };
    auto boxstr  = [rectstr](const box& item) { return "dlib.image_dataset_metadata.box at " + rectstr(item.rect); }; 
    auto boxrepr = [boxstr](const box& item) { return "<"+boxstr(item)+">"; };
    py::class_<box> pybox(m, "box", 
        "This object represents an annotated rectangular area of an image. \n"
        "It is typically used to mark the location of an object such as a \n"
        "person, car, etc.\n"
        "\n"
        "The main variable of interest is rect.  It gives the location of \n"
        "the box.  All the other variables are optional." ); pybox
        .def("__str__", boxstr)
        .def("__repr__", boxrepr)
        .def_readwrite("rect",            &box::rect)
        .def_readonly("parts",           &box::parts)
        .def_readwrite("label",           &box::label)
        .def_readwrite("difficult",       &box::difficult)
        .def_readwrite("truncated",       &box::truncated)
        .def_readwrite("occluded",        &box::occluded)
        .def_readwrite("ignore",          &box::ignore)
        .def_readwrite("pose",            &box::pose)
        .def_readwrite("detection_score", &box::detection_score)
        .def_readwrite("angle",           &box::angle)
        .def_readwrite("gender",          &box::gender)
        .def_readwrite("age",             &box::age);

    py::enum_<gender_t>(pybox,"gender_type")
        .value("MALE", gender_t::MALE)
        .value("FEMALE", gender_t::FEMALE)
        .value("UNKNOWN", gender_t::UNKNOWN)
        .export_values();


    m.def("save_image_dataset_metadata", &save_image_dataset_metadata, py::arg("data"), py::arg("filename"),
        "Writes the contents of the meta object to a file with the given filename.  The file will be in an XML format."
        );

    m.def("load_image_dataset_metadata", &py_load_image_dataset_metadata, py::arg("filename"),
        "Attempts to interpret filename as a file containing XML formatted data as produced "
        "by the save_image_dataset_metadata() function.  The data is loaded and returned as a dlib.image_dataset_metadata.dataset object."
        );

    m_.def("make_bounding_box_regression_training_data", &py_make_bounding_box_regression_training_data, 
        py::arg("truth"), py::arg("detections"),
"requires \n\
    - len(truth.images) == len(detections) \n\
    - detections == A dlib.rectangless object or a list of dlib.rectangles. \n\
ensures \n\
    - Suppose you have an object detector that can roughly locate objects in an \n\
      image.  This means your detector draws boxes around objects, but these are \n\
      *rough* boxes in the sense that they aren't positioned super accurately.  For \n\
      instance, HOG based detectors usually have a stride of 8 pixels.  So the \n\
      positional accuracy is going to be, at best, +/-8 pixels.   \n\
       \n\
      If you want to get better positional accuracy one easy thing to do is train a \n\
      shape_predictor to give you the corners of the object.  The \n\
      make_bounding_box_regression_training_data() routine helps you do this by \n\
      creating an appropriate training dataset.  It does this by taking the dataset \n\
      you used to train your detector (the truth object), and combining that with \n\
      the output of your detector on each image in the training dataset (the \n\
      detections object).  In particular, it will create a new annotated dataset \n\
      where each object box is one of the rectangles from detections and that \n\
      object has 4 part annotations, the corners of the truth rectangle \n\
      corresponding to that detection rectangle.  You can then take the returned \n\
      dataset and train a shape_predictor on it.  The resulting shape_predictor can \n\
      then be used to do bounding box regression. \n\
    - We assume that detections[i] contains object detections corresponding to  \n\
      the image truth.images[i]." 
    /*!
        requires
            - len(truth.images) == len(detections)
            - detections == A dlib.rectangless object or a list of dlib.rectangles.
        ensures
            - Suppose you have an object detector that can roughly locate objects in an
              image.  This means your detector draws boxes around objects, but these are
              *rough* boxes in the sense that they aren't positioned super accurately.  For
              instance, HOG based detectors usually have a stride of 8 pixels.  So the
              positional accuracy is going to be, at best, +/-8 pixels.  
              
              If you want to get better positional accuracy one easy thing to do is train a
              shape_predictor to give you the corners of the object.  The
              make_bounding_box_regression_training_data() routine helps you do this by
              creating an appropriate training dataset.  It does this by taking the dataset
              you used to train your detector (the truth object), and combining that with
              the output of your detector on each image in the training dataset (the
              detections object).  In particular, it will create a new annotated dataset
              where each object box is one of the rectangles from detections and that
              object has 4 part annotations, the corners of the truth rectangle
              corresponding to that detection rectangle.  You can then take the returned
              dataset and train a shape_predictor on it.  The resulting shape_predictor can
              then be used to do bounding box regression.
            - We assume that detections[i] contains object detections corresponding to 
              the image truth.images[i].
    !*/
    );
}