summaryrefslogtreecommitdiffstats
path: root/ml/dlib/examples/one_class_classifiers_ex.cpp
blob: 3394ee76fe3f3dd4c7766408194655be9ca35961 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
    This is an example illustrating the use of the tools in dlib for doing distribution
    estimation or detecting anomalies using one-class support vector machines. 

    Unlike regular classifiers, these tools take unlabeled points and try to learn what
    parts of the feature space normally contain data samples and which do not.  Typically
    you use these tools when you are interested in finding outliers or otherwise
    identifying "unusual" data samples.

    In this example, we will sample points from the sinc() function to generate our set of
    "typical looking" points.  Then we will train some one-class classifiers and use them
    to predict if new points are unusual or not.  In this case, unusual means a point is
    not from the sinc() curve.
*/

#include <iostream>
#include <vector>
#include <dlib/svm.h>
#include <dlib/gui_widgets.h>
#include <dlib/array2d.h>
#include <dlib/image_transforms.h>

using namespace std;
using namespace dlib;

// Here is the sinc function we will be trying to learn with the one-class SVMs 
double sinc(double x)
{
    if (x == 0)
        return 2;
    return 2*sin(x)/x;
}

int main()
{
    // We will use column vectors to store our points.  Here we make a convenient typedef
    // for the kind of vector we will use.
    typedef matrix<double,0,1> sample_type;

    // Then we select the kernel we want to use.  For our present problem the radial basis
    // kernel is quite effective.
    typedef radial_basis_kernel<sample_type> kernel_type;

    // Now make the object responsible for training one-class SVMs.
    svm_one_class_trainer<kernel_type> trainer;
    // Here we set the width of the radial basis kernel to 4.0.  Larger values make the
    // width smaller and give the radial basis kernel more resolution.  If you play with
    // the value and observe the program output you will get a more intuitive feel for what
    // that means.
    trainer.set_kernel(kernel_type(4.0));

    // Now sample some 2D points.  The points will be located on the curve defined by the
    // sinc() function.
    std::vector<sample_type> samples;
    sample_type m(2);
    for (double x = -15; x <= 8; x += 0.3)
    {
        m(0) = x;
        m(1) = sinc(x);
        samples.push_back(m);
    }

    // Now train a one-class SVM.  The result is a function, df(), that outputs large
    // values for points from the sinc() curve and smaller values for points that are
    // anomalous (i.e. not on the sinc() curve in our case).
    decision_function<kernel_type> df = trainer.train(samples);

    // So for example, let's look at the output from some points on the sinc() curve.  
    cout << "Points that are on the sinc function:\n";
    m(0) = -1.5; m(1) = sinc(m(0)); cout << "   " << df(m) << endl;  
    m(0) = -1.5; m(1) = sinc(m(0)); cout << "   " << df(m) << endl;  
    m(0) = -0;   m(1) = sinc(m(0)); cout << "   " << df(m) << endl;  
    m(0) = -0.5; m(1) = sinc(m(0)); cout << "   " << df(m) << endl;  
    m(0) = -4.1; m(1) = sinc(m(0)); cout << "   " << df(m) << endl;  
    m(0) = -1.5; m(1) = sinc(m(0)); cout << "   " << df(m) << endl;  
    m(0) = -0.5; m(1) = sinc(m(0)); cout << "   " << df(m) << endl;  

    cout << endl;
    // Now look at some outputs for points not on the sinc() curve.  You will see that
    // these values are all notably smaller. 
    cout << "Points that are NOT on the sinc function:\n";
    m(0) = -1.5; m(1) = sinc(m(0))+4;   cout << "   " << df(m) << endl;
    m(0) = -1.5; m(1) = sinc(m(0))+3;   cout << "   " << df(m) << endl;
    m(0) = -0;   m(1) = -sinc(m(0));    cout << "   " << df(m) << endl;
    m(0) = -0.5; m(1) = -sinc(m(0));    cout << "   " << df(m) << endl;
    m(0) = -4.1; m(1) = sinc(m(0))+2;   cout << "   " << df(m) << endl;
    m(0) = -1.5; m(1) = sinc(m(0))+0.9; cout << "   " << df(m) << endl;
    m(0) = -0.5; m(1) = sinc(m(0))+1;   cout << "   " << df(m) << endl;

    // The output is as follows:
    /*
    Points that are on the sinc function:
        0.000389691
        0.000389691
        -0.000239037
        -0.000179978
        -0.000178491
        0.000389691
        -0.000179978

    Points that are NOT on the sinc function:
        -0.269389
        -0.269389
        -0.269389
        -0.269389
        -0.269389
        -0.239954
        -0.264318
    */

    // So we can see that in this example the one-class SVM correctly indicates that 
    // the non-sinc points are definitely not points from the sinc() curve.


    // It should be noted that the svm_one_class_trainer becomes very slow when you have
    // more than 10 or 20 thousand training points.  However, dlib comes with very fast SVM
    // tools which you can use instead at the cost of a little more setup.  In particular,
    // it is possible to use one of dlib's very fast linear SVM solvers to train a one
    // class SVM.  This is what we do below.  We will train on 115,000 points and it only
    // takes a few seconds with this tool!
    // 
    // The first step is constructing a feature space that is appropriate for use with a
    // linear SVM.  In general, this is quite problem dependent.  However, if you have
    // under about a hundred dimensions in your vectors then it can often be quite
    // effective to use the empirical_kernel_map as we do below (see the
    // empirical_kernel_map documentation and example program for an extended discussion of
    // what it does).  
    //
    // But putting the empirical_kernel_map aside, the most important step in turning a
    // linear SVM into a one-class SVM is the following.  We append a -1 value onto the end
    // of each feature vector and then tell the trainer to force the weight for this
    // feature to 1.  This means that if the linear SVM assigned all other weights a value
    // of 0 then the output from a learned decision function would always be -1.  The
    // second step is that we ask the SVM to label each training sample with +1.  This
    // causes the SVM to set the other feature weights such that the training samples have
    // positive outputs from the learned decision function.  But the starting bias for all
    // the points in the whole feature space is -1.  The result is that points outside our
    // training set will not be affected, so their outputs from the decision function will
    // remain close to -1.

    empirical_kernel_map<kernel_type> ekm;
    ekm.load(trainer.get_kernel(),samples);

    samples.clear();
    std::vector<double> labels;
    // make a vector with just 1 element in it equal to -1.
    sample_type bias(1);
    bias = -1;
    sample_type augmented;
    // This time sample 115,000 points from the sinc() function.
    for (double x = -15; x <= 8; x += 0.0002)
    {
        m(0) = x;
        m(1) = sinc(x);
        // Apply the empirical_kernel_map transformation and then append the -1 value
        augmented = join_cols(ekm.project(m), bias);
        samples.push_back(augmented);
        labels.push_back(+1);
    }
    cout << "samples.size(): "<< samples.size() << endl;

    // The svm_c_linear_dcd_trainer is a very fast SVM solver which only works with the
    // linear_kernel.  It has the nice feature of supporting this "force_last_weight_to_1"
    // mode we discussed above.
    svm_c_linear_dcd_trainer<linear_kernel<sample_type> > linear_trainer;
    linear_trainer.force_last_weight_to_1(true);

    // Train the SVM
    decision_function<linear_kernel<sample_type> > df2 = linear_trainer.train(samples, labels);

    // Here we test it as before, again we note that points from the sinc() curve have
    // large outputs from the decision function.  Note also that we must remember to
    // transform the points in exactly the same manner used to construct the training set
    // before giving them to df2() or the code will not work.
    cout << "Points that are on the sinc function:\n";
    m(0) = -1.5; m(1) = sinc(m(0)); cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;  
    m(0) = -1.5; m(1) = sinc(m(0)); cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;  
    m(0) = -0;   m(1) = sinc(m(0)); cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;  
    m(0) = -0.5; m(1) = sinc(m(0)); cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;  
    m(0) = -4.1; m(1) = sinc(m(0)); cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;  
    m(0) = -1.5; m(1) = sinc(m(0)); cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;  
    m(0) = -0.5; m(1) = sinc(m(0)); cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;  

    cout << endl;
    // Again, we see here that points not on the sinc() function have small values.
    cout << "Points that are NOT on the sinc function:\n";
    m(0) = -1.5; m(1) = sinc(m(0))+4;   cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;
    m(0) = -1.5; m(1) = sinc(m(0))+3;   cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;
    m(0) = -0;   m(1) = -sinc(m(0));    cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;
    m(0) = -0.5; m(1) = -sinc(m(0));    cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;
    m(0) = -4.1; m(1) = sinc(m(0))+2;   cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;
    m(0) = -1.5; m(1) = sinc(m(0))+0.9; cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;
    m(0) = -0.5; m(1) = sinc(m(0))+1;   cout << "   " << df2(join_cols(ekm.project(m),bias)) << endl;


    // The output is as follows:
    /*
    Points that are on the sinc function:
        1.00454
        1.00454
        1.00022
        1.00007
        1.00371
        1.00454
        1.00007

    Points that are NOT on the sinc function:
        -1
        -1
        -1
        -1
        -0.999998
        -0.781231
        -0.96242
    */


    // Finally, to help you visualize what is happening here we are going to plot the
    // response of the one-class classifiers on the screen.  The code below creates two
    // heatmap images which show the response.  In these images you can clearly see where
    // the algorithms have identified the sinc() curve.  The hotter the pixel looks, the
    // larger the value coming out of the decision function and therefore the more "normal"
    // it is according to the classifier.
    const long size = 500;
    array2d<double> img1(size,size);
    array2d<double> img2(size,size);
    for (long r = 0; r < img1.nr(); ++r)
    {
        for (long c = 0; c < img1.nc(); ++c)
        {
            double x = 30.0*c/size - 19;
            double y = 8.0*r/size - 4;
            m(0) = x;
            m(1) = y;
            img1[r][c] = df(m);
            img2[r][c] = df2(join_cols(ekm.project(m),bias));
        }
    }
    image_window win1(heatmap(img1), "svm_one_class_trainer");
    image_window win2(heatmap(img2), "svm_c_linear_dcd_trainer");
    win1.wait_until_closed();
}