summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/compress_stream/compress_stream_kernel_2.h
blob: e46b23fad874758b2553ac1db1a1006f8ec8b064 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
// Copyright (C) 2003  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_COMPRESS_STREAM_KERNEl_2_
#define DLIB_COMPRESS_STREAM_KERNEl_2_

#include "../algs.h"
#include <iostream>
#include <streambuf>
#include "compress_stream_kernel_abstract.h"

namespace dlib
{

    template <
        typename fce,
        typename fcd,
        typename lz77_buffer,
        typename sliding_buffer,
        typename fce_length,
        typename fcd_length,
        typename fce_index,
        typename fcd_index,
        typename crc32
        >
    class compress_stream_kernel_2
    {
        /*!
            REQUIREMENTS ON fce
                is an implementation of entropy_encoder_model/entropy_encoder_model_kernel_abstract.h
                the alphabet_size of fce must be 257.
                fce and fcd share the same kernel number.

            REQUIREMENTS ON fcd
                is an implementation of entropy_decoder_model/entropy_decoder_model_kernel_abstract.h
                the alphabet_size of fcd must be 257.
                fce and fcd share the same kernel number.

            REQUIREMENTS ON lz77_buffer
                is an implementation of lz77_buffer/lz77_buffer_kernel_abstract.h

            REQUIREMENTS ON sliding_buffer
                is an implementation of sliding_buffer/sliding_buffer_kernel_abstract.h
                is instantiated with T = unsigned char

            REQUIREMENTS ON fce_length
                is an implementation of entropy_encoder_model/entropy_encoder_model_kernel_abstract.h
                the alphabet_size of fce must be 513.  This will be used to encode the length of lz77 matches.
                fce_length and fcd share the same kernel number.

            REQUIREMENTS ON fcd_length
                is an implementation of entropy_decoder_model/entropy_decoder_model_kernel_abstract.h
                the alphabet_size of fcd must be 513.  This will be used to decode the length of lz77 matches.
                fce_length and fcd share the same kernel number.

            REQUIREMENTS ON fce_index
                is an implementation of entropy_encoder_model/entropy_encoder_model_kernel_abstract.h
                the alphabet_size of fce must be 32257.  This will be used to encode the index of lz77 matches.
                fce_index and fcd share the same kernel number.

            REQUIREMENTS ON fcd_index
                is an implementation of entropy_decoder_model/entropy_decoder_model_kernel_abstract.h
                the alphabet_size of fcd must be 32257.  This will be used to decode the index of lz77 matches.
                fce_index and fcd share the same kernel number.

            REQUIREMENTS ON crc32
                is an implementation of crc32/crc32_kernel_abstract.h

            INITIAL VALUE
                this object has no state

            CONVENTION
                this object has no state
        !*/

        const static unsigned long eof_symbol = 256;

    public:

        class decompression_error : public dlib::error 
        { 
            public: 
                decompression_error(
                    const char* i
                ) :
                    dlib::error(std::string(i))
                {}

                decompression_error(
                    const std::string& i
                ) :
                    dlib::error(i)
                {}
        };


        compress_stream_kernel_2 (
        )
        {}

        ~compress_stream_kernel_2 (
        )
        {}

        void compress (
            std::istream& in,
            std::ostream& out
        ) const;

        void decompress (
            std::istream& in,
            std::ostream& out
        ) const;

    private:

        // restricted functions
        compress_stream_kernel_2(compress_stream_kernel_2&);        // copy constructor
        compress_stream_kernel_2& operator=(compress_stream_kernel_2&);    // assignment operator

    };

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
    // member function definitions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    template <
        typename fce,
        typename fcd,
        typename lz77_buffer,
        typename sliding_buffer,
        typename fce_length,
        typename fcd_length,
        typename fce_index,
        typename fcd_index,
        typename crc32
        >
    void compress_stream_kernel_2<fce,fcd,lz77_buffer,sliding_buffer,fce_length,fcd_length,fce_index,fcd_index,crc32>::
    compress (
        std::istream& in_,
        std::ostream& out_
    ) const
    {
        std::streambuf::int_type temp;

        std::streambuf& in = *in_.rdbuf();

        typename fce::entropy_encoder_type coder;
        coder.set_stream(out_);

        fce model(coder);
        fce_length model_length(coder);
        fce_index model_index(coder);

        const unsigned long LOOKAHEAD_LIMIT = 512; 
        lz77_buffer buffer(15,LOOKAHEAD_LIMIT);
        
        crc32 crc;
      

        unsigned long count = 0;

        unsigned long lz77_count = 1;  // number of times we used lz77 to encode
        unsigned long ppm_count = 1;   // number of times we used ppm to encode


        while (true)
        {
            // write out a known value every 20000 symbols
            if (count == 20000)
            {
                count = 0;
                coder.encode(150,151,400);
            }
            ++count;

            // try to fill the lookahead buffer
            if (buffer.get_lookahead_buffer_size() < buffer.get_lookahead_buffer_limit())
            {
                temp = in.sbumpc();
                while (temp != EOF)
                {
                    crc.add(static_cast<unsigned char>(temp));
                    buffer.add(static_cast<unsigned char>(temp));
                    if (buffer.get_lookahead_buffer_size() == buffer.get_lookahead_buffer_limit())
                        break;
                    temp = in.sbumpc();
                }
            }

            // compute the sum of ppm_count and lz77_count but make sure
            // it is less than 65536
            unsigned long sum = ppm_count + lz77_count;
            if (sum >= 65536)
            {
                ppm_count >>= 1;                    
                lz77_count >>= 1;
                ppm_count |= 1;
                lz77_count |= 1;
                sum = ppm_count+lz77_count;                    
            }

            // if there are still more symbols in the lookahead buffer to encode
            if (buffer.get_lookahead_buffer_size() > 0)  
            {
                unsigned long match_index, match_length;
                buffer.find_match(match_index,match_length,6);
                if (match_length != 0)
                {
                  
                    // signal the decoder that we are using lz77
                    coder.encode(0,lz77_count,sum);
                    ++lz77_count;
                    
                    // encode the index and length pair
                    model_index.encode(match_index);                   
                    model_length.encode(match_length);                   

                }
                else
                {

                    // signal the decoder that we are using ppm 
                    coder.encode(lz77_count,sum,sum);
                    ++ppm_count;

                    // encode the symbol using the ppm model
                    model.encode(buffer.lookahead_buffer(0));
                    buffer.shift_buffers(1);                    
                }
            }
            else
            {
                // signal the decoder that we are using ppm 
                coder.encode(lz77_count,sum,sum);
                

                model.encode(eof_symbol);
                // now write the checksum
                unsigned long checksum = crc.get_checksum();
                unsigned char byte1 = static_cast<unsigned char>((checksum>>24)&0xFF);
                unsigned char byte2 = static_cast<unsigned char>((checksum>>16)&0xFF);
                unsigned char byte3 = static_cast<unsigned char>((checksum>>8)&0xFF);
                unsigned char byte4 = static_cast<unsigned char>((checksum)&0xFF);

                model.encode(byte1);
                model.encode(byte2);
                model.encode(byte3);
                model.encode(byte4);

                break;      
            }
        } // while (true)        
    }

// ----------------------------------------------------------------------------------------

    template <
        typename fce,
        typename fcd,
        typename lz77_buffer,
        typename sliding_buffer,
        typename fce_length,
        typename fcd_length,
        typename fce_index,
        typename fcd_index,
        typename crc32
        >
    void compress_stream_kernel_2<fce,fcd,lz77_buffer,sliding_buffer,fce_length,fcd_length,fce_index,fcd_index,crc32>::
    decompress (
        std::istream& in_,
        std::ostream& out_
    ) const
    {

        std::streambuf& out = *out_.rdbuf();

        typename fcd::entropy_decoder_type coder;
        coder.set_stream(in_);

        fcd model(coder);
        fcd_length model_length(coder);
        fcd_index model_index(coder);

        unsigned long symbol;
        unsigned long count = 0;

        sliding_buffer buffer;
        buffer.set_size(15);

        // Initialize the buffer to all zeros.  There is no algorithmic reason to
        // do this.  But doing so avoids a warning from valgrind so that is why
        // I'm doing this.
        for (unsigned long i = 0; i < buffer.size(); ++i)
              buffer[i] = 0;

        crc32 crc;
        
        unsigned long lz77_count = 1;  // number of times we used lz77 to encode
        unsigned long ppm_count = 1;   // number of times we used ppm to encode
        bool next_block_lz77;


        // decode until we hit the marker symbol
        while (true)
        {
            // make sure this is the value we expect
            if (count == 20000)
            {
                if (coder.get_target(400) != 150)
                {
                    throw decompression_error("Error detected in compressed data stream.");
                }
                count = 0;
                coder.decode(150,151);
            }
            ++count;


            // compute the sum of ppm_count and lz77_count but make sure
            // it is less than 65536
            unsigned long sum = ppm_count + lz77_count;
            if (sum >= 65536)
            {
                ppm_count >>= 1;                    
                lz77_count >>= 1;
                ppm_count |= 1;
                lz77_count |= 1;
                sum = ppm_count+lz77_count;                    
            }

            // check if we are decoding a lz77 or ppm block
            if (coder.get_target(sum) < lz77_count)
            {
                coder.decode(0,lz77_count);
                next_block_lz77 = true;
                ++lz77_count;
            }
            else
            {
                coder.decode(lz77_count,sum);
                next_block_lz77 = false;
                ++ppm_count;
            }


            if (next_block_lz77)
            {
                
                unsigned long match_length, match_index;
                // decode the match index
                model_index.decode(match_index);

                // decode the match length
                model_length.decode(match_length);

                
                match_index += match_length;
                buffer.rotate_left(match_length);
                for (unsigned long i = 0; i < match_length; ++i)
                {
                    unsigned char ch = buffer[match_index-i];
                    buffer[match_length-i-1] = ch;

                    crc.add(ch);
                    // write this ch to out
                    if (out.sputc(static_cast<char>(ch)) != static_cast<int>(ch))
                    {
                        throw std::ios::failure("error occurred in compress_stream_kernel_2::decompress");
                    }
                }
                
            }
            else
            {

                // decode the next symbol
                model.decode(symbol);
                if (symbol != eof_symbol)
                {
                    buffer.rotate_left(1);
                    buffer[0] = static_cast<unsigned char>(symbol);
                    

                    crc.add(static_cast<unsigned char>(symbol));
                    // write this symbol to out
                    if (out.sputc(static_cast<char>(symbol)) != static_cast<int>(symbol))
                    {
                        throw std::ios::failure("error occurred in compress_stream_kernel_2::decompress");
                    }
                }
                else
                {
                    // this was the eof marker symbol so we are done.  now check the checksum

                    // now get the checksum and make sure it matches
                    unsigned char byte1;
                    unsigned char byte2;
                    unsigned char byte3;
                    unsigned char byte4;

                    model.decode(symbol); byte1 = static_cast<unsigned char>(symbol);
                    model.decode(symbol); byte2 = static_cast<unsigned char>(symbol);
                    model.decode(symbol); byte3 = static_cast<unsigned char>(symbol);
                    model.decode(symbol); byte4 = static_cast<unsigned char>(symbol);

                    unsigned long checksum = byte1;
                    checksum <<= 8;
                    checksum |= byte2;
                    checksum <<= 8;
                    checksum |= byte3;
                    checksum <<= 8;
                    checksum |= byte4;

                    if (checksum != crc.get_checksum())
                        throw decompression_error("Error detected in compressed data stream.");

                    break;
                }
            }

        } // while (true)
    }

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_COMPRESS_STREAM_KERNEl_2_