summaryrefslogtreecommitdiffstats
path: root/src/lib/util/encode/base_n.cc
blob: e0c37e5f4933db176aace9e18cd24f6c370c57a3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
// Copyright (C) 2010-2022 Internet Systems Consortium, Inc. ("ISC")
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

#include <config.h>

#include <util/encode/base32hex_from_binary.h>
#include <util/encode/binary_from_base32hex.h>
#include <util/encode/base16_from_binary.h>
#include <util/encode/binary_from_base16.h>
#include <util/encode/base32hex.h>
#include <util/encode/base64.h>

#include <exceptions/exceptions.h>
#include <exceptions/isc_assert.h>

#include <boost/archive/iterators/base64_from_binary.hpp>
#include <boost/archive/iterators/binary_from_base64.hpp>
#include <boost/archive/iterators/transform_width.hpp>
#ifdef HAVE_BOOST_INTEGER_COMMON_FACTOR_HPP
#include <boost/integer/common_factor.hpp>
#else
#include <boost/math/common_factor.hpp>
#endif

#include <stdint.h>
#include <stdexcept>
#include <iterator>
#include <string>
#include <vector>

using namespace std;
using namespace boost::archive::iterators;

namespace isc {
namespace util {
namespace encode {

// Some versions of clang cannot handle exceptions in unnamed namespaces
// so this exception is defined in an 'internal' namespace
namespace clang_unnamed_namespace_workaround {
// An internally caught exception to unify a few possible cases of the same
// error.
class IncompleteBaseInput : public std::exception {
};
} // end namespace internal

// In the following anonymous namespace, we provide a generic framework
// to encode/decode baseN format.  We use the following tools:
// - boost base64_from_binary/binary_from_base64: provide mapping table for
//   base64.
//   These classes take another iterator (Base) as a template argument, and
//   their dereference operator (operator*()) first retrieves an input value
//   from Base via Base::operator* and converts the value using their mapping
//   table.  The converted value is returned as their own operator*.
// - base{32hex,16}_from_binary/binary_from_base{32hex,16}: provide mapping
//   table for base32hex and base16.  A straightforward variation of their
//   base64 counterparts.
// - EncodeNormalizer/DecodeNormalizer: supplemental filter handling baseN
//   padding characters (=)
// - boost transform_width: an iterator framework for handling data stream
//   per bit-group.  It takes another iterator (Base) and output/input bit
//   numbers (BitsOut/BitsIn) template arguments.  A transform_width object
//   internally maintains a bit stream, which can be retrieved per BitsOut
//   bits via its dereference operator (operator*()).  It builds the stream
//   by internally iterating over the Base object via Base::operator++ and
//   Base::operator*, using the least BitsIn bits of the result of
//   Base::operator*.  In our usage BitsIn for encoding and BitsOut for
//   decoding are always 8 (# of bits for one byte).
//
// Its dereference operator
//   retrieves BitsIn bits from the result of "*Base" (if necessary it
//   internally calls ++Base)
//
// A conceptual description of how the encoding and decoding work is as
// follows:
// Encoding:
//   input binary data => Normalizer (append sufficient number of 0 bits)
//                     => transform_width (extract bit groups from the original
//                                         stream)
//                     => baseXX_from_binary (convert each bit group to an
//                                            encoded byte using the mapping)
// Decoding:
//   input baseXX text => Normalizer (convert '='s to the encoded characters
//                                    corresponding to 0, e.g. 'A's in base64)
//                     => binary_from_baseXX (convert each encoded byte into
//                                            the original group bit)
//                     => transform_width (build original byte stream by
//                                         concatenating the decoded bit
//                                         stream)
//
// Below, we define a set of templated classes to handle different parameters
// for different encoding algorithms.
namespace {
// Common constants used for all baseN encoding.
const char BASE_PADDING_CHAR = '=';
const uint8_t BINARY_ZERO_CODE = 0;

// EncodeNormalizer is an input iterator intended to be used as a filter
// between the binary stream and baseXX_from_binary translator (via
// transform_width).  An EncodeNormalizer object is configured with two
// iterators (base and base_end), specifying the head and end of the input
// stream.  It internally iterators over the original stream, and return
// each byte of the stream intact via its dereference operator until it
// reaches the end of the stream.  After that the EncodeNormalizer object
// will return 0 no matter how many times it is subsequently incremented.
// This is necessary because the input binary stream may not contain
// sufficient bits for a full encoded text while baseXX_from_binary expects
// a sufficient length of input.
// Note: this class is intended to be used within this implementation file,
// and assumes "base < base_end" on construction without validating the
// arguments.  The behavior is undefined if this assumption doesn't hold.
class EncodeNormalizer {
public:
    // Aliases used to enable iterator behavior on this class
    using iterator_category = input_iterator_tag;
    using value_type = uint8_t;
    using difference_type = ptrdiff_t;
    using pointer = uint8_t*;
    using reference = uint8_t&;

    EncodeNormalizer(const vector<uint8_t>::const_iterator& base,
                     const vector<uint8_t>::const_iterator& base_end) :
        base_(base), base_end_(base_end), in_pad_(false)
    {}
    EncodeNormalizer& operator++() { // prefix version
        increment();
        return (*this);
    }
    EncodeNormalizer operator++(int) { // postfix version
        const EncodeNormalizer copy = *this;
        increment();
        return (copy);
    }
    const uint8_t& operator*() const {
        if (in_pad_) {
            return (BINARY_ZERO_CODE);
        } else {
            return (*base_);
        }
    }
    bool operator==(const EncodeNormalizer& other) const {
        return (base_ == other.base_);
    }
private:
    void increment() {
        if (!in_pad_) {
            ++base_;
        }
        if (base_ == base_end_) {
            in_pad_ = true;
        }
    }
    vector<uint8_t>::const_iterator base_;
    const vector<uint8_t>::const_iterator base_end_;
    bool in_pad_;
};

// DecodeNormalizer is an input iterator intended to be used as a filter
// between the encoded baseX stream and binary_from_baseXX.
// A DecodeNormalizer object is configured with three string iterators
// (base, base_beginpad, and base_end), specifying the head of the string,
// the beginning position of baseX padding (when there's padding), and
// end of the string, respectively.  It internally iterators over the original
// stream, and return each character of the encoded string via its dereference
// operator until it reaches base_beginpad.  After that the DecodeNormalizer
// will return the encoding character corresponding to the all-0 value
// (which is specified on construction via base_zero_code.  see also
// BaseZeroCode below).  This translation is necessary because
// binary_from_baseXX doesn't accept the padding character (i.e. '=').
// Note: this class is intended to be used within this implementation file,
// and for simplicity assumes "base < base_beginpad <= base_end" on
// construction without validating the arguments.  The behavior is undefined
// if this assumption doesn't hold.
class DecodeNormalizer {
public:
    // Aliases used to enable iterator behavior on this class
    using iterator_category = input_iterator_tag;
    using value_type = char;
    using difference_type = ptrdiff_t;
    using pointer = char*;
    using reference = char&;

    DecodeNormalizer(const char base_zero_code,
                     const string::const_iterator& base,
                     const string::const_iterator& base_beginpad,
                     const string::const_iterator& base_end,
                     size_t* char_count) :
        base_zero_code_(base_zero_code),
        base_(base), base_beginpad_(base_beginpad), base_end_(base_end),
        in_pad_(false), char_count_(char_count)
    {
        // Skip beginning spaces, if any.  We need do it here because
        // otherwise the first call to operator*() would be confused.
        skipSpaces();
    }
    DecodeNormalizer& operator++() {
        if (base_ < base_end_) {
            ++*char_count_;
        }
        ++base_;
        skipSpaces();
        if (base_ == base_beginpad_) {
            in_pad_ = true;
        }
        return (*this);
    }
    void skipSpaces() {
        // If (char is signed and) *base_ < 0, on Windows platform with Visual
        // Studio compiler it may trigger _ASSERTE((unsigned)(c + 1) <= 256);
        // so make sure that the parameter of isspace() is larger than 0.
        // We don't simply cast it to unsigned char to avoid confusing the
        // isspace() implementation with a possible extension for values
        // larger than 127.  Also note the check is not ">= 0"; for systems
        // where char is unsigned that would always be true and would possibly
        // trigger a compiler warning that could stop the build.
        while (base_ != base_end_ && *base_ > 0 && isspace(*base_)) {
            ++base_;
        }
    }
    const char& operator*() const {
        if (base_ == base_end_) {
            // binary_from_baseX can call this operator when it needs more bits
            // even if the internal iterator (base_) has reached its end
            // (if that happens it means the input is an incomplete baseX
            // string and should be rejected).  So this is the only point
            // we can catch and reject this type of invalid input.
            //
            // More recent versions of Boost fixed the behavior and the
            // out-of-range call to this operator doesn't happen.  It's good,
            // but in that case we need to catch incomplete baseX input in
            // a different way.  It's done via char_count_ and after the
            // completion of decoding.

            // throw this now and convert it
            throw clang_unnamed_namespace_workaround::IncompleteBaseInput();
        }
        if (*base_ == BASE_PADDING_CHAR) {
            // Padding can only happen at the end of the input string.  We can
            // detect any violation of this by checking in_pad_, which is
            // true iff we are on or after the first valid sequence of padding
            // characters.
            if (in_pad_) {
                return (base_zero_code_);
            } else {
                isc_throw(BadValue, "Intermediate padding found");
            }
        } else {
            return (*base_);
        }
    }
    bool operator==(const DecodeNormalizer& other) const {
        return (base_ == other.base_);
    }
private:
    const char base_zero_code_;
    string::const_iterator base_;
    const string::const_iterator base_beginpad_;
    const string::const_iterator base_end_;
    bool in_pad_;
    // Store number of non-space decoded characters (incl. pad) here.  Define
    // it as a pointer so we can carry it over to any copied objects.
    size_t* char_count_;
};

// BitsPerChunk: number of bits to be converted using the baseN mapping table.
//               e.g. 6 for base64.
// BaseZeroCode: the byte character that represents a value of 0 in
//               the corresponding encoding.  e.g. 'A' for base64.
// Encoder: baseX_from_binary<transform_width<EncodeNormalizer,
//                                            BitsPerChunk, 8> >
// Decoder: transform_width<binary_from_baseX<DecodeNormalizer>,
//                          8, BitsPerChunk>
template <int BitsPerChunk, char BaseZeroCode,
          typename Encoder, typename Decoder>
struct BaseNTransformer {
    static string encode(const vector<uint8_t>& binary);
    static void decode(const char* algorithm,
                       const string& base64, vector<uint8_t>& result);

    // BITS_PER_GROUP is the number of bits for the smallest possible (non
    // empty) bit string that can be converted to a valid baseN encoded text
    // without padding.  It's the least common multiple of 8 and BitsPerChunk,
    // e.g. 24 for base64.
    static const int BITS_PER_GROUP =
#ifdef HAVE_BOOST_INTEGER_COMMON_FACTOR_HPP
        boost::integer::static_lcm<BitsPerChunk, 8>::value;
#else
        boost::math::static_lcm<BitsPerChunk, 8>::value;
#endif

    // MAX_PADDING_CHARS is the maximum number of padding characters
    // that can appear in a valid baseN encoded text.
    // It's group_len - chars_for_byte, where group_len is the number of
    // encoded characters to represent BITS_PER_GROUP bits, and
    // chars_for_byte is the number of encoded character that is needed to
    // represent a single byte, which is ceil(8 / BitsPerChunk).
    // For example, for base64 we need two encoded characters to represent a
    // byte, and each group consists of 4 encoded characters, so
    // MAX_PADDING_CHARS is 4 - 2 = 2.
    static const int MAX_PADDING_CHARS =
        BITS_PER_GROUP / BitsPerChunk -
        (8 / BitsPerChunk + ((8 % BitsPerChunk) == 0 ? 0 : 1));
};

template <int BitsPerChunk, char BaseZeroCode,
          typename Encoder, typename Decoder>
string
BaseNTransformer<BitsPerChunk, BaseZeroCode, Encoder, Decoder>::encode(
    const vector<uint8_t>& binary)
{
    // calculate the resulting length.
    size_t bits = binary.size() * 8;
    if (bits % BITS_PER_GROUP > 0) {
        bits += (BITS_PER_GROUP - (bits % BITS_PER_GROUP));
    }
    const size_t len = bits / BitsPerChunk;

    string result;
    result.reserve(len);
    result.assign(Encoder(EncodeNormalizer(binary.begin(), binary.end())),
                  Encoder(EncodeNormalizer(binary.end(), binary.end())));
    isc_throw_assert(len >= result.length());
    result.append(len - result.length(), BASE_PADDING_CHAR);
    return (result);
}

template <int BitsPerChunk, char BaseZeroCode,
          typename Encoder, typename Decoder>
void
BaseNTransformer<BitsPerChunk, BaseZeroCode, Encoder, Decoder>::decode(
    const char* const algorithm,
    const string& input,
    vector<uint8_t>& result)
{
    // enumerate the number of trailing padding characters (=), ignoring
    // white spaces.  since baseN_from_binary doesn't accept padding,
    // we handle it explicitly.
    size_t padchars = 0;
    string::const_reverse_iterator srit = input.rbegin();
    string::const_reverse_iterator srit_end = input.rend();
    while (srit != srit_end) {
        char ch = *srit;
        if (ch == BASE_PADDING_CHAR) {
            if (++padchars > MAX_PADDING_CHARS) {
                isc_throw(BadValue, "Too many " << algorithm
                          << " padding characters: " << input);
            }
        } else if (!(ch > 0 && isspace(ch))) {
            // see the note for DecodeNormalizer::skipSpaces() above for ch > 0
            break;
        }
        ++srit;
    }
    // then calculate the number of padding bits corresponding to the padding
    // characters.  In general, the padding bits consist of all-zero
    // trailing bits of the last encoded character followed by zero bits
    // represented by the padding characters:
    // 1st pad  2nd pad  3rd pad...
    // +++===== =======  ===...    (+: from encoded chars, =: from pad chars)
    // 0000...0 0......0 000...
    // 0      7 8     15 16.... (bits)
    // The number of bits for the '==...' part is padchars * BitsPerChunk.
    // So the total number of padding bits is the smallest multiple of 8
    // that is >= padchars * BitsPerChunk.
    // (Below, note the common idiom of the bitwise AND with ~7.  It clears the
    // lowest three bits, so has the effect of rounding the result down to the
    // nearest multiple of 8)
    const size_t padbits = (padchars * BitsPerChunk + 7) & ~7;

    // In some encoding algorithm, it could happen that a padding byte would
    // contain a full set of encoded bits, which is not allowed by definition
    // of padding.  For example, if BitsPerChunk is 5, the following
    // representation could happen:
    // ++00000= (+: from encoded chars, 0: encoded char for '0', =: pad chars)
    // 0      7 (bits)
    // This must actually be encoded as follows:
    // ++======
    // 0      7 (bits)
    // The following check rejects this type of invalid encoding.
    if (padbits > BitsPerChunk * (padchars + 1)) {
        isc_throw(BadValue, "Invalid " << algorithm << " padding: " << input);
    }

    // convert the number of bits in bytes for convenience.
    const size_t padbytes = padbits / 8;

    try {
        size_t char_count = 0;
        result.assign(Decoder(DecodeNormalizer(BaseZeroCode, input.begin(),
                                               srit.base(), input.end(),
                                               &char_count)),
                      Decoder(DecodeNormalizer(BaseZeroCode, input.end(),
                                               input.end(), input.end(),
                                               NULL)));

        // Number of bits of the conversion result including padding must be
        // a multiple of 8; otherwise the decoder reaches the end of input
        // with some incomplete bits of data, which is invalid.
        if (((char_count * BitsPerChunk) % 8) != 0) {
            // catch this immediately below
            throw clang_unnamed_namespace_workaround::IncompleteBaseInput();
        }
    } catch (const clang_unnamed_namespace_workaround::IncompleteBaseInput&) {
        // we unify error handling for incomplete input here.
        isc_throw(BadValue, "Incomplete input for " << algorithm
                  << ": " << input);
    } catch (const dataflow_exception& ex) {
        // convert any boost exceptions into our local one.
        isc_throw(BadValue, ex.what());
    }

    // Confirm the original BaseX text is the canonical encoding of the
    // data, that is, that the first byte of padding is indeed 0.
    // (DecodeNormalizer and binary_from_baseXX ensure that the rest of the
    // padding is all zero).
    isc_throw_assert(result.size() >= padbytes);
    if (padbytes > 0 && *(result.end() - padbytes) != 0) {
            isc_throw(BadValue, "Non 0 bits included in " << algorithm
                      << " padding: " << input);
    }

    // strip the padded zero-bit fields
    result.resize(result.size() - padbytes);
}

//
// Instantiation for BASE-64
//
typedef
base64_from_binary<transform_width<EncodeNormalizer, 6, 8> > base64_encoder;
typedef
transform_width<binary_from_base64<DecodeNormalizer>, 8, 6> base64_decoder;
typedef BaseNTransformer<6, 'A', base64_encoder, base64_decoder>
Base64Transformer;

//
// Instantiation for BASE-32HEX
//
typedef
base32hex_from_binary<transform_width<EncodeNormalizer, 5, 8> >
base32hex_encoder;
typedef
transform_width<binary_from_base32hex<DecodeNormalizer>, 8, 5>
base32hex_decoder;
typedef BaseNTransformer<5, '0', base32hex_encoder, base32hex_decoder>
Base32HexTransformer;

//
// Instantiation for BASE-16 (HEX)
//
typedef
base16_from_binary<transform_width<EncodeNormalizer, 4, 8> > base16_encoder;
typedef
transform_width<binary_from_base16<DecodeNormalizer>, 8, 4> base16_decoder;
typedef BaseNTransformer<4, '0', base16_encoder, base16_decoder>
Base16Transformer;
}

string
encodeBase64(const vector<uint8_t>& binary) {
    return (Base64Transformer::encode(binary));
}

void
decodeBase64(const string& input, vector<uint8_t>& result) {
    Base64Transformer::decode("base64", input, result);
}

string
encodeBase32Hex(const vector<uint8_t>& binary) {
    return (Base32HexTransformer::encode(binary));
}

void
decodeBase32Hex(const string& input, vector<uint8_t>& result) {
    Base32HexTransformer::decode("base32hex", input, result);
}

string
encodeHex(const vector<uint8_t>& binary) {
    return (Base16Transformer::encode(binary));
}

void
decodeHex(const string& input, vector<uint8_t>& result) {
    Base16Transformer::decode("base16", input, result);
}

} // namespace encode
} // namespace util
} // namespace isc