diff options
Diffstat (limited to 'web/server/h2o/libh2o/deps/brotli/enc/utf8_util.cc')
-rw-r--r-- | web/server/h2o/libh2o/deps/brotli/enc/utf8_util.cc | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/web/server/h2o/libh2o/deps/brotli/enc/utf8_util.cc b/web/server/h2o/libh2o/deps/brotli/enc/utf8_util.cc new file mode 100644 index 000000000..a2b5c3a67 --- /dev/null +++ b/web/server/h2o/libh2o/deps/brotli/enc/utf8_util.cc @@ -0,0 +1,83 @@ +/* Copyright 2013 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +// Heuristics for deciding about the UTF8-ness of strings. + +#include "./utf8_util.h" + +#include "./types.h" + +namespace brotli { + +namespace { + +size_t ParseAsUTF8(int* symbol, const uint8_t* input, size_t size) { + // ASCII + if ((input[0] & 0x80) == 0) { + *symbol = input[0]; + if (*symbol > 0) { + return 1; + } + } + // 2-byte UTF8 + if (size > 1u && + (input[0] & 0xe0) == 0xc0 && + (input[1] & 0xc0) == 0x80) { + *symbol = (((input[0] & 0x1f) << 6) | + (input[1] & 0x3f)); + if (*symbol > 0x7f) { + return 2; + } + } + // 3-byte UFT8 + if (size > 2u && + (input[0] & 0xf0) == 0xe0 && + (input[1] & 0xc0) == 0x80 && + (input[2] & 0xc0) == 0x80) { + *symbol = (((input[0] & 0x0f) << 12) | + ((input[1] & 0x3f) << 6) | + (input[2] & 0x3f)); + if (*symbol > 0x7ff) { + return 3; + } + } + // 4-byte UFT8 + if (size > 3u && + (input[0] & 0xf8) == 0xf0 && + (input[1] & 0xc0) == 0x80 && + (input[2] & 0xc0) == 0x80 && + (input[3] & 0xc0) == 0x80) { + *symbol = (((input[0] & 0x07) << 18) | + ((input[1] & 0x3f) << 12) | + ((input[2] & 0x3f) << 6) | + (input[3] & 0x3f)); + if (*symbol > 0xffff && *symbol <= 0x10ffff) { + return 4; + } + } + // Not UTF8, emit a special symbol above the UTF8-code space + *symbol = 0x110000 | input[0]; + return 1; +} + +} // namespace + +// Returns true if at least min_fraction of the data is UTF8-encoded. +bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask, + const size_t length, const double min_fraction) { + size_t size_utf8 = 0; + size_t i = 0; + while (i < length) { + int symbol; + size_t bytes_read = ParseAsUTF8( + &symbol, &data[(pos + i) & mask], length - i); + i += bytes_read; + if (symbol < 0x110000) size_utf8 += bytes_read; + } + return size_utf8 > min_fraction * static_cast<double>(length); +} + +} // namespace brotli |