1 files changed, 496 insertions, 0 deletions
diff --git a/src/fluent-bit/lib/tutf8e/codegen.py b/src/fluent-bit/lib/tutf8e/codegen.py
new file mode 100755
index 000000000..8d7c2c635
--- /dev/null
+++ b/src/fluent-bit/lib/tutf8e/codegen.py
@@ -0,0 +1,496 @@
+#!/usr/bin/env python3
+
+encodings = [
+  'windows-1250', 'windows-1251', 'windows-1252',
+  'windows-1253', 'windows-1254', 'windows-1255',
+  'windows-1256', 'windows-1257', 'windows-1258',
+  'iso-8859-1', 'iso-8859-2',  'iso-8859-3',  'iso-8859-4',
+  'iso-8859-5', 'iso-8859-6',  'iso-8859-7',  'iso-8859-8',
+  'iso-8859-9', 'iso-8859-10', 'iso-8859-11', 'iso-8859-13',
+  'iso-8859-14', 'iso-8859-15', 'iso-8859-16'
+]
+
+with open('include/tutf8e.h', 'w') as include:
+
+  include.write('''
+#ifndef TUTF8E_H
+#define TUTF8E_H
+
+#include <stddef.h>  /* size_t */
+#include <stdint.h>  /* uint16_t */
+
+/*************** Internal API ***************/
+
+/* NUL-terminated C-string API */
+
+extern int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length);
+extern int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length);
+
+/* Known-length buffer API */
+
+extern int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t input_length, const char *invalid, size_t *output_length);
+extern int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length);
+
+/*************** Public API ***************/
+
+/* Opaque handle type */
+
+typedef void *TUTF8encoder;
+
+/* Query encoder by name */
+
+extern TUTF8encoder tutf8e_encoder(const char *encoding);
+
+#define TUTF8E_OK      0 /* Success                    */
+#define TUTF8E_INVALID 1 /* Invalid input character    */
+#define TUTF8E_TOOLONG 2 /* Insufficient output buffer */
+
+/*
+ * tutf8e_encoder_string_length
+ *
+ * Determine the length of input and UTF8 encoded output of NUL-terminated string
+ * Performance: single pass O(n)
+ *
+ * output NUL terminator not counted
+ *
+ * - TUTF8E_INVALID if input character is not convertable
+ * - TUTF8E_OK for success
+ */
+
+static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t *input_length, size_t *output_length)
+{
+  return tutf8e_string_length((const uint16_t *) encoder, input, invalid, input_length, output_length);
+}
+
+/*
+ * tutf8e_encoder_string_encode
+ *
+ * UTF8 encode NUL-terminated string
+ * Performance: two pass O(n)
+ *
+ * output string is NUL terminated
+ * output_length is output buffer size for input
+ * output_length is encoded length for output, including NUL
+ *
+ * - TUTF8E_TOOLONG if output buffer insuficient
+ * - TUTF8E_INVALID if input character is not convertable
+ * - TUTF8E_OK for success
+ */
+
+static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *input, const char *invalid, char *output, size_t *output_length)
+{
+  return tutf8e_string_encode((const uint16_t *) encoder, input, invalid, output, output_length);
+}
+
+/* Known-length buffer API */
+
+/*
+ * tutf8e_encoder_buffer_length
+ *
+ * Determine the length of input and UTF8 encoded output of string
+ * Performance: single pass O(n)
+ *
+ * output NUL terminator not counted
+ *
+ * - TUTF8E_INVALID if input character is not convertable
+ * - TUTF8E_OK for success
+ */
+
+static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t input_length, size_t *length)
+{
+  return tutf8e_buffer_length((const uint16_t *) encoder, input, input_length, invalid, length);
+}
+
+/*
+ * tutf8e_encoder_buffer_encode
+ *
+ * UTF8 encode string
+ * Performance: two pass O(n)
+ *
+ * output string is not NUL terminated
+ *
+ * output_length is output buffer size for input
+ * output_length is encoded length for output
+ *
+ * - TUTF8E_TOOLONG if output buffer insuficient
+ * - TUTF8E_INVALID if input character is not convertable
+ * - TUTF8E_OK for success
+ */
+
+static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length)
+{
+  return tutf8e_buffer_encode((const uint16_t *) encoder, input, input_length, invalid, output, output_length);
+}
+
+''')
+
+  include.write('/* Supported encoders */\n\n')
+  for e in sorted(encodings):
+    name = e.replace('-', '_').lower()
+    include.write('extern const TUTF8encoder tutf8e_encoder_%s;\n'%(name))
+
+  include.write('\n')
+  include.write('#endif\n')
+
+with open('src/tutf8e.c', 'w') as src:
+
+  src.write('''
+#include <tutf8e.h>
+
+#include <string.h>
+
+int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length)
+{
+  const size_t invalid_length = invalid ? strlen(invalid) : 0;
+
+  const unsigned char *i;
+  for (i = (const unsigned char *) input; *i; ++i, (*input_length)++) {
+    const uint16_t c = table[*i];
+    if (c<0x80) {
+      *output_length += 1;
+      continue;
+    }
+    if (c<0x800) {
+      *output_length += 2;
+      continue;
+    }
+    if (c<0xffff) {
+      *output_length += 3;
+      continue;
+    }
+    if (invalid) {
+      *output_length += invalid_length;
+    }
+    else {
+      return TUTF8E_INVALID;
+    }
+  }
+  return TUTF8E_OK;
+}
+
+int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length)
+{
+  int ret;
+  size_t input_length = 0;
+  size_t encoded_length = 0;
+  if (!(ret = tutf8e_string_length(table, input, invalid, &input_length, &encoded_length)))
+  {
+    if (encoded_length+1 > *output_length) return TUTF8E_TOOLONG;
+    if (!(ret = tutf8e_buffer_encode(table, input, input_length, invalid, output, output_length)))
+    {
+      output[encoded_length] = 0;
+      return TUTF8E_OK;
+    }
+  }
+  return ret;
+}
+
+int tutf8e_buffer_length
+(
+  const uint16_t *table,
+  const char *input,
+  size_t input_length,
+  const char *invalid,
+  size_t *length
+)
+{
+  const size_t invalid_length = invalid ? strlen(invalid) : 0;
+
+  const unsigned char *i;
+  for (i = (const unsigned char *) input; input_length; ++i, --input_length) {
+    const uint16_t c = table[*i];
+    if (c<0x80) {
+      ++*length;
+      continue;
+    }
+    if (c<0x800) {
+      *length += 2;
+      continue;
+    }
+    if (c<0xffff) {
+      *length += 3;
+      continue;
+    }
+    if (invalid) {
+      *length += invalid_length;
+    }
+    else {
+      return TUTF8E_INVALID;
+    }
+  }
+  return TUTF8E_OK;
+}
+
+int tutf8e_buffer_encode
+(
+  const uint16_t *table,
+  const char *input,
+  size_t input_length,
+  const char *invalid,
+  char *output,
+  size_t *output_length
+)
+{
+  size_t invalid_length = invalid ? strlen(invalid) : 0;
+
+  size_t left = *output_length;
+  unsigned char *o = (unsigned char *) output;
+  const unsigned char *i;
+  for (i = (const unsigned char *) input; input_length; ++i, --input_length) {
+    const uint16_t c = table[*i];
+    if (c<0x80) {
+      if (left<1) return TUTF8E_TOOLONG;
+      *(o++) = c;
+      left -= 1;
+      continue;
+    }
+    if (c<0x800) {
+      if (left<2) return TUTF8E_TOOLONG;
+      *(o++) = 0xc0 | (c>>6);
+      *(o++) = 0x80 | (c&0x3f);
+      left -= 2;
+      continue;
+    }
+    if (c<0xffff) {
+      if (left<3) return TUTF8E_TOOLONG;
+      *(o++) = 0xe0 | (c>>12);
+      *(o++) = 0x80 | ((c>>6)&0x3f);
+      *(o++) = 0x80 | (c&0x3f);
+      left -= 3;
+      continue;
+    }
+    if (invalid)
+    {
+      if (left<invalid_length) return TUTF8E_TOOLONG;
+      if (invalid_length) {
+        memcpy(o, invalid, invalid_length);
+        o += invalid_length;
+        left -= invalid_length;
+      }
+    }
+    else {
+      return TUTF8E_INVALID;
+    }
+  }
+  *output_length -= left;
+  return TUTF8E_OK;
+}
+''')
+
+  for e in sorted(encodings):
+
+    mapping  = {}
+    domain   = []
+
+    name = e.replace('-', '_').lower()
+
+    v = []
+    for i in range(0,256):
+      try:
+        v.append(ord(bytes([i]).decode(e)[0]))
+      except:
+        v.append(0xffff)
+        pass
+
+    src.write('\n')
+    src.write('const uint16_t tutf8e_%s_utf8[256] =\n'%(name))
+    src.write('{\n')
+    for i in range(0,256,16):
+      src.write('  %s,\n'%(', '.join([ '0x%04x'%(i) for i in v[i:i+16]])))
+    src.write('};\n')
+
+  src.write('\n')
+  for e in sorted(encodings):
+    name = e.replace('-', '_').lower()
+    src.write('const TUTF8encoder tutf8e_encoder_%s = (TUTF8encoder) tutf8e_%s_utf8;\n'%(name, name))
+
+  src.write('''
+TUTF8encoder tutf8e_encoder(const char *encoding)
+{
+''')
+  for e in sorted(encodings):
+    name = e.replace('-', '_').lower()
+    src.write('  if (!strcmp(encoding, "%s")) return tutf8e_encoder_%s;\n'%(e, name))
+  src.write('''
+  return NULL;
+}
+''')
+
+for e in sorted(encodings):
+
+  mapping  = {}
+  domain   = []
+
+  name = e.replace('-', '_').lower()
+  with open('include/tutf8e/%s.h'%(name), 'w') as include:
+
+    include.write('''
+#ifndef TUTF8E_%s_H
+#define TUTF8E_%s_H
+
+#include <tutf8e.h>
+'''%(name.upper(), name.upper()))
+
+    include.write('''
+static inline int tutf8e_%s_string_length(const char *input, const char *invalid, size_t *input_length, size_t *output_length)
+{
+  return tutf8e_encoder_string_length(tutf8e_encoder_%s, input, invalid, input_length, output_length);
+}
+'''%(name, name))
+
+    include.write('''
+static inline int tutf8e_%s_string_encode(const char *input, const char *invalid, char *output, size_t *output_length)
+{
+  return tutf8e_encoder_string_encode(tutf8e_encoder_%s, input, invalid, output, output_length);
+}
+'''%(name, name))
+
+    include.write('''
+static inline int tutf8e_%s_buffer_length(const char *i, size_t input_length, const char *invalid, size_t *length)
+{
+  return tutf8e_encoder_buffer_length(tutf8e_encoder_%s, input, input_length, invalid, length);
+}
+'''%(name, name))
+
+    include.write('''
+static inline int tutf8e_%s_buffer_encode(const char *i, size_t input_length, const char *invalid, char *output, size_t *output_length)
+{
+  return tutf8e_encoder_buffer_encode(tutf8e_encoder_%s, input, input_length, invalid, output, output_length);
+}
+'''%(name, name))
+
+    include.write('\n')
+    include.write('#endif\n')
+
+# TESTS
+
+# List of pangrams
+# http://clagnut.com/blog/2380/
+
+tests = [
+  ('english',  'iso-8859-1',  'A quick brown fox jumps over the lazy dog'),
+  ('finnish',  'iso-8859-1',  'Albert osti fagotin ja töräytti puhkuvan melodian.'),
+  ('czech',    'iso-8859-2',  'Nechť již hříšné saxofony ďáblů rozezvučí síň úděsnými tóny waltzu, tanga a quickstepu.'),
+  ('turkish',  'iso-8859-3',  'Pijamalı hasta yağız şoföre çabucak güvendi.'),
+  ('estonian', 'iso-8859-4',  'Põdur Zagrebi tšellomängija-följetonist Ciqo külmetas kehvas garaažis'),
+  ('russian',  'iso-8859-5',  'В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!'),
+  ('greek',    'iso-8859-7',  'διαφυλάξτε γενικά τη ζωή σας από βαθειά ψυχικά τραύματα'),
+  ('hebrew',   'iso-8859-8',  'עטלף אבק נס דרך מזגן שהתפוצץ כי חם'),
+  ('turkish2', 'iso-8859-9',  'Pijamalı hasta yağız şoföre çabucak güvendi.'),
+  ('swedish',  'iso-8859-10', 'Flygande bäckasiner söka hwila på mjuka tuvor.'),
+  ('thai',     'iso-8859-11', 'เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอยฯ'),
+  ('polish',   'iso-8859-13', 'Jeżu klątw, spłódź Finom część gry hańb!')
+]
+
+with open('test/test.c', 'w') as test:
+
+  test.write('#include <tutf8e.h>\n')
+  test.write('\n')
+  # for e in sorted(encodings):
+  #   name = e.replace('-', '_').lower()
+  #   test.write('#include <tutf8e/%s.h>\n'%(name))
+  # test.write('\n')
+
+  test.write('#include <stdio.h>\n')
+  test.write('#include <string.h>\n')
+  test.write('#include <stdlib.h>\n')
+  test.write('\n')
+  test.write('int main(int argc, char *argv[])\n')
+  test.write('{\n')
+  test.write('  int pass = 0;\n')
+  test.write('  int fail = 0;\n')
+  test.write('  int ret;\n')
+  test.write('  char *copy;\n')
+  test.write('  size_t input_length, output_length;\n')
+  test.write('  char buffer[1024];\n')
+  # test.write('  char *encoded;\n')
+  test.write('\n')
+
+  for i in tests:
+    if i[1] in encodings:
+      test.write('  static const char %s[] = {\n'%(i[0]))
+      data = [i for i in i[2].encode(i[1])] + [ 0 ]
+      for i in range(0, len(data), 24):
+        test.write('    %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]])))
+      test.write('  };\n')
+
+  test.write('\n')
+  for i in tests:
+    if i[1] in encodings:
+      test.write('  static const char %sUTF8[] = {\n'%(i[0]))
+      data = [i for i in i[2].encode('utf-8')] + [ 0 ]
+      for i in range(0, len(data), 24):
+        test.write('    %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]])))
+      test.write('  };\n')
+
+  test.write('\n  /* string encode to UTF8, error on invalid input */\n')
+  for i in tests:
+    if i[1] in encodings:
+      name = i[1].replace('-', '_').lower()
+      test.write('  output_length = sizeof(buffer);\n')
+      test.write('  ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, %s, NULL, buffer, &output_length);\n'%(name, i[0]))
+      test.write('  if (!ret && !strcmp(buffer, %sUTF8)) {\n'%(i[0]))
+      test.write('    printf("%s\\n", buffer);\n')
+      test.write('    pass++;\n')
+      test.write('  } else {\n')
+      test.write('    printf("Failed to encode %s test\\n");\n'%(i[0]))
+      test.write('    fail++;\n')
+      test.write('  }\n')
+      test.write('\n')
+
+  test.write('\n  /* buffer encode to UTF8, error on invalid input */\n')
+  for i in tests:
+    if i[1] in encodings:
+      name = i[1].replace('-', '_').lower()
+      test.write('  input_length = strlen(%s);\n'%(i[0]))
+      test.write('  output_length = sizeof(buffer);\n')
+      test.write('  ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_%s, %s, input_length, NULL, buffer, &output_length);\n'%(name, i[0]))
+      test.write('  if (!ret && (output_length+1)==sizeof(%sUTF8) && !strncmp(buffer, %sUTF8, output_length)) {\n'%(i[0], i[0]))
+      test.write('    pass++;\n')
+      test.write('  } else {\n')
+      test.write('    printf("Failed to encode %s test\\n");\n'%(i[0]))
+      test.write('    fail++;\n')
+      test.write('  }\n')
+      test.write('\n')
+
+  test.write('\n  /* string encode to UTF8, first input character invalid -> ? */\n')
+  for i in tests:
+    if i[1] in ['iso-8859-6', 'iso-8859-7', 'iso-8859-11']:
+      name = i[1].replace('-', '_').lower()
+      test.write('  output_length = sizeof(buffer);\n')
+      test.write('  copy = strdup(%s);\n'%(i[0]))
+      test.write('  copy[0] = 255;\n')
+      test.write('  buffer[0] = 255;\n')
+      test.write('  ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, copy, "?", buffer, &output_length);\n'%(name))
+      test.write('  if (!ret && buffer[0]==\'?\') {\n')
+      test.write('    printf("%s\\n", buffer);\n')
+      test.write('    pass++;\n')
+      test.write('  } else {\n')
+      test.write('    printf("Failed to encode %s test\\n");\n'%(i[0]))
+      test.write('    fail++;\n')
+      test.write('  }\n')
+      test.write('  free(copy);\n')
+      test.write('\n')
+
+  test.write('\n  /* string encode to UTF8, first input character invalid -> [INVALID] */\n')
+  for i in tests:
+    if i[1] in ['iso-8859-6', 'iso-8859-7', 'iso-8859-11']:
+      name = i[1].replace('-', '_').lower()
+      test.write('  output_length = sizeof(buffer);\n')
+      test.write('  copy = strdup(%s);\n'%(i[0]))
+      test.write('  copy[0] = 255;\n')
+      test.write('  buffer[0] = 255;\n')
+      test.write('  ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, copy, "[INVALID]", buffer, &output_length);\n'%(name))
+      test.write('  if (!ret && !strncmp(buffer, "[INVALID]", 9)) {\n')
+      test.write('    printf("%s\\n", buffer);\n')
+      test.write('    pass++;\n')
+      test.write('  } else {\n')
+      test.write('    printf("Failed to encode %s test\\n");\n'%(i[0]))
+      test.write('    fail++;\n')
+      test.write('  }\n')
+      test.write('  free(copy);\n')
+      test.write('\n')
+
+  test.write('  printf("%d passed, %d failed tests\\n", pass, fail);\n')
+
+  test.write('}\n')