summaryrefslogtreecommitdiffstats
path: root/src/fluent-bit/lib/tutf8e/codegen.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/fluent-bit/lib/tutf8e/codegen.py')
-rwxr-xr-xsrc/fluent-bit/lib/tutf8e/codegen.py496
1 files changed, 496 insertions, 0 deletions
diff --git a/src/fluent-bit/lib/tutf8e/codegen.py b/src/fluent-bit/lib/tutf8e/codegen.py
new file mode 100755
index 000000000..8d7c2c635
--- /dev/null
+++ b/src/fluent-bit/lib/tutf8e/codegen.py
@@ -0,0 +1,496 @@
+#!/usr/bin/env python3
+
+encodings = [
+ 'windows-1250', 'windows-1251', 'windows-1252',
+ 'windows-1253', 'windows-1254', 'windows-1255',
+ 'windows-1256', 'windows-1257', 'windows-1258',
+ 'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4',
+ 'iso-8859-5', 'iso-8859-6', 'iso-8859-7', 'iso-8859-8',
+ 'iso-8859-9', 'iso-8859-10', 'iso-8859-11', 'iso-8859-13',
+ 'iso-8859-14', 'iso-8859-15', 'iso-8859-16'
+]
+
+with open('include/tutf8e.h', 'w') as include:
+
+ include.write('''
+#ifndef TUTF8E_H
+#define TUTF8E_H
+
+#include <stddef.h> /* size_t */
+#include <stdint.h> /* uint16_t */
+
+/*************** Internal API ***************/
+
+/* NUL-terminated C-string API */
+
+extern int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length);
+extern int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length);
+
+/* Known-length buffer API */
+
+extern int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t input_length, const char *invalid, size_t *output_length);
+extern int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length);
+
+/*************** Public API ***************/
+
+/* Opaque handle type */
+
+typedef void *TUTF8encoder;
+
+/* Query encoder by name */
+
+extern TUTF8encoder tutf8e_encoder(const char *encoding);
+
+#define TUTF8E_OK 0 /* Success */
+#define TUTF8E_INVALID 1 /* Invalid input character */
+#define TUTF8E_TOOLONG 2 /* Insufficient output buffer */
+
+/*
+ * tutf8e_encoder_string_length
+ *
+ * Determine the length of input and UTF8 encoded output of NUL-terminated string
+ * Performance: single pass O(n)
+ *
+ * output NUL terminator not counted
+ *
+ * - TUTF8E_INVALID if input character is not convertable
+ * - TUTF8E_OK for success
+ */
+
+static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t *input_length, size_t *output_length)
+{
+ return tutf8e_string_length((const uint16_t *) encoder, input, invalid, input_length, output_length);
+}
+
+/*
+ * tutf8e_encoder_string_encode
+ *
+ * UTF8 encode NUL-terminated string
+ * Performance: two pass O(n)
+ *
+ * output string is NUL terminated
+ * output_length is output buffer size for input
+ * output_length is encoded length for output, including NUL
+ *
+ * - TUTF8E_TOOLONG if output buffer insuficient
+ * - TUTF8E_INVALID if input character is not convertable
+ * - TUTF8E_OK for success
+ */
+
+static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *input, const char *invalid, char *output, size_t *output_length)
+{
+ return tutf8e_string_encode((const uint16_t *) encoder, input, invalid, output, output_length);
+}
+
+/* Known-length buffer API */
+
+/*
+ * tutf8e_encoder_buffer_length
+ *
+ * Determine the length of input and UTF8 encoded output of string
+ * Performance: single pass O(n)
+ *
+ * output NUL terminator not counted
+ *
+ * - TUTF8E_INVALID if input character is not convertable
+ * - TUTF8E_OK for success
+ */
+
+static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t input_length, size_t *length)
+{
+ return tutf8e_buffer_length((const uint16_t *) encoder, input, input_length, invalid, length);
+}
+
+/*
+ * tutf8e_encoder_buffer_encode
+ *
+ * UTF8 encode string
+ * Performance: two pass O(n)
+ *
+ * output string is not NUL terminated
+ *
+ * output_length is output buffer size for input
+ * output_length is encoded length for output
+ *
+ * - TUTF8E_TOOLONG if output buffer insuficient
+ * - TUTF8E_INVALID if input character is not convertable
+ * - TUTF8E_OK for success
+ */
+
+static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length)
+{
+ return tutf8e_buffer_encode((const uint16_t *) encoder, input, input_length, invalid, output, output_length);
+}
+
+''')
+
+ include.write('/* Supported encoders */\n\n')
+ for e in sorted(encodings):
+ name = e.replace('-', '_').lower()
+ include.write('extern const TUTF8encoder tutf8e_encoder_%s;\n'%(name))
+
+ include.write('\n')
+ include.write('#endif\n')
+
+with open('src/tutf8e.c', 'w') as src:
+
+ src.write('''
+#include <tutf8e.h>
+
+#include <string.h>
+
+int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length)
+{
+ const size_t invalid_length = invalid ? strlen(invalid) : 0;
+
+ const unsigned char *i;
+ for (i = (const unsigned char *) input; *i; ++i, (*input_length)++) {
+ const uint16_t c = table[*i];
+ if (c<0x80) {
+ *output_length += 1;
+ continue;
+ }
+ if (c<0x800) {
+ *output_length += 2;
+ continue;
+ }
+ if (c<0xffff) {
+ *output_length += 3;
+ continue;
+ }
+ if (invalid) {
+ *output_length += invalid_length;
+ }
+ else {
+ return TUTF8E_INVALID;
+ }
+ }
+ return TUTF8E_OK;
+}
+
+int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length)
+{
+ int ret;
+ size_t input_length = 0;
+ size_t encoded_length = 0;
+ if (!(ret = tutf8e_string_length(table, input, invalid, &input_length, &encoded_length)))
+ {
+ if (encoded_length+1 > *output_length) return TUTF8E_TOOLONG;
+ if (!(ret = tutf8e_buffer_encode(table, input, input_length, invalid, output, output_length)))
+ {
+ output[encoded_length] = 0;
+ return TUTF8E_OK;
+ }
+ }
+ return ret;
+}
+
+int tutf8e_buffer_length
+(
+ const uint16_t *table,
+ const char *input,
+ size_t input_length,
+ const char *invalid,
+ size_t *length
+)
+{
+ const size_t invalid_length = invalid ? strlen(invalid) : 0;
+
+ const unsigned char *i;
+ for (i = (const unsigned char *) input; input_length; ++i, --input_length) {
+ const uint16_t c = table[*i];
+ if (c<0x80) {
+ ++*length;
+ continue;
+ }
+ if (c<0x800) {
+ *length += 2;
+ continue;
+ }
+ if (c<0xffff) {
+ *length += 3;
+ continue;
+ }
+ if (invalid) {
+ *length += invalid_length;
+ }
+ else {
+ return TUTF8E_INVALID;
+ }
+ }
+ return TUTF8E_OK;
+}
+
+int tutf8e_buffer_encode
+(
+ const uint16_t *table,
+ const char *input,
+ size_t input_length,
+ const char *invalid,
+ char *output,
+ size_t *output_length
+)
+{
+ size_t invalid_length = invalid ? strlen(invalid) : 0;
+
+ size_t left = *output_length;
+ unsigned char *o = (unsigned char *) output;
+ const unsigned char *i;
+ for (i = (const unsigned char *) input; input_length; ++i, --input_length) {
+ const uint16_t c = table[*i];
+ if (c<0x80) {
+ if (left<1) return TUTF8E_TOOLONG;
+ *(o++) = c;
+ left -= 1;
+ continue;
+ }
+ if (c<0x800) {
+ if (left<2) return TUTF8E_TOOLONG;
+ *(o++) = 0xc0 | (c>>6);
+ *(o++) = 0x80 | (c&0x3f);
+ left -= 2;
+ continue;
+ }
+ if (c<0xffff) {
+ if (left<3) return TUTF8E_TOOLONG;
+ *(o++) = 0xe0 | (c>>12);
+ *(o++) = 0x80 | ((c>>6)&0x3f);
+ *(o++) = 0x80 | (c&0x3f);
+ left -= 3;
+ continue;
+ }
+ if (invalid)
+ {
+ if (left<invalid_length) return TUTF8E_TOOLONG;
+ if (invalid_length) {
+ memcpy(o, invalid, invalid_length);
+ o += invalid_length;
+ left -= invalid_length;
+ }
+ }
+ else {
+ return TUTF8E_INVALID;
+ }
+ }
+ *output_length -= left;
+ return TUTF8E_OK;
+}
+''')
+
+ for e in sorted(encodings):
+
+ mapping = {}
+ domain = []
+
+ name = e.replace('-', '_').lower()
+
+ v = []
+ for i in range(0,256):
+ try:
+ v.append(ord(bytes([i]).decode(e)[0]))
+ except:
+ v.append(0xffff)
+ pass
+
+ src.write('\n')
+ src.write('const uint16_t tutf8e_%s_utf8[256] =\n'%(name))
+ src.write('{\n')
+ for i in range(0,256,16):
+ src.write(' %s,\n'%(', '.join([ '0x%04x'%(i) for i in v[i:i+16]])))
+ src.write('};\n')
+
+ src.write('\n')
+ for e in sorted(encodings):
+ name = e.replace('-', '_').lower()
+ src.write('const TUTF8encoder tutf8e_encoder_%s = (TUTF8encoder) tutf8e_%s_utf8;\n'%(name, name))
+
+ src.write('''
+TUTF8encoder tutf8e_encoder(const char *encoding)
+{
+''')
+ for e in sorted(encodings):
+ name = e.replace('-', '_').lower()
+ src.write(' if (!strcmp(encoding, "%s")) return tutf8e_encoder_%s;\n'%(e, name))
+ src.write('''
+ return NULL;
+}
+''')
+
+for e in sorted(encodings):
+
+ mapping = {}
+ domain = []
+
+ name = e.replace('-', '_').lower()
+ with open('include/tutf8e/%s.h'%(name), 'w') as include:
+
+ include.write('''
+#ifndef TUTF8E_%s_H
+#define TUTF8E_%s_H
+
+#include <tutf8e.h>
+'''%(name.upper(), name.upper()))
+
+ include.write('''
+static inline int tutf8e_%s_string_length(const char *input, const char *invalid, size_t *input_length, size_t *output_length)
+{
+ return tutf8e_encoder_string_length(tutf8e_encoder_%s, input, invalid, input_length, output_length);
+}
+'''%(name, name))
+
+ include.write('''
+static inline int tutf8e_%s_string_encode(const char *input, const char *invalid, char *output, size_t *output_length)
+{
+ return tutf8e_encoder_string_encode(tutf8e_encoder_%s, input, invalid, output, output_length);
+}
+'''%(name, name))
+
+ include.write('''
+static inline int tutf8e_%s_buffer_length(const char *i, size_t input_length, const char *invalid, size_t *length)
+{
+ return tutf8e_encoder_buffer_length(tutf8e_encoder_%s, input, input_length, invalid, length);
+}
+'''%(name, name))
+
+ include.write('''
+static inline int tutf8e_%s_buffer_encode(const char *i, size_t input_length, const char *invalid, char *output, size_t *output_length)
+{
+ return tutf8e_encoder_buffer_encode(tutf8e_encoder_%s, input, input_length, invalid, output, output_length);
+}
+'''%(name, name))
+
+ include.write('\n')
+ include.write('#endif\n')
+
+# TESTS
+
+# List of pangrams
+# http://clagnut.com/blog/2380/
+
+tests = [
+ ('english', 'iso-8859-1', 'A quick brown fox jumps over the lazy dog'),
+ ('finnish', 'iso-8859-1', 'Albert osti fagotin ja töräytti puhkuvan melodian.'),
+ ('czech', 'iso-8859-2', 'Nechť již hříšné saxofony ďáblů rozezvučí síň úděsnými tóny waltzu, tanga a quickstepu.'),
+ ('turkish', 'iso-8859-3', 'Pijamalı hasta yağız şoföre çabucak güvendi.'),
+ ('estonian', 'iso-8859-4', 'Põdur Zagrebi tšellomängija-följetonist Ciqo külmetas kehvas garaažis'),
+ ('russian', 'iso-8859-5', 'В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!'),
+ ('greek', 'iso-8859-7', 'διαφυλάξτε γενικά τη ζωή σας από βαθειά ψυχικά τραύματα'),
+ ('hebrew', 'iso-8859-8', 'עטלף אבק נס דרך מזגן שהתפוצץ כי חם'),
+ ('turkish2', 'iso-8859-9', 'Pijamalı hasta yağız şoföre çabucak güvendi.'),
+ ('swedish', 'iso-8859-10', 'Flygande bäckasiner söka hwila på mjuka tuvor.'),
+ ('thai', 'iso-8859-11', 'เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอยฯ'),
+ ('polish', 'iso-8859-13', 'Jeżu klątw, spłódź Finom część gry hańb!')
+]
+
+with open('test/test.c', 'w') as test:
+
+ test.write('#include <tutf8e.h>\n')
+ test.write('\n')
+ # for e in sorted(encodings):
+ # name = e.replace('-', '_').lower()
+ # test.write('#include <tutf8e/%s.h>\n'%(name))
+ # test.write('\n')
+
+ test.write('#include <stdio.h>\n')
+ test.write('#include <string.h>\n')
+ test.write('#include <stdlib.h>\n')
+ test.write('\n')
+ test.write('int main(int argc, char *argv[])\n')
+ test.write('{\n')
+ test.write(' int pass = 0;\n')
+ test.write(' int fail = 0;\n')
+ test.write(' int ret;\n')
+ test.write(' char *copy;\n')
+ test.write(' size_t input_length, output_length;\n')
+ test.write(' char buffer[1024];\n')
+ # test.write(' char *encoded;\n')
+ test.write('\n')
+
+ for i in tests:
+ if i[1] in encodings:
+ test.write(' static const char %s[] = {\n'%(i[0]))
+ data = [i for i in i[2].encode(i[1])] + [ 0 ]
+ for i in range(0, len(data), 24):
+ test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]])))
+ test.write(' };\n')
+
+ test.write('\n')
+ for i in tests:
+ if i[1] in encodings:
+ test.write(' static const char %sUTF8[] = {\n'%(i[0]))
+ data = [i for i in i[2].encode('utf-8')] + [ 0 ]
+ for i in range(0, len(data), 24):
+ test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]])))
+ test.write(' };\n')
+
+ test.write('\n /* string encode to UTF8, error on invalid input */\n')
+ for i in tests:
+ if i[1] in encodings:
+ name = i[1].replace('-', '_').lower()
+ test.write(' output_length = sizeof(buffer);\n')
+ test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, %s, NULL, buffer, &output_length);\n'%(name, i[0]))
+ test.write(' if (!ret && !strcmp(buffer, %sUTF8)) {\n'%(i[0]))
+ test.write(' printf("%s\\n", buffer);\n')
+ test.write(' pass++;\n')
+ test.write(' } else {\n')
+ test.write(' printf("Failed to encode %s test\\n");\n'%(i[0]))
+ test.write(' fail++;\n')
+ test.write(' }\n')
+ test.write('\n')
+
+ test.write('\n /* buffer encode to UTF8, error on invalid input */\n')
+ for i in tests:
+ if i[1] in encodings:
+ name = i[1].replace('-', '_').lower()
+ test.write(' input_length = strlen(%s);\n'%(i[0]))
+ test.write(' output_length = sizeof(buffer);\n')
+ test.write(' ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_%s, %s, input_length, NULL, buffer, &output_length);\n'%(name, i[0]))
+ test.write(' if (!ret && (output_length+1)==sizeof(%sUTF8) && !strncmp(buffer, %sUTF8, output_length)) {\n'%(i[0], i[0]))
+ test.write(' pass++;\n')
+ test.write(' } else {\n')
+ test.write(' printf("Failed to encode %s test\\n");\n'%(i[0]))
+ test.write(' fail++;\n')
+ test.write(' }\n')
+ test.write('\n')
+
+ test.write('\n /* string encode to UTF8, first input character invalid -> ? */\n')
+ for i in tests:
+ if i[1] in ['iso-8859-6', 'iso-8859-7', 'iso-8859-11']:
+ name = i[1].replace('-', '_').lower()
+ test.write(' output_length = sizeof(buffer);\n')
+ test.write(' copy = strdup(%s);\n'%(i[0]))
+ test.write(' copy[0] = 255;\n')
+ test.write(' buffer[0] = 255;\n')
+ test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, copy, "?", buffer, &output_length);\n'%(name))
+ test.write(' if (!ret && buffer[0]==\'?\') {\n')
+ test.write(' printf("%s\\n", buffer);\n')
+ test.write(' pass++;\n')
+ test.write(' } else {\n')
+ test.write(' printf("Failed to encode %s test\\n");\n'%(i[0]))
+ test.write(' fail++;\n')
+ test.write(' }\n')
+ test.write(' free(copy);\n')
+ test.write('\n')
+
+ test.write('\n /* string encode to UTF8, first input character invalid -> [INVALID] */\n')
+ for i in tests:
+ if i[1] in ['iso-8859-6', 'iso-8859-7', 'iso-8859-11']:
+ name = i[1].replace('-', '_').lower()
+ test.write(' output_length = sizeof(buffer);\n')
+ test.write(' copy = strdup(%s);\n'%(i[0]))
+ test.write(' copy[0] = 255;\n')
+ test.write(' buffer[0] = 255;\n')
+ test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, copy, "[INVALID]", buffer, &output_length);\n'%(name))
+ test.write(' if (!ret && !strncmp(buffer, "[INVALID]", 9)) {\n')
+ test.write(' printf("%s\\n", buffer);\n')
+ test.write(' pass++;\n')
+ test.write(' } else {\n')
+ test.write(' printf("Failed to encode %s test\\n");\n'%(i[0]))
+ test.write(' fail++;\n')
+ test.write(' }\n')
+ test.write(' free(copy);\n')
+ test.write('\n')
+
+ test.write(' printf("%d passed, %d failed tests\\n", pass, fail);\n')
+
+ test.write('}\n')