#!/usr/bin/env python3 encodings = [ 'windows-1250', 'windows-1251', 'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255', 'windows-1256', 'windows-1257', 'windows-1258', 'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5', 'iso-8859-6', 'iso-8859-7', 'iso-8859-8', 'iso-8859-9', 'iso-8859-10', 'iso-8859-11', 'iso-8859-13', 'iso-8859-14', 'iso-8859-15', 'iso-8859-16' ] with open('include/tutf8e.h', 'w') as include: include.write(''' #ifndef TUTF8E_H #define TUTF8E_H #include /* size_t */ #include /* uint16_t */ /*************** Internal API ***************/ /* NUL-terminated C-string API */ extern int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length); extern int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length); /* Known-length buffer API */ extern int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t input_length, const char *invalid, size_t *output_length); extern int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length); /*************** Public API ***************/ /* Opaque handle type */ typedef void *TUTF8encoder; /* Query encoder by name */ extern TUTF8encoder tutf8e_encoder(const char *encoding); #define TUTF8E_OK 0 /* Success */ #define TUTF8E_INVALID 1 /* Invalid input character */ #define TUTF8E_TOOLONG 2 /* Insufficient output buffer */ /* * tutf8e_encoder_string_length * * Determine the length of input and UTF8 encoded output of NUL-terminated string * Performance: single pass O(n) * * output NUL terminator not counted * * - TUTF8E_INVALID if input character is not convertable * - TUTF8E_OK for success */ static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t *input_length, size_t *output_length) { return tutf8e_string_length((const uint16_t *) encoder, input, invalid, input_length, output_length); } /* * tutf8e_encoder_string_encode * * UTF8 encode NUL-terminated string * Performance: two pass O(n) * * output string is NUL terminated * output_length is output buffer size for input * output_length is encoded length for output, including NUL * * - TUTF8E_TOOLONG if output buffer insuficient * - TUTF8E_INVALID if input character is not convertable * - TUTF8E_OK for success */ static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *input, const char *invalid, char *output, size_t *output_length) { return tutf8e_string_encode((const uint16_t *) encoder, input, invalid, output, output_length); } /* Known-length buffer API */ /* * tutf8e_encoder_buffer_length * * Determine the length of input and UTF8 encoded output of string * Performance: single pass O(n) * * output NUL terminator not counted * * - TUTF8E_INVALID if input character is not convertable * - TUTF8E_OK for success */ static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t input_length, size_t *length) { return tutf8e_buffer_length((const uint16_t *) encoder, input, input_length, invalid, length); } /* * tutf8e_encoder_buffer_encode * * UTF8 encode string * Performance: two pass O(n) * * output string is not NUL terminated * * output_length is output buffer size for input * output_length is encoded length for output * * - TUTF8E_TOOLONG if output buffer insuficient * - TUTF8E_INVALID if input character is not convertable * - TUTF8E_OK for success */ static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length) { return tutf8e_buffer_encode((const uint16_t *) encoder, input, input_length, invalid, output, output_length); } ''') include.write('/* Supported encoders */\n\n') for e in sorted(encodings): name = e.replace('-', '_').lower() include.write('extern const TUTF8encoder tutf8e_encoder_%s;\n'%(name)) include.write('\n') include.write('#endif\n') with open('src/tutf8e.c', 'w') as src: src.write(''' #include #include int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length) { const size_t invalid_length = invalid ? strlen(invalid) : 0; const unsigned char *i; for (i = (const unsigned char *) input; *i; ++i, (*input_length)++) { const uint16_t c = table[*i]; if (c<0x80) { *output_length += 1; continue; } if (c<0x800) { *output_length += 2; continue; } if (c<0xffff) { *output_length += 3; continue; } if (invalid) { *output_length += invalid_length; } else { return TUTF8E_INVALID; } } return TUTF8E_OK; } int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length) { int ret; size_t input_length = 0; size_t encoded_length = 0; if (!(ret = tutf8e_string_length(table, input, invalid, &input_length, &encoded_length))) { if (encoded_length+1 > *output_length) return TUTF8E_TOOLONG; if (!(ret = tutf8e_buffer_encode(table, input, input_length, invalid, output, output_length))) { output[encoded_length] = 0; return TUTF8E_OK; } } return ret; } int tutf8e_buffer_length ( const uint16_t *table, const char *input, size_t input_length, const char *invalid, size_t *length ) { const size_t invalid_length = invalid ? strlen(invalid) : 0; const unsigned char *i; for (i = (const unsigned char *) input; input_length; ++i, --input_length) { const uint16_t c = table[*i]; if (c<0x80) { ++*length; continue; } if (c<0x800) { *length += 2; continue; } if (c<0xffff) { *length += 3; continue; } if (invalid) { *length += invalid_length; } else { return TUTF8E_INVALID; } } return TUTF8E_OK; } int tutf8e_buffer_encode ( const uint16_t *table, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length ) { size_t invalid_length = invalid ? strlen(invalid) : 0; size_t left = *output_length; unsigned char *o = (unsigned char *) output; const unsigned char *i; for (i = (const unsigned char *) input; input_length; ++i, --input_length) { const uint16_t c = table[*i]; if (c<0x80) { if (left<1) return TUTF8E_TOOLONG; *(o++) = c; left -= 1; continue; } if (c<0x800) { if (left<2) return TUTF8E_TOOLONG; *(o++) = 0xc0 | (c>>6); *(o++) = 0x80 | (c&0x3f); left -= 2; continue; } if (c<0xffff) { if (left<3) return TUTF8E_TOOLONG; *(o++) = 0xe0 | (c>>12); *(o++) = 0x80 | ((c>>6)&0x3f); *(o++) = 0x80 | (c&0x3f); left -= 3; continue; } if (invalid) { if (left '''%(name.upper(), name.upper())) include.write(''' static inline int tutf8e_%s_string_length(const char *input, const char *invalid, size_t *input_length, size_t *output_length) { return tutf8e_encoder_string_length(tutf8e_encoder_%s, input, invalid, input_length, output_length); } '''%(name, name)) include.write(''' static inline int tutf8e_%s_string_encode(const char *input, const char *invalid, char *output, size_t *output_length) { return tutf8e_encoder_string_encode(tutf8e_encoder_%s, input, invalid, output, output_length); } '''%(name, name)) include.write(''' static inline int tutf8e_%s_buffer_length(const char *i, size_t input_length, const char *invalid, size_t *length) { return tutf8e_encoder_buffer_length(tutf8e_encoder_%s, input, input_length, invalid, length); } '''%(name, name)) include.write(''' static inline int tutf8e_%s_buffer_encode(const char *i, size_t input_length, const char *invalid, char *output, size_t *output_length) { return tutf8e_encoder_buffer_encode(tutf8e_encoder_%s, input, input_length, invalid, output, output_length); } '''%(name, name)) include.write('\n') include.write('#endif\n') # TESTS # List of pangrams # http://clagnut.com/blog/2380/ tests = [ ('english', 'iso-8859-1', 'A quick brown fox jumps over the lazy dog'), ('finnish', 'iso-8859-1', 'Albert osti fagotin ja töräytti puhkuvan melodian.'), ('czech', 'iso-8859-2', 'Nechť již hříšné saxofony ďáblů rozezvučí síň úděsnými tóny waltzu, tanga a quickstepu.'), ('turkish', 'iso-8859-3', 'Pijamalı hasta yağız şoföre çabucak güvendi.'), ('estonian', 'iso-8859-4', 'Põdur Zagrebi tšellomängija-följetonist Ciqo külmetas kehvas garaažis'), ('russian', 'iso-8859-5', 'В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!'), ('greek', 'iso-8859-7', 'διαφυλάξτε γενικά τη ζωή σας από βαθειά ψυχικά τραύματα'), ('hebrew', 'iso-8859-8', 'עטלף אבק נס דרך מזגן שהתפוצץ כי חם'), ('turkish2', 'iso-8859-9', 'Pijamalı hasta yağız şoföre çabucak güvendi.'), ('swedish', 'iso-8859-10', 'Flygande bäckasiner söka hwila på mjuka tuvor.'), ('thai', 'iso-8859-11', 'เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอยฯ'), ('polish', 'iso-8859-13', 'Jeżu klątw, spłódź Finom część gry hańb!') ] with open('test/test.c', 'w') as test: test.write('#include \n') test.write('\n') # for e in sorted(encodings): # name = e.replace('-', '_').lower() # test.write('#include \n'%(name)) # test.write('\n') test.write('#include \n') test.write('#include \n') test.write('#include \n') test.write('\n') test.write('int main(int argc, char *argv[])\n') test.write('{\n') test.write(' int pass = 0;\n') test.write(' int fail = 0;\n') test.write(' int ret;\n') test.write(' char *copy;\n') test.write(' size_t input_length, output_length;\n') test.write(' char buffer[1024];\n') # test.write(' char *encoded;\n') test.write('\n') for i in tests: if i[1] in encodings: test.write(' static const char %s[] = {\n'%(i[0])) data = [i for i in i[2].encode(i[1])] + [ 0 ] for i in range(0, len(data), 24): test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]]))) test.write(' };\n') test.write('\n') for i in tests: if i[1] in encodings: test.write(' static const char %sUTF8[] = {\n'%(i[0])) data = [i for i in i[2].encode('utf-8')] + [ 0 ] for i in range(0, len(data), 24): test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]]))) test.write(' };\n') test.write('\n /* string encode to UTF8, error on invalid input */\n') for i in tests: if i[1] in encodings: name = i[1].replace('-', '_').lower() test.write(' output_length = sizeof(buffer);\n') test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, %s, NULL, buffer, &output_length);\n'%(name, i[0])) test.write(' if (!ret && !strcmp(buffer, %sUTF8)) {\n'%(i[0])) test.write(' printf("%s\\n", buffer);\n') test.write(' pass++;\n') test.write(' } else {\n') test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) test.write(' fail++;\n') test.write(' }\n') test.write('\n') test.write('\n /* buffer encode to UTF8, error on invalid input */\n') for i in tests: if i[1] in encodings: name = i[1].replace('-', '_').lower() test.write(' input_length = strlen(%s);\n'%(i[0])) test.write(' output_length = sizeof(buffer);\n') test.write(' ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_%s, %s, input_length, NULL, buffer, &output_length);\n'%(name, i[0])) test.write(' if (!ret && (output_length+1)==sizeof(%sUTF8) && !strncmp(buffer, %sUTF8, output_length)) {\n'%(i[0], i[0])) test.write(' pass++;\n') test.write(' } else {\n') test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) test.write(' fail++;\n') test.write(' }\n') test.write('\n') test.write('\n /* string encode to UTF8, first input character invalid -> ? */\n') for i in tests: if i[1] in ['iso-8859-6', 'iso-8859-7', 'iso-8859-11']: name = i[1].replace('-', '_').lower() test.write(' output_length = sizeof(buffer);\n') test.write(' copy = strdup(%s);\n'%(i[0])) test.write(' copy[0] = 255;\n') test.write(' buffer[0] = 255;\n') test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, copy, "?", buffer, &output_length);\n'%(name)) test.write(' if (!ret && buffer[0]==\'?\') {\n') test.write(' printf("%s\\n", buffer);\n') test.write(' pass++;\n') test.write(' } else {\n') test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) test.write(' fail++;\n') test.write(' }\n') test.write(' free(copy);\n') test.write('\n') test.write('\n /* string encode to UTF8, first input character invalid -> [INVALID] */\n') for i in tests: if i[1] in ['iso-8859-6', 'iso-8859-7', 'iso-8859-11']: name = i[1].replace('-', '_').lower() test.write(' output_length = sizeof(buffer);\n') test.write(' copy = strdup(%s);\n'%(i[0])) test.write(' copy[0] = 255;\n') test.write(' buffer[0] = 255;\n') test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, copy, "[INVALID]", buffer, &output_length);\n'%(name)) test.write(' if (!ret && !strncmp(buffer, "[INVALID]", 9)) {\n') test.write(' printf("%s\\n", buffer);\n') test.write(' pass++;\n') test.write(' } else {\n') test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) test.write(' fail++;\n') test.write(' }\n') test.write(' free(copy);\n') test.write('\n') test.write(' printf("%d passed, %d failed tests\\n", pass, fail);\n') test.write('}\n')