diff options
Diffstat (limited to 'third_party/rust/chardetng/src')
-rw-r--r-- | third_party/rust/chardetng/src/data.rs | 1313 | ||||
-rw-r--r-- | third_party/rust/chardetng/src/lib.rs | 3775 | ||||
-rw-r--r-- | third_party/rust/chardetng/src/tld.rs | 340 |
3 files changed, 5428 insertions, 0 deletions
diff --git a/third_party/rust/chardetng/src/data.rs b/third_party/rust/chardetng/src/data.rs new file mode 100644 index 0000000000..5dacd15a8c --- /dev/null +++ b/third_party/rust/chardetng/src/data.rs @@ -0,0 +1,1313 @@ +/* Any copyright is dedicated to the Public Domain. + * https://creativecommons.org/publicdomain/zero/1.0/ */ + +use super::IMPLAUSIBILITY_PENALTY; +use encoding_rs::Encoding; +use encoding_rs::IBM866_INIT; +use encoding_rs::ISO_8859_13_INIT; +use encoding_rs::ISO_8859_2_INIT; +use encoding_rs::ISO_8859_4_INIT; +use encoding_rs::ISO_8859_5_INIT; +use encoding_rs::ISO_8859_6_INIT; +use encoding_rs::ISO_8859_7_INIT; +use encoding_rs::ISO_8859_8_INIT; +use encoding_rs::KOI8_U_INIT; +use encoding_rs::WINDOWS_1250_INIT; +use encoding_rs::WINDOWS_1251_INIT; +use encoding_rs::WINDOWS_1252_INIT; +use encoding_rs::WINDOWS_1253_INIT; +use encoding_rs::WINDOWS_1254_INIT; +use encoding_rs::WINDOWS_1255_INIT; +use encoding_rs::WINDOWS_1256_INIT; +use encoding_rs::WINDOWS_1257_INIT; +use encoding_rs::WINDOWS_1258_INIT; +use encoding_rs::WINDOWS_874_INIT; + +const PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: usize = 0; + +const IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: usize = 1; + +const IMPLAUSIBLE_BEFORE_ALPHABETIC: usize = 2; + +const IMPLAUSIBLE_AFTER_ALPHABETIC: usize = 3; + +const PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE: usize = 4; + +const PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE: usize = 5; + +const WINDOWS_1256_ZWNJ: usize = 2; + +pub const ASCII_DIGIT: usize = 100; + +#[repr(align(64))] // Align to cache lines +pub struct DetectorData { + pub frequent_simplified: [u16; 128], + pub frequent_kanji: [u16; 128], + pub frequent_hangul: [u16; 128], + latin_ascii: [u8; 128], + non_latin_ascii: [u8; 128], + turkish_ascii: [u8; 128], + windows_1258: [u8; 128], + windows_1250: [u8; 128], + iso_8859_2: [u8; 128], + windows_1251: [u8; 128], + koi8_u: [u8; 128], + iso_8859_5: [u8; 128], + ibm866: [u8; 128], + windows_1252: [u8; 128], + windows_1252_icelandic: [u8; 128], + windows_1253: [u8; 128], + iso_8859_7: [u8; 128], + windows_1254: [u8; 128], + windows_1255: [u8; 128], + iso_8859_8: [u8; 128], + windows_1256: [u8; 128], + iso_8859_6: [u8; 128], + windows_1257: [u8; 128], + iso_8859_13: [u8; 128], + iso_8859_4: [u8; 128], + windows_874: [u8; 128], + vietnamese: [u8; 1975], + central: [u8; 3895], + cyrillic: [u8; 2112], + western: [u8; 2752], + icelandic: [u8; 871], + greek: [u8; 1365], + turkish: [u8; 845], + hebrew: [u8; 1292], + arabic: [u8; 2805], + baltic: [u8; 1387], + thai: [u8; 5180], +} + +#[rustfmt::skip] +pub static DETECTOR_DATA: DetectorData = DetectorData { + frequent_simplified: [ + 0x7684, 0x5E74, 0x56FD, 0x65E5, 0x6708, 0x4E2D, 0x4EBA, 0x4E00, 0x5927, 0x4E3A, 0x5728, 0x662F, 0x5B66, 0x6709, 0x884C, 0x4F1A, + 0x65AF, 0x4E8E, 0x5730, 0x533A, 0x6587, 0x548C, 0x5C14, 0x540D, 0x7B2C, 0x516C, 0x65F6, 0x5C0F, 0x90E8, 0x4E0D, 0x5E02, 0x53F0, + 0x4EE5, 0x4E0A, 0x540E, 0x52A8, 0x51FA, 0x4E2A, 0x672C, 0x4F5C, 0x5BB6, 0x65B0, 0x6210, 0x897F, 0x5B9A, 0x91CD, 0x751F, 0x4E4B, + 0x7535, 0x4E3B, 0x5B50, 0x7528, 0x7279, 0x5206, 0x6C11, 0x4E86, 0x4E9A, 0x5458, 0x514B, 0x5357, 0x653F, 0x7AD9, 0x5FB7, 0x4E0E, + 0x7403, 0x4E1C, 0x79D1, 0x91CC, 0x9053, 0x5C71, 0x6CD5, 0x65B9, 0x5317, 0x5411, 0x5929, 0x53D1, 0x7269, 0x6765, 0x5230, 0x673A, + 0x661F, 0x8DEF, 0x76EE, 0x7F8E, 0x6751, 0x9AD8, 0x957F, 0x519B, 0x5229, 0x4E09, 0x62C9, 0x8F66, 0x5DDE, 0x57FA, 0x6D77, 0x81EA, + 0x4E0B, 0x8D5B, 0x9762, 0x52A0, 0x4ED6, 0x9A6C, 0x5176, 0x53C2, 0x53BF, 0x4EE3, 0x5185, 0x7406, 0x4E16, 0x4E8C, 0x7EBF, 0x53CA, + 0x5EFA, 0x8868, 0x4F4D, 0x7F57, 0x7531, 0x7ACB, 0x591A, 0x53EF, 0x534E, 0x6797, 0x7EF4, 0x5EA6, 0x4E8B, 0x5E73, 0x5916, 0x4F53, + ], + frequent_kanji: [ + 0x5E74, 0x65E5, 0x6708, 0x5927, 0x672C, 0x5B66, 0x4EBA, 0x56FD, 0x4F1A, 0x4E2D, 0x51FA, 0x4E00, 0x8005, 0x5E02, 0x4F5C, 0x540D, + 0x90E8, 0x7528, 0x5730, 0x884C, 0x5834, 0x7530, 0x7B2C, 0x751F, 0x5408, 0x5B50, 0x9053, 0x4E0A, 0x6771, 0x6642, 0x770C, 0x4EE3, + 0x5C71, 0x793E, 0x4E8B, 0x753B, 0x65B0, 0x624B, 0x9AD8, 0x6210, 0x6226, 0x7269, 0x5F8C, 0x767A, 0x9577, 0x7ACB, 0x5206, 0x5DDD, + 0x8A18, 0x6821, 0x9593, 0x696D, 0x95A2, 0x6240, 0x5B9A, 0x9078, 0x5C0F, 0x76EE, 0x52D5, 0x548C, 0x6587, 0x91CE, 0x540C, 0x524D, + 0x5185, 0x958B, 0x7DDA, 0x81EA, 0x53F7, 0x516C, 0x99C5, 0x9001, 0x56DE, 0x753A, 0x9664, 0x4E3B, 0x5BB6, 0x5229, 0x8ECA, 0x901A, + 0x4EAC, 0x8868, 0x5CF6, 0x4E0B, 0x4E16, 0x65B9, 0x6751, 0x66F8, 0x5168, 0x660E, 0x9023, 0x5E73, 0x653E, 0x4F53, 0x7684, 0x5F0F, + 0x756A, 0x5EA6, 0x5317, 0x5165, 0x5916, 0x983C, 0x8A9E, 0x5973, 0x8A71, 0x6A5F, 0x8A2D, 0x539F, 0x4E09, 0x524A, 0x533A, 0x6D77, + 0x4F9D, 0x5F53, 0x73FE, 0x5BFE, 0x4F4D, 0x6570, 0x5316, 0x795E, 0x66F2, 0x7406, 0x6559, 0x7279, 0x7248, 0x5728, 0x6CD5, 0x898B, + ], + frequent_hangul: [ + 0xC774, 0xC758, 0xB2E4, 0xAE30, 0xC5D0, 0xB85C, 0xC0AC, 0xB144, 0xC2A4, 0xB9AC, 0xB294, 0xC77C, 0xD558, 0xAC00, 0xC2DC, 0xC9C0, + 0xB300, 0xC11C, 0xBD84, 0xAD6D, 0xD55C, 0xB3C4, 0xC778, 0xACE0, 0xB958, 0xC790, 0xC8FC, 0xC544, 0xC744, 0xB77C, 0xC218, 0xC81C, + 0xC815, 0xC6D4, 0xB098, 0xAD6C, 0xC804, 0xC5B4, 0xC740, 0xADF8, 0xBD80, 0xB97C, 0xB3D9, 0xC120, 0xC73C, 0xBB38, 0xD2B8, 0xC6A9, + 0xBCF4, 0xC704, 0xB4DC, 0xACFC, 0xAD50, 0xC0C1, 0xB9C8, 0xC7A5, 0xD559, 0xC6D0, 0xC131, 0xD654, 0xC5ED, 0xB2C8, 0xBBF8, 0xACF5, + 0xACBD, 0xD574, 0xC624, 0xC6B0, 0xBA85, 0xC788, 0xD06C, 0xC601, 0xC18C, 0xC870, 0xD68C, 0xC5EC, 0xBBFC, 0xD1A0, 0xBE44, 0xC138, + 0xB974, 0xC720, 0xC2E0, 0xD0A4, 0xC911, 0xACC4, 0xD0C0, 0xC5F0, 0xD504, 0xAD00, 0xB418, 0xC801, 0xCE58, 0xB808, 0xCE74, 0xC9C4, + 0xC640, 0xD130, 0xB4E4, 0xBAA9, 0xACA8, 0xAC8C, 0xAC1C, 0xBC29, 0xD30C, 0xC0B0, 0xD638, 0xCD9C, 0xC74C, 0xB9BC, 0xBA74, 0xC791, + 0xB9CC, 0xB2E8, 0xB118, 0xBAA8, 0xC694, 0xC5C8, 0xC0DD, 0xB0A8, 0xC7AC, 0xBB34, 0xD6C4, 0xD45C, 0xAD70, 0xD3EC, 0xB2F9, 0xB178, + ], + latin_ascii: [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100,100,100,100,100,100,100,100,100,100, 0, 0, 0, 0, 0, 0, + 0,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151,152,153,154, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0, + ], + non_latin_ascii: [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100,100,100,100,100,100,100,100,100,100, 0, 0, 0, 0, 0, 0, + 0,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129, + 129,129,129,129,129,129,129,129,129,129,129, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + ], + turkish_ascii: [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100,100,100,100,100,100,100,100,100,100, 0, 0, 0, 0, 0, 0, + 0,129,130,131,132,133,134,135,136,154,137,138,139,140,141,142, + 143,144,145,146,147,148,149,150,151,152,153, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 27, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0, + ], + windows_1258: [ + 0,255, 0, 53, 0, 0, 0, 0, 0, 0,255, 0,155,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 27,255,255,179, + 0, 55, 53, 53, 53, 53, 52, 53, 53, 55, 53, 53, 56, 52, 54, 53, + 55, 56, 54, 54, 53, 55, 54, 52, 53, 54, 53, 53, 55, 55, 55, 55, + 159,160,161,162,155,155,155,155,163,164,165,155, 28,167,168,169, + 170,155, 29,172,173,174,155, 56,155,175,176,177,155,178, 30, 27, + 31, 32, 33, 34, 27, 27, 27, 27, 35, 36, 37, 27, 38, 39, 40, 41, + 42, 27, 43, 44, 45, 46, 27, 56, 27, 47, 48, 49, 27, 50, 53, 51, + ], + windows_1250: [ + 0,255, 0,255, 0, 0, 0, 0,255, 0,156, 0,157,158,159,160, + 255, 0, 0, 0, 0, 0, 0, 0,255, 0, 28, 0, 29, 30, 31, 32, + 0, 69, 69,161, 69,162, 68, 69, 69, 71,163, 68, 69, 68, 70,165, + 71, 69, 69, 33, 69, 71, 70, 68, 69, 34, 35, 68,164, 0, 36, 37, + 166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181, + 182,183,184,185,186,187,188, 72,189,190,191,192,193,194,195, 27, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 72, 61, 62, 63, 64, 65, 66, 67, 69, + ], + iso_8859_2: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,162, 69,161, 69,164,157, 69, 69,156,163,158,160, 68,159,165, + 71, 34, 69, 33, 69, 36, 29, 69, 69, 28, 35, 30, 32, 0, 31, 37, + 166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181, + 182,183,184,185,186,187,188, 72,189,190,191,192,193,194,195, 27, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 72, 61, 62, 63, 64, 65, 66, 67, 69, + ], + windows_1251: [ + 131,130, 0, 2, 0, 0, 0, 0, 0, 0,132, 0,133,130,134,135, + 3, 0, 0, 0, 0, 0, 0, 0,255, 0, 4, 0, 5, 2, 6, 7, + 0,136, 8,140, 47,130, 46, 47,138, 49,139, 49, 50, 46, 48,141, + 49, 50,137, 9, 2, 49, 48, 46, 10, 47, 11, 48, 12,130, 2, 13, + 142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157, + 158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + ], + koi8_u: [ + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 46, 0, 50, 50, 50, 0, 47, 49, 48, 46, 50, + 47, 47, 47, 10, 11, 47, 9, 13, 47, 47, 47, 47, 47, 2, 8, 47, + 47, 47, 47,138,139, 47,137,141, 47, 47, 47, 47, 47,130,136, 49, + 44, 14, 15, 36, 18, 19, 34, 17, 35, 22, 23, 24, 25, 26, 27, 28, + 29, 45, 30, 31, 32, 33, 20, 16, 42, 41, 21, 38, 43, 39, 37, 40, + 172,142,143,164,146,147,162,145,163,150,151,152,153,154,155,156, + 157,173,158,159,160,161,148,144,170,169,149,166,171,167,165,168, + ], + iso_8859_5: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,138,131,130,139,130,137,141,140,132,133,134,130, 46,136,135, + 142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157, + 158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 47, 10, 3, 2, 11, 2, 9, 13, 12, 4, 5, 6, 2, 47, 8, 7, + ], + ibm866: [ + 142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157, + 158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 138, 10,139, 11,141, 13,136, 8, 49, 46, 46, 0, 47, 47, 47, 3, + ], + windows_1252: [ + 0,255, 0, 60, 0, 0, 0, 0, 0, 0,156, 0,157,255,185,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 29,255, 57,186, + 0, 62, 60, 60, 60, 60, 59, 60, 60, 62, 60, 59, 63, 59, 61, 60, + 62, 63, 61, 61, 60, 62, 61, 59, 60, 61, 60, 59, 62, 62, 62, 62, + 158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173, + 188,174,175,176,177,178,179, 63,180,181,182,183,184,188,188, 27, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 60, 46, 47, 48, 49, 50, 51, 63, 52, 53, 54, 55, 56, 60, 60, 58, + ], + windows_1252_icelandic: [ + 0,255, 0, 41, 0, 0, 0, 0, 0, 0,155, 0,155,255,155,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 27,255, 27,155, + 0, 43, 41, 41, 41, 41, 40, 41, 41, 43, 41, 41, 44, 40, 42, 41, + 43, 44, 42, 42, 41, 43, 42, 40, 41, 42, 41, 41, 43, 43, 43, 43, + 155,156,155,155,157,155,158,155,155,159,155,155,155,160,155,155, + 161,155,155,162,155,155,163, 44,164,155,165,155,155,166,167, 27, + 27, 28, 27, 27, 29, 27, 30, 27, 27, 31, 27, 27, 27, 32, 27, 27, + 33, 27, 27, 34, 27, 27, 35, 44, 36, 27, 37, 27, 27, 38, 39, 27, + ], + windows_1253: [ + 38,255, 0, 38, 0, 0, 0, 0,255, 0,255, 0,255,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255,255,255,255, + 0, 38,131, 38, 38, 38, 37, 38, 38, 40,255, 40, 37, 37, 39, 37, + 40, 37, 39, 39, 0, 40, 39, 37,132,133,134, 39,162, 40,163,164, + 2,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149, + 150,151,255,153,154,155,156,157,158,159,160,161, 3, 4, 5, 6, + 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,255, + ], + iso_8859_7: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0, 40, 39, 38, 38, 38, 37, 38, 38, 40, 38, 40, 37, 37,255, 37, + 40, 37, 39, 39, 0, 38,131, 37,132,133,134, 39,162, 40,163,164, + 2,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149, + 150,151,255,153,154,155,156,157,158,159,160,161, 3, 4, 5, 6, + 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,255, + ], + windows_1254: [ + 0,255, 0, 40, 0, 0, 0, 0, 0, 0,156, 0,156,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 28,255,255,156, + 0, 42, 40, 40, 40, 40, 39, 40, 40, 42, 40, 42, 43, 39, 41, 40, + 42, 43, 41, 41, 40, 42, 41, 39, 40, 41, 40, 41, 42, 42, 42, 42, + 156,156,158,156,157,156,156,159,156,156,160,156,156,156,161,156, + 162,156,156,156,156,156,163, 43,156,156,156,164,165,155,166, 28, + 28, 28, 30, 28, 29, 28, 28, 31, 28, 28, 32, 28, 28, 28, 33, 28, + 34, 28, 28, 28, 28, 28, 35, 43, 28, 28, 28, 36, 37, 26, 38, 28, + ], + windows_1255: [ + 0,255, 0, 37, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255, + 0, 39, 37, 37, 37, 37, 36, 37, 37, 39, 40, 40, 40, 36, 38, 37, + 39, 40, 38, 38, 37, 39, 38, 36, 37, 38, 40, 40, 39, 39, 39, 39, + 2, 2, 2, 2, 2, 2, 2, 3, 4, 2, 2, 2, 5, 2, 36, 37, + 36, 2, 2, 0, 6, 7, 8, 41, 41,255,255,255,255,255,255,255, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,255,255, 37, 37,255, + ], + iso_8859_8: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,255, 37, 37, 37, 37, 36, 37, 37, 39, 40, 40, 40, 36, 38, 37, + 39, 40, 38, 38, 37, 39, 38, 36, 37, 38, 40, 40, 39, 39, 39,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 36, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,255,255, 37, 37,255, + ], + windows_1256: [ + 0, 3, 0, 54, 0, 0, 0, 0, 0, 0, 4, 0,129, 5, 6, 7, + 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 10, 0, 1, 2, 53, 11, + 0, 58, 54, 54, 54, 54, 53, 54, 54, 56, 12, 56, 57, 53, 55, 54, + 56, 57, 55, 55, 54, 56, 55, 53, 54, 55, 58, 55, 56, 56, 56, 58, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 57, 36, 37, 38, 39, 40, 41, 42, 43, + 1, 44, 1, 45, 46, 47, 48, 1, 1, 1, 1, 1, 49, 50, 1, 1, + 51, 51, 51, 51, 1, 51, 51, 57, 51, 1, 51, 1, 1, 54, 54, 52, + ], + iso_8859_6: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,255,255,255, 54,255,255,255,255,255,255,255, 58, 53,255,255, + 255,255,255,255,255,255,255,255,255,255,255, 58,255,255,255, 58, + 255, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,255,255,255,255,255, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 51, 51, 51, 51, + 51, 51, 51,255,255,255,255,255,255,255,255,255,255,255,255,255, + ], + windows_1257: [ + 0,255, 0,255, 0, 0, 0, 0,255, 0,255, 0,255, 47, 47, 47, + 255, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 47, 47,255, + 0,255, 47, 47, 47,255, 46, 47,155, 49,156, 49, 50, 46, 48,155, + 49, 50, 48, 48, 47, 49, 48, 46, 27, 48, 28, 48, 49, 49, 49, 27, + 157,158,159,155,155,155,160,161,162,155,155,163,164,165,166,167, + 168,155,169,155,170,155,155, 50,171,155,155,172,155,155,173, 27, + 29, 30, 31, 27, 27, 27, 32, 33, 34, 27, 27, 35, 36, 37, 38, 39, + 40, 27, 41, 27, 42, 27, 27, 50, 43, 27, 27, 44, 27, 27, 45, 47, + ], + iso_8859_13: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0, 48, 47, 47, 47, 49, 46, 47,155, 49,156, 49, 50, 46, 48,155, + 49, 50, 48, 48, 46, 49, 48, 46, 27, 48, 28, 48, 49, 49, 49, 27, + 157,158,159,155,155,155,160,161,162,155,155,163,164,165,166,167, + 168,155,169,155,170,155,155, 50,171,155,155,172,155,155,173, 27, + 29, 30, 31, 27, 27, 27, 32, 33, 34, 27, 27, 35, 36, 37, 38, 39, + 40, 27, 41, 27, 42, 27, 27, 50, 43, 27, 27, 44, 27, 27, 45, 46, + ], + iso_8859_4: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,157, 27,156, 47,155,167, 47, 47,168,161,164,155, 46,173, 47, + 49, 29, 47, 28, 47, 27, 39, 47, 47, 40, 33, 36, 27,155, 45, 27, + 159,155,155,155,155,155,155,158,162,155,160,155,163,155,155,166, + 155,169,170,165,155,155,155, 50,155,171,155,155,155,155,172, 27, + 31, 27, 27, 27, 27, 27, 27, 30, 34, 27, 32, 27, 35, 27, 27, 38, + 27, 41, 42, 37, 27, 27, 27, 50, 27, 43, 27, 27, 27, 27, 44, 47, + ], + windows_874: [ + 77,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255, + 0, 2, 3, 71, 4, 71, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 71, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,255,255,255,255, 77, + 57, 58, 59, 60, 61, 71, 62, 63, 64, 65, 66, 67, 68, 69, 70, 77, + 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,255,255,255,255, + ], + vietnamese: [ + 0, 58, 58, 22, 33, 7, 0, 0, 0, 0, 1, 57, 3, 0, 0, 0, 39, 14, 8, 3, 0, 11, 0, 5, 0, // , + 0, 3, 16, 2, 0, 0, 0, 0, 0, 0, 0, 3, 1,255, 0, 17, 2, 1, 0, 0, 1, 0, 0, 1,255, // a, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // b, + 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 51, 12, 0, 0, 0, 48, 0, 0, 0, 0, 2, 0, 0, 0, // c, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // d, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, // e, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, // f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // g, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // h, + 0, 27, 11, 1, 16, 3, 0, 0, 0, 0, 0, 16, 0, 0,255, 10, 34, 0, 2, 1, 0, 0, 0, 0,255, // i, + 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0,255, // j, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // k, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // l, + 0, 6, 3, 0, 2, 0, 2, 22, 0, 0, 2, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,255, // m, + 0, 59, 23, 10, 19, 22, 18, 5, 0, 0, 28, 61, 6, 0, 0, 0, 39, 4, 20, 9, 6, 1, 0, 1,255, // n, + 0, 5, 13, 0, 5, 3, 0, 0, 0, 0, 0, 3, 0,255, 0, 4, 2, 0, 0,255,255, 0,255, 0,255, // o, + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 13, 0, 0,255, 0, 10, 0, 0, 0, 0, 0, 0, 0,255, // p, + 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0,255, 0, 0,255, 0,255, // q, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // r, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // s, + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0, 0,255, // t, + 0, 12, 3, 1, 1, 0, 4, 0, 0, 0, 6, 4, 0,255,255, 0, 4, 0, 0, 0, 0, 0,255, 1, 0, // u, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, // v, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, // w, + 0, 0,255, 0, 0, 0,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, // x, + 0, 18, 2, 0, 18, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0,255, 0, 0,255, 0,255, // y, + 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, // z, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255, 0, 0, 0, 0, 0,255,255,255,255, 0,255,255,255, 0, 0,255,255, // ß, + 0, 95,255,255,255, 1,255,255,255, 11,255,255,255,255,255, 6,255,255,255,255,255, 8,255,255,255, 4,255,255,255,255,255,255,255, 14, 2,255,255, 17,255,255,255,255,255,255,255, 8, 16,255,255,255, 5,255, // ̀, + 0, 39,255,255,255, 0,255,255,255, 5,255,255,255,255,255, 3,255,255,255,255,255, 20,255,255,255, 1,255,255,255,255,255,255,255, 3, 0,255,255, 31,255,255,255,255,255,255,255, 10, 10,255,255,255, 5,255, // ̉, + 0, 12,255,255,255, 1,255,255,255, 5,255,255,255,255,255, 0,255,255,255,255,255, 3,255,255,255, 1,255,255,255,255,255,255,255, 2, 0,255,255, 3,255,255,255,255,255,255,255, 1, 0,255,255,255, 7,255, // ̃, + 0, 0, 3, 0, 0, 0, 0, 12, 15, 0, 0, 0, 17, 1, 6, 15, 0,255, 0, 0, 1, 0, 16, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255,255,255,255,255, 0,255,255, 1, 0,255,255, 0,255,255,255, 0,255, // à, + 2, 0, 2, 13, 0, 0, 0, 0, 26, 3, 0, 0, 1, 1, 0, 1, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255, 3,255,255,255,255,255,255,255,255,255, // á, + 3, 0, 1, 7, 5, 0, 0, 1, 23, 0, 0, 0, 3, 3, 0, 0, 0, 0, 3, 1, 7, 13, 16, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255,255,255, 8, 0,255,255, 0,255, 0,255, 0,255, // â, + 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 23, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0, 0,255,255,255, 0,255,255,255, 1, 0,255,255, 0,255,255,255,255,255, // ă, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, 0, 0,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255, // è, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255, // é, + 0, 0, 1, 0, 0, 0, 0, 0, 28, 66, 0, 6, 2, 0, 1, 0, 0, 0, 2, 0, 4, 0, 3, 0, 0, 12, 0,255, 0, 0, 0,255,255,255,255, 0, 0, 0, 0, 0,255,255, 13, 0,255, 0,255,255,255,255, 0,255, // ê, + 0, 63,255,255,255, 5,255,255,255, 24,255,255,255,255,255, 22,255,255,255,255,255, 18,255,255,255, 3,255,255,255,255,255,255,255, 17, 6,255,255, 30,255,255,255,255,255,255,255, 23, 20,255,255,255, 10,255, // ́, + 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255,255, 0, 0, 0,255,255, 0,255,255, 0, 0,255,255,255,255,255, // í, + 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // î, + 0, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255, 0, 0,255,255, 0, 0, 0,255,255,255,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255, // ï, + 97, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0,255,255,255,255,255, // đ, + 0, 42,255,255,255, 0,255,255,255, 15,255,255,255,255,255, 18,255,255,255,255,255, 8,255,255,255, 0,255,255,255,255,255,255,255, 34, 4,255,255, 24,255,255,255,255,255,255,255, 41, 17,255,255,255, 17,255, // ̣, + 0, 0, 2, 10, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255, 2,255, 0,255,255,255,255,255, 0,255, // ó, + 2, 0, 7, 8, 0, 0, 0, 2, 18, 0, 0, 0, 1, 17, 2, 0, 0, 0, 1, 6, 5, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255, 0, 0,255,255,255, 22, 0,255, 0, 0,255,255,255, 0,255, // ô, + 5, 0, 1, 1, 0, 0, 0, 0, 8, 2, 0, 0, 2, 1, 0, 0, 0,255, 1, 2, 0, 0, 4, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0,255,255,255, 0,255,255,255, 2, 0,255, 0, 0,255,255,255, 42,255, // ơ, + 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255,255,255,255,255,255,255,255, 0,255,255,255,255, 0,255,255,255,255, // ù, + 0, 0, 0, 1, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255,255, 0,255,255,255,255, 0,255, 0,255,255,255, 0,255,255,255, // ú, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, 0,255, 0, 0, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // û, + 1, 0, 1, 4, 4,255, 0, 7, 28, 1, 0, 0, 4, 0, 3, 0, 0, 0, 4, 7, 9, 0, 2, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0, 0,255,255,255, 0,255,255,255, 15, 0,255,255, 0,255,255,255, 0,255, // ư, + 0, 0,255,255,255,255,255,255,255,255,255,255, 0,255,255, 0,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ÿ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, ̀, ̉, ̃, à, á, â, ă, è, é, ê, ́, í, î, ï, đ, ̣, ó, ô, ơ, ù, ú, û, ư, ÿ, + ], + central: [ + 0, 42, 11, 51, 30, 3,154, 77, 18, 20, 23, 0,139, 0,254, 1, 0, 79, 0, 70,132,121, 0, 52,241, 14, 5, 7, 17, 8, 74, 0, 58, 0, 8, 33, 36, 9, 1,105, 8, // , + 0, 70, 0, 11,146, 0,115, 0, 11, 23, 12, 0, 0, 0, 2, 0, 0, 66, 0,107, 1, 0, 0, 0, 10, 0, 10, 25, 0, 8, 1, 0, 1, 0, 9, 0, 0, 0, 0, 0, 48, // a, + 0, 0, 0, 0, 18, 0, 1, 1, 0, 4, 2, 0, 32, 0, 1, 0, 0, 0, 0, 2, 11, 5, 0, 0, 2, 0, 0, 0, 0, 0, 8, 0, 4, 6, 1, 0, 10, 0, 0, 1, 0, // b, + 0, 2, 62, 0, 0, 0, 9, 45, 9, 0, 5, 0, 47, 0, 8, 0, 0, 0, 0, 0, 5, 31, 0, 3, 33, 3, 0,255, 14, 0, 8, 0, 0, 0, 0, 0, 13, 0, 1, 48, 0, // c, + 0, 0, 0, 0, 10, 6, 1, 19, 0, 0, 2, 0, 23, 0, 6, 0,255, 0, 0, 0, 11, 28, 0, 5, 8, 0, 0, 0, 0, 0, 33, 0, 4, 2, 0, 1, 10, 0, 1, 0, 0, // d, + 0, 70, 0, 0,111, 0, 16, 0, 27, 0, 36, 0, 0, 0, 0, 0,255, 61, 0,145, 5, 0, 0, 0, 0, 0, 0,133, 0, 0, 0, 0, 2, 0, 64, 0, 0, 1, 0, 0, 34, // e, + 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // f, + 0, 0, 0, 0, 2, 0, 2, 5, 0, 0, 0,255, 64, 0, 2, 0, 0, 0, 0, 0, 61, 9, 0, 0, 4, 0, 0,255, 0, 0, 7, 0, 1, 2, 0, 0, 13, 0, 4, 0, 0, // g, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 38, 0, 0, 4, 11, 2, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 4, 0, 0, 5, 0, // h, + 0,110, 0, 0,103, 0, 0, 0,157, 0, 0, 0, 2, 1, 7, 0, 0,225, 0,177, 0, 0, 0, 0, 0, 7, 0, 6, 0, 0, 6, 0, 5, 0, 24, 0, 0, 0, 0, 0,242, // i, + 0, 7, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0,255, 0,255, 27, 1, 0, 0, 10, 9, 0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 8, 0, 0, 0, 0, // j, + 0,193, 0, 0,210, 0, 21, 0, 1, 12, 8, 0, 21, 0, 0, 0, 0, 1, 0,186, 42, 8, 0, 8, 19, 5, 0, 0, 0, 0, 13, 0, 9, 10, 2, 0, 1, 2, 5, 2, 2, // k, + 0, 10, 15, 0, 1, 0, 0, 0, 0, 0, 2, 0, 92, 3, 12, 0, 0, 0, 0, 31, 48, 0, 0, 15, 9, 4, 0, 0, 0, 0, 24, 1, 13, 16, 0, 1, 7, 1, 56, 2, 0, // l, + 0, 2, 9, 0, 1, 0, 1, 0, 0, 2, 0, 0, 22, 4, 16, 0, 0, 0, 0, 0, 15, 0, 0, 7, 27, 10, 0, 0, 0, 0, 4, 0, 1, 2, 0, 4, 4, 0, 0, 22, 0, // m, + 0, 37, 15, 0, 39, 6, 14, 0, 1, 5, 20, 0, 76, 90, 10, 0, 0, 6, 0,121, 54, 0, 0, 16, 20,238, 0, 0, 0, 0, 37, 2, 2, 17, 1, 0, 4, 1, 4, 1, 0, // n, + 0, 16, 0, 3, 7, 0, 93, 0, 7, 21, 12, 0, 0, 0, 0, 0, 0, 2, 1, 11, 4, 0, 0, 0, 5, 0, 0, 1, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, // o, + 0, 23, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 9, 0, 3, 0, 1, 0, 0, 0, 44, 38, 0, 3, 2, 0,255, 0, 0, 0, 6, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, // p, + 255, 0,255,255, 0, 0, 0,255, 0,255,255,255, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0,255, 0, 0,255,255,255,255, 0, 0,255, 0,255,255, 0,255, 0,255,255, // q, + 0, 1, 11, 0, 1, 5, 4, 0, 0, 0, 0, 0, 96, 14, 70, 1, 0, 0, 0, 15, 68, 0, 0, 8, 20, 4, 0, 0, 0, 0, 58, 3, 5, 27, 0, 0, 14, 4, 3, 3, 0, // r, + 0, 1, 0, 0, 10, 0, 0, 6, 0, 9, 4, 0,133, 0, 15, 0, 0, 0, 0, 1,147, 9, 0, 11, 27, 2, 0, 0, 66, 0, 11, 5, 13, 12, 3, 4, 9, 6, 2, 5, 0, // s, + 0,254, 0, 0, 0, 0, 3, 10, 74, 0, 0, 0, 63, 7, 46, 5, 0, 0, 0, 6, 30, 23, 0, 22, 45, 3, 0, 0, 0, 0, 6, 1, 5, 13, 0, 0, 7, 0, 3, 2, 0, // t, + 0, 7, 0, 0, 35, 0, 35, 0, 10, 6, 4,255, 1, 9, 17, 0, 0, 16, 0, 22, 0, 0, 0, 0, 0, 0, 0, 42, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, // u, + 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 1, 0, 0, 0, 0, 8, 38, 0, 0, 2, 12, 1, 0, 0,255, 0, 2, 9, 6, 14, 0, 6, 2, 6, 0, 8, 0, // v, + 0, 0, 33, 0, 0, 1, 0, 0, 0,255, 0,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,113, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // w, + 255, 0,255,255, 0,255, 0,255,255,255,255,255, 0,255, 0, 0,255, 0,255, 0, 3,255, 0, 0, 0,255,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, // x, + 255, 0, 0, 0, 0, 0, 37, 0, 0, 0, 37,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // y, + 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 46, 0, 9, 2, 0, 0,255, 0, 8, 29, 0, 4, 5, 0, 0,255, 0,255, 4, 1, 8, 31, 0, 1, 42, 1, 3, 3, 0, // z, + 0, 0,255,255,255, 0,255,255, 0, 0,255, 0, 0, 0, 0, 1,255,255, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255, 0,255,255, // ß, + 180, 89, 2, 0, 5, 92, 0, 23, 0,166, 40, 4, 4, 15, 12,133,125, 0, 60, 0, 3, 37, 10, 0, 0, 13, 5,255, 9,255, 0, 8,255,255, 0,255, 0,255, 0, 4, 0, 0, 0, 0, 0,255, 3, 0, 0, 0, 1, 3, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,255, 0, 3,255, // š, + 53, 6, 0, 0, 0, 25, 0, 0, 0, 6, 4, 0, 0, 1, 0, 89, 0, 0, 0, 0, 0, 1, 0, 1,255, 5, 0,255,255, 0,255,255, 0, 0, 0,255,255, 0,255, 0,255,255,255,255, 0,255,255,255, 6,255, 0,255,255,255,255, 0,255, 0, 0,255,255,255,255, 0,255,255,255,255, // ś, + 1, 13, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0,255, 1, 28, 0, 0, 0,255,255, 3, 0,255, 2,255, 0, 0,255,255,255,255,255,255,255, 0,255,255, 1,255,255,255, 0, 0, 0,255, 0, 3,255, 0,255,255, 0, 0, 0,255,255,255, 0, 2,255,255, 0,255, // ť, + 96, 34, 0, 0, 21, 89, 0, 0, 0, 28, 0, 0, 5, 1, 3,221, 0, 0,127, 0, 0, 77, 1, 0, 0, 2, 0,255, 0,255, 0, 0,255,255, 0,255, 0,255, 0, 3,255, 0, 0, 5, 0,255, 0, 1, 0,255, 5, 5,255, 0, 0,255, 0, 0, 4,255,255, 0, 2, 1,255, 0, 1,255, // ž, + 4, 7, 0,255, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0,255,255, 0,255,255, 0, 0, 0,255,255, 0,255,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255, 3, 0,255,255,255,255,255,255,255,255,255, // ź, + 18,129, 4, 3, 21, 21, 0, 23, 1, 42, 0, 15, 0, 3, 0, 40, 11, 0, 2, 40, 2, 56, 0, 19,255, 32, 15,255, 0, 0,255, 0, 0, 0, 4,255,255, 0,255, 0,255, 0, 0,255, 0,255, 0, 0, 2, 0, 0, 0,255,255,255,255,255, 30, 0,255, 0,255, 0,255,255,255,255,255, // ł, + 0, 0, 2, 5, 3, 0, 0, 2, 0, 38, 44, 9, 8, 1, 16, 0, 0, 0, 7, 9, 5, 0, 0, 7, 0, 0, 28,255, 0, 0,255, 0, 0, 7, 0, 0,255, 2,255,255,255,255,255,255, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255,255,255,255, 0, 0, // ą, + 140, 48, 0, 1, 0, 50, 0, 0, 0, 33, 0, 0, 0, 0, 1, 19, 0, 0, 4, 0, 0, 12, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255, 0, 2, 3, 0,255,255,255,255, 0, 0,255,255, 0, 4,255,255,255,255,255,255,255, 0,255,255, 0,255, 0,255, 0, // ş, + 8, 3, 0, 0, 7, 56, 0, 0, 4, 1, 0, 0, 0, 0, 0, 13, 0,255, 0, 0, 0, 2, 0,255,255, 0, 0,255, 0,255,255, 0,255,255,255,255, 0,255,255, 5,255, 0,255, 0,255,255, 0,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255,255, 0,255,255, 0,255, // ľ, + 30, 16, 0, 0, 3, 26,255, 0, 0, 5, 0, 5, 0, 0, 1, 27, 0,255, 1, 0, 0, 24, 0, 0, 0, 10, 0,255,255, 0,255,255, 0, 1, 11,255,255, 0,255,255,255,255, 0,255, 0,255,255, 0, 9, 0,255,255,255, 0,255, 0,255, 5, 0,255,255,255,255,255,255,255,255,255, // ż, + 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0,255, 0, 0, 0,255, 0,255,255,255, 0,255,255,255,255, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ŕ, + 37, 0, 10, 9, 29, 2, 7, 20, 20, 13, 34, 45, 62, 52,115, 0, 15, 0, 97, 50, 87, 8, 57, 0, 0, 14, 82,255, 0, 0, 0, 3, 0, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0, 0, 7, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 3,255, 0, 0, 0, 0, 0, // á, + 0, 0, 1, 19, 2, 0, 5, 2, 5, 0, 0, 0, 1, 48, 1, 0, 8, 0, 17, 2, 12, 0, 9, 0, 0, 0, 1,255, 0,255,255, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255, 0, 0, 0, 0,255,255,255,255, 0,255, 0,255, 0,255,255,255,255,255,255,255,255, 0,255, 0, // â, + 0, 0, 14,106, 14, 0, 9, 21, 1, 0, 0, 0, 51, 27, 62, 0, 36, 0, 72, 66,133, 7, 12, 0, 1, 0, 28,255, 0,255,255, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255,255, 0, 0,255,255,255,255,255, 0,255, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 35, // ă, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0,255, 0,255, 0,255,255,255,255, 0,255,255, 0, 0,255,255,255,255,255,255, 0,255,255, 0,255,255,255, 0,255,255,255,255, 0, 0,255, // ä, + 0, 0, 0,255, 5, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, 1, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ĺ, + 15, 18, 0, 0, 0, 91, 0, 0, 0, 96, 0, 1, 0, 0, 0, 29,156,255, 1, 0, 0, 37, 0, 0, 0, 4, 0,255, 14, 40,255, 0, 0, 0, 1,255,255, 0,255, 0,255,255,255,255, 0,255, 0, 0, 1,255, 0, 0,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255, // ć, + 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255, 0, 0,255, 0,255,255, 0,255, 0,255,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255, 0, 0,255, // ç, + 118,219, 32, 0, 1, 87, 0, 0, 2,176, 3, 1, 3, 12, 26,106, 2, 0, 12, 4, 0, 54, 5, 0, 0, 1, 0,255, 76, 0, 0, 0,255, 0,255,255, 0,255, 0, 6, 0, 0, 3, 0, 0,255, 0, 0,255,255, 0, 3,255,255, 0,255, 0, 0, 0,255, 0, 0, 0, 10,255, 0, 0,255, // č, + 106, 1, 6, 3, 16, 0, 11, 14, 4, 2, 5, 60, 49, 41, 86, 0, 11, 0, 28, 50, 74, 1, 32, 0, 0, 10, 33, 0, 0,255, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255,255,255, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, // é, + 0, 0, 4, 1, 2, 0, 0, 4, 0,101, 53, 10, 5, 7, 32, 0, 2, 0, 14, 2, 41, 0, 0, 7, 0, 0, 26,255, 0, 0,255, 0, 0, 5, 0, 0, 0, 1,255,255,255,255,255,255, 0, 0, 0, 0, 0, 0,255, 0,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255, 0, 0, // ę, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255,255,255, 0,255,255,255,255,255,255,255, 0, 0,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ë, + 0, 0, 10, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 36, 54, 0, 6, 0, 0, 0, 23, 0, 37, 0,255, 0, 0,255, 0,255, 0, 0, 0,255,255,255,255,255,255, 0,255,255,255,255, 0,255, 0, 0,255,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255,255,255,255, 0, // ě, + 11, 1, 6, 25, 14, 2, 1, 2, 5, 3, 21, 5, 12, 21,177, 0, 7, 0, 38, 11, 28, 1, 26, 0, 0, 7, 14,255, 29, 0, 0, 7, 0,255,255, 0, 0,255, 0, 4,255, 0,255,255, 0, 0, 14, 0,255,255, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 51,255, 0, 0, 0,255, 0, // í, + 239, 0, 1, 0, 2, 1, 0, 0, 1, 3, 2, 9, 8, 7, 7, 0, 1, 0, 7, 3, 5, 0, 4, 0, 0, 0, 0,255,255,255,255,255,255,255,255, 1,255,255,255, 0, 0, 0,255,255,255, 0, 0,255, 0, 0,255, 0, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, // î, + 4, 4, 0,255, 0, 4,255,255,255, 0, 0,255, 0, 0, 0, 0,255,255, 0,255, 0, 0, 1,255,255, 0, 0,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255, 0,255,255,255, 0, 0,255,255, 0, 0,255, 0,255,255, 0, 0,255,255,255,255,255, 0,255,255,255,255, // ď, + 4, 31, 0,255, 0, 41,255, 0, 0, 1, 0, 0, 0, 0, 1, 69, 0,255, 4, 0,255, 3, 0, 0,255, 0, 0,255, 0,255,255, 63,255,255,255,255,255,255,255, 0,255,255,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255,255, // đ, + 0, 48,255, 0, 0, 16,255,255, 0, 12, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 1,255, 0,255, 6, 0,255,255, 0,255,255, 0, 0, 0, 0,255, 0,255, 0,255,255,255,255,255,255,255, 0, 0, 0,255, 0,255,255, 0, 0,255, 0, 0,255,255,255, 0, 0,255,255,255,255, // ń, + 0, 1, 0, 0, 3, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255, 0, 0,255,255,255, 0,255,255, 0, 0,255,255, 0, 0,255, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 0,255, // ň, + 3, 0, 8, 9, 10, 2, 3, 53, 3, 17, 6, 22, 42, 5, 7, 0, 26, 0, 52, 7, 32, 0, 2, 31, 0, 2, 20,255, 0, 0,255, 0, 0, 16,255,255,255, 1,255, 0,255,255, 0,255, 0, 0, 0, 0, 0,255,255, 0,255,255,255, 0, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, // ó, + 0, 0, 0, 1, 3, 0, 0, 0, 1, 0, 0, 3, 0, 4, 0, 0, 13,255, 1, 0, 0, 0, 1, 0, 0, 0, 0,255,255,255,255, 0,255, 0,255,255, 0, 0,255,255,255,255,255,255,255, 0,255, 0,255,255,255, 0,255,255, 0,255,255, 0, 0,255,255,255,255,255,255,255,255,255, // ô, + 4, 0, 5, 0, 12, 0, 7, 1, 0, 0, 0, 5, 22, 0, 9, 0, 0,255, 9, 13, 33, 0, 2, 0, 0, 3, 23,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255,255,255,255, 0,255, 0,255,255,255, 0,255,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0,255,255, // ő, + 13, 0, 2, 0, 6, 0, 5, 3, 2, 0, 1, 57, 7, 1, 5, 0, 0,255, 8, 2, 22, 0, 1, 0, 0, 2, 15,255, 0,255,255, 0,255, 0,255, 0,255,255,255, 0,255,255, 0,255,255, 0, 0, 0,255,255,255, 0,255,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0,255,255, // ö, + 16, 6, 5, 0, 3, 4, 0, 0, 2, 2, 0, 4, 0, 1, 0, 12, 66,255, 0, 0, 19, 2, 1, 0, 0, 2, 1,255, 0,255,255, 0,255,255,255,255,255,255, 0, 9,255,255,255,255,255,255, 0, 0,255, 0, 3, 5,255,255,255,255, 0, 0,255,255,255, 0, 0, 2,255,255, 0,255, // ř, + 0, 0, 0, 2, 5, 0, 0, 0, 2, 0, 1, 5, 4, 3, 3, 0, 7, 0, 10, 1, 4, 0, 3, 0, 0, 0, 1, 0, 0,255, 0, 0,255, 0,255, 0,255,255,255, 0,255,255,255,255, 0,255, 0, 0,255,255,255, 0,255, 0,255,255, 0, 0,255,255, 0, 0, 0, 0,255, 0, 0,255, // ů, + 27, 0, 0, 36, 2, 0, 0, 1, 1, 4, 26, 8, 4, 2, 8, 0, 1, 0, 14, 40, 14, 0, 2, 0, 0, 1, 4,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0,255, 0, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, // ú, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0,255, 4, 3, 3, 0, 1, 0, 0, 2, 1,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255,255, 0,255,255,255,255,255, 0,255, 0,255,255,255, 0, 0, 0,255,255, // ű, + 5, 0, 2, 0, 0, 0, 3, 1, 1, 0, 0, 8, 2, 2, 1, 0, 15, 0, 14, 4, 4, 0, 1, 1, 0, 3, 13,255, 0,255, 0, 0,255,255,255, 0,255, 0,255, 0,255,255, 0,255,255, 0, 0, 0,255, 0,255, 0,255,255,255,255, 0, 0,255, 0, 0,255,255, 0, 0, 0,255,255, // ü, + 0, 0, 4, 0, 2, 0, 0, 0, 3, 0, 0, 52, 4, 1, 69, 0, 0,255, 13, 0, 11, 0, 56, 0,255, 0, 1,255, 0,255,255, 0,255,255,255,255,255,255,255, 0,255,255, 0,255,255,255, 0, 0,255,255, 0, 0,255,255,255,255, 0,255,255,255, 0, 0, 0, 0,255, 0, 0,255, // ý, + 20,111, 1, 33, 0, 31, 0, 0, 0, 31, 0, 0, 5, 1, 78, 5, 2, 0, 10, 1, 0, 25, 2, 0, 0, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255, 0, 0, 25,255,255,255,255,255, 0,255,255,255, 0, 4,255,255,255,255, 0,255,255,255,255,255, 0,255,255,255, 0, // ţ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, š, ś, ť, ž, ź, ł, ą, ş, ľ, ż, ŕ, á, â, ă, ä, ĺ, ć, ç, č, é, ę, ë, ě, í, î, ď, đ, ń, ň, ó, ô, ő, ö, ř, ů, ú, ű, ü, ý, ţ, + ], + cyrillic: [ + 0, 0, 0, 0, 1, 0, 16, 38, 0, 2, 5, 10,121, 4, 20, 25, 26, 53, 9, 5, 61, 23, 20, 26, 15, 95, 60, 2, 26, 15, 25, 29, 0, 14, 6, 6, 25, 1, 0, 27, 25, 8, 5, 39, // , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, // ѓ, + 0, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 2, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ђ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 4, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // љ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0,255, 5, 0, 0, 0, 0, 2, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // њ, + 0, 0,255, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ћ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // џ, + 7, 0, 0,255,255,255,255,255, 0, 1, 0,255,255,255, 15, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 1, 0, 0, 0, 1, // ў, + 12, 0, 0,255,255, 0,255,255, 0, 2, 0, 0, 0, 0, 2, 3, 15, 5, 5, 0, 0, 4, 0, 0, 21, 15, 10, 17, 0, 6, 14, 4, 6, 0, 3, 1, 8, 1, 0, 0, 0, 2, 0, 0, 0, 0, // і, + 0, 0,255,255,255,255,255,255, 0, 0, 0,255,255, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ё, + 6, 0, 0,255,255,255,255,255, 0, 0,255, 5,255, 0, 1, 7, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // є, + 12, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 5, 1, 0, 0, 0, 2, 0, 0, 20,255, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255, // ј, + 9, 0, 0,255,255,255,255,255,255, 5,255, 0, 0, 13, 3, 3, 0, 4, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 1, 3, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ї, + 32, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0, 0, 28, 0, 23, 22, 26, 22, 19, 0, 3, 12, 5, 0, 44, 38, 18, 58, 1, 21, 44, 17, 54, 1, 2, 28, 5, 8, 3, 1, 9, 0, 12, 0, 0, 0, // а, + 40, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 7, 0, 0, 0, 1, 7, 0, 1, 1, 0, 0, 7, 4, 1, 9, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // б, + 31, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 3, 0, 0, 19, 0, 0, 1, 1, 6, 0, 2, 6, 0, 1, 0, 1, 0, 32, 0, 2, 2, 23, 9, 0, 0, 0, 1, 0, 0, 1, 1, 0, 3, 0, 2, // в, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 1, 20, 0, 0, 1, 0, 9, 0, 0, 9, 7, 0, 5, 2, 18, 11, 0, 8, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 13, 0, 3, // г, + 26, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 2, 0, 2, 19, 0, 1, 5, 0, 13, 2, 2, 3, 2, 0, 6, 1, 12, 30, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 1, // д, + 12, 0, 0, 1, 4, 5, 0, 0, 0, 0, 0, 0, 24, 1, 5, 7, 11, 3, 12, 1, 6, 6, 11, 0, 3, 15, 14, 14, 4, 8, 25, 14, 29, 0, 1, 1, 4, 8, 8, 2, 0, 3, 1, 0, 0, 0, // е, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 3, 2, 1, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, // ж, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 6, 0, 0, 0, 11, 8, 0, 0, 8, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, // з, + 24, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 1, 0, 1, 10, 16, 21, 22, 0, 6, 5, 6, 1, 15, 15, 8, 38, 2, 4, 27, 9, 15, 0, 3, 8, 12, 7, 6, 1, 0, 0, 0, 0, 0, 0, // и, + 6, 0, 0, 0,255,255,255,255, 0, 7, 0, 0,255, 4, 21, 0, 0, 0, 0, 5, 0, 0, 39, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 3, 0, 0, // й, + 54, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 10, 0, 1, 0, 1, 11, 0, 0, 12, 0, 1, 2, 0, 4, 8, 0, 2, 23, 2, 4, 0, 2, 3, 3, 8, 0, 0, 3, 16, 1, 4, 3, // к, + 12, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 6, 0, 4, 29, 12, 4, 5, 2, 18, 0, 0, 17, 4, 5, 11, 0, 0, 21, 2, 3, 4, 1, 15, 1, 0, 0, 0, 0, 0, 4, 3, 2, 12, 0, 2, // л, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 17, 1, 0, 0, 0, 7, 0, 1, 13, 2, 0, 0, 0, 0, 13, 0, 2, 4, 0, 2, 0, 0, 0, 0, 0, 0, 1, 4, 2, 4, 1, 1, // м, + 42, 0, 0, 0, 0, 0, 0, 0, 4, 12, 6, 7, 1, 7, 76, 0, 22, 1, 4, 27, 1, 3, 34, 30, 0, 7, 1, 13, 24, 1, 3, 5, 3, 4, 0, 1, 0, 4, 1, 0, 2, 18, 7, 16, 0, 4, // н, + 37, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 10, 27, 22, 15, 1, 2, 3, 7, 5, 32, 11, 7, 38, 8, 21, 24, 11, 23, 0, 2, 10, 2, 2, 3, 2, 0, 0, 1, 0, 0, 0, // о, + 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 2, 0, 1, 2, 4, 0, 0, 2, 0, 6, 0, 0, 5, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, // п, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 8, 0, 5, 47, 4, 6, 6, 5, 23, 0, 0, 5, 2, 6, 0, 0, 0, 23, 22, 0, 1, 14, 9, 1, 0, 1, 0, 0, 0, 7, 2, 8, 16, 0, 3, // р, + 53, 0, 0, 0, 0, 0, 0, 0, 4, 9, 2, 0, 1, 2, 21, 1, 4, 1, 2, 11, 0, 0, 12, 2, 4, 7, 1, 13, 15, 1, 4, 6, 3, 6, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 0, 1, // с, + 28, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 1, 0, 1, 32, 0, 1, 3, 0, 12, 0, 1, 22, 1, 4, 7, 1, 6, 23, 0, 14, 41, 14, 3, 0, 1, 1, 1, 21, 0, 2, 2, 6, 2, 1, 4, // т, + 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 2, 4, 2, 4, 6, 3, 0, 2, 0, 0, 6, 5, 6, 3, 0, 3, 7, 4, 7, 18, 1, 6, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, // у, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ф, + 41, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 30, 0, 2, 0, 0, 11, 0, 0, 5, 1, 14, 3, 0, 3, 6, 0, 7, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 4, 3, 5, 0, 0, // х, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 0, 0, 0, 0, 4, 0, 0, 7, 1, 0, 1, 0, 2, 1, 0, 0, 9, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, // ц, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 1, 5, 0, 2, 0, 0, 6, 0, 0, 1, 0, 0, 3, 0, 2, 0, 0, 2, 0, 1, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, // ч, + 12, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 17, 0, 0, 1, 0, 2, 0, 0, 26, 0, 0, 0, 0, 0, 22, 2, 6, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, // ш, + 2, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, // щ, + 0, 0,255,255,255,255, 0,255, 0, 0, 0,255,255,255, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 11, 0, 1, 0, 0, 2, 2, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ъ, + 1, 0, 0,255,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 3, 11, 0, 4, 0, 2, 1, 0, 0, 0, 3, 1, 16, 0, 0, 22, 2, 10, 0, 0, 0, 8, 6, 3, 0, 0, 0, 0, 0, 0, 0, // ы, + 0, 0, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 7, 3, 0, 1, 13, 7, 7, 0, 35, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, // ь, + 10, 0, 0,255,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 1, 1, 10, 11, 0, 2, 2, 0, 0, 0, 9, 3, 9, 0, 0, 7, 6, 9, 0, 0, 8, 3, 2, 1, 0, 0, 0, 0, 17, 0, 0, // э, + 14, 0, 0, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ю, + 5, 0, 0,255,255,255,255,255, 0, 9, 0, 0,255, 0, 11, 0, 3, 0, 0, 0, 0, 2, 24, 0, 0, 5, 2, 14, 1, 0, 2, 3, 1, 0, 0, 1, 3, 0, 0, 0, 0, 16, 1, 0, 0, 0, // я, + // , a, ѓ, ђ, љ, њ, ћ, џ, ў, і, ё, є, ј, ї, а, б, в, г, д, е, ж, з, и, й, к, л, м, н, о, п, р, с, т, у, ф, х, ц, ч, ш, щ, ъ, ы, ь, э, ю, я, + ], + western: [ + 18, 3, 0,254, 74, 0, 5,254,254, 2, 25,254,149, 4,254, 66,148,254, 0,254,122,238, 8, 1, 20, 13,254, 35, 20, 3, 1, 0, // , + 0, 3, 0, 0, 0, 0, 0, 5, 2, 0, 86, 9, 76, 0, 0, 0,241, 0, 0, 49, 0, 0, 0, 0, 11, 2, 0, 34, 0, 1, 2, 0, // a, + 19, 0, 0, 5, 5, 0, 0, 8, 13, 5, 0, 34, 22, 0, 0, 0, 4, 0, 0, 0, 6, 1, 3, 3, 42, 37, 8, 8, 0, 67, 0, 0, // b, + 0, 0, 0, 9, 6, 1, 0, 22, 10, 1, 0, 19, 54, 1, 0, 1, 18, 3, 1, 2, 40, 7, 0, 0, 6, 0, 3, 5, 1, 34, 0, 0, // c, + 0, 0, 0, 5, 5, 0, 0, 12, 45, 16, 1, 6, 42, 0, 13, 3, 10, 0, 2, 0, 66, 11, 5, 8, 33,104, 3, 4, 0, 19, 0, 0, // d, + 63, 5, 0, 0, 0, 0, 2, 33, 15, 1, 3, 0, 87, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 49, 1, 11, 0, 3, 0, 9, 1, 0, // e, + 0, 0, 0, 8, 8, 0, 0, 10, 2, 7, 0,162, 23, 0, 13, 0, 4, 0, 0, 0, 1, 3, 0, 0, 15, 4, 0, 0, 0, 4, 0, 0, // f, + 1, 0, 0, 14, 16, 24, 0, 29, 11, 41, 0, 13, 86, 0, 14, 9, 3, 0, 0, 0, 20, 8, 7, 7, 13, 37, 14, 0, 0, 12, 0, 0, // g, + 1, 0, 0, 0, 0, 0, 0, 47, 2, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 29, 20, 0, 0, 0, 0, 45, 0, 0, // h, + 5, 4, 0,166,120, 0, 0,144, 0, 2, 3, 88,254, 0, 0, 0, 0, 0, 0, 3, 28,107, 0,112, 8, 2, 44, 32, 0, 3, 3, 0, // i, + 0, 0, 0, 0, 0, 0, 0, 39, 9, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 16, 18, 44, 0, 0, 0, 0, 0,255, // j, + 0, 2, 0, 0, 1, 0, 0, 48, 31, 32, 1, 60, 1, 0, 4, 0, 1, 0, 0, 0, 1, 3, 0, 2, 20, 47, 0, 0, 0, 20, 0, 0, // k, + 4, 0, 0, 12, 16, 0, 0, 54, 40, 48, 0, 64, 36, 0, 39, 6, 12, 3, 0, 0, 27, 9, 3, 24, 42, 33, 2, 9, 7, 77, 0, 0, // l, + 0, 0, 0, 14, 5, 4, 0, 60, 11, 4, 3, 48, 30, 7, 28, 1, 10, 1, 0, 0, 24, 41, 3, 3, 19, 24, 1, 8, 2, 36, 0, 0, // m, + 1, 1, 0, 24, 91, 16, 0,132, 62, 73, 1, 56, 71, 33, 78, 7, 35, 2, 3, 0, 94,254, 10, 21, 33, 38, 24, 21, 1, 61, 0, 0, // n, + 0, 1, 0, 0, 0, 0,254, 6, 0, 1, 27, 0, 13, 0, 0, 84,127, 0, 0, 62, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, // o, + 0, 0, 0, 5, 2, 0, 0, 9, 15, 0, 0, 4, 34, 0, 6, 0, 6, 0, 0, 0, 20, 12, 9, 28, 10, 22, 0, 3, 0, 7, 0, 0, // p, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 33, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // q, + 0, 0, 0, 83, 62, 1, 0,198,139,125, 0,229, 94, 54,190, 38, 18, 1, 0, 0,176, 24, 16, 29,193,181, 13, 13, 2,131, 0, 0, // r, + 1, 0, 0, 41, 34, 0, 0, 41, 24, 42, 0, 68,113, 15,159, 6, 43, 19, 4, 58, 14, 18, 1, 4, 48, 42, 4, 12, 9, 20, 0, 0, // s, + 7, 1, 0, 14, 20, 8, 0, 56, 37, 31, 0,104, 67, 14,113, 3, 50, 9, 5, 0, 89, 7, 19, 22, 13, 14, 40, 12, 15, 18, 0, 0, // t, + 0, 1, 5, 1, 2, 0, 0, 30, 0, 0, 1, 15, 2, 0, 1, 0, 1, 0, 0, 2, 4, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, // u, + 0, 2, 0, 1, 6, 0, 0, 29, 33, 13, 0, 19, 46, 0, 15, 0, 7, 0, 1, 31, 2, 2, 3, 1, 32, 27, 0, 0, 1, 1, 0, 0, // v, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0,255, // w, + 0, 0, 0, 1, 16, 0, 0, 23, 0, 0, 0, 3, 14, 0, 0, 0, 2, 3, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, // x, + 0, 0, 0, 0, 0, 0, 0, 58, 8, 0, 0, 1, 1, 62, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 82, 0, 0, 0, 0, 0,255, // y, + 0, 0, 0, 0, 2, 0, 0, 0, 14, 0, 0, 7, 3, 0, 6, 0, 3, 5, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, // z, + 0, 29, 0, 0, 0, 15, 0, 0, 0, 11, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255, 4, 0, 0,255,255, 0,255, 0,255, 0, 0,255,255,255, 0, 0, 0, 8, 0,255, 0, 0, 2, 0, 0, // ß, + 6, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 1, 0, 0, 0, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // š, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255, 0,255,255, // œ, + 107, 0, 22, 16, 18, 14, 6, 24, 46, 15, 2, 0, 42, 18, 17, 0, 36, 0, 34, 4,254, 1, 2, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, // à, + 41, 0, 10, 8, 21, 34, 5, 5, 60, 18, 5, 1, 29, 42, 26, 2, 16, 0, 27, 9, 43, 28, 7, 0, 0, 1, 4, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0,255, // á, + 24, 0, 1, 2, 0, 0, 0, 0, 7, 0, 0, 0, 3, 1, 0, 0, 0, 0, 2, 0, 5, 0, 1, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 2, 0,255, 0,255, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255, 0,255, // â, + 0, 0, 0, 1, 2, 3, 0, 1, 2, 12, 0, 0, 1, 7, 29, 4, 1,255, 11, 66, 11, 0, 1, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0,255,255,127,255,255,255,255,255, 0, 0,255, 0, 0,255,255, 0,255,255,255,255,255,255,255,255, // ã, + 134, 1, 11, 0, 25, 6, 15, 11, 61, 24,123, 95,114, 68, 53, 1, 49, 0, 60, 98,198, 0, 88, 29, 0, 6, 12, 0, 0,255, 0,255, 0, 0,118, 0,255, 0,255, 0,255, 0,255, 0,255,255, 0,255,255, 0,255, 2,255,255,255, 0, 0, 0,255, // ä, + 156, 0, 12, 14, 19, 3, 12, 47, 17, 3, 12, 5, 30, 47, 22, 0,205, 0,184, 70, 19, 0, 22, 8, 0, 6, 1,255, 0,255,255, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, // å, + 26, 0, 7, 0, 4, 0, 23, 8, 15, 0, 18, 19, 56, 23, 24, 0, 9, 0, 82, 37, 24, 0, 71, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255, 0,255, // æ, + 17,112, 0, 2, 0, 15, 0, 0, 0, 35, 0, 0, 2, 0, 59, 9, 1, 0, 36, 0, 0, 8, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // ç, + 254, 0, 9, 14, 20, 0, 15, 6, 70,144, 14, 45, 47, 92, 16, 3,123, 0, 38, 23,115, 52, 22, 42, 2, 80, 19,255, 0,255, 0, 0,255,255, 0,255,255, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0, 1,255,255, // è, + 152, 2, 19, 24, 85, 0, 29, 23, 26, 25, 2, 9, 43, 60, 62, 1, 32, 0,122, 45,169, 15, 13, 30, 7, 4, 8, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, // é, + 5, 0, 0, 3, 7, 0, 0, 10, 2, 3, 0, 26, 6, 6, 20, 1, 2, 0, 20, 1, 11, 5, 5, 2, 0, 0, 1,255, 0,255,255,255, 0,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0,255, // ê, + 36, 2, 23, 15, 36,143, 5, 23, 52, 52, 66, 48, 92, 57,216, 10,125, 35, 89, 58,254, 9, 24, 14, 0, 0, 8,255, 0,255, 0,255,255,255, 0, 0,255, 1, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0,255, 0, 0, 0,255, // ë, + 12, 0, 1, 4, 6, 0, 3, 21, 10, 0, 0, 0, 18, 8, 4, 0, 1, 0, 65, 35, 8, 3, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0,255, // ì, + 40, 72, 7, 10, 16, 2, 23, 10, 34, 0, 0, 1, 34, 15, 21, 1, 3, 0,203, 28, 58, 23, 11, 0, 10, 0, 2, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0,255,255, 0,255, 2, 0,255, // í, + 6, 5, 1, 9, 5, 0, 0, 0, 22, 0, 9, 8, 8, 6, 9, 1, 10, 0, 20, 6,182, 0, 13, 0, 0, 24, 1,255, 0,255,255,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255,255,255, // î, + 0, 6, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, // ï, + 0,254, 0, 0, 0, 26, 0, 0, 0, 61, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255, // ñ, + 20, 0, 56, 43, 8,162, 14, 3, 23, 19, 2,118, 31, 26, 46, 0, 20, 0, 23, 6, 24, 19, 6, 21, 5, 27, 63,255, 0,255, 0, 0,255,255,255,255,255, 3, 0,255,255,255, 0, 0,255, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, // ò, + 67, 0, 12, 15, 9, 7, 8, 66, 13,254, 3, 23, 14, 16, 16, 0, 8, 0, 29, 11, 26, 0, 5, 5, 1, 10, 13,255, 0,255,255, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0, 0,255, 0, 1, 0, 0, 0, 0,255,255,255, 0,255,255, 0,255, // ó, + 18, 3, 3, 12, 1, 0, 2, 0, 7, 0, 1, 0, 2, 2, 8, 0, 6, 0, 6, 7, 4, 0, 2, 0, 0, 0, 1,255, 0, 0,255, 0, 0,255,255,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0, 0,255,255,255, // ô, + 29, 2, 0, 0, 0, 0, 0, 0, 5, 2, 22, 30, 25, 38, 19, 0, 33,255, 4, 39, 24, 0, 88, 0, 0, 0, 0,255, 0,255,255, 0,255, 0,255,255,255, 36,255,255,255,255,255, 0,255,255, 0,255, 0, 0, 6, 0,255,255,255, 0, 0, 0,255, // õ, + 44, 0, 33, 0, 25, 0,142, 5, 46, 10, 25, 32, 26, 13, 6, 0, 3, 0, 30, 8, 35, 0, 25, 5, 0, 44, 7, 0, 0,255,255, 0,255,255, 73, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, 39, 0,255,255,255, 0, 0, 0, // ö, + 52, 0, 21, 0, 57, 0,119, 12, 47, 3, 59, 33, 45, 15, 12, 0, 3, 0, 52, 82, 49, 1, 11, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255, 0, // ø, + 25, 0, 4, 3, 53, 0, 0, 2, 12, 72, 0, 0, 30, 0, 0,254, 0, 0, 6, 3, 3, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0, 0, 0, 0, // ù, + 19, 2, 1, 7, 9, 1, 12, 5, 9, 41, 1, 0, 10, 7, 9, 0, 8, 0, 12, 28, 8, 0, 0, 0, 0, 1, 0,255, 0,255,255, 0,255,255,255,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0, 0,255, 0,255,255, 0, 0,255,255, 0,255, // ú, + 0, 0, 0, 0, 1, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 45, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0,255, 0,255,255,255, 0, 0,255,255,255,255, 0,255,255,255, 0,255, 0, 0,255, 0, // û, + 95, 2, 19, 0, 6, 2,121, 9, 15, 1, 5, 44, 18, 26, 7, 0, 11, 2, 68, 49, 20, 0, 2, 17, 0, 0, 6, 0, 0,255, 0,255,255,255, 0,255,255, 0,255, 0,255, 0,255,255,255, 0, 0,255,255,255, 0, 0,255, 0, 0, 0, 31, 0, 0, // ü, + 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, // ž, + 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, 0,255, 0,255,255,255, 0, 0, 0, // ÿ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, š, œ, à, á, â, ã, ä, å, æ, ç, è, é, ê, ë, ì, í, î, ï, ñ, ò, ó, ô, õ, ö, ø, ù, ú, û, ü, ž, ÿ, + ], + icelandic: [ + 0, 68, 0, 0, 2,122,156, 5, 1, 1, 5, 1, 0, // , + 0, 2,255, 0, 0, 6, 51, 2, 0, 0, 5, 0, 19, // a, + 0, 1, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, // b, + 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, // c, + 0, 0, 0, 2, 0, 2, 0, 0, 0, 5, 0, 1,255, // d, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 22, // e, + 0, 4, 0, 1, 0, 5, 3, 3, 13, 0, 0, 0, 0, // f, + 0, 4, 0, 3, 1, 7, 2, 10, 12, 19, 7, 3, 0, // g, + 0, 2, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, // h, + 0, 1, 0, 0, 0, 0, 69, 0, 0, 0, 1, 2, 4, // i, + 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 8, // j, + 0, 3, 0, 5, 1, 21, 1, 10, 4, 10, 11, 0,255, // k, + 0, 30, 0, 8, 9, 4, 6, 78, 20, 18, 4, 1, 0, // l, + 0, 2, 0, 5, 0, 8, 2, 9, 1, 3, 1, 1, 0, // m, + 0, 9, 4, 4, 0, 11, 2, 18, 11, 6, 13, 3, 0, // n, + 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 3, // o, + 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0,255, // p, + 0,255,255,255, 0, 0,255,255,255,255,255,255,255, // q, + 0, 45, 1, 13, 7, 2, 7, 25, 17, 59, 9, 8, 7, // r, + 0, 8, 1, 2, 1, 37, 13, 5, 0, 1, 9, 9, 0, // s, + 0, 17, 0, 14, 7, 6, 1, 17, 3, 3, 14, 5, 0, // t, + 0, 0, 0, 0, 0, 7, 61, 0, 0, 0, 0, 3, 1, // u, + 0, 5, 0, 2, 0, 3, 4, 3, 0, 9, 0, 0, 6, // v, + 0, 0,255,255,255,255,255, 0, 0, 0,255,255,255, // w, + 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255,255, // x, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // y, + 0, 0,255, 0, 0, 0,255, 0, 0,255, 0, 0,255, // z, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0,255, 0,255, 0, // ß, + 83, 0, 2, 0, 4, 0, 2, 3, 6, 0, 16, 3, 9, 23, 5, 0, 2,255, 24, 3, 7, 0, 5, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255,255, 0, 0, 8, // á, + 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,255, 0,255, 0, 0, 0,255, 1, 0,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255,255, // ä, + 4, 0, 7, 0, 3, 0, 8, 1, 4, 0, 0, 0, 5, 2, 3, 0, 4,255, 18, 5, 8, 0, 8, 0, 0, 0,255,255, 0,255, 0,255, 0, 0, 0,255,255, 0, 0, 2, // æ, + 1, 0, 0, 0, 0, 0, 8, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0,255, 6, 6, 0, 0, 1, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255,255,255,255, 0, // é, + 126, 2, 2, 0, 1, 0, 1, 2, 0, 0, 0, 3, 16, 2, 5, 0, 1, 0, 25, 28, 19, 0, 29, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255,255, 0,255, 0, // í, + 0, 95, 0,255, 0, 24, 5, 5, 0,122, 0, 0, 0, 0, 0, 7, 0,255, 46, 0, 0, 20, 2,255,255, 6,255, 0, 8,255, 23, 0, 36, 0, 13, 7, 21, 1, 2,255, // ð, + 8, 0, 63, 0, 7, 0, 22, 2, 3, 4, 39, 9, 5, 4, 5, 0, 1,255, 13, 8, 21, 0, 5, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, 5, // ó, + 9, 0, 1, 0, 0, 0, 1, 2, 8, 0, 18, 2, 10, 4, 3, 0, 0,255, 4, 6, 11, 0, 5, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, // ö, + 17, 0, 3,255, 3, 0, 40, 1, 11, 0, 14, 1, 13, 2, 2, 0, 0,255, 14, 9, 12, 0, 12,255,255, 0,255,255,255,255,255,255, 0, 0, 0,255,255, 0,255,255, // ø, + 20, 0, 9, 0, 0, 0, 0, 2, 7, 0, 6, 3, 1, 1, 10, 0, 0,255, 12, 3, 3, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 1, // ú, + 1, 0, 9, 0, 2,255, 0, 0, 0,255, 0, 1, 6, 0, 7,255, 0,255, 1, 4, 9,255, 0,255, 0, 0, 0,255,255,255,255,255,255, 0, 0,255,255, 0,255, 4, // ý, + 87, 1,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255,255, 0,255, 0, 0, 2, 0, 0, 0,255, 0, 0, 0, // þ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, á, ä, æ, é, í, ð, ó, ö, ø, ú, ý, þ, + ], + greek: [ + 0, 12, 0, 16, 6, 69, 1, 0, 0, 24, 0, 33, 0, 41, 2, 2, 1, 50, 0, 44, 1, 2,105, 1, 2, 33, 0, 0, 0, 2, 0, 0, 15, 5, 1, // , + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0,255, 0, 0,255,255, 0,255,255, 0, 0,255, 0,255, 0,255,255,255,255,255, 0,255,255,255, 0,255, 0,255,255,255, 0,255,255,255,255,255, // ΐ, + 4, 0, 0, 0, 0,255, 0, 0, 1, 1, 0, 0, 0, 0, 0, 4, 7, 4, 3, 3, 0, 0, 3, 6,255, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // ά, + 9, 0,255,255, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 1, 3, 2, 6, 2, 0, 0, 2, 2,255, 0, 5, 0, 1, 1, 0, 0, 0, 0, 0,255,255, // έ, + 3, 0,255,255,255,255, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 10, 3, 1, 2, 0, 0, 0, 2,255, 0, 4, 0, 1, 1, 0, 0, 0,255,255,255,255, // ή, + 0, 0,255, 0, 0,255, 0, 5, 1, 2, 2, 15, 0, 0, 0, 0, 1, 4, 1, 5, 0, 5, 2, 14,255, 3, 4, 0, 1, 1, 0, 0, 0, 0, 0,255,255, // ί, + 46, 0, 0, 0, 1, 0, 22, 0, 4, 3, 4, 0, 1, 0, 2, 20, 34, 8, 15, 23, 1, 0, 13, 18,255, 5, 30, 1, 4, 3, 0, 0, 0, 0, 0, 0, 0, // α, + 10, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // β, + 15, 0, 0, 1, 1, 0, 0, 7, 0, 1, 0, 2, 0, 6, 0, 1, 0, 0, 0, 0, 0, 5, 0, 5,255, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, // γ, + 19, 0, 0, 3, 0, 0, 4, 2, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 3, 0, 4, 0, 0,255, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // δ, + 33, 0, 0, 0, 0, 0, 2, 0, 2, 4, 7, 0, 2, 0, 5, 2, 9, 9, 16, 8, 1, 0, 10, 7,255, 15, 15, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, // ε, + 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ζ, + 10, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 4, 0, 2, 8, 3, 6, 1, 0, 1, 3,255, 14, 41, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, // η, + 5, 0, 0, 1, 0, 1, 0, 5, 0, 0, 0, 2, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,255, 0, 0, 1, 0, 0, 0, 0, 0,255, 0, 0, 0, // θ, + 6, 0, 0, 0, 0, 0, 0, 28, 2, 8, 12, 19, 0, 0, 0, 0, 3, 9, 5, 10, 0, 12, 6, 19,255, 6, 19, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, // ι, + 44, 0, 0, 1, 1, 0, 2, 8, 0, 5, 0, 5, 0, 4, 0, 33, 0, 0, 0, 0, 0, 3, 0, 2,255, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // κ, + 7, 0, 0, 5, 3, 0, 1, 10, 1, 1, 0, 9, 0, 2, 1, 3, 3, 9, 0, 0, 0, 10, 4, 0,255, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 1, 0, // λ, + 20, 0, 0, 1, 1, 3, 0, 5, 0, 1, 0, 3, 0, 8, 1, 3, 0, 0, 2, 0, 0, 8, 0, 2,255, 5, 0, 4, 0, 0, 0, 2, 0, 0, 3, 1, 0, // μ, + 12, 0, 0, 6, 8, 1, 7, 30, 0, 1, 0, 10, 0, 14, 1, 8, 0, 0, 0, 1, 0, 16, 0, 1,255, 0, 0, 8, 0, 1, 0, 13, 0, 0, 5, 3, 9, // ν, + 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // ξ, + 20, 0, 0, 0, 0, 0, 9, 0, 3, 9, 6, 0, 1, 0, 2, 14, 12, 13, 10, 14, 0, 0, 26, 21,255, 4, 54, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, // ο, + 45, 0, 0, 1, 0, 0, 0, 13, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 5, 0, 0, 6, 0, 0,255, 2, 0, 5, 0, 0, 0, 1, 0, 0, 2, 1, 0, // π, + 3, 0, 0, 5, 5, 1, 2, 17, 3, 4, 4, 19, 0, 3, 1, 4, 5, 0, 0, 0, 0, 14, 12, 0,255, 0, 9, 5, 1, 4, 0, 2, 0, 0, 3, 2, 1, // ρ, + 0, 0, 0, 1, 6, 6, 0, 14, 0, 0, 0, 9, 0, 21, 0, 6, 0, 0, 0, 0, 0, 17, 0, 0,255, 0, 0, 5, 0, 0, 0, 4, 0, 0, 7, 1, 0, // ς, + 43, 0, 0, 3, 2, 3, 4, 10, 0, 0, 0, 4, 0, 6, 0, 13, 0, 0, 0, 1, 0, 7, 0, 0,255, 2, 2, 5, 0, 0, 0, 3, 0, 0, 3, 2, 1, // σ, + 73, 0, 0, 4, 2, 3, 3, 21, 0, 0, 0, 10, 0, 5, 0, 4, 4, 0, 0, 16, 0, 3, 2, 2,255, 35, 0, 5, 0, 0, 0, 3, 0, 0, 7, 1, 1, // τ, + 4, 0,255, 0, 0, 0, 0, 4, 0, 0, 1, 5, 0, 0, 1, 0, 2, 1, 0, 0, 0, 53, 0, 1,255, 7, 1, 0, 1, 0, 0, 0,255,255, 0, 0, 0, // υ, + 6, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0,255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // φ, + 9, 0, 0, 0, 3, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 4,255, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // χ, + 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, // ψ, + 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 3, 0, 0, 1, 4,255, 1, 6, 0, 1, 1, 0, 0, 0,255, 0, 0, 0, // ω, + 0, 0,255, 0, 0, 0, 0, 1, 0,255,255, 0,255, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, // ϊ, + 0, 0,255, 0, 0,255, 0, 0,255,255, 0, 0,255,255,255, 0,255, 0,255,255,255, 0,255, 0,255, 0, 0, 0,255,255,255, 0,255,255, 0,255,255, // ϋ, + 6, 0, 0,255, 0,255, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 11, 1, 4, 3, 0, 0, 9, 5,255, 1, 4, 0, 0, 0, 0, 0, 0,255, 0,255,255, // ό, + 0, 0,255,255, 0,255,255, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 13, 0, 0,255, 2, 0, 0, 0, 0, 0, 0,255,255, 0,255,255, // ύ, + 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 3,255, 0, 1, 0, 0, 1, 0, 0, 0,255,255,255,255, // ώ, + // , a, ΐ, ά, έ, ή, ί, α, β, γ, δ, ε, ζ, η, θ, ι, κ, λ, μ, ν, ξ, ο, π, ρ, ς, σ, τ, υ, φ, χ, ψ, ω, ϊ, ϋ, ό, ύ, ώ, + ], + turkish: [ + 195,254, 0,140, 0, 12,220,165, 2, 1, 58, 25, 27, // , + 1, 23, 0, 2, 0, 19, 0, 0, 4, 0, 0, 1, 26, // a, + 2, 53, 0, 12, 0, 0, 3, 5, 0, 0, 1, 1, 0, // b, + 13, 31, 0, 4, 0, 0, 0, 0, 0, 0, 0, 2, 0, // c, + 7,161, 0, 22, 0, 0, 11, 4, 1, 3, 1, 2, 16, // d, + 0, 18, 0, 0, 0, 22, 0, 0, 6, 0, 1, 0, 14, // e, + 3, 19, 0, 12, 0, 0, 0, 1, 0, 0, 0, 8, 1, // f, + 0, 45, 0, 0, 0, 0, 1, 2, 0, 0, 2, 0, 0, // g, + 0, 27, 0, 21, 0, 0, 1, 2, 0, 0, 0, 2, 0, // h, + 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, // j, + 37,114, 0, 30, 0, 1, 8, 25, 0, 2, 4, 14, 6, // k, + 60,157, 0, 45, 0, 4, 3, 9, 13, 19, 1, 21, 18, // l, + 39,105, 0, 19, 0, 2, 6, 5, 0, 1, 2, 22, 10, // m, + 105,198, 0, 63, 0, 0, 89, 46, 0, 28, 13, 36, 0, // n, + 1, 54, 0, 0, 0, 7, 0, 0, 0, 0,255, 0, 1, // o, + 2, 22, 0, 0, 0, 0, 0, 3, 0, 0, 3, 6, 0, // p, + 8, 12, 0, 6, 0, 0, 0, 0, 0, 0, 0, 1, 1, // q, + 44,125, 0,124, 0, 0, 21, 23, 6, 22, 10, 42, 1, // r, + 18,123, 0, 48, 0, 0, 0, 17, 0, 2, 3, 7, 0, // s, + 5,117, 0, 35, 0, 1, 2, 7, 0, 1, 1, 4, 25, // t, + 0, 3, 0, 0, 0, 1, 0, 0, 23, 0, 0, 0, 7, // u, + 1, 30, 0, 4, 0, 0, 2, 2, 0, 11, 0, 1, 0, // v, + 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, // w, + 3, 9, 0, 3, 0, 0, 3, 0, 0, 0, 0, 1, 0, // x, + 8, 73, 0, 14, 0, 0, 1, 5, 0, 8, 5, 7, 0, // y, + 12, 34, 0, 10, 0, 0, 10, 5, 0, 10, 0, 26, 0, // z, + 25, 0, 2, 22, 54, 0, 8, 2, 4, 0, 38, 68, 26, 55, 0, 10, 7, 67, 64, 38, 0, 0, 0, 4, 44, 9, 3, 0, 0, 0, 0, 15,255,255, 28, 0,255, 0, 17, // ı, + 228, 15, 82, 26,125, 8, 28, 37, 54, 38,120,164, 71, 79, 6, 17, 7,254,151,193, 4, 68, 61, 9, 15, 20, 0, 2, 0, 0, 0, 26, 0, 0, 22, 0, 0, 0, 36, // i, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, // ß, + 36, 0, 20, 12, 81, 0, 14, 9, 24, 0, 17,101, 55, 24, 0, 1, 12, 35, 31, 42, 0, 36, 0, 7, 21, 21, 0, 1, 0, 0,255, 4,255,255, 0, 0,255, 3, 24, // ä, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0,255, 0,255, 0, // â, + 31, 17, 0, 0, 0, 11, 0, 0, 2, 0, 10, 8, 1, 2, 1, 0, 0, 4, 0, 4, 3, 7, 0, 1, 0, 0, 1, 19, 0, 3, 0, 0, 0, 0, 0, 0, 0, 5, 1, // ç, + 23, 0, 9, 1, 20, 0, 1, 6, 13, 7, 17, 18, 15, 48, 0, 12, 0, 57, 7, 23, 0, 20, 23, 2, 57, 5,255, 0, 0,255,255, 3, 0, 0, 0,255, 0,255, 6, // ê, + 20, 0, 6, 5, 24, 0, 4, 2, 6, 16, 25, 21, 15, 33, 0, 4, 1, 52, 12, 20, 0, 13, 17, 0, 16, 10,255, 0, 0,255, 0, 2, 0, 0, 0,255, 0,255, 7, // î, + 0, 35, 0, 0, 0, 12, 0, 0, 0,255, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 13, 16, 0, 0, 0, 0,255, 0, 0, 2, 0, 2, 0, // ğ, + 30, 0, 11, 0, 6, 0, 0, 22, 1, 0, 10, 0, 2, 4, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 14, 1, 0, 0, 0, 0,255, 1,255,255, 0, 0,255, 0, 0, // ö, + 45, 0, 20, 0, 4, 0, 0, 0, 1, 0, 2, 1, 4, 5, 0, 1, 0, 10, 2, 5, 0, 0, 0, 0, 1, 0,255, 0, 0,255,255, 3, 0, 0, 0,255, 0, 0, 3, // û, + 20, 0, 11, 5, 26, 0, 0, 15, 3, 0, 11, 26, 21, 17, 0, 0, 1, 15, 8, 34, 0, 2, 0, 1, 17, 2, 0, 0, 0, 0, 0, 6,255, 0, 2, 0,255, 0, 3, // ü, + 33, 45, 1, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 14, 0, 0, 12, 0, 1, 5, 0, 0, 28, 33, 0, 11, 0, 0, 6, 5, 0, 0, 2, 6, 0, // ş, + // , a, b, c, d, e, f, g, h, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ı, i, ß, ä, â, ç, ê, î, ğ, ö, û, ü, ş, + ], + hebrew: [ + 0, 1, 0, 0, 0, 0, 0, 28, 29, 17, 24,144, 46, 26, 7, 88, 68, 22, 2, 71,106, 3,200, 0, 35, 69, 10, 2, 9, 2, 13,104, 19,138, // , + 0,255,255, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, // ְ, + 0, 0, 0, 0,255,255, 0,255, 1, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, // ַ, + 0,255, 0, 0, 0,255, 0,255,255, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, // ָ, + 0,255, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255, 0, 3,255, 0, 0, 0, 0, 0, // ּ, + 1,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0,255,255,255,255, 0, 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, // װ, + 0,255,255,255,255, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0,255, 0,255,255,255, 0,255, 0,255, 0,255,255,255, 0,255, 0, 0, 0, 0,255, // ױ, + 0,255, 0, 0,255, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, // ײ, + 171, 0, 0, 0, 0, 0, 0,255, 0, 0, 20, 14, 15, 21, 22, 5, 0, 22, 15, 0, 2, 17, 0, 16, 0, 16, 6, 2, 0, 27, 0, 3, 25, 30, 5, 3, // א, + 101, 0, 0, 0, 0, 0,255, 0, 0, 11, 2, 2, 2, 7, 22, 0, 4, 1, 15, 0, 3, 6, 0, 5, 0, 2, 1, 7, 0, 0, 0, 3, 5, 16, 6, 4, // ב, + 44, 0, 0, 0, 0, 0,255, 0, 0, 7, 1, 0, 2, 5, 9, 0, 0, 6, 12, 0, 0, 3, 0, 2, 0, 17, 1, 15, 0, 1, 0, 0, 0, 7, 1, 1, // ג, + 70, 0, 0, 0, 0, 0,255, 0, 0, 9, 4, 4, 0, 4, 16, 0, 5, 0, 15, 0, 3, 5, 0, 8, 0, 20, 3, 7, 0, 2,255, 1, 2, 4, 0, 0, // ד, + 158, 0, 0, 0, 0, 0, 0,255,255, 5, 8, 2, 6, 3, 8, 5, 4, 2, 36, 0, 4, 17, 0, 11, 0, 16, 1, 4, 0, 4, 0, 2, 5, 16, 9, 5, // ה, + 42, 0, 0, 0, 0, 0, 0,255,255, 48, 21, 16, 18, 19, 73, 5, 13, 12, 38, 0, 11, 22, 0, 30, 0, 25, 10, 13, 0, 32, 0, 18, 22, 34, 19, 17, // ו, + 32, 0, 0, 0, 0, 0,255, 0, 0, 8, 0, 0, 2, 1, 6, 0, 2, 0, 25,255, 1, 0, 0, 2, 0, 1, 0, 4, 0, 0,255, 0, 0, 0, 0, 0, // ז, + 18, 0, 0, 0, 0, 0,255,255,255, 8, 4, 0, 0, 6, 7, 0, 0, 1, 10, 0, 1, 5, 0, 6, 0, 2, 0, 0, 0, 1, 0, 1, 0, 4, 4, 5, // ח, + 22, 0, 0, 7, 1, 0,255, 0, 0, 28, 1, 4, 0, 2, 6, 0, 0, 0, 30, 0, 5, 9, 0, 4, 0, 20, 21, 12, 0, 7, 0, 1, 10, 16, 20, 0, // ט, + 31, 0, 0, 0, 0, 0, 0,255,255, 65, 31, 9, 45, 19, 60, 22, 12, 19, 78, 0, 9, 37, 0, 25, 0, 45, 15, 12, 0, 15, 0, 15, 17, 50, 25, 13, // י, + 0,255, 0, 0, 0, 0,255, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 0, 10, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,255, 0, 0, 4, 0, 0, // ך, + 29, 0, 0, 0, 0, 0,255, 0, 0, 5, 2, 0, 0, 4, 7, 2, 0, 0, 5, 0, 0, 4, 0, 4, 0, 1, 1, 3, 0, 0,255, 0, 0, 6, 2, 1, // כ, + 64, 0, 0, 1, 0, 0, 0, 0, 0, 35, 7, 8, 2, 5, 29, 1, 6, 4, 20, 0, 8, 4, 0, 7, 0, 1, 2, 18, 0, 4, 0, 2, 6, 2, 29, 2, // ל, + 0, 0, 0, 0, 0, 0,255, 0, 0, 2, 0, 3, 1, 3, 9, 0, 0, 0, 65, 0, 0, 2, 0, 0, 0, 0, 0, 11,255, 0,255, 0, 0, 0, 5, 1, // ם, + 89, 0, 0, 0, 0, 0,255, 0, 0, 12, 11, 1, 2, 21, 15, 2, 3, 1, 10, 0, 2, 8, 0, 5, 0, 1, 2, 7, 0, 0, 0, 2, 1, 5, 7, 10, // מ, + 1,255, 0, 0, 0, 0, 0, 0, 0, 13, 8, 3, 3, 1, 48, 1, 0, 12, 37, 0, 3, 3, 0, 3, 0, 0, 2, 31, 0, 3,255, 1, 1, 15, 5, 1, // ן, + 24, 0, 0, 2, 0, 0, 0, 0, 0, 35, 8, 2, 1, 5, 26, 0, 1, 1, 35, 0, 4, 4, 0, 8, 0, 0, 1, 30, 0, 5, 0, 0, 2, 3, 13, 2, // נ, + 22, 0, 0, 0, 0, 0, 0, 0, 0, 16, 4, 0, 0, 6, 14, 0, 2, 1, 17, 0, 1, 2, 0, 6, 0, 5, 0, 12, 0, 2,255, 0, 3, 5, 0, 0, // ס, + 33, 0, 0, 0, 0, 1, 0, 0, 0, 0, 16, 44, 38, 14, 25, 8, 0, 46, 36, 0, 5, 20, 0, 25, 0, 25, 5, 1, 0, 11, 0, 5, 15, 19, 18, 1, // ע, + 0,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 1, 0, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, // ף, + 76, 0, 0, 0, 0, 0,255, 0, 0, 13, 2, 0, 0, 8, 12, 0, 0, 0, 9, 0, 1, 4, 0, 5, 0, 2, 10, 8, 0, 0,255, 1, 1, 4, 5, 3, // פ, + 0, 0, 0, 0, 0, 0,255,255,255, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, // ץ, + 24, 0, 0, 0, 0, 0,255,255, 0, 4, 1, 0, 0, 5, 7, 0, 0, 0, 7, 0, 0, 1, 0, 5, 0, 5, 0, 2, 0, 0,255, 0, 2, 3, 0, 0, // צ, + 39, 0, 0, 0, 0, 0, 0, 0, 0, 7, 3, 0, 1, 6, 9, 0, 4, 0, 17, 0, 0, 5, 0, 7, 0, 5, 3, 9,255, 3,255, 0, 0, 5, 1, 4, // ק, + 27, 0, 0, 3, 7, 0,255, 0, 0, 60, 25, 8, 9, 10, 49, 3, 9, 9, 22, 0, 3, 2, 0, 11, 0, 2, 4,118, 0, 18, 0, 5, 10, 0, 11, 9, // ר, + 75, 0, 0, 0, 0, 0,255, 0, 0, 8, 7, 0, 2, 10, 9, 4, 2, 7, 32, 0, 1, 4, 0, 26, 0, 3, 0, 6, 0, 2, 0, 0, 2, 7, 0, 2, // ש, + 21, 0, 0, 0, 0, 0,255,255, 0, 17, 8, 1, 2, 9, 65, 0, 6, 1, 34, 0, 5, 5, 0, 8, 0, 8, 2, 4,255, 5, 0, 1, 1, 10, 11, 1, // ת, + // , a, ְ, ַ, ָ, ּ, װ, ױ, ײ, א, ב, ג, ד, ה, ו, ז, ח, ט, י, ך, כ, ל, ם, מ, ן, נ, ס, ע, ף, פ, ץ, צ, ק, ר, ש, ת, + ], + arabic: [ + 0, 8, 11, 2, 0, 9, 7, 40, 0, 34, 6, 74, 8, 1, 0, 0, 0, 10, 74, 21, 87, 40, 2, 3, 3, 5, 50, 0, 78, 23, 29, 9, 3, 2, 5, 5, 13, 7, 0, 22, 7, 4, 35, 42, 69, 85, 37, 11, 44, 0, 49, // , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 12, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 4, 9, 0, 0, 0, 0, 0, // , + 20, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, // پ, + 8, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 2, 0,255,255,255, 0, 0, 0,255, 0,255, 0, 0, 0, 5, 0, 1,255, 0, 0, 0, // ٹ, + 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,255, 0, 0, 0, // چ, + 2, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255,255, 0,255, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // ژ, + 8, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 1, 0,255, 0,255, 0,255, 0, 0,255, 0, 0, 0, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, // ڈ, + 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,255, 1, 0, 0, 0, 0, 3, 0, 3, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, // گ, + 79, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 2, 0, 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 4, 7, 1, 0, 1, 0, 0, 0, 0, // ک, + 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0, 0,255, 0,255, 0,255,255, 0,255, 0, 0, 0, 0,255,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // ڑ, + 27,255,255,255, 0,255,255, 0,255, 0,255,255, 0, 0, 0, 0,255, 0,255, 0, 2,255,255, 0,255, 0,255,255, 0,255, 0, 0, 0, 0,255,255,255,255,255,255, 0,255,255,255,255, 0, 0, 0, 4, 0, 0, 0, 0, // ں, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,255, 0, 0,255, 0, 0, 0,255, 0, 0, 3,255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255, // ھ, + 27, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 3, 2, 0, 3, 0, 0, 0, 0, 5, 0, 25, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 9, 1, 2, 0, 2,255, 0, 0, 0, // ہ, + 7, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 4, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0,255,255, 0,255, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, // ء, + 14, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // آ, + 24, 0, 0,255,255, 0,255, 0, 0, 0,255,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0,255, // أ, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // ؤ, + 11, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0,255, // إ, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, // ئ, + 124, 0, 5, 6, 5, 0, 0, 1, 7, 17, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 24, 0, 18, 2, 9, 6, 2, 13, 1, 23, 4, 11, 4, 2, 2, 2, 1, 7, 1, 0, 5, 7, 7, 19, 13, 14, 21, 18, 0, 15, 0, 0, // ا, + 50, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 11, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 2, 4, 0, 0, 0, 1, 0, 3, 0, 0, 0, 1, 1, 6, 2, 1, 0, 7, 0, 3, 0, 0, // ب, + 0, 0, 0, 0,255, 0, 0,255, 0, 0,255,255, 0,255, 0, 0, 0, 0,255, 0, 1, 3, 0, 0, 0, 1, 1, 0, 4, 0, 10, 0, 1, 0, 0, 0, 1, 0, 3, 1, 0, 2, 3, 1, 8, 4, 4, 0, 0, 0, 22, 0,255, // ة, + 38, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 1, 0, 0, 0, 0, 1, 2, 0, 0, 3, 0, 24, 3, 0, 0, 0, 0, 1, 0, 0, 2, 1, 2, 9, 7, 5, 0, 5, 0, 4, 0, 0, // ت, + 1, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, // ث, + 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 1, 0, 3, 0, 1, 0, 0, // ج, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 3, 0, 0, 1, 0, 1, 0, 0, // ح, + 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, // خ, + 42, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 2, 0, 0, 0, 1, 4, 1, 2, 0, 27, 1, 1, 5, 1, 0, 0, 0, 4, 0, 0, 0, 4, 0, 5, 4, 15, 1, 8, 0, 6, 0, 0, // د, + 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, // ذ, + 61, 0, 0, 6, 2, 0, 0, 0, 5, 9, 0, 0, 0, 6, 0, 1, 2, 0, 0, 1, 32, 12, 0, 7, 0, 1, 1, 2, 18, 1, 4, 0, 2, 4, 1, 0, 1, 0, 3, 2, 0, 5, 2, 4, 3, 27, 1, 8, 15, 0, 8, 0, 0, // ر, + 30, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 1, 0, 0, // ز, + 31, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 17, 1, 0, 1, 0, 0, 1, 0, 1, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 6, 9, 3, 1, 5, 0, 4, 0, 0, // س, + 22, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 2, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 3, 0, 1, 0, 0, 0, 0, // ش, + 8, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 14, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 0, // ص, + 1, 0, 0, 0,255,255,255,255, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, // ض, + 2, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 0, // ط, + 0, 0, 0, 0,255, 0,255,255, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,255, 0, 0,255, // ظ, + 18, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 2, 0, 0, 0, 5, 2, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 9, 6, 0, 0, 5, 0, 1, 0, 0, // ع, + 2, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, // غ, + 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ـ, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1, 0, 3, 0, 16, 0, 0, // ف, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 3, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 2, 1, 0, 4, 0, 3, 0, 0, // ق, + 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 1, 0, 0, 1, 0, 4, 0, 0, // ك, + 17, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0,255, 0, 0, 0, 1, 1, 0, 3, 0,123, 6, 0, 1, 1, 1, 1, 2, 1, 0, 0, 0, 3, 0, 2, 1, 1, 0, 10, 0, 0, 1, 3, 2, 5, 8, 0, 2, 12, 0, 10, 0, 0, // ل, + 76, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 16, 0, 0, 2, 0, 3, 2, 0, 4, 0, 8, 22, 6, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 27, 1, 1, 4, 6, 0, 5, 0, 0, // م, + 38, 0, 1, 1, 0, 0, 0, 0, 1, 4, 0, 0, 0, 1, 0, 2, 3, 4, 3, 1, 55, 2, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 3, 2, 14, 0, 1, 0, 3, 0, 0, 0, 0, 2, 4, 14, 1, 2, 10, 0, 11, 0, 0, // ن, + 16, 0, 11, 0, 0, 0, 0, 0, 0, 4, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 10, 13, 0, 4, 0, 1, 0, 0, 30, 0, 6, 2, 1, 5, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 4, 3, 4, 0, 1, 0, 1, 0, 0, // ه, + 36, 0, 0, 1, 0, 0, 0, 0, 2, 6, 0, 0, 1, 6, 0, 0, 4, 0, 0, 0, 10, 9, 0, 5, 0, 9, 4, 4, 5, 0, 13, 1, 3, 3, 3, 0, 1, 0, 1, 0, 0, 2, 1, 4, 9, 6, 10, 3, 1, 0, 8, 0, 0, // و, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,255, // ى, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 1, 2, 5, 8, 0, 5, 0, 2, 2, 1, 12, 1, 22, 2, 8, 2, 2, 1, 1, 0, 2, 1, 0, 19, 3, 4, 14, 7, 24, 2, 10, 0, 1, 0, 0, // ي, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ً, + 2, 0,255, 0, 0, 1, 0, 0, 0, 12, 0, 0, 0, 6, 0, 0,255, 0,255, 5, 0, 0,255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0, 0,255, 0, 0, 0, // ے, + // , a, , پ, ٹ, چ, ژ, ڈ, گ, ک, ڑ, ں, ھ, ہ, ء, آ, أ, ؤ, إ, ئ, ا, ب, ة, ت, ث, ج, ح, خ, د, ذ, ر, ز, س, ش, ص, ض, ط, ظ, ع, غ, ـ, ف, ق, ك, ل, م, ن, ه, و, ى, ي, ً, ے, + ], + baltic: [ + 0, 0, 57, 42,135, 14, 20, 3,119, 0, 0, 18, 1, 18, 0, 0,205, 1, 4, // , + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 16, 39, 14, 0, 0, 0, 16, // a, + 0,255, 0, 0, 3, 0, 0, 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0, 0, // b, + 0,255, 0, 0, 19, 0, 9, 0, 0,255, 0, 6, 0, 0, 0, 0, 0, 1, 0, // c, + 0,255, 0, 0, 17, 0, 6, 0, 6,255, 0, 14, 0, 0, 0, 0, 0, 6, 3, // d, + 0, 0, 0, 0, 0, 0, 0, 9, 0, 5, 3, 0, 1, 7, 3, 0, 0, 0, 21, // e, + 0,255, 0, 0, 4, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, // f, + 0,255, 0, 1, 1, 0, 1, 0, 3, 0, 0, 27, 0, 0, 0, 0, 0, 1, 0, // g, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // h, + 0, 0, 0, 0, 0, 0, 0, 72, 0, 13, 9, 0, 5, 41, 7, 0, 0, 0, 56, // i, + 0,255, 6, 0, 30, 0, 32, 0, 41, 0,255, 12, 0, 0, 0, 0, 3, 0, 0, // j, + 0, 0, 0, 4, 32, 0, 8, 0, 2, 0, 0, 3, 0, 36, 0, 0, 0, 6, 1, // k, + 0,255, 0, 0, 29, 0, 36, 1, 24, 0, 0, 4, 0, 5, 0, 0, 0, 2, 0, // l, + 0, 0, 0, 1, 16, 0, 11, 0, 6, 0, 0, 15, 0, 2, 0, 0, 0, 1, 6, // m, + 0,255, 0, 0, 19, 0, 7, 0, 10, 0, 0, 12, 0, 8, 0, 0, 0, 16, 6, // n, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 10, 2,255, 0, 0, 6, // o, + 0,255, 0, 0, 3, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, // p, + 0,255, 0, 0, 0, 0,255, 0, 0,255,255, 0,255, 0,255,255, 0, 0, 0, // q, + 0,255, 2, 2, 59, 0, 23, 0, 2, 0, 0, 6, 0, 3, 0, 0, 0, 23, 0, // r, + 0,255, 2, 7, 50, 7, 9, 1, 88, 0, 0, 7, 0, 4, 0, 0, 0, 5, 1, // s, + 0,255, 0, 2, 33, 0, 31, 0, 10, 0, 0, 21, 0, 22, 0, 0, 0, 6, 1, // t, + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 13, 11, 12, 0, 0, 0, 7, // u, + 0,255, 0, 5, 10, 0, 2, 0, 3, 0, 0, 21, 0, 12, 0, 0, 0, 1, 3, // v, + 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255, 0,255, 0, 0, 0, 0, // w, + 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255, 0,255,255, 0, 0, 0, // x, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 5,255, 0, 0, 0, 7, // y, + 0,255, 0, 0, 4, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, // z, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0,255,255, 0, 0, // ß, + 0, 0, 0,255,255, 0,255, 0,255, 0,255, 0,255,255,255, 0, 0,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255, 0,255, // ŗ, + 1, 0, 1, 0, 1, 0, 0, 1, 0, 4, 9, 4, 3, 10, 5, 0, 0, 0, 5, 9, 9, 0, 1, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0, 0, 0, // ą, + 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 5, 0, 0, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0, 0, 0, // į, + 9, 0, 10, 2, 28, 3, 0, 13, 1, 6, 45, 45, 27, 28, 50, 0, 30, 0, 40, 13, 73, 2, 28, 0, 0, 0, 5, 0, 0,255,255, 0,255,255, 0,255, 0, 0, 0, 2, 4, 1,255,255,255, 3, // ā, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 4, 0, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0, 0, 0, // ę, + 4, 0, 3, 6, 12, 0, 0, 0, 0, 0, 2, 0, 20, 16, 8, 0, 35,255, 15, 19, 28, 0, 26, 0,255, 0, 5,255, 0,255,255, 0,255, 0, 0,255, 0, 1,255, 0, 1, 3,255,255, 0, 0, // ē, + 12, 9, 0, 0, 0, 9, 0, 0, 0, 15, 0, 0, 1, 0, 8, 2, 0, 0, 4, 7, 0, 3, 0, 0, 0, 6, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 7, 0, 0, 0, 1, 0, // č, + 1, 0, 20, 0, 27, 0, 0, 9, 0, 0, 3, 27, 33, 22, 68, 0, 12, 0, 25, 12, 29, 0, 20, 0, 0, 0, 1,255,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 2,255,255, 0, 0, 4, // ė, + 6, 1,255, 0, 0, 2,255, 0,255, 1,255, 0, 0, 0, 0, 3, 0,255, 1, 0, 0, 1, 0,255,255,255, 0,255,255,255,255, 0,255, 1,255,255, 0,255, 0, 0, 0, 0,255,255, 0, 0, // ģ, + 4, 1, 0, 0, 0, 1,255, 0,255, 3, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0,255,255, 0,255, 0, 0,255,255, 0, 0, 0, 5, 2,255,255, 0, 0, // ķ, + 4, 0, 5, 11, 17, 0, 0, 0, 0, 0, 0, 0, 32, 4, 17, 0, 2, 0, 44, 6, 35, 0, 7, 0, 0, 0, 25, 0, 0,255,255, 0,255, 0, 0,255, 0, 2, 0, 0, 3, 0,255,255, 0, 0, // ī, + 2, 11, 0,255, 0, 10, 0, 2, 0, 3,255, 11, 0, 0,255, 2, 0,255, 0, 0, 0, 2, 0,255,255,255, 0,255,255,255,255, 1,255, 2,255,255, 0, 0, 0, 0, 0,255, 0,255, 0, 0, // ļ, + 75, 31, 0, 0, 0, 15, 0, 1, 0, 71, 0, 18, 1, 1, 1, 13, 2, 0, 7, 0, 1, 10, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 4, 0, 0,255, 0, 3, 1, 0, 5, 0, 0, 3, 0, // š, + 1, 5, 0, 0, 0, 6,255, 0, 0, 24, 0, 0, 0, 0, 0, 2, 0,255, 1, 0, 0, 1, 0,255,255,255, 2,255,255,255,255, 3,255, 0, 0,255,255,255, 1, 0, 0, 0, 0,255, 0, 0, // ņ, + 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255, 0,255,255,255,255,255, 0,255, 0,255,255, 0, // ō, + 0, 0, 3, 0, 4, 0, 0, 4, 0, 76, 18, 13, 8, 11, 19, 0, 0, 0, 9, 4, 28, 0, 3, 0, 0, 0, 0,255,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0, 0, 1, // ų, + 2, 0, 9, 3, 1, 0, 0, 1, 0, 12, 8, 8, 3, 5, 0, 0, 1, 0, 9, 1, 12, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0,255, 1, 0, 0,255, 0, 0, 0, // ū, + 46, 17, 0, 0, 30, 11, 0, 1, 0, 3, 0, 0, 1, 3, 1, 2, 0, 0, 8, 0, 0, 16, 0, 0, 0, 2, 0, 0,255, 1, 1, 0, 0, 0, 0, 3,255,255, 0, 0, 0, 0,255, 0, 0, 0, // ž, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, ŗ, ą, į, ā, ę, ē, č, ė, ģ, ķ, ī, ļ, š, ņ, ō, ų, ū, ž, + ], + thai: [ + 6, 0, 2, 0, 14, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 2, 0, 1, 0, 21, 3, 1, 0, 0, 5, 0, 0, 9, 9, 4, 0, 4, 2, 6, 0, 2, 0, 0, 4, 0, 0, 4, 0, 11, 0, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 3, 0, 0, 16, 0, 0, 0, // , + 0, 0, 0,255, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // a, + 8, 0, 6, 0, 4, 0, 91, 3, 0, 13, 1, 0, 1, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 17, 6, 3, 0, 0, 0, 0, 0, 8, 13, 10, 0, 6, 5, 0, 0, 4, 1, 0, 28, 0, 0, 13, 40, 50, 4, 26, 13, 9, 0, 23, 33, 0, 80, 28, 16, 2, 3, 0, 17, 12, 13, 3, 0, 19, 0,255, 0, // ก, + 1, 0, 8, 0, 1, 0,119, 2, 0, 0, 0,255, 3, 0, 0, 0, 0, 0, 0, 7, 1, 0, 0, 0, 14, 3, 0, 0, 0, 1, 0, 0, 6, 4, 4,255, 5, 4, 0, 0, 0, 0, 0, 5, 0, 0, 3, 0, 17, 2, 5, 6, 0, 0, 16, 1, 0,137, 33, 4, 0, 12, 0, 0, 12, 9, 0, 0, 15, 0,255, 0, // ข, + 7, 0, 23, 0, 61, 0,254, 2, 0, 9, 3,255, 2, 0, 0, 0, 0,255, 1, 10, 1, 0, 16, 0, 51, 21, 1, 0, 0, 1, 0, 1, 14, 13, 32, 0, 5, 5, 0, 0, 4, 0, 0, 16, 0, 0, 31, 11,120, 11, 16, 21, 0, 0, 61, 5, 0,173, 54,110, 7, 6, 0, 14, 29, 37, 2, 0, 42, 0,255, 0, // ค, + 0,255, 0,255, 2, 0, 8, 0,255, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 5,255, 0,255, 0, 0, 0,255,255, 0, 0,255,255, // ฆ, + 0, 0, 6, 1, 20, 0, 24, 6, 0, 1, 3, 0, 0, 0, 0, 0,255,255, 0, 22, 0, 0, 0, 5, 4, 2, 0, 0, 0, 6, 0, 0, 7, 39, 53, 0, 41, 56, 0, 0, 20, 4, 0,254, 0,255, 1,167,254, 4,100, 0, 46, 0, 55, 34, 0, 26, 1, 2,255, 1, 0, 20,254,254, 2, 1, 1, 0,255, 0, // ง, + 4, 0, 9, 1, 3, 0, 74, 23, 0, 1, 0,255, 9, 0, 0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 14, 2, 1, 0, 0, 2, 0, 0, 5, 3, 7, 0, 4, 5, 4, 0, 1, 0, 0, 4, 0, 0, 14, 9, 32, 1, 17, 3, 0, 0, 5, 4, 0,137, 10, 15, 31, 0, 0, 44, 24, 23, 0, 0, 15, 0,255, 0, // จ, + 0,255, 1, 0, 0, 0, 5, 4, 0, 0, 0,255, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 1, 0, 0, 0,255,255, 0, 0, 0, 47, 1, 3,255, 0, 0, 0, 1, 0, 0, 0, 1,255,255,255, // ฉ, + 3, 0, 8, 0, 11, 0,139, 1, 0, 17, 1, 0, 32, 0, 0, 0, 0, 0, 0, 8, 3, 0, 1, 0, 31, 5, 1, 0, 0, 7, 0, 0, 18, 22, 8, 0, 7, 7, 0, 0, 1, 0, 0, 10, 0, 0, 31, 16,141, 2, 33, 15, 0, 3, 11, 13, 0,208, 45, 22,170, 11, 0, 2, 21, 32, 0, 0, 29, 0,255, 0, // ช, + 2, 0, 16, 0, 6, 0, 47, 0, 0, 3, 5, 0, 0, 0, 0, 0,255, 0, 0, 2, 2, 0, 1, 0, 13, 2, 2,255,255, 0, 2,255, 3, 2, 4,255, 3, 2, 0, 0, 6, 0,255, 5, 0, 0, 4, 0, 16, 10, 7, 9, 0, 0, 4, 9, 0,178, 19, 44, 0, 40, 0, 0, 14, 4, 0, 0, 19,255,255,255, // ซ, + 0,255, 0,255, 0,255, 0, 0,255, 2,255,255, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 1, 0, 0,255, 0,255,255, 0, 0,255,255, 0,255,255,255, // ฌ, + 0,255, 0, 0, 0,255, 3, 1,255, 13, 0, 0, 83,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,255, 1, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 37, 0, 1,255,255, 0, 48, 7, 0, 16, 0, 0, 0, 44, 11, 0, 0, 0, 1, 0, 0,255, 3, 1, 0, 0, 0, 2, 0,255,255, // ญ, + 0,255, 11,255, 0, 0, 0,255,255, 0,255,255,255, 0,255, 0,255,255, 0,255, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 7, 0, 0, 0,255, 0,255,255,255,255, 0, 0,255,255, 0, 0,255,255, // ฎ, + 0,255, 4,255, 0, 0, 0, 0,255, 0,255,255, 0,255, 0, 0,255,255, 0,255, 0,255, 0,255, 0, 0, 9, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255,255, 0, 2, 1,255, 0, 0,255,255, 2, 0, 0, 0,255, 0,255,255,255,255, 0,255,255,255, 0,255,255,255, // ฏ, + 0,255, 0, 0, 0,255, 0, 0,255, 0,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,255, 0,255,255, 0, 11, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0,255,255, 0, 0,255,255, // ฐ, + 0,255, 0,255, 0,255, 0, 0,255, 0,255,255, 0,255,255, 0, 0, 0, 2, 0, 0,255, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0, 0,255, 0, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255,255,255,255,255,255, 0,255,255,255, // ฑ, + 0,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255, 0, 0,255,255,255,255,255, 0,255,255,255,255, 0, 0, 0, 0,255,255,255,255,255,255,255,255,255, 4, 0,255, 0, 0,255,255, 2, 0, 0, 0,255, 0,255,255,255,255,255,255,255,255, 0, 0,255,255, // ฒ, + 0,255, 0, 2, 16, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 1, 0, 10, 0, 0, 1, 0, 3, 0, 0,255, 0,255,255, 0, 2, 6, 0, 0, 0,255,255, 11, 0, 0, 0,255, 0,255,255,255,255, 0, 0,255,255, 0, 0,255,255, // ณ, + 2, 0, 3, 0, 4, 0, 15, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 12, 2, 1, 0, 0, 0, 0, 0, 3, 2, 5, 3, 3, 17, 0, 0, 7, 0, 0, 11, 0, 0, 5, 26, 11, 1, 21, 2, 0, 0, 32, 7, 0, 75, 9, 54, 2, 75, 0, 8, 4, 5, 0, 0, 10, 0,255, 0, // ด, + 4, 0, 6, 7, 8, 0, 32, 1, 0, 2, 7, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, 0, 19, 1, 1, 0, 0, 0, 0, 0, 4, 2, 4, 0, 2, 1, 1, 0, 14, 4, 0, 7, 0, 0, 4, 16, 17, 1, 18, 3, 0, 0, 27, 6, 0, 36, 34, 12, 9, 5, 0, 5, 11, 6, 0, 0, 14, 0,255, 0, // ต, + 1, 0, 0, 0, 0,255, 7, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 1, 0, 0,255,255, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 1, 3, 0,255, 0, 0, 0, 3, 3, 0, 0, 0,255,255,255, // ถ, + 4, 0, 4, 1, 3, 0, 50, 1, 0, 5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 32, 8, 2, 0, 0, 2, 0, 1, 5, 4, 6, 0, 3, 2, 0, 1, 1, 0, 0, 3, 0, 0, 8, 3, 13, 1, 16, 3, 0, 0, 18, 2, 0, 83, 12, 15, 0, 37, 0, 1, 13, 9, 0, 0, 23, 0,255, 0, // ท, + 1,255, 0, 0, 0, 0, 2, 0,255, 1, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 6, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 3, 0,255, 2, 0, 5, 0, 3, 0, 0, 0, 4, 0, 0, 11, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 2, 0,255,255, // ธ, + 4, 0, 4, 7, 41, 0, 24, 11, 0, 43, 9, 0, 0, 0, 0, 0, 0, 3, 0, 6, 3, 1, 3, 1, 10, 3, 1, 1, 0, 1, 0, 0, 9, 30, 3, 0, 5, 14, 2, 0, 10, 32, 0, 41, 0, 0, 5, 69, 60, 8, 35, 9, 0, 2, 20, 10, 0, 38, 13, 14,150, 5, 0,124, 80, 92, 0, 0, 10, 0,255, 0, // น, + 3, 0, 1, 1, 2, 0, 15, 2, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1, 0, 5, 10, 0, 0, 0, 2, 0, 0, 2, 2, 4, 0, 4, 1, 1, 0, 1, 0, 0, 15, 0, 0, 7, 48, 8, 5, 7, 2, 0, 1, 12, 1, 0, 30, 33, 8, 3, 1, 0, 10, 5, 4, 0, 0, 9, 0,255, 0, // บ, + 5, 0, 1, 0, 3, 0, 29, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 2, 0, 0,255, 0, 0, 0, 5, 1, 5, 0, 2, 0, 0, 0, 1, 0, 0, 3, 0, 0, 3, 1, 4, 1, 3, 2, 0, 0, 6, 11, 0,163, 8, 11, 0, 45, 0, 0, 15, 6, 0, 0, 6, 0,255,255, // ป, + 2, 0, 0, 0, 0,255, 7, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 1, 0, 1, 0, 0, 1, 0,255, 2, 0, 0, 4, 8, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0,255,255,255, // ผ, + 0, 0, 0, 0, 0, 0, 2, 0,255, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,255,255,255, // ฝ, + 9, 0, 1, 0, 1, 0, 23, 7, 5, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 6, 1, 0,255,255, 0, 0, 0, 6, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 4, 4, 20, 1, 5, 4, 0, 0, 3, 1, 0, 63, 11, 4, 0, 3, 0, 0, 6, 4, 0, 0, 4, 0,255, 0, // พ, + 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 1, 0, 0,255,255, 0, 1,255, 0, 0, 2,255, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 7, 4, 1, 0, 29, 0, 0, 0, 0, 0, 0, 3,255,255, 0, // ฟ, + 2, 0, 0, 0, 0, 0, 5, 0,255, 1, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,255,255, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 3, 0, 0, 1,255, 0, 0, 0, 3, 0, 1, 0,255, 0, 2, 0, 0, 15, 0, 0, 0, 0, 0, 0, 1, 2,255,255, 1,255,255, 0, // ภ, + 5, 0, 5, 1, 63, 0, 29, 3, 0, 17, 3, 0, 1, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 7, 1, 0, 0, 0, 1, 0, 0, 4, 7, 17, 0, 2, 7, 0, 0, 18, 59, 0, 21, 0, 0, 8, 6, 52, 1, 12, 6, 0, 0, 22, 9, 0, 82, 27, 21, 0, 43, 0, 8, 59, 21, 0, 0, 10, 0, 0, 0, // ม, + 1, 0, 1, 1, 6, 0, 10, 0, 0, 6, 1, 0, 1, 0, 0, 0, 0, 0, 0, 13, 2, 0, 18, 1, 5, 0, 1, 0, 0, 5, 0, 0, 1, 1, 3,255, 4, 13, 0, 1, 1, 2, 0, 27, 0, 0, 4, 16, 74, 0, 7, 61, 0, 0, 6, 0, 0, 16, 5, 8, 0, 0, 0, 1, 50, 8, 0, 0, 3, 0, 0, 0, // ย, + 6, 0, 35, 0,148, 0, 29, 9, 0, 7, 1, 0, 1, 5, 0, 0, 0, 0, 1, 4, 27, 0, 14, 5, 8, 14, 42, 0, 1, 32, 2, 1, 8, 5, 23, 0, 1, 8, 4, 1, 16, 14, 0, 33, 0, 0, 8, 0,121, 3, 8, 5, 0, 0, 20, 14, 0, 85, 17, 40, 0, 8, 0, 0, 8, 18, 0, 0, 7, 0, 0, 0, // ร, + 0,255, 3, 0, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 3, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255,255,255,255,255,255, 0, 0,255,255, 0,255,255,255, // ฤ, + 2, 0, 19, 1, 40, 0, 15, 1, 5, 5, 6, 0, 3, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 3, 6, 14, 5, 0, 10, 6, 0, 2, 2, 1,255, 3, 5, 0, 0, 2, 26, 0, 19, 0, 0, 2, 9, 20, 2, 12, 2, 0, 0, 17, 25, 0, 73,127, 25, 0, 5, 0, 0, 4, 2, 0, 0, 7, 0, 0, 0, // ล, + 4, 0, 8, 4, 80, 0, 23, 1, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 12, 2, 0, 0, 0, 1, 0, 0, 18, 8, 9, 0, 4, 0, 1, 0, 5, 12, 0, 5, 0, 0, 8, 16, 32, 1, 8, 7, 0, 0, 4, 1, 0, 48, 3, 1, 0, 8, 0, 2, 60, 50, 0, 1, 4, 0, 0, 0, // ว, + 6, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 9, 0, 5, 0, 0, 0, 1, 0, 0, 12,255, 0,255, 0, 0, 0, 0, 0, 0,255, 3, 0,255,255, // ศ, + 0,255, 16, 0, 0, 1, 4, 0,255, 0,255,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255, 0, 1, 0, 3, 5, 0, 0, 1, 0, 0, 0,255, 0,255,255, 0, 0, 9,255, 4, 0, 0,255, 6, 0, 0, 1,255, 0,255, 0,255,255, 0, 0,255,255, 0, 0,255, 0, // ษ, + 9, 0, 8, 4, 4, 0, 75, 3, 0, 12, 1, 0, 1, 0, 0, 0, 0, 0, 1, 5, 2, 0, 1, 0, 19, 5, 1, 0, 0, 0, 0, 0, 12, 6, 9, 0, 3, 2, 2, 0, 1, 0, 0, 13, 0, 0, 11, 9, 26, 1, 16, 7, 0, 0, 8, 2, 0, 55, 26, 2, 3, 0, 0, 1, 29, 16, 0, 0, 19, 0,255, 0, // ส, + 7, 0, 2, 0, 1, 0, 77, 2, 0, 0, 0,255, 4, 4, 0, 0, 0, 0, 0, 21, 0, 0, 2, 0, 9, 2, 0, 0,255, 0, 0, 0, 13, 3, 2, 0, 1, 2, 0, 0, 5, 0, 0, 4, 0, 0, 8, 0, 12, 7, 3, 1, 0, 0, 2, 4, 0, 62, 60, 4, 82, 6, 0, 0, 11, 10, 0, 0, 3, 0, 0, 0, // ห, + 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255,255,255, 0,255,255,255,255,255,255,255, 0, 0, 0,255,255, 0,255,255, 0, 0,255, 0,255,255,255, 0, 0,255, 0, 5,255,255, 2, 0,255, 0,255, 0,255,255,255,255, 0,255, 0,255,255,255,255,255, // ฬ, + 8, 0, 13, 98, 32, 0, 68, 12, 0, 23, 18, 0, 1, 0, 0, 1, 0, 0, 0, 10, 10, 0, 6, 4, 20, 17, 3, 0, 0, 2, 2, 4, 9, 9, 21,255, 11, 8, 1, 0, 7, 1, 0, 21, 2, 0, 9, 0, 10, 4, 2, 5, 0, 57, 0, 2, 0, 84, 22, 36, 0, 8, 0, 19,254,144, 1, 1, 11, 0,255, 0, // อ, + 0, 0, 0, 0, 0,255, 2, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 1, 0, 0, 0, 0,255, 0, 0, 0, 8, 3, 4,255, 3, 0, 0, 0, 0, 0, 0, 1,255,255,255, // ฮ, + 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, 0,255,255,255, 0, 0, 0,255,255, 0, 0,255,255, // ฯ, + 0,255, 3, 0, 8, 0, 2, 33, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 7, 0, 5, 0, 2, 0, 7, 0, 1, 0,255, 0, 0, 0, 3, 3, 93,255, 44, 1, 0, 0, 3, 0, 0, 5, 0, 0, 0, 0, 8, 0, 0, 0,255,255, 0,255,255, 0,255, 0,255,255,255, 0, 0, 0, 1, 0, 0,255,255,255, // ะ, + 0,255, 33, 13, 29, 0, 0, 54, 5, 48, 13, 0, 3, 0, 0, 0, 0, 0, 0, 12, 23, 1, 18, 2, 29, 17, 7, 0, 1, 11, 1, 3, 12, 8, 50,255, 26, 39, 4, 4, 21, 6, 0, 22, 2,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255,255,255,255,255,255, 0, 0,255,255,255,255,255, 0, // ั, + 0, 0, 97, 28, 43, 1, 47, 62, 8,139, 31, 0, 33, 9, 0, 3, 0, 0, 3, 11, 15, 8, 23, 5, 41, 17, 5, 0, 0, 6, 1, 34, 49, 29, 63, 0, 28, 30, 10, 18, 31, 30, 6, 33, 3,255, 0, 0, 0, 0, 0, 0,255,255, 0,255,255, 0,255,255,255,255,255, 0,224,254, 1, 0, 0, 0,255, 0, // า, + 0,255, 5, 0, 35,255, 0, 21, 0, 1, 0,255,255,255,255,255,255,255, 0, 2, 8, 0, 10, 0, 4, 0, 0, 0,255, 0, 0,255, 0, 0, 0,255, 3, 0,255,255, 10, 0,255, 7, 0,255, 0, 0, 0, 0,255,255,255,255,255, 0,255, 0,255,255,255, 0,255,255, 3, 33, 0, 0, 0,255,255, 0, // ำ, + 0,255, 20, 0, 22, 0, 8, 20, 2, 65, 34, 0, 17, 0, 12, 0, 1, 0, 1, 15, 28, 1, 5, 14, 18, 6, 7, 1, 0, 14, 2, 1, 14, 3, 35, 0, 18, 32, 4, 0, 20, 2, 0, 23, 2,255, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0, 0,255,255, 0,255,255, 0, // ิ, + 0,255, 10, 7, 9, 0, 0, 12, 4, 56, 51, 0, 15, 3, 0, 0, 0, 0, 1, 19, 4, 0, 86, 2, 23, 5, 18, 0, 0, 6, 1, 0, 34, 3, 37,255, 23, 7, 0, 0, 16, 0, 0, 9, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0,255,255,255,255,255,255, 0, 0,255,255, 0,255,255, 0, // ี, + 0,255, 0, 17, 0,255, 0, 9, 0, 0, 44, 0, 0, 0, 0,255,255,255, 0, 0, 0, 6, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 5,255, 1, 0,255, 0, 0,255,255,255,255,255, 0, 0, 0,255, 0,255,255,255,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, // ึ, + 0,255, 0, 0, 29,255, 0, 0, 0, 66, 3, 0, 0,255, 0,255,255, 0, 0, 2, 0, 1, 0, 0, 6, 0, 0, 0, 0, 9, 0,255, 18, 1, 19,255, 7, 0, 0, 0, 6, 0, 0, 6, 0,255,255,255, 0,255, 0,255,255, 0,255,255,255, 0,255,255,255,255,255,255, 0, 0,255,255, 0,255,255, 0, // ื, + 0,255, 20, 9, 88, 0, 1, 45, 1, 55, 17, 0, 0, 0, 0, 0, 0,255, 1, 4, 25, 6, 11, 7, 22, 29, 10, 0, 0, 8, 19, 0, 12, 28, 29,255, 17, 4, 1, 0, 53, 3, 0, 22, 1,255,255, 0, 0,255,255, 0, 0,255, 0, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255, 0, // ุ, + 0,255, 6, 0, 37, 0, 3, 9, 0, 15, 28, 1, 1, 0, 0, 0, 0,255, 0, 26, 8, 15, 3, 0, 4, 7, 2, 48, 0, 7, 2, 10, 77, 38, 26, 0, 13, 1, 3, 0, 26, 1,255, 4, 0,255,255,255,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, // ู, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255, // ฺ, + 20, 0, 41, 1, 31, 0,254, 12, 0, 16, 23, 0, 12, 1, 1, 1, 0, 0, 1, 30, 10, 2, 7, 1,110, 23, 9, 0, 0, 4, 4, 0, 52, 35, 41, 0, 29, 20, 5, 0, 30, 0, 0, 67, 2, 0,130, 0, 84, 35, 40, 50, 0, 0, 20, 56, 0, 0, 0, 0, 0, 0, 0, 8,138,119, 0, 0,142, 0,255, 0, // เ, + 10, 0, 19, 0, 9, 0,170, 5, 0, 3, 2, 0, 4, 0, 0, 0, 0, 0, 0, 9, 3, 0, 2, 0, 36, 8, 6, 0, 0, 1, 0, 0, 12, 13, 20,255, 6, 5, 1, 0, 6, 0, 0, 9, 0, 0, 16, 0, 26, 16, 4, 13, 0, 0, 2, 11, 0, 0, 0, 0,255, 0, 0, 1, 52, 49, 0, 0, 68, 0,255, 0, // แ, + 6, 0, 7, 0, 29, 0, 72, 3, 0, 9, 11, 0, 2, 0, 0, 0, 0, 0, 0, 4, 3, 0, 2, 0, 27, 5, 2, 0,255, 1, 0, 0, 9, 8, 8, 0, 8, 6, 1, 0, 20, 0, 0, 8, 1, 0, 12, 0, 22, 2, 21, 13, 0, 0, 26, 14, 1, 0, 0, 0,255, 0, 0, 0, 11, 10, 0, 0, 33, 0,255, 0, // โ, + 3, 0, 9, 0, 4, 0,114, 4, 0, 1, 0, 0, 6, 0, 2, 0, 0, 0, 0, 10, 3, 1, 0, 0, 27, 5, 2, 0,255, 1, 0, 0, 7, 16, 10,255, 6, 5, 2, 0, 1, 0, 0, 9, 0, 0, 8, 0, 22, 14, 3, 7, 0, 0, 2, 3, 0, 0,255,255, 0, 0, 0, 1, 55, 53, 0, 0, 24,255,255, 0, // ใ, + 5, 0, 8, 0, 4, 0,116, 3, 0, 3, 1, 0, 1, 1, 0, 0, 0, 0, 0, 5, 1, 4, 0, 0, 28, 10, 3, 0,255, 0, 0, 0, 6, 9, 5,255, 3, 9, 8, 0, 3, 0, 0, 10, 0, 0, 15, 0, 23, 5, 5, 4, 0, 0, 9, 15, 0, 0,255,255,255, 0, 0, 12, 51, 30, 0, 0, 34, 0,255, 0, // ไ, + 0,255, 0,255, 0,255, 12,255,255,255,255,255, 0,255,255,255,255,255, 0, 0, 0, 0, 0, 0, 2, 0, 0,255,255, 0,255,255, 0, 0, 0,255, 0, 0,255, 0, 0,255,255, 0,255,255, 0,255, 0, 0, 0, 0,255,255, 0, 0,255,255,255,255,255,255, 0,255, 0, 0, 0, 0, 0,255,255,255, // ๆ, + 0,255, 25, 6, 4, 0, 0, 7, 0, 7, 21, 0, 0, 0, 0,255,255,255, 0, 16, 5, 0, 2, 0, 2, 1,112, 0, 0, 1, 0, 0, 3, 2, 11,255, 12, 6, 0,255, 0, 7,255, 13, 0,255,255,255,255,255, 0,255,255,255,255, 0, 0, 0,255,255,255,255,255, 0,255,255,255,255,255,255,255,255, // ็, + 0, 0, 28, 36, 44, 5, 7, 4, 0,166, 14,255, 53,255,255,255,255, 0, 0, 3, 64, 4, 15, 0, 19, 8, 5, 12, 4, 4, 1, 0, 69, 28, 23,255, 45, 56, 0,255, 35, 40, 0, 6, 1,255, 0, 35, 0, 0, 43,254, 46,156,119,254, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255, 0, // ่, + 0, 0, 17,126, 42, 0, 0, 91, 0,177, 69, 0, 2,255,255,255,255,255,255,105, 38, 4, 11, 0, 56, 12, 6, 1, 0, 3, 8, 0, 10, 7, 40,255, 32, 11, 0, 0, 7, 61,255, 26, 0,255, 0,125, 0, 0, 14, 49, 25, 28, 16,211,255, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0,255, 0, // ้, + 0,255, 2, 0, 0,255, 0, 2,255, 0, 0,255, 0, 0,255,255,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0, 0,255,255, 0,255,255, 1, 0, 0, 0, 6, 1,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255, // ๊, + 0,255, 0, 0, 0, 0,255, 1,255, 0, 0,255, 0,255,255,255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0,255,255,255,255, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0, 0,255,255, 0,255,255, 0, 1, 0, 0, 0, 1, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255,255, // ๋, + 0, 0, 6, 1,105, 5, 0, 2, 0, 9, 44, 0, 3, 0, 0, 1, 2, 0, 24, 19, 28, 1, 11, 12, 24, 0, 8,255,255, 4, 1, 0, 2, 46,168,255, 28, 8, 13, 10, 39, 14,255, 0, 1,255, 0,255,255,255, 21, 0,255, 0, 14, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255, // ์, + 0,255, 0, 0, 0,255,255, 0,255, 0, 0,255, 0, 0,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255,255, 0,255,255,255, 0,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255, // ํ, + 0,255, 0,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255,255,255,255, 0,255,255, 0,255,255,255,255,255,255, 0,255,255,255, 0, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ๎, + 0, 0, 0, 0, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255, 0,255, 0, 0,255,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255,255, 0, 0, 0, 0, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255,255,255, 0, 0,255,255, 0,255,255, 0, // ๅ, + // , a, ก, ข, ค, ฆ, ง, จ, ฉ, ช, ซ, ฌ, ญ, ฎ, ฏ, ฐ, ฑ, ฒ, ณ, ด, ต, ถ, ท, ธ, น, บ, ป, ผ, ฝ, พ, ฟ, ภ, ม, ย, ร, ฤ, ล, ว, ศ, ษ, ส, ห, ฬ, อ, ฮ, ฯ, ะ, ั, า, ำ, ิ, ี, ึ, ื, ุ, ู, ฺ, เ, แ, โ, ใ, ไ, ๆ, ็, ่, ้, ๊, ๋, ์, ํ, ๎, ๅ, + ], +}; + +const VIETNAMESE_ASCII: usize = 27; +const VIETNAMESE_NON_ASCII: usize = 25; +const CENTRAL_ASCII: usize = 27; +const CENTRAL_NON_ASCII: usize = 41; +const CYRILLIC_ASCII: usize = 2; +const CYRILLIC_NON_ASCII: usize = 44; +const WESTERN_ASCII: usize = 27; +const WESTERN_NON_ASCII: usize = 32; +const ICELANDIC_ASCII: usize = 27; +const ICELANDIC_NON_ASCII: usize = 13; +const GREEK_ASCII: usize = 2; +const GREEK_NON_ASCII: usize = 35; +const TURKISH_ASCII: usize = 26; +const TURKISH_NON_ASCII: usize = 13; +const HEBREW_ASCII: usize = 2; +const HEBREW_NON_ASCII: usize = 34; +const ARABIC_ASCII: usize = 2; +const ARABIC_NON_ASCII: usize = 51; +const BALTIC_ASCII: usize = 27; +const BALTIC_NON_ASCII: usize = 19; +const THAI_ASCII: usize = 2; +const THAI_NON_ASCII: usize = 70; +#[inline(always)] +fn compute_index( + x: usize, + y: usize, + ascii_classes: usize, + non_ascii_classes: usize, +) -> Option<usize> { + if x == 0 && y == 0 { + return None; + } + if x < ascii_classes && y < ascii_classes { + return None; + } + if y >= ascii_classes { + return Some( + (ascii_classes * non_ascii_classes) + + (ascii_classes + non_ascii_classes) * (y - ascii_classes) + + x, + ); + } + Some(y * non_ascii_classes + x - ascii_classes) +} + +pub struct SingleByteData { + pub encoding: &'static Encoding, + lower: &'static [u8; 128], + upper: &'static [u8; 128], + probabilities: &'static [u8], + ascii: usize, + non_ascii: usize, +} + +impl SingleByteData { + #[inline(always)] + pub fn classify(&'static self, byte: u8) -> u8 { + let high = byte >> 7; + let low = byte & 0x7F; + if high == 0u8 { + self.lower[usize::from(low)] + } else { + self.upper[usize::from(low)] + } + } + + #[inline(always)] + pub fn is_latin_alphabetic(&'static self, caseless_class: u8) -> bool { + let caseless_class_usize = usize::from(caseless_class); + caseless_class_usize > 0 && caseless_class_usize < (self.ascii + self.non_ascii) + } + + #[inline(always)] + pub fn is_non_latin_alphabetic( + &'static self, + caseless_class: u8, + is_windows_1256: bool, + ) -> bool { + let caseless_class_usize = usize::from(caseless_class); + let lower_bound = if is_windows_1256 { + WINDOWS_1256_ZWNJ + } else { + 1 + }; + caseless_class_usize > lower_bound && caseless_class_usize < (self.ascii + self.non_ascii) + } + + #[inline(always)] + pub fn score( + &'static self, + current_class: u8, + previous_class: u8, + is_windows_1256: bool, + ) -> i64 { + let current_usize = usize::from(current_class); + let previous_usize = usize::from(previous_class); + let stored_boundary = self.ascii + self.non_ascii; + if current_usize < stored_boundary { + if previous_usize < stored_boundary { + // Both below + if let Some(index) = + compute_index(previous_usize, current_usize, self.ascii, self.non_ascii) + { + let b = self.probabilities[index]; + if b == 255 { + IMPLAUSIBILITY_PENALTY + } else { + i64::from(b) + } + } else { + 0 + } + } else { + // Current below stored, prev above + if current_usize == 0 + || current_usize == ASCII_DIGIT + || (is_windows_1256 && current_usize == WINDOWS_1256_ZWNJ) + { + // Current is space-like + 0 + } else { + // Current is alphabetic + let previous_unstored = previous_usize - stored_boundary; + match previous_unstored { + PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE => 0, + IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE => IMPLAUSIBILITY_PENALTY, + IMPLAUSIBLE_BEFORE_ALPHABETIC => IMPLAUSIBILITY_PENALTY, + IMPLAUSIBLE_AFTER_ALPHABETIC => 0, + PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE => { + if current_usize < self.ascii { + IMPLAUSIBILITY_PENALTY + } else { + 0 + } + } + PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE => { + if current_usize < self.ascii { + 0 + } else { + IMPLAUSIBILITY_PENALTY + } + } + _ => { + debug_assert_eq!(previous_usize, ASCII_DIGIT); + 0 + } + } + } + } + } else { + if previous_usize < stored_boundary { + // Current above, prev below + if previous_usize == 0 + || previous_usize == ASCII_DIGIT + || (is_windows_1256 && previous_usize == WINDOWS_1256_ZWNJ) + { + // Previous is space-like + 0 + } else { + // Current is alphabetic + let current_unstored = current_usize - stored_boundary; + match current_unstored { + PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE => 0, + IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE => IMPLAUSIBILITY_PENALTY, + IMPLAUSIBLE_BEFORE_ALPHABETIC => 0, + IMPLAUSIBLE_AFTER_ALPHABETIC => IMPLAUSIBILITY_PENALTY, + PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE => { + if previous_usize < self.ascii { + IMPLAUSIBILITY_PENALTY + } else { + 0 + } + } + PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE => { + if previous_usize < self.ascii { + 0 + } else { + IMPLAUSIBILITY_PENALTY + } + } + _ => { + debug_assert_eq!(current_usize, ASCII_DIGIT); + 0 + } + } + } + } else if current_usize == ASCII_DIGIT || previous_usize == ASCII_DIGIT { + 0 + } else { + // Both above + IMPLAUSIBILITY_PENALTY + } + } + } +} + +impl PartialEq for SingleByteData { + #[inline] + fn eq(&self, other: &SingleByteData) -> bool { + (self as *const SingleByteData) == (other as *const SingleByteData) + } +} + +pub static SINGLE_BYTE_DATA: [SingleByteData; 20] = [ + SingleByteData { + encoding: &WINDOWS_1258_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1258, + probabilities: &DETECTOR_DATA.vietnamese, + ascii: VIETNAMESE_ASCII, + non_ascii: VIETNAMESE_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1250_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1250, + probabilities: &DETECTOR_DATA.central, + ascii: CENTRAL_ASCII, + non_ascii: CENTRAL_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_2_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.iso_8859_2, + probabilities: &DETECTOR_DATA.central, + ascii: CENTRAL_ASCII, + non_ascii: CENTRAL_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1251_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_1251, + probabilities: &DETECTOR_DATA.cyrillic, + ascii: CYRILLIC_ASCII, + non_ascii: CYRILLIC_NON_ASCII, + }, + SingleByteData { + encoding: &KOI8_U_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.koi8_u, + probabilities: &DETECTOR_DATA.cyrillic, + ascii: CYRILLIC_ASCII, + non_ascii: CYRILLIC_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_5_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.iso_8859_5, + probabilities: &DETECTOR_DATA.cyrillic, + ascii: CYRILLIC_ASCII, + non_ascii: CYRILLIC_NON_ASCII, + }, + SingleByteData { + encoding: &IBM866_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.ibm866, + probabilities: &DETECTOR_DATA.cyrillic, + ascii: CYRILLIC_ASCII, + non_ascii: CYRILLIC_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1252_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1252, + probabilities: &DETECTOR_DATA.western, + ascii: WESTERN_ASCII, + non_ascii: WESTERN_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1252_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1252_icelandic, + probabilities: &DETECTOR_DATA.icelandic, + ascii: ICELANDIC_ASCII, + non_ascii: ICELANDIC_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1253_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_1253, + probabilities: &DETECTOR_DATA.greek, + ascii: GREEK_ASCII, + non_ascii: GREEK_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_7_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.iso_8859_7, + probabilities: &DETECTOR_DATA.greek, + ascii: GREEK_ASCII, + non_ascii: GREEK_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1254_INIT, + lower: &DETECTOR_DATA.turkish_ascii, + upper: &DETECTOR_DATA.windows_1254, + probabilities: &DETECTOR_DATA.turkish, + ascii: TURKISH_ASCII, + non_ascii: TURKISH_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1255_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_1255, + probabilities: &DETECTOR_DATA.hebrew, + ascii: HEBREW_ASCII, + non_ascii: HEBREW_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_8_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.iso_8859_8, + probabilities: &DETECTOR_DATA.hebrew, + ascii: HEBREW_ASCII, + non_ascii: HEBREW_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1256_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_1256, + probabilities: &DETECTOR_DATA.arabic, + ascii: ARABIC_ASCII, + non_ascii: ARABIC_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_6_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.iso_8859_6, + probabilities: &DETECTOR_DATA.arabic, + ascii: ARABIC_ASCII, + non_ascii: ARABIC_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1257_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1257, + probabilities: &DETECTOR_DATA.baltic, + ascii: BALTIC_ASCII, + non_ascii: BALTIC_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_13_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.iso_8859_13, + probabilities: &DETECTOR_DATA.baltic, + ascii: BALTIC_ASCII, + non_ascii: BALTIC_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_4_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.iso_8859_4, + probabilities: &DETECTOR_DATA.baltic, + ascii: BALTIC_ASCII, + non_ascii: BALTIC_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_874_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_874, + probabilities: &DETECTOR_DATA.thai, + ascii: THAI_ASCII, + non_ascii: THAI_NON_ASCII, + }, +]; + +pub const WINDOWS_1258_INDEX: usize = 0; +pub const WINDOWS_1250_INDEX: usize = 1; +pub const ISO_8859_2_INDEX: usize = 2; +pub const WINDOWS_1251_INDEX: usize = 3; +pub const KOI8_U_INDEX: usize = 4; +pub const ISO_8859_5_INDEX: usize = 5; +pub const IBM866_INDEX: usize = 6; +pub const WINDOWS_1252_INDEX: usize = 7; +pub const WINDOWS_1252_ICELANDIC_INDEX: usize = 8; +pub const WINDOWS_1253_INDEX: usize = 9; +pub const ISO_8859_7_INDEX: usize = 10; +pub const WINDOWS_1254_INDEX: usize = 11; +pub const WINDOWS_1255_INDEX: usize = 12; +pub const ISO_8859_8_INDEX: usize = 13; +pub const WINDOWS_1256_INDEX: usize = 14; +pub const ISO_8859_6_INDEX: usize = 15; +pub const WINDOWS_1257_INDEX: usize = 16; +pub const ISO_8859_13_INDEX: usize = 17; +pub const ISO_8859_4_INDEX: usize = 18; +pub const WINDOWS_874_INDEX: usize = 19; diff --git a/third_party/rust/chardetng/src/lib.rs b/third_party/rust/chardetng/src/lib.rs new file mode 100644 index 0000000000..c19d81d943 --- /dev/null +++ b/third_party/rust/chardetng/src/lib.rs @@ -0,0 +1,3775 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! `chardetng` is a character encoding detector for legacy Web content. +//! +//! It is optimized for binary size in applications that already depend +//! on `encoding_rs` for other reasons. + +use encoding_rs::Decoder; +use encoding_rs::DecoderResult; +use encoding_rs::Encoding; +use encoding_rs::BIG5; +use encoding_rs::EUC_JP; +use encoding_rs::EUC_KR; +use encoding_rs::GBK; +use encoding_rs::ISO_2022_JP; +use encoding_rs::ISO_8859_8; +use encoding_rs::SHIFT_JIS; +use encoding_rs::UTF_8; +use encoding_rs::WINDOWS_1255; + +mod data; +mod tld; +use data::*; +use tld::classify_tld; +use tld::Tld; + +const LATIN_ADJACENCY_PENALTY: i64 = -50; + +const IMPLAUSIBILITY_PENALTY: i64 = -220; + +const ORDINAL_BONUS: i64 = 300; + +/// Must match the ISO-8859-2 score for " Š ". Note: There +/// are four Slovenian Wikipedia list page titles where the +/// list is split by letter so that Š stands alone for the +/// list part for Š. Let's assume that's a special case not +/// worth detecting even though the copyright sign detection +/// makes Slovenian title detection round to one percentage +/// point worse. +const COPYRIGHT_BONUS: i64 = 222; + +const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180; + +const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40; + +const NON_LATIN_ALL_CAPS_PENALTY: i64 = -40; + +const NON_LATIN_MIXED_CASE_PENALTY: i64 = -20; + +// Manually calibrated relative to windows-1256 Arabic +const CJK_BASE_SCORE: i64 = 41; + +const CJK_SECONDARY_BASE_SCORE: i64 = 20; // Was 20 + +const SHIFT_JIS_SCORE_PER_KANA: i64 = 20; + +const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE; + +const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE; + +// Manually calibrated relative to windows-1256 Persian and Urdu +const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75; + +const HALF_WIDTH_KATAKANA_SCORE: i64 = 1; + +// Unclear if this is a good idea; seems not harmful, but can't be sure. +const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10; + +const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Should this be larger? + +const SHIFT_JIS_EXTENSION_PENALTY: i64 = SHIFT_JIS_PUA_PENALTY * 2; + +const SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY: i64 = SHIFT_JIS_EXTENSION_PENALTY; + +const EUC_JP_SCORE_PER_KANA: i64 = CJK_BASE_SCORE + (CJK_BASE_SCORE / 3); // Relative to Big5 + +const EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA: i64 = CJK_BASE_SCORE - 1; + +const EUC_JP_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE; + +const EUC_JP_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE; + +const EUC_JP_SCORE_PER_OTHER_KANJI: i64 = CJK_SECONDARY_BASE_SCORE / 4; + +const EUC_JP_INITIAL_KANA_PENALTY: i64 = -((CJK_BASE_SCORE / 3) + 1); + +const EUC_JP_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 50); // Needs to be more severe than for Shift_JIS to avoid misdetecting EUC-KR! + +const BIG5_SCORE_PER_LEVEL_1_HANZI: i64 = CJK_BASE_SCORE; + +const BIG5_SCORE_PER_OTHER_HANZI: i64 = CJK_SECONDARY_BASE_SCORE; + +const BIG5_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 30); // More severe than other PUA penalties to avoid misdetecting EUC-KR! (25 as the multiplier is too little) + +const BIG5_SINGLE_BYTE_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 40); + +const EUC_KR_SCORE_PER_EUC_HANGUL: i64 = CJK_BASE_SCORE + 1; + +const EUC_KR_SCORE_PER_NON_EUC_HANGUL: i64 = CJK_SECONDARY_BASE_SCORE / 5; + +const EUC_KR_SCORE_PER_HANJA: i64 = CJK_SECONDARY_BASE_SCORE / 2; + +const EUC_KR_HANJA_AFTER_HANGUL_PENALTY: i64 = -(CJK_BASE_SCORE * 10); + +const EUC_KR_LONG_WORD_PENALTY: i64 = -6; + +const EUC_KR_PUA_PENALTY: i64 = GBK_PUA_PENALTY - 1; // Break tie in favor of GBK + +const EUC_KR_MAC_KOREAN_PENALTY: i64 = EUC_KR_PUA_PENALTY * 2; + +const EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY: i64 = EUC_KR_MAC_KOREAN_PENALTY; + +const GBK_SCORE_PER_LEVEL_1: i64 = CJK_BASE_SCORE; + +const GBK_SCORE_PER_LEVEL_2: i64 = CJK_SECONDARY_BASE_SCORE; + +const GBK_SCORE_PER_NON_EUC: i64 = CJK_SECONDARY_BASE_SCORE / 4; + +const GBK_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Factor should be at least 2, but should it be larger? + +const GBK_SINGLE_BYTE_EXTENSION_PENALTY: i64 = GBK_PUA_PENALTY * 4; + +const CJK_LATIN_ADJACENCY_PENALTY: i64 = -CJK_BASE_SCORE; // smaller penalty than LATIN_ADJACENCY_PENALTY + +const CJ_PUNCTUATION: i64 = CJK_BASE_SCORE / 2; + +const CJK_OTHER: i64 = CJK_SECONDARY_BASE_SCORE / 4; + +/// Latin letter caseless class +const LATIN_LETTER: u8 = 1; + +fn contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool { + for &b in label.into_iter() { + if b >= 0x80 { + return true; + } + if b == b'.' { + return true; + } + if b >= b'A' && b <= b'Z' { + return true; + } + } + false +} + +// For Latin, we only penalize pairwise bad transitions +// if one participant is non-ASCII. This avoids violating +// the principle that ASCII pairs never contribute to the +// score. (Maybe that's a bad principle, though!) +#[derive(PartialEq)] +enum LatinCaseState { + Space, + Upper, + Lower, + AllCaps, +} + +// Fon non-Latin, we calculate case-related penalty +// or bonus on a per-non-Latin-word basis. +#[derive(PartialEq)] +enum NonLatinCaseState { + Space, + Upper, + Lower, + UpperLower, + AllCaps, + Mix, +} + +struct NonLatinCasedCandidate { + data: &'static SingleByteData, + prev: u8, + case_state: NonLatinCaseState, + prev_ascii: bool, + current_word_len: u64, + longest_word: u64, + ibm866: bool, + prev_was_a0: bool, // Only used with IBM866 +} + +impl NonLatinCasedCandidate { + fn new(data: &'static SingleByteData) -> Self { + NonLatinCasedCandidate { + data: data, + prev: 0, + case_state: NonLatinCaseState::Space, + prev_ascii: true, + current_word_len: 0, + longest_word: 0, + ibm866: data == &SINGLE_BYTE_DATA[IBM866_INDEX], + prev_was_a0: false, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); + + // The purpose of this state machine is to avoid misdetecting Greek as + // Cyrillic by: + // + // * Giving a small bonus to words that start with an upper-case letter + // and are lower-case for the rest. + // * Giving a large penalty to start with one lower-case letter followed + // by all upper-case (obviously upper and lower case inverted, which + // unfortunately is possible due to KOI8-U). + // * Giving a small per-word penalty to all-uppercase KOI8-U (to favor + // all-lowercase Greek over all-caps KOI8-U). + // * Giving large penalties for mixed-case other than initial upper-case. + // This also helps relative to non-cased encodings. + + // ASCII doesn't participate in non-Latin casing. + if caseless_class == LATIN_LETTER { + // Latin + // Mark this word as a mess. If there end up being non-Latin + // letters in this word, the ASCII-adjacency penalty gets + // applied to Latin/non-Latin pairs and the mix penalty + // to non-Latin/non-Latin pairs. + // XXX Apply penalty here + self.case_state = NonLatinCaseState::Mix; + } else if !non_ascii_alphabetic { + // Space + match self.case_state { + NonLatinCaseState::Space + | NonLatinCaseState::Upper + | NonLatinCaseState::Lower => {} + NonLatinCaseState::UpperLower => { + // Intentionally applied only once per word. + score += NON_LATIN_CAPITALIZATION_BONUS; + } + NonLatinCaseState::AllCaps => { + // Intentionally applied only once per word. + if self.data == &SINGLE_BYTE_DATA[KOI8_U_INDEX] { + // Apply only to KOI8-U. + score += NON_LATIN_ALL_CAPS_PENALTY; + } + } + NonLatinCaseState::Mix => { + // Per letter + score += NON_LATIN_MIXED_CASE_PENALTY * (self.current_word_len as i64); + } + } + self.case_state = NonLatinCaseState::Space; + } else if (class >> 7) == 0 { + // Lower case + match self.case_state { + NonLatinCaseState::Space => { + self.case_state = NonLatinCaseState::Lower; + } + NonLatinCaseState::Upper => { + self.case_state = NonLatinCaseState::UpperLower; + } + NonLatinCaseState::Lower + | NonLatinCaseState::UpperLower + | NonLatinCaseState::Mix => {} + NonLatinCaseState::AllCaps => { + self.case_state = NonLatinCaseState::Mix; + } + } + } else { + // Upper case + match self.case_state { + NonLatinCaseState::Space => { + self.case_state = NonLatinCaseState::Upper; + } + NonLatinCaseState::Upper => { + self.case_state = NonLatinCaseState::AllCaps; + } + NonLatinCaseState::Lower | NonLatinCaseState::UpperLower => { + self.case_state = NonLatinCaseState::Mix; + } + NonLatinCaseState::AllCaps | NonLatinCaseState::Mix => {} + } + } + + // XXX Apply penalty if > 16 + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + let is_a0 = b == 0xA0; + if !ascii_pair { + // 0xA0 is no-break space in many other encodings, so avoid + // assigning score to IBM866 when 0xA0 occurs next to itself + // or a space-like byte. + if !(self.ibm866 + && ((is_a0 && (self.prev_was_a0 || self.prev == 0)) + || caseless_class == 0 && self.prev_was_a0)) + { + score += self.data.score(caseless_class, self.prev, false); + } + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER + && self.data.is_non_latin_alphabetic(self.prev, false) + { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + self.prev_was_a0 = is_a0; + } + Some(score) + } +} + +enum OrdinalState { + Other, + Space, + PeriodAfterN, + OrdinalExpectingSpace, + OrdinalExpectingSpaceUndoImplausibility, + OrdinalExpectingSpaceOrDigit, + OrdinalExpectingSpaceOrDigitUndoImplausibily, + UpperN, + LowerN, + FeminineAbbreviationStartLetter, + Digit, + Roman, + Copyright, +} + +struct LatinCandidate { + data: &'static SingleByteData, + prev: u8, + case_state: LatinCaseState, + prev_non_ascii: u32, + ordinal_state: OrdinalState, // Used only when `windows1252 == true` + windows1252: bool, +} + +impl LatinCandidate { + fn new(data: &'static SingleByteData) -> Self { + LatinCandidate { + data: data, + prev: 0, + case_state: LatinCaseState::Space, + prev_non_ascii: 0, + ordinal_state: OrdinalState::Space, + windows1252: data == &SINGLE_BYTE_DATA[WINDOWS_1252_INDEX], + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_non_ascii == 0 && ascii; + + let non_ascii_penalty = match self.prev_non_ascii { + 0 | 1 | 2 => 0, + 3 => -5, + 4 => -20, + _ => -200, + }; + score += non_ascii_penalty; + // XXX if has Vietnamese-only characters and word length > 7, + // apply penalty + + if !self.data.is_latin_alphabetic(caseless_class) { + self.case_state = LatinCaseState::Space; + } else if (class >> 7) == 0 { + // Penalizing lower case after two upper case + // is important for avoiding misdetecting + // windows-1250 as windows-1252 (byte 0x9F). + if self.case_state == LatinCaseState::AllCaps && !ascii_pair { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + self.case_state = LatinCaseState::Lower; + } else { + match self.case_state { + LatinCaseState::Space => { + self.case_state = LatinCaseState::Upper; + } + LatinCaseState::Upper | LatinCaseState::AllCaps => { + self.case_state = LatinCaseState::AllCaps; + } + LatinCaseState::Lower => { + if !ascii_pair { + // XXX How bad is this for Irish Gaelic? + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + self.case_state = LatinCaseState::Upper; + } + } + } + + // Treat pairing space-like, which can be non-ASCII, with ASCII as + // ASCIIish enough not to get a score in order to avoid giving + // ASCII i and I in windows-1254 next to windows-125x apostrophe/quote + // a score. This avoids detecting English I’ as Turkish. + let ascii_ish_pair = ascii_pair + || (ascii && self.prev == 0) + || (caseless_class == 0 && self.prev_non_ascii == 0); + + if !ascii_ish_pair { + score += self.data.score(caseless_class, self.prev, false); + } + + if self.windows1252 { + // This state machine assigns score to the sequences + // * " º " (Spanish) + // * " ª " (Spanish) + // * ".ª " (Spanish) + // * ".º " (Spanish) + // * "n.º1" (Spanish) + // * " Mª " (Spanish) + // * " Dª " (Spanish) + // * " Nª " (Spanish) + // * " Sª " (Spanish) + // * " 3º " (Italian, where 3 is an ASCII digit) + // * " 3ª " (Italian, where 3 is an ASCII digit) + // * " Xº " (Italian, where X is a small Roman numeral) + // * " Xª " (Italian, where X is a small Roman numeral) + // * " Nº1" (Italian, where 1 is an ASCII digit) + // * " Nº " (Italian) + // * " © " (otherwise ASCII-only) + // which are problematic to deal with by pairwise scoring + // without messing up Romanian detection. + // Initial sc + match self.ordinal_state { + OrdinalState::Other => { + if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } + } + OrdinalState::Space => { + if caseless_class == 0 { + // pass + } else if b == 0xAA || b == 0xBA { + self.ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if b == b'M' || b == b'D' || b == b'S' { + self.ordinal_state = OrdinalState::FeminineAbbreviationStartLetter; + } else if b == b'N' { + // numero or Nuestra + self.ordinal_state = OrdinalState::UpperN; + } else if b == b'n' { + // numero + self.ordinal_state = OrdinalState::LowerN; + } else if caseless_class == (ASCII_DIGIT as u8) { + self.ordinal_state = OrdinalState::Digit; + } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24 + /* X */ + { + self.ordinal_state = OrdinalState::Roman; + } else if b == 0xA9 { + self.ordinal_state = OrdinalState::Copyright; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::OrdinalExpectingSpace => { + if caseless_class == 0 { + score += ORDINAL_BONUS; + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::OrdinalExpectingSpaceUndoImplausibility => { + if caseless_class == 0 { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::OrdinalExpectingSpaceOrDigit => { + if caseless_class == 0 { + score += ORDINAL_BONUS; + self.ordinal_state = OrdinalState::Space; + } else if caseless_class == (ASCII_DIGIT as u8) { + score += ORDINAL_BONUS; + // Deliberately set to `Other` + self.ordinal_state = OrdinalState::Other; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily => { + if caseless_class == 0 { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + self.ordinal_state = OrdinalState::Space; + } else if caseless_class == (ASCII_DIGIT as u8) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + // Deliberately set to `Other` + self.ordinal_state = OrdinalState::Other; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::UpperN => { + if b == 0xAA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if b == 0xBA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if b == b'.' { + self.ordinal_state = OrdinalState::PeriodAfterN; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::LowerN => { + if b == 0xBA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if b == b'.' { + self.ordinal_state = OrdinalState::PeriodAfterN; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::FeminineAbbreviationStartLetter => { + if b == 0xAA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::Digit => { + if b == 0xAA || b == 0xBA { + self.ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else if caseless_class == (ASCII_DIGIT as u8) { + // pass + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::Roman => { + if b == 0xAA || b == 0xBA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24 + /* X */ + { + // pass + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::PeriodAfterN => { + if b == 0xBA { + self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::Copyright => { + if caseless_class == 0 { + score += COPYRIGHT_BONUS; + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + } + } + + if ascii { + self.prev_non_ascii = 0; + } else { + self.prev_non_ascii += 1; + } + self.prev = caseless_class; + } + Some(score) + } +} + +struct ArabicFrenchCandidate { + data: &'static SingleByteData, + prev: u8, + case_state: LatinCaseState, + prev_ascii: bool, + current_word_len: u64, + longest_word: u64, +} + +impl ArabicFrenchCandidate { + fn new(data: &'static SingleByteData) -> Self { + ArabicFrenchCandidate { + data: data, + prev: 0, + case_state: LatinCaseState::Space, + prev_ascii: true, + current_word_len: 0, + longest_word: 0, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + if caseless_class != LATIN_LETTER { + // We compute case penalties for French only + self.case_state = LatinCaseState::Space; + } else if (class >> 7) == 0 { + if self.case_state == LatinCaseState::AllCaps && !ascii_pair { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + self.case_state = LatinCaseState::Lower; + } else { + match self.case_state { + LatinCaseState::Space => { + self.case_state = LatinCaseState::Upper; + } + LatinCaseState::Upper | LatinCaseState::AllCaps => { + self.case_state = LatinCaseState::AllCaps; + } + LatinCaseState::Lower => { + if !ascii_pair { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + self.case_state = LatinCaseState::Upper; + } + } + } + + // Count only Arabic word length and ignore French + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, true); + // XXX apply penalty if > 23 + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + if !ascii_pair { + score += self.data.score(caseless_class, self.prev, true); + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER + && self.data.is_non_latin_alphabetic(self.prev, true) + { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + } + Some(score) + } +} + +struct CaselessCandidate { + data: &'static SingleByteData, + prev: u8, + prev_ascii: bool, + current_word_len: u64, + longest_word: u64, +} + +impl CaselessCandidate { + fn new(data: &'static SingleByteData) -> Self { + CaselessCandidate { + data: data, + prev: 0, + prev_ascii: true, + current_word_len: 0, + longest_word: 0, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); + // Apply penalty if > 23 and not Thai + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + if !ascii_pair { + score += self.data.score(caseless_class, self.prev, false); + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER + && self.data.is_non_latin_alphabetic(self.prev, false) + { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + } + Some(score) + } +} + +fn is_ascii_punctuation(byte: u8) -> bool { + match byte { + b'.' | b',' | b':' | b';' | b'?' | b'!' => true, + _ => false, + } +} + +struct LogicalCandidate { + data: &'static SingleByteData, + prev: u8, + prev_ascii: bool, + plausible_punctuation: u64, + current_word_len: u64, + longest_word: u64, +} + +impl LogicalCandidate { + fn new(data: &'static SingleByteData) -> Self { + LogicalCandidate { + data: data, + prev: 0, + prev_ascii: true, + plausible_punctuation: 0, + current_word_len: 0, + longest_word: 0, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); + // XXX apply penalty if > 22 + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + if !ascii_pair { + score += self.data.score(caseless_class, self.prev, false); + + let prev_non_ascii_alphabetic = self.data.is_non_latin_alphabetic(self.prev, false); + if caseless_class == 0 && prev_non_ascii_alphabetic && is_ascii_punctuation(b) { + self.plausible_punctuation += 1; + } + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER && prev_non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + } + Some(score) + } +} + +struct VisualCandidate { + data: &'static SingleByteData, + prev: u8, + prev_ascii: bool, + prev_punctuation: bool, + plausible_punctuation: u64, + current_word_len: u64, + longest_word: u64, +} + +impl VisualCandidate { + fn new(data: &'static SingleByteData) -> Self { + VisualCandidate { + data: data, + prev: 0, + prev_ascii: true, + prev_punctuation: false, + plausible_punctuation: 0, + current_word_len: 0, + longest_word: 0, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); + // XXX apply penalty if > 22 + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + if !ascii_pair { + score += self.data.score(caseless_class, self.prev, false); + + if non_ascii_alphabetic && self.prev_punctuation { + self.plausible_punctuation += 1; + } + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER + && self.data.is_non_latin_alphabetic(self.prev, false) + { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + self.prev_punctuation = caseless_class == 0 && is_ascii_punctuation(b); + } + Some(score) + } +} + +struct Utf8Candidate { + decoder: Decoder, +} + +impl Utf8Candidate { + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut dst = [0u8; 1024]; + let mut total_read = 0; + loop { + let (result, read, _) = self.decoder.decode_to_utf8_without_replacement( + &buffer[total_read..], + &mut dst, + last, + ); + total_read += read; + match result { + DecoderResult::InputEmpty => { + return Some(0); + } + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + continue; + } + } + } + } +} + +struct Iso2022Candidate { + decoder: Decoder, +} + +impl Iso2022Candidate { + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut dst = [0u16; 1024]; + let mut total_read = 0; + loop { + let (result, read, _) = self.decoder.decode_to_utf16_without_replacement( + &buffer[total_read..], + &mut dst, + last, + ); + total_read += read; + match result { + DecoderResult::InputEmpty => { + return Some(0); + } + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + continue; + } + } + } + } +} + +#[derive(PartialEq)] +enum LatinCj { + AsciiLetter, + Cj, + Other, +} + +#[derive(PartialEq, Copy, Clone)] +enum HalfWidthKatakana { + DakutenForbidden, + DakutenAllowed, + DakutenOrHandakutenAllowed, +} + +#[derive(PartialEq)] +enum LatinKorean { + AsciiLetter, + Hangul, + Hanja, + Other, +} + +fn cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64 { + if let Some(pos) = table.iter().position(|&x| x == u) { + ((128 - pos) / 16) as i64 + } else { + 0 + } +} + +struct GbkCandidate { + decoder: Decoder, + prev_byte: u8, + prev: LatinCj, + pending_score: Option<i64>, +} + +impl GbkCandidate { + fn maybe_set_as_pending(&mut self, s: i64) -> i64 { + assert!(self.pending_score.is_none()); + if self.prev == LatinCj::Cj || !more_problematic_lead(self.prev_byte) { + s + } else { + self.pending_score = Some(s); + 0 + } + } + + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written == 1 { + let u = dst[0]; + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + self.pending_score = None; // Discard pending score + if self.prev == LatinCj::Cj { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::AsciiLetter; + } else if u == 0x20AC { + // euro sign + self.pending_score = None; // Discard pending score + // Should there even be a penalty? + self.prev = LatinCj::Other; + } else if u >= 0x4E00 && u <= 0x9FA5 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + if b >= 0xA1 && b <= 0xFE { + match self.prev_byte { + 0xA1..=0xD7 => { + score += GBK_SCORE_PER_LEVEL_1; + score += + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_simplified); + } + 0xD8..=0xFE => score += GBK_SCORE_PER_LEVEL_2, + _ => { + score += GBK_SCORE_PER_NON_EUC; + } + } + } else { + score += self.maybe_set_as_pending(GBK_SCORE_PER_NON_EUC); + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // XXX score? + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if u >= 0xE000 && u < 0xF900 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // Treat the GB18030-required PUA mappings as non-EUC ideographs. + match u { + 0xE78D..=0xE796 + | 0xE816..=0xE818 + | 0xE81E + | 0xE826 + | 0xE82B + | 0xE82C + | 0xE831 + | 0xE832 + | 0xE83B + | 0xE843 + | 0xE854 + | 0xE855 + | 0xE864 => { + score += GBK_SCORE_PER_NON_EUC; + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } + _ => { + score += GBK_PUA_PENALTY; + self.prev = LatinCj::Other; + } + } + } else { + match u { + 0x3000 // Distinct from Korean, space + | 0x3001 // Distinct from Korean, enumeration comma + | 0x3002 // Distinct from Korean, full stop + | 0xFF08 // Distinct from Korean, parenthesis + | 0xFF09 // Distinct from Korean, parenthesis + | 0xFF01 // Distinct from Japanese, exclamation + | 0xFF0C // Distinct from Japanese, comma + | 0xFF1B // Distinct from Japanese, semicolon + | 0xFF1F // Distinct from Japanese, question + => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJ_PUNCTUATION; + } + 0..=0x7F => { + self.pending_score = None; // Discard pending score + } + _ => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJK_OTHER; + } + } + self.prev = LatinCj::Other; + } + } else if written == 2 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + let u = dst[0]; + if u >= 0xDB80 && u <= 0xDBFF { + score += GBK_PUA_PENALTY; + self.prev = LatinCj::Other; + } else if u >= 0xD480 && u < 0xD880 { + score += GBK_SCORE_PER_NON_EUC; + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else { + score += CJK_OTHER; + self.prev = LatinCj::Other; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(malformed_len, _) => { + if (self.prev_byte == 0xA0 || self.prev_byte == 0xFE || self.prev_byte == 0xFD) + && (b < 0x80 || b == 0xFF) + { + // Mac OS Chinese Simplified single-byte that conflicts with code page GBK lead byte + // followed by ASCII or a non-conflicting single-byte extension. + self.pending_score = None; // Just in case + score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; + if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { + self.prev = LatinCj::AsciiLetter; + } else if b == 0xFF { + score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + } else { + self.prev = LatinCj::Other; + } + // The GBK decoder has the pending ASCII concept, which is + // a problem with this trickery, so let's reset the state. + self.decoder = GBK.new_decoder_without_bom_handling(); + } else if malformed_len == 1 && b == 0xFF { + // Mac OS Chinese Simplified single-byte extension that doesn't conflict with lead bytes + self.pending_score = None; // Just in case + score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + // The GBK decoder has the pending ASCII concept, which is + // a problem with this trickery, so let's reset the state. + self.decoder = GBK.new_decoder_without_bom_handling(); + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +// Shift_JIS and Big5 +fn problematic_lead(b: u8) -> bool { + match b { + 0x91..=0x97 | 0x9A | 0x8A | 0x9B | 0x8B | 0x9E | 0x8E | 0xB0 => true, + _ => false, + } +} + +// GBK and EUC-KR +fn more_problematic_lead(b: u8) -> bool { + problematic_lead(b) || b == 0x82 || b == 0x84 || b == 0x85 || b == 0xA0 +} + +struct ShiftJisCandidate { + decoder: Decoder, + half_width_katakana_seen: bool, + half_width_katakana_state: HalfWidthKatakana, + prev: LatinCj, + prev_byte: u8, + pending_score: Option<i64>, +} + +impl ShiftJisCandidate { + fn maybe_set_as_pending(&mut self, s: i64) -> i64 { + assert!(self.pending_score.is_none()); + if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) { + s + } else { + self.pending_score = Some(s); + 0 + } + } + + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written > 0 { + let half_width_katakana_state = self.half_width_katakana_state; + self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden; + let u = dst[0]; + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + self.pending_score = None; // Discard pending score + if self.prev == LatinCj::Cj { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::AsciiLetter; + } else if u >= 0xFF61 && u <= 0xFF9F { + if !self.half_width_katakana_seen { + self.half_width_katakana_seen = true; + // To avoid misdetecting title-length inputs + score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY; + } + self.pending_score = None; // Discard pending score + score += HALF_WIDTH_KATAKANA_SCORE; + + if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 { + self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed; + } else if u >= 0xFF8A && u <= 0xFF8E { + self.half_width_katakana_state = + HalfWidthKatakana::DakutenOrHandakutenAllowed; + } else if u == 0xFF9E { + if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden { + score += IMPLAUSIBILITY_PENALTY; + } else { + score += HALF_WIDTH_KATAKANA_VOICING_SCORE; + } + } else if u == 0xFF9F { + if half_width_katakana_state + != HalfWidthKatakana::DakutenOrHandakutenAllowed + { + score += IMPLAUSIBILITY_PENALTY; + } else { + score += HALF_WIDTH_KATAKANA_VOICING_SCORE; + } + } + + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if u >= 0x3040 && u < 0x3100 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += SHIFT_JIS_SCORE_PER_KANA; + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + if self.prev_byte < 0x98 || (self.prev_byte == 0x98 && b < 0x73) { + score += self.maybe_set_as_pending( + SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI + + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji), + ); + } else { + score += self.maybe_set_as_pending(SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI); + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if u >= 0xE000 && u < 0xF900 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += SHIFT_JIS_PUA_PENALTY; + self.prev = LatinCj::Other; + } else { + match u { + 0x3000 // Distinct from Korean, space + | 0x3001 // Distinct from Korean, enumeration comma + | 0x3002 // Distinct from Korean, full stop + | 0xFF08 // Distinct from Korean, parenthesis + | 0xFF09 // Distinct from Korean, parenthesis + => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // Not really needed for CJK distinction + // but let's give non-zero score for these + // common byte pairs anyway. + score += CJ_PUNCTUATION; + } + 0..=0x7F => { + self.pending_score = None; // Discard pending score + } + 0x80 => { + // This is a control character that overlaps euro + // in windows-1252 and happens to be a non-error + // is Shift_JIS. + self.pending_score = None; // Discard pending score + score += IMPLAUSIBILITY_PENALTY; + } + _ => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJK_OTHER; + } + } + self.prev = LatinCj::Other; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(malformed_len, _) => { + if (((self.prev_byte >= 0x81 && self.prev_byte <= 0x9F) + || (self.prev_byte >= 0xE0 && self.prev_byte <= 0xFC)) + && ((b >= 0x40 && b <= 0x7E) || (b >= 0x80 && b <= 0xFC))) + && !((self.prev_byte == 0x82 && b >= 0xFA) + || (self.prev_byte == 0x84 && ((b >= 0xDD && b <= 0xE4) || b >= 0xFB)) + || (self.prev_byte == 0x86 && b >= 0xF2 && b <= 0xFA) + || (self.prev_byte == 0x87 && b >= 0x77 && b <= 0x7D) + || (self.prev_byte == 0xFC && b >= 0xF5)) + { + // Shift_JIS2004 or MacJapanese + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += SHIFT_JIS_EXTENSION_PENALTY; + // Approximate boundary + if self.prev_byte < 0x87 { + self.prev = LatinCj::Other; + } else { + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } + } else if malformed_len == 1 && (b == 0xA0 || b >= 0xFD) { + self.pending_score = None; // Just in case + score += SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +struct EucJpCandidate { + decoder: Decoder, + non_ascii_seen: bool, + half_width_katakana_state: HalfWidthKatakana, + prev: LatinCj, + prev_byte: u8, + prev_prev_byte: u8, +} + +impl EucJpCandidate { + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written > 0 { + let half_width_katakana_state = self.half_width_katakana_state; + self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden; + let u = dst[0]; + if !self.non_ascii_seen && u >= 0x80 { + self.non_ascii_seen = true; + if u >= 0x3040 && u < 0x3100 { + // Remove the kana advantage over initial Big5 + // hanzi. + score += EUC_JP_INITIAL_KANA_PENALTY; + } + } + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + if self.prev == LatinCj::Cj { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::AsciiLetter; + } else if u >= 0xFF61 && u <= 0xFF9F { + score += HALF_WIDTH_KATAKANA_SCORE; + + if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 { + self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed; + } else if u >= 0xFF8A && u <= 0xFF8E { + self.half_width_katakana_state = + HalfWidthKatakana::DakutenOrHandakutenAllowed; + } else if u == 0xFF9E { + if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden { + score += IMPLAUSIBILITY_PENALTY; + } else { + score += HALF_WIDTH_KATAKANA_VOICING_SCORE; + } + } else if u == 0xFF9F { + if half_width_katakana_state + != HalfWidthKatakana::DakutenOrHandakutenAllowed + { + score += IMPLAUSIBILITY_PENALTY; + } else { + score += HALF_WIDTH_KATAKANA_VOICING_SCORE; + } + } + + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Other; + } else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) { + match u { + 0x3090 // hiragana wi + | 0x3091 // hiragana we + | 0x30F0 // katakana wi + | 0x30F1 // katakana we + => { + // Remove advantage over Big5 Hanzi + score += EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA; + } + _ => { + score += EUC_JP_SCORE_PER_KANA; + } + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { + if self.prev_prev_byte == 0x8F { + score += EUC_JP_SCORE_PER_OTHER_KANJI; + } else if self.prev_byte < 0xD0 { + score += EUC_JP_SCORE_PER_LEVEL_1_KANJI; + score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji); + } else { + score += EUC_JP_SCORE_PER_LEVEL_2_KANJI; + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else { + match u { + 0x3000 // Distinct from Korean, space + | 0x3001 // Distinct from Korean, enumeration comma + | 0x3002 // Distinct from Korean, full stop + | 0xFF08 // Distinct from Korean, parenthesis + | 0xFF09 // Distinct from Korean, parenthesis + => { + score += CJ_PUNCTUATION; + } + 0..=0x7F => {} + _ => { + score += CJK_OTHER; + } + } + self.prev = LatinCj::Other; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(_, _) => { + if b >= 0xA1 + && b <= 0xFE + && self.prev_byte >= 0xA1 + && self.prev_byte <= 0xFE + && ((self.prev_prev_byte != 0x8F + && !(self.prev_byte == 0xA8 && b >= 0xDF && b <= 0xE6) + && !(self.prev_byte == 0xAC && b >= 0xF4 && b <= 0xFC) + && !(self.prev_byte == 0xAD && b >= 0xD8 && b <= 0xDE)) + || (self.prev_prev_byte == 0x8F + && self.prev_byte != 0xA2 + && self.prev_byte != 0xA6 + && self.prev_byte != 0xA7 + && self.prev_byte != 0xA9 + && self.prev_byte != 0xAA + && self.prev_byte != 0xAB + && self.prev_byte != 0xED + && !(self.prev_byte == 0xFE && b >= 0xF7))) + { + score += EUC_JP_EXTENSION_PENALTY; + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_prev_byte = self.prev_byte; + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +struct Big5Candidate { + decoder: Decoder, + prev: LatinCj, + prev_byte: u8, + pending_score: Option<i64>, +} + +impl Big5Candidate { + fn maybe_set_as_pending(&mut self, s: i64) -> i64 { + assert!(self.pending_score.is_none()); + if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) { + s + } else { + self.pending_score = Some(s); + 0 + } + } + + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written == 1 { + let u = dst[0]; + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + self.pending_score = None; // Discard pending score + if self.prev == LatinCj::Cj { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::AsciiLetter; + } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + match self.prev_byte { + 0xA4..=0xC6 => { + score += self.maybe_set_as_pending(BIG5_SCORE_PER_LEVEL_1_HANZI); + // score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_traditional); + } + _ => { + score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI); + } + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else { + match u { + 0x3000 // Distinct from Korean, space + | 0x3001 // Distinct from Korean, enumeration comma + | 0x3002 // Distinct from Korean, full stop + | 0xFF08 // Distinct from Korean, parenthesis + | 0xFF09 // Distinct from Korean, parenthesis + | 0xFF01 // Distinct from Japanese, exclamation + | 0xFF0C // Distinct from Japanese, comma + | 0xFF1B // Distinct from Japanese, semicolon + | 0xFF1F // Distinct from Japanese, question + => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // Not really needed for CJK distinction + // but let's give non-zero score for these + // common byte pairs anyway. + score += CJ_PUNCTUATION; + } + 0..=0x7F => { + self.pending_score = None; // Discard pending score + } + _ => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJK_OTHER; + } + } + self.prev = LatinCj::Other; + } + } else if written == 2 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + if dst[0] == 0xCA || dst[0] == 0xEA { + score += CJK_OTHER; + self.prev = LatinCj::Other; + } else { + debug_assert!(dst[0] >= 0xD480 && dst[0] < 0xD880); + score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI); + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(malformed_len, _) => { + if self.prev_byte >= 0x81 + && self.prev_byte <= 0xFE + && ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE)) + { + // The byte pair is in the Big5 range but unmapped. + // Treat as PUA to avoid rejecting Big5-UAO, etc. + // We don't reprocess `b` even if ASCII, since it's + // logically part of the pair. + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += BIG5_PUA_PENALTY; + // Assume Hanzi semantics + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if (self.prev_byte == 0xA0 + || self.prev_byte == 0xFD + || self.prev_byte == 0xFE) + && (b < 0x80 || b == 0xFF) + { + // Mac OS Chinese Traditional single-byte that conflicts with code page Big5 lead byte + // followed by ASCII or a non-conflicting single-byte extension. + self.pending_score = None; // Just in case + score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; + if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { + self.prev = LatinCj::AsciiLetter; + } else if b == 0xFF { + score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + } else { + self.prev = LatinCj::Other; + } + } else if malformed_len == 1 && b == 0xFF { + // Mac OS Chinese Traditional single-byte extension that doesn't conflict with lead bytes + self.pending_score = None; // Just in case + score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +struct EucKrCandidate { + decoder: Decoder, + prev_byte: u8, + prev_was_euc_range: bool, + prev: LatinKorean, + current_word_len: u64, + pending_score: Option<i64>, +} + +impl EucKrCandidate { + fn maybe_set_as_pending(&mut self, s: i64) -> i64 { + assert!(self.pending_score.is_none()); + if self.prev == LatinKorean::Hangul || !more_problematic_lead(self.prev_byte) { + s + } else { + self.pending_score = Some(s); + 0 + } + } + + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + let in_euc_range = b >= 0xA1 && b <= 0xFE; + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written > 0 { + let u = dst[0]; + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + self.pending_score = None; // Discard pending score + match self.prev { + LatinKorean::Hangul | LatinKorean::Hanja => { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + _ => {} + } + self.prev = LatinKorean::AsciiLetter; + self.current_word_len = 0; + } else if u >= 0xAC00 && u <= 0xD7A3 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + if self.prev_was_euc_range && in_euc_range { + score += EUC_KR_SCORE_PER_EUC_HANGUL; + score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_hangul); + } else { + score += self.maybe_set_as_pending(EUC_KR_SCORE_PER_NON_EUC_HANGUL); + } + if self.prev == LatinKorean::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinKorean::Hangul; + self.current_word_len += 1; + if self.current_word_len > 5 { + score += EUC_KR_LONG_WORD_PENALTY; + } + } else if (u >= 0x4E00 && u < 0xAC00) || (u >= 0xF900 && u <= 0xFA0B) { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += EUC_KR_SCORE_PER_HANJA; + match self.prev { + LatinKorean::AsciiLetter => { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + LatinKorean::Hangul => { + score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY; + } + _ => {} + } + self.prev = LatinKorean::Hanja; + self.current_word_len += 1; + if self.current_word_len > 5 { + score += EUC_KR_LONG_WORD_PENALTY; + } + } else { + if u >= 0x80 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJK_OTHER; + } else { + self.pending_score = None; // Discard pending score + } + self.prev = LatinKorean::Other; + self.current_word_len = 0; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(malformed_len, _) => { + if (self.prev_byte == 0xC9 || self.prev_byte == 0xFE) && b >= 0xA1 && b <= 0xFE + { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // The byte pair is in code page 949 EUDC range + score += EUC_KR_PUA_PENALTY; + // Assume Hanja semantics + match self.prev { + LatinKorean::AsciiLetter => { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + LatinKorean::Hangul => { + score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY; + } + _ => {} + } + self.prev = LatinKorean::Hanja; + self.current_word_len += 1; + if self.current_word_len > 5 { + score += EUC_KR_LONG_WORD_PENALTY; + } + } else if (self.prev_byte == 0xA1 + || (self.prev_byte >= 0xA3 && self.prev_byte <= 0xA8) + || (self.prev_byte >= 0xAA && self.prev_byte <= 0xAD)) + && (b >= 0x7B && b <= 0x7D) + { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // MacKorean symbols in range not part of code page 949 + score += EUC_KR_MAC_KOREAN_PENALTY; + self.prev = LatinKorean::Other; + self.current_word_len = 0; + } else if (self.prev_byte >= 0x81 && self.prev_byte <= 0x84) + && (b <= 0x80 || b == 0xFF) + { + // MacKorean single-byte that conflicts with code page 949 lead byte + // followed by ASCII or a non-conflicting single-byte extension. + self.pending_score = None; // Just in case + score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; + if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { + self.prev = LatinKorean::AsciiLetter; + } else if b == 0x80 || b == 0xFF { + score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinKorean::Other; + } else { + self.prev = LatinKorean::Other; + } + self.current_word_len = 0; + } else if malformed_len == 1 && (b == 0x80 || b == 0xFF) { + // MacKorean single-byte extensions that don't conflict with lead bytes + self.pending_score = None; // Just in case + score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinKorean::Other; + self.current_word_len = 0; + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_was_euc_range = in_euc_range; + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +enum InnerCandidate { + Latin(LatinCandidate), + NonLatinCased(NonLatinCasedCandidate), + Caseless(CaselessCandidate), + ArabicFrench(ArabicFrenchCandidate), + Logical(LogicalCandidate), + Visual(VisualCandidate), + Utf8(Utf8Candidate), + Iso2022(Iso2022Candidate), + Shift(ShiftJisCandidate), + EucJp(EucJpCandidate), + EucKr(EucKrCandidate), + Big5(Big5Candidate), + Gbk(GbkCandidate), +} + +impl InnerCandidate { + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + match self { + InnerCandidate::Latin(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::NonLatinCased(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::Caseless(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::ArabicFrench(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::Logical(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::Visual(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::Utf8(c) => c.feed(buffer, last), + InnerCandidate::Iso2022(c) => c.feed(buffer, last), + InnerCandidate::Shift(c) => c.feed(buffer, last), + InnerCandidate::EucJp(c) => c.feed(buffer, last), + InnerCandidate::EucKr(c) => c.feed(buffer, last), + InnerCandidate::Big5(c) => c.feed(buffer, last), + InnerCandidate::Gbk(c) => c.feed(buffer, last), + } + } +} + +fn encoding_for_tld(tld: Tld) -> usize { + match tld { + Tld::CentralWindows | Tld::CentralCyrillic => EncodingDetector::CENTRAL_WINDOWS_INDEX, + Tld::Cyrillic => EncodingDetector::CYRILLIC_WINDOWS_INDEX, + Tld::Generic | Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic | Tld::Eu => { + EncodingDetector::WESTERN_INDEX + } + Tld::IcelandicFaroese => EncodingDetector::ICELANDIC_INDEX, + Tld::Greek => EncodingDetector::GREEK_ISO_INDEX, + Tld::TurkishAzeri => EncodingDetector::TURKISH_INDEX, + Tld::Hebrew => EncodingDetector::LOGICAL_INDEX, + Tld::Arabic => EncodingDetector::ARABIC_WINDOWS_INDEX, + Tld::Baltic => EncodingDetector::BALTIC_WINDOWS_INDEX, + Tld::Vietnamese => EncodingDetector::VIETNAMESE_INDEX, + Tld::Thai => EncodingDetector::THAI_INDEX, + Tld::Simplified | Tld::SimplifiedTraditional => EncodingDetector::GBK_INDEX, + Tld::Traditional | Tld::TraditionalSimplified => EncodingDetector::BIG5_INDEX, + Tld::Japanese => EncodingDetector::SHIFT_JIS_INDEX, + Tld::Korean => EncodingDetector::EUC_KR_INDEX, + Tld::CentralIso => EncodingDetector::CENTRAL_ISO_INDEX, + } +} + +fn encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool { + match tld { + Tld::CentralWindows => encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX, + Tld::Cyrillic => { + encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX + || encoding == EncodingDetector::CYRILLIC_KOI_INDEX + || encoding == EncodingDetector::CYRILLIC_IBM_INDEX + || encoding == EncodingDetector::CYRILLIC_ISO_INDEX + } + Tld::Western => encoding == EncodingDetector::WESTERN_INDEX, + Tld::Greek => { + encoding == EncodingDetector::GREEK_WINDOWS_INDEX + || encoding == EncodingDetector::GREEK_ISO_INDEX + } + Tld::TurkishAzeri => encoding == EncodingDetector::TURKISH_INDEX, + Tld::Hebrew => encoding == EncodingDetector::LOGICAL_INDEX, + Tld::Arabic => { + encoding == EncodingDetector::ARABIC_WINDOWS_INDEX + || encoding == EncodingDetector::ARABIC_ISO_INDEX + } + Tld::Baltic => { + encoding == EncodingDetector::BALTIC_WINDOWS_INDEX + || encoding == EncodingDetector::BALTIC_ISO13_INDEX + || encoding == EncodingDetector::BALTIC_ISO4_INDEX + } + Tld::Vietnamese => encoding == EncodingDetector::VIETNAMESE_INDEX, + Tld::Thai => encoding == EncodingDetector::THAI_INDEX, + Tld::Simplified => encoding == EncodingDetector::GBK_INDEX, + Tld::Traditional => encoding == EncodingDetector::BIG5_INDEX, + Tld::Japanese => { + encoding == EncodingDetector::SHIFT_JIS_INDEX + || encoding == EncodingDetector::EUC_JP_INDEX + } + Tld::Korean => encoding == EncodingDetector::EUC_KR_INDEX, + Tld::SimplifiedTraditional | Tld::TraditionalSimplified => { + encoding == EncodingDetector::GBK_INDEX || encoding == EncodingDetector::BIG5_INDEX + } + Tld::CentralIso => encoding == EncodingDetector::CENTRAL_ISO_INDEX, + Tld::IcelandicFaroese => encoding == EncodingDetector::ICELANDIC_INDEX, + Tld::WesternCyrillic => { + encoding == EncodingDetector::WESTERN_INDEX + || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX + || encoding == EncodingDetector::CYRILLIC_KOI_INDEX + || encoding == EncodingDetector::CYRILLIC_IBM_INDEX + || encoding == EncodingDetector::CYRILLIC_ISO_INDEX + } + Tld::CentralCyrillic => { + encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX + || encoding == EncodingDetector::CENTRAL_ISO_INDEX + || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX + || encoding == EncodingDetector::CYRILLIC_KOI_INDEX + || encoding == EncodingDetector::CYRILLIC_IBM_INDEX + || encoding == EncodingDetector::CYRILLIC_ISO_INDEX + } + Tld::WesternArabic => { + encoding == EncodingDetector::WESTERN_INDEX + || encoding == EncodingDetector::ARABIC_WINDOWS_INDEX + || encoding == EncodingDetector::ARABIC_ISO_INDEX + } + Tld::Eu => { + encoding == EncodingDetector::WESTERN_INDEX + || encoding == EncodingDetector::ICELANDIC_INDEX + || encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX + || encoding == EncodingDetector::CENTRAL_ISO_INDEX + || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX + || encoding == EncodingDetector::CYRILLIC_KOI_INDEX + || encoding == EncodingDetector::CYRILLIC_IBM_INDEX + || encoding == EncodingDetector::CYRILLIC_ISO_INDEX + || encoding == EncodingDetector::GREEK_WINDOWS_INDEX + || encoding == EncodingDetector::GREEK_ISO_INDEX + || encoding == EncodingDetector::BALTIC_WINDOWS_INDEX + || encoding == EncodingDetector::BALTIC_ISO13_INDEX + || encoding == EncodingDetector::BALTIC_ISO4_INDEX + } + Tld::Generic => false, + } +} + +fn score_adjustment(score: i64, encoding: usize, tld: Tld) -> i64 { + if score < 1 { + return 0; + } + // This is the most ad hoc part of this library. + let (divisor, constant) = match tld { + Tld::Generic => { + unreachable!(); + } + Tld::CentralWindows | Tld::CentralIso => { + match encoding { + EncodingDetector::WESTERN_INDEX + | EncodingDetector::ICELANDIC_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::VIETNAMESE_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Cyrillic => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::VIETNAMESE_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Greek => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::CYRILLIC_WINDOWS_INDEX + | EncodingDetector::CYRILLIC_ISO_INDEX + | EncodingDetector::CYRILLIC_KOI_INDEX + | EncodingDetector::CYRILLIC_IBM_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::TurkishAzeri => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::VIETNAMESE_INDEX + | EncodingDetector::ICELANDIC_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Hebrew => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::CYRILLIC_WINDOWS_INDEX + | EncodingDetector::CYRILLIC_ISO_INDEX + | EncodingDetector::CYRILLIC_KOI_INDEX + | EncodingDetector::CYRILLIC_IBM_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::VIETNAMESE_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Arabic => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::EUC_KR_INDEX + | EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::CYRILLIC_WINDOWS_INDEX + | EncodingDetector::CYRILLIC_ISO_INDEX + | EncodingDetector::CYRILLIC_KOI_INDEX + | EncodingDetector::CYRILLIC_IBM_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::VIETNAMESE_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Baltic => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::ICELANDIC_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::VIETNAMESE_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Vietnamese => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::ICELANDIC_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Thai => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::EUC_KR_INDEX + | EncodingDetector::SHIFT_JIS_INDEX + | EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::CYRILLIC_WINDOWS_INDEX + | EncodingDetector::CYRILLIC_ISO_INDEX + | EncodingDetector::CYRILLIC_KOI_INDEX + | EncodingDetector::CYRILLIC_IBM_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::ARABIC_WINDOWS_INDEX + | EncodingDetector::ARABIC_ISO_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Simplified + | Tld::Traditional + | Tld::TraditionalSimplified + | Tld::SimplifiedTraditional + | Tld::Japanese + | Tld::Korean => { + // If TLD default is valid, everything else scores zero + return score; + } + Tld::IcelandicFaroese => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::VIETNAMESE_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::CentralCyrillic => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Eu => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::VIETNAMESE_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + }; + (score / divisor) + constant +} + +struct Candidate { + inner: InnerCandidate, + score: Option<i64>, +} + +impl Candidate { + fn feed(&mut self, buffer: &[u8], last: bool) { + if let Some(old_score) = self.score { + if let Some(new_score) = self.inner.feed(buffer, last) { + self.score = Some(old_score + new_score); + } else { + self.score = None; + } + } + } + + fn new_latin(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::Latin(LatinCandidate::new(data)), + score: Some(0), + } + } + + fn new_non_latin_cased(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::NonLatinCased(NonLatinCasedCandidate::new(data)), + score: Some(0), + } + } + + fn new_caseless(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::Caseless(CaselessCandidate::new(data)), + score: Some(0), + } + } + + fn new_arabic_french(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::ArabicFrench(ArabicFrenchCandidate::new(data)), + score: Some(0), + } + } + + fn new_logical(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::Logical(LogicalCandidate::new(data)), + score: Some(0), + } + } + + fn new_visual(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::Visual(VisualCandidate::new(data)), + score: Some(0), + } + } + + fn new_utf_8() -> Self { + Candidate { + inner: InnerCandidate::Utf8(Utf8Candidate { + decoder: UTF_8.new_decoder_without_bom_handling(), + }), + score: Some(0), + } + } + + fn new_iso_2022_jp() -> Self { + Candidate { + inner: InnerCandidate::Iso2022(Iso2022Candidate { + decoder: ISO_2022_JP.new_decoder_without_bom_handling(), + }), + score: Some(0), + } + } + + fn new_shift_jis() -> Self { + Candidate { + inner: InnerCandidate::Shift(ShiftJisCandidate { + decoder: SHIFT_JIS.new_decoder_without_bom_handling(), + half_width_katakana_seen: false, + half_width_katakana_state: HalfWidthKatakana::DakutenForbidden, + prev: LatinCj::Other, + prev_byte: 0, + pending_score: None, + }), + score: Some(0), + } + } + + fn new_euc_jp() -> Self { + Candidate { + inner: InnerCandidate::EucJp(EucJpCandidate { + decoder: EUC_JP.new_decoder_without_bom_handling(), + non_ascii_seen: false, + half_width_katakana_state: HalfWidthKatakana::DakutenForbidden, + prev: LatinCj::Other, + prev_byte: 0, + prev_prev_byte: 0, + }), + score: Some(0), + } + } + + fn new_euc_kr() -> Self { + Candidate { + inner: InnerCandidate::EucKr(EucKrCandidate { + decoder: EUC_KR.new_decoder_without_bom_handling(), + prev_byte: 0, + prev_was_euc_range: false, + prev: LatinKorean::Other, + current_word_len: 0, + pending_score: None, + }), + score: Some(0), + } + } + + fn new_big5() -> Self { + Candidate { + inner: InnerCandidate::Big5(Big5Candidate { + decoder: BIG5.new_decoder_without_bom_handling(), + prev: LatinCj::Other, + prev_byte: 0, + pending_score: None, + }), + score: Some(0), + } + } + + fn new_gbk() -> Self { + Candidate { + inner: InnerCandidate::Gbk(GbkCandidate { + decoder: GBK.new_decoder_without_bom_handling(), + prev: LatinCj::Other, + prev_byte: 0, + pending_score: None, + }), + score: Some(0), + } + } + + fn score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option<i64> { + match &self.inner { + InnerCandidate::NonLatinCased(c) => { + if c.longest_word < 2 { + return None; + } + } + InnerCandidate::Caseless(c) => { + if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { + return None; + } + } + InnerCandidate::ArabicFrench(c) => { + if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { + return None; + } + } + InnerCandidate::Logical(c) => { + if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { + return None; + } + } + InnerCandidate::Visual(c) => { + if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { + return None; + } + } + _ => {} + } + if tld == Tld::Generic { + return self.score; + } + if let Some(score) = self.score { + if encoding == encoding_for_tld(tld) { + return Some(score + 1); + } + if encoding_is_native_to_tld(tld, encoding) { + return Some(score); + } + if expectation_is_valid { + return Some(score - score_adjustment(score, encoding, tld)); + } + // If expectation is no longer valid, fall back to + // generic behavior. + // XXX Flipped Chinese and Central + return Some(score); + } + None + } + + fn plausible_punctuation(&self) -> u64 { + match &self.inner { + InnerCandidate::Logical(c) => { + return c.plausible_punctuation; + } + InnerCandidate::Visual(c) => { + return c.plausible_punctuation; + } + _ => { + unreachable!(); + } + } + } + + fn encoding(&self) -> &'static Encoding { + match &self.inner { + InnerCandidate::Latin(c) => { + return c.data.encoding; + } + InnerCandidate::NonLatinCased(c) => { + return c.data.encoding; + } + InnerCandidate::Caseless(c) => { + return c.data.encoding; + } + InnerCandidate::ArabicFrench(c) => { + return c.data.encoding; + } + InnerCandidate::Logical(c) => { + return c.data.encoding; + } + InnerCandidate::Visual(c) => { + return c.data.encoding; + } + InnerCandidate::Shift(_) => { + return SHIFT_JIS; + } + InnerCandidate::EucJp(_) => { + return EUC_JP; + } + InnerCandidate::Big5(_) => { + return BIG5; + } + InnerCandidate::EucKr(_) => { + return EUC_KR; + } + InnerCandidate::Gbk(_) => { + return GBK; + } + InnerCandidate::Utf8(_) => { + return UTF_8; + } + InnerCandidate::Iso2022(_) => { + return ISO_2022_JP; + } + } + } +} + +fn count_non_ascii(buffer: &[u8]) -> u64 { + let mut count = 0; + for &b in buffer { + if b >= 0x80 { + count += 1; + } + } + count +} + +#[derive(Clone, Copy)] +enum BeforeNonAscii { + None, + One([u8; 1]), + Two([u8; 2]), +} + +impl BeforeNonAscii { + fn as_slice(&self) -> &[u8] { + match self { + BeforeNonAscii::None => b"", + BeforeNonAscii::One(arr) => &arr[..], + BeforeNonAscii::Two(arr) => &arr[..], + } + } + + fn push(&mut self, buffer: &[u8]) { + let len = buffer.len(); + if len >= 2 { + let arr = [buffer[len - 2], buffer[len - 1]]; + *self = BeforeNonAscii::Two(arr); + } else if len == 1 { + match self { + BeforeNonAscii::None => { + let arr = [buffer[0]]; + *self = BeforeNonAscii::One(arr); + } + BeforeNonAscii::One(first) => { + let arr = [first[0], buffer[0]]; + *self = BeforeNonAscii::Two(arr); + } + BeforeNonAscii::Two(first) => { + let arr = [first[1], buffer[0]]; + *self = BeforeNonAscii::Two(arr); + } + } + } + } +} + +/// A Web browser-oriented detector for guessing what character +/// encoding a stream of bytes is encoded in. +/// +/// The bytes are fed to the detector incrementally using the `feed` +/// method. The current guess of the detector can be queried using +/// the `guess` method. The guessing parameters are arguments to the +/// `guess` method rather than arguments to the constructor in order +/// to enable the application to check if the arguments affect the +/// guessing outcome. (The specific use case is to disable UI for +/// re-running the detector with UTF-8 allowed and the top-level +/// domain name ignored if those arguments don't change the guess.) +pub struct EncodingDetector { + candidates: [Candidate; 27], + non_ascii_seen: u64, + // We need to feed up to two bytes of context before non-ASCII + // thanks to Spanish n.º. + last_before_non_ascii: BeforeNonAscii, + esc_seen: bool, + closed: bool, +} + +impl EncodingDetector { + fn feed_impl(&mut self, buffer: &[u8], last: bool) { + for candidate in self.candidates.iter_mut() { + candidate.feed(buffer, last); + } + self.non_ascii_seen += count_non_ascii(buffer); + } + + /// Inform the detector of a chunk of input. + /// + /// The byte stream is represented as a sequence of calls to this + /// method such that the concatenation of the arguments to this + /// method form the byte stream. It does not matter how the application + /// chooses to chunk the stream. It is OK to call this method with + /// a zero-length byte slice. + /// + /// The end of the stream is indicated by calling this method with + /// `last` set to `true`. In that case, the end of the stream is + /// considered to occur after the last byte of the `buffer` (which + /// may be zero-length) passed in the same call. Once this method + /// has been called with `last` set to `true` this method must not + /// be called again. + /// + /// If you want to perform detection on just the prefix of a longer + /// stream, do not pass `last=true` after the prefix if the stream + /// actually still continues. + /// + /// Returns `true` if after processing `buffer` the stream has + /// contained at least one non-ASCII byte and `false` if only + /// ASCII has been seen so far. + /// + /// # Panics + /// + /// If this method has previously been called with `last` set to `true`. + pub fn feed(&mut self, buffer: &[u8], last: bool) -> bool { + assert!( + !self.closed, + "Must not feed again after feeding with last equaling true." + ); + if last { + self.closed = true; + } + let start = if self.non_ascii_seen == 0 && !self.esc_seen { + let up_to = Encoding::ascii_valid_up_to(buffer); + let start = if let Some(escape) = memchr::memchr(0x1B, &buffer[..up_to]) { + self.esc_seen = true; + escape + } else { + up_to + }; + if start == buffer.len() { + self.last_before_non_ascii.push(buffer); + return self.non_ascii_seen != 0; + } + if start == 0 || start == 1 { + let last_before = self.last_before_non_ascii; + self.last_before_non_ascii = BeforeNonAscii::None; + self.feed_impl(last_before.as_slice(), false); + 0 + } else { + start - 2 + } + } else { + 0 + }; + self.feed_impl(&buffer[start..], last); + self.non_ascii_seen != 0 + } + + /// Guess the encoding given the bytes pushed to the detector so far + /// (via `feed()`), the top-level domain name from which the bytes were + /// loaded, and an indication of whether to consider UTF-8 as a permissible + /// guess. + /// + /// The `tld` argument takes the rightmost DNS label of the hostname of the + /// host the stream was loaded from in lower-case ASCII form. That is, if + /// the label is an internationalized top-level domain name, it must be + /// provided in its Punycode form. If the TLD that the stream was loaded + /// from is unavalable, `None` may be passed instead, which is equivalent + /// to passing `Some(b"com")`. + /// + /// If the `allow_utf8` argument is set to `false`, the return value of + /// this method won't be `encoding_rs::UTF_8`. When performing detection + /// on `text/html` on non-`file:` URLs, Web browsers must pass `false`, + /// unless the user has taken a specific contextual action to request an + /// override. This way, Web developers cannot start depending on UTF-8 + /// detection. Such reliance would make the Web Platform more brittle. + /// + /// Returns the guessed encoding. + /// + /// # Panics + /// + /// If `tld` contains non-ASCII, period, or upper-case letters. (The panic + /// condition is intentionally limited to signs of failing to extract the + /// label correctly, failing to provide it in its Punycode form, and failure + /// to lower-case it. Full DNS label validation is intentionally not performed + /// to avoid panics when the reality doesn't match the specs.) + pub fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding { + let mut tld_type = tld.map_or(Tld::Generic, |tld| { + assert!(!contains_upper_case_period_or_non_ascii(tld)); + classify_tld(tld) + }); + + if self.non_ascii_seen == 0 + && self.esc_seen + && self.candidates[Self::ISO_2022_JP_INDEX].score.is_some() + { + return ISO_2022_JP; + } + + if self.candidates[Self::UTF_8_INDEX].score.is_some() { + if allow_utf8 { + return UTF_8; + } + // Various test cases that prohibit UTF-8 detection want to + // see windows-1252 specifically. These tests run on generic + // domains. However, if we returned windows-1252 on + // some non-generic domains, we'd cause reloads. + return self.candidates[encoding_for_tld(tld_type)].encoding(); + } + + let mut encoding = self.candidates[encoding_for_tld(tld_type)].encoding(); + let mut max = 0i64; + let mut expectation_is_valid = false; + if tld_type != Tld::Generic { + for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { + if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() { + expectation_is_valid = true; + break; + } + } + } + if !expectation_is_valid { + // Flip Chinese and Central around + match tld_type { + Tld::Simplified => { + if self.candidates[Self::BIG5_INDEX].score.is_some() { + tld_type = Tld::Traditional; + expectation_is_valid = true; + } + } + Tld::Traditional => { + if self.candidates[Self::GBK_INDEX].score.is_some() { + tld_type = Tld::Simplified; + expectation_is_valid = true; + } + } + Tld::CentralWindows => { + if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() { + tld_type = Tld::CentralIso; + expectation_is_valid = true; + } + } + Tld::CentralIso => { + if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() { + tld_type = Tld::CentralWindows; + expectation_is_valid = true; + } + } + _ => {} + } + } + for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { + if let Some(score) = candidate.score(i, tld_type, expectation_is_valid) { + if score > max { + max = score; + encoding = candidate.encoding(); + } + } + } + let visual = &self.candidates[Self::VISUAL_INDEX]; + if let Some(visual_score) = visual.score(Self::VISUAL_INDEX, tld_type, expectation_is_valid) + { + if (visual_score > max || encoding == WINDOWS_1255) + && visual.plausible_punctuation() + > self.candidates[Self::LOGICAL_INDEX].plausible_punctuation() + { + // max = visual_score; + encoding = ISO_8859_8; + } + } + + encoding + } + + // XXX Test-only API + #[cfg(feature = "testing-only-no-semver-guarantees-do-not-use")] + pub fn find_score(&self, encoding: &'static Encoding) -> Option<i64> { + let mut tld_type = Tld::Generic; + let mut expectation_is_valid = false; + if tld_type != Tld::Generic { + for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { + if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() { + expectation_is_valid = true; + break; + } + } + } + if !expectation_is_valid { + // Flip Chinese and Central around + match tld_type { + Tld::Simplified => { + if self.candidates[Self::BIG5_INDEX].score.is_some() { + tld_type = Tld::Traditional; + expectation_is_valid = true; + } + } + Tld::Traditional => { + if self.candidates[Self::GBK_INDEX].score.is_some() { + tld_type = Tld::Simplified; + expectation_is_valid = true; + } + } + Tld::CentralWindows => { + if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() { + tld_type = Tld::CentralIso; + expectation_is_valid = true; + } + } + Tld::CentralIso => { + if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() { + tld_type = Tld::CentralWindows; + expectation_is_valid = true; + } + } + _ => {} + } + } + for (i, candidate) in self.candidates.iter().enumerate() { + if encoding == candidate.encoding() { + return candidate.score(i, tld_type, expectation_is_valid); + } + } + Some(0) + } + + const FIRST_NORMAL: usize = 3; + + const UTF_8_INDEX: usize = 0; + + const ISO_2022_JP_INDEX: usize = 1; + + const VISUAL_INDEX: usize = 2; + + const GBK_INDEX: usize = 3; + + const EUC_JP_INDEX: usize = 4; + + const EUC_KR_INDEX: usize = 5; + + const SHIFT_JIS_INDEX: usize = 6; + + const BIG5_INDEX: usize = 7; + + const WESTERN_INDEX: usize = 8; + + const CYRILLIC_WINDOWS_INDEX: usize = 9; + + const CENTRAL_WINDOWS_INDEX: usize = 10; + + const CENTRAL_ISO_INDEX: usize = 11; + + const ARABIC_WINDOWS_INDEX: usize = 12; + + const ICELANDIC_INDEX: usize = 13; + + const TURKISH_INDEX: usize = 14; + + const THAI_INDEX: usize = 15; + + const LOGICAL_INDEX: usize = 16; + + const GREEK_WINDOWS_INDEX: usize = 17; + + const GREEK_ISO_INDEX: usize = 18; + + const BALTIC_WINDOWS_INDEX: usize = 19; + + const BALTIC_ISO13_INDEX: usize = 20; + + const CYRILLIC_KOI_INDEX: usize = 21; + + const CYRILLIC_IBM_INDEX: usize = 22; + + const ARABIC_ISO_INDEX: usize = 23; + + const VIETNAMESE_INDEX: usize = 24; + + const BALTIC_ISO4_INDEX: usize = 25; + + const CYRILLIC_ISO_INDEX: usize = 26; + + /// Creates a new instance of the detector. + pub fn new() -> Self { + EncodingDetector { + candidates: [ + Candidate::new_utf_8(), // 0 + Candidate::new_iso_2022_jp(), // 1 + Candidate::new_visual(&SINGLE_BYTE_DATA[ISO_8859_8_INDEX]), // 2 + Candidate::new_gbk(), // 3 + Candidate::new_euc_jp(), // 4 + Candidate::new_euc_kr(), // 5 + Candidate::new_shift_jis(), // 6 + Candidate::new_big5(), // 7 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_INDEX]), // 8 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1251_INDEX]), // 9 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1250_INDEX]), // 10 + Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_2_INDEX]), // 11 + Candidate::new_arabic_french(&SINGLE_BYTE_DATA[WINDOWS_1256_INDEX]), // 12 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_ICELANDIC_INDEX]), // 13 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1254_INDEX]), // 14 + Candidate::new_caseless(&SINGLE_BYTE_DATA[WINDOWS_874_INDEX]), // 15 + Candidate::new_logical(&SINGLE_BYTE_DATA[WINDOWS_1255_INDEX]), // 16 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1253_INDEX]), // 17 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_7_INDEX]), // 18 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1257_INDEX]), // 19 + Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_13_INDEX]), // 20 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[KOI8_U_INDEX]), // 21 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[IBM866_INDEX]), // 22 + Candidate::new_caseless(&SINGLE_BYTE_DATA[ISO_8859_6_INDEX]), // 23 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1258_INDEX]), // 24 + Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_4_INDEX]), // 25 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_5_INDEX]), // 26 + ], + non_ascii_seen: 0, + last_before_non_ascii: BeforeNonAscii::None, + esc_seen: false, + closed: false, + } + } + + /// Queries whether the TLD is considered non-generic and could affect the guess. + pub fn tld_may_affect_guess(tld: Option<&[u8]>) -> bool { + if let Some(tld) = tld { + classify_tld(tld) != Tld::Generic + } else { + false + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use detone::IterDecomposeVietnamese; + use encoding_rs::IBM866; + use encoding_rs::ISO_8859_2; + use encoding_rs::ISO_8859_4; + use encoding_rs::ISO_8859_5; + use encoding_rs::ISO_8859_6; + use encoding_rs::ISO_8859_7; + use encoding_rs::KOI8_U; + use encoding_rs::WINDOWS_1250; + use encoding_rs::WINDOWS_1251; + use encoding_rs::WINDOWS_1252; + use encoding_rs::WINDOWS_1253; + use encoding_rs::WINDOWS_1254; + use encoding_rs::WINDOWS_1256; + use encoding_rs::WINDOWS_1257; + use encoding_rs::WINDOWS_1258; + use encoding_rs::WINDOWS_874; + + fn check_bytes(bytes: &[u8], encoding: &'static Encoding) { + let mut det = EncodingDetector::new(); + det.feed(bytes, true); + let enc = det.guess(None, false); + let (decoded, _) = enc.decode_without_bom_handling(bytes); + println!("{:?}", decoded); + assert_eq!(enc, encoding); + } + + fn check(input: &str, encoding: &'static Encoding) { + let orthographic; + let (bytes, _, _) = if encoding == WINDOWS_1258 { + orthographic = input + .chars() + .decompose_vietnamese_tones(true) + .collect::<String>(); + encoding.encode(&orthographic) + } else { + encoding.encode(input) + }; + check_bytes(&bytes, encoding); + } + + #[test] + fn test_i_apostrophe() { + let mut det = EncodingDetector::new(); + det.feed(b"I\x92", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_one_by_one() { + let mut det = EncodingDetector::new(); + det.feed(b"n", false); + det.feed(b".", false); + det.feed(b"\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_two_together() { + let mut det = EncodingDetector::new(); + det.feed(b"n.", false); + det.feed(b"\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_one_by_one_extra_before() { + let mut det = EncodingDetector::new(); + det.feed(b" n", false); + det.feed(b".", false); + det.feed(b"\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_one_before() { + let mut det = EncodingDetector::new(); + det.feed(b"n", false); + det.feed(b".\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_longer_first_buffer() { + let mut det = EncodingDetector::new(); + det.feed(b"rrn.", false); + det.feed(b"\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_empty() { + let mut det = EncodingDetector::new(); + let seen_non_ascii = det.feed(b"", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + assert!(!seen_non_ascii); + } + + #[test] + fn test_fi() { + check("Ääni", WINDOWS_1252); + } + + #[test] + fn test_fi_bis() { + check("Tämä", WINDOWS_1252); + } + + #[test] + fn test_pt() { + check( + "Este é um teste de codificação de caracteres.", + WINDOWS_1252, + ); + } + + #[test] + fn test_is() { + check("Þetta er kóðunarpróf á staf. Fyrir sum tungumál sem nota latneska stafi þurfum við meira inntak til að taka ákvörðunina.", WINDOWS_1252); + } + + #[test] + fn test_ru_short() { + check("Русский", WINDOWS_1251); + } + + #[test] + fn test_ru() { + check("Это тест кодировки символов.", WINDOWS_1251); + } + + #[test] + fn test_ru_iso() { + check("Это тест кодировки символов.", ISO_8859_5); + } + + #[test] + fn test_ru_ibm() { + check("Это тест кодировки символов.", IBM866); + } + + #[test] + fn test_ru_koi() { + check("Это тест кодировки символов.", KOI8_U); + } + + #[test] + fn test_uk() { + check("Це тест на кодування символів.", WINDOWS_1251); + } + + #[test] + fn test_uk_koi() { + check("Це тест на кодування символів.", KOI8_U); + } + + #[test] + fn test_el_short() { + check("Ελληνικά", WINDOWS_1253); + } + + #[test] + fn test_el() { + check( + "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης", + WINDOWS_1253, + ); + } + + #[test] + fn test_el_iso() { + check( + "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης", + ISO_8859_7, + ); + } + + #[test] + fn test_de() { + check("Straße", WINDOWS_1252); + } + + #[test] + fn test_he() { + check("\u{5E2}\u{5D1}\u{5E8}\u{5D9}\u{5EA}", WINDOWS_1255); + } + + #[test] + fn test_2022() { + check("日本語", ISO_2022_JP); + } + + #[test] + fn test_th() { + check("นี่คือการทดสอบการเข้ารหัสอักขระ", WINDOWS_874); + } + + #[test] + fn test_vi() { + check("Đây là một thử nghiệm mã hóa ký tự.", WINDOWS_1258); + } + + #[test] + fn test_tr() { + check("Bu bir karakter kodlama testidir. Latince karakterleri kullanan bazı dillerde karar vermek için daha fazla girdiye ihtiyacımız var.", WINDOWS_1254); + } + + #[test] + fn test_simplified() { + check("这是一个字符编码测试。", GBK); + } + + #[test] + fn test_traditional() { + check("這是一個字符編碼測試。", BIG5); + } + + #[test] + fn test_ko() { + check("이것은 문자 인코딩 테스트입니다.", EUC_KR); + } + + #[test] + fn test_shift() { + check("これは文字実験です。", SHIFT_JIS); + } + + #[test] + fn test_euc() { + check("これは文字実験です。", EUC_JP); + } + + #[test] + fn test_ar() { + check("هذا هو اختبار ترميز الأحرف.", WINDOWS_1256); + } + + #[test] + fn test_ar_iso() { + check("هذا هو اختبار ترميز الأحرف.", ISO_8859_6); + } + + #[test] + fn test_fa() { + check("این یک تست رمزگذاری کاراکتر است.", WINDOWS_1256); + } + + #[test] + fn test_visual() { + check(".םיוות דודיק ןחבמ והז", ISO_8859_8); + } + + #[test] + fn test_yi() { + check("דאָס איז אַ טעסט פֿאַר קאָדירונג פון כאַראַקטער.", WINDOWS_1255); + } + + #[test] + fn test_it() { + check("è", WINDOWS_1252); + } + + #[test] + fn test_en() { + check("isn’t", WINDOWS_1252); + } + + #[test] + fn test_en_bis() { + check("Rock ’n Roll", WINDOWS_1252); + } + + #[test] + fn test_ca() { + check("Codificació de caràcters", WINDOWS_1252); + } + + #[test] + fn test_et() { + check("või", WINDOWS_1252); + } + + #[test] + fn test_pl_iso() { + check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", ISO_8859_2); + } + + #[test] + fn test_pl() { + check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", WINDOWS_1250); + } + + #[test] + fn test_lt() { + check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", WINDOWS_1257); + } + + // TODO: Detected as ISO-8859-2. + // #[test] + // fn test_lt_windows_iso_8859_4() { + // check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", ISO_8859_4); + // } + + #[test] + fn test_lv() { + check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", WINDOWS_1257); + } + + #[test] + fn test_lv_iso_8859_4() { + check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", ISO_8859_4); + } + + #[test] + fn test_a0() { + // Test that this isn't IBM866. TODO: What about GBK with fully paired 0xA0? + check("\u{A0}\u{A0} \u{A0}", WINDOWS_1252); + } + + #[test] + fn test_a0a0() { + // Test that this isn't GBK or EUC-KR. + check("\u{A0}\u{A0}", WINDOWS_1252); + } + + #[test] + fn test_space_copyright_space() { + check(" © ", WINDOWS_1252); + } + + #[test] + fn test_space_masculine_space() { + check(" º ", WINDOWS_1252); + } + + #[test] + fn test_space_feminine_space() { + check(" ª ", WINDOWS_1252); + } + + #[test] + fn test_period_masculine_space() { + check(".º ", WINDOWS_1252); + } + + #[test] + fn test_period_feminine_space() { + check(".ª ", WINDOWS_1252); + } + + #[test] + fn test_maria() { + check(" Mª ", WINDOWS_1252); + } + + #[test] + fn test_dona() { + check(" Dª ", WINDOWS_1252); + } + + #[test] + fn test_nuestra() { + check(" Nª ", WINDOWS_1252); + } + + #[test] + fn test_senora() { + check(" Sª ", WINDOWS_1252); + } + + #[test] + fn test_digit_feminine() { + check(" 42ª ", WINDOWS_1252); + } + + #[test] + fn test_digit_masculine() { + check(" 42º ", WINDOWS_1252); + } + + #[test] + fn test_roman_feminine() { + check(" XIVª ", WINDOWS_1252); + } + + #[test] + fn test_roman_masculine() { + check(" XIVº ", WINDOWS_1252); + } + + #[test] + fn test_numero_uno() { + check("Nº1", WINDOWS_1252); + } + + #[test] + fn test_numero() { + check("Nº", WINDOWS_1252); + } + + #[test] + fn test_euro() { + check(" €9", WINDOWS_1252); + } + + #[test] + fn test_shift_jis_half_width_katakana() { + check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS); + } + + #[test] + fn test_big5_pua() { + let mut v = Vec::new(); + for _ in 0..40 { + v.extend_from_slice(b"\xA4\x40"); + } + v.extend_from_slice(b"\x81\x40\xA4\x40"); + check_bytes(&v, BIG5); + } + + #[test] + fn test_big5_single_byte_a0() { + let mut v = Vec::new(); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\x40"); + } + v.extend_from_slice(b"\x81\x40\xA0 "); + check_bytes(&v, BIG5); + } + + #[test] + fn test_big5_single_byte_ff() { + let mut v = Vec::new(); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\x40"); + } + v.extend_from_slice(b"\x81\x40\xFF "); + check_bytes(&v, BIG5); + } + + #[test] + fn test_not_big5() { + let mut v = Vec::new(); + for _ in 0..40 { + v.extend_from_slice(b"\xA4\x40"); + } + v.extend_from_slice(b"\x81\x40\xA0\xA0"); + check_bytes(&v, IBM866); + } + + #[test] + fn test_euc_kr_pua() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xC9\xA1\xB0\xA1 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_euc_kr_pua_bis() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFE\xA1\xB0\xA1 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_euc_kr_single_byte_ff() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFF "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_euc_kr_single_byte_81() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x81 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_euc_kr_single_byte_84() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x84 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_not_euc_kr() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xC9\xA0\xB0\xA1 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_shift_jis_x0213() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x87\xE5"); + for _ in 0..40 { + v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); + } + check_bytes(&v, SHIFT_JIS); + } + + #[test] + fn test_shift_jis_single_byte_fd() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFD"); + for _ in 0..40 { + v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); + } + check_bytes(&v, SHIFT_JIS); + } + + #[test] + fn test_not_shift_jis() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x84\xE0"); + for _ in 0..40 { + v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_not_shift_jis_bis() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x87\x7D"); + for _ in 0..40 { + v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_euc_jp_x0213() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xAD\xBF"); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); + } + check_bytes(&v, EUC_JP); + } + + #[test] + fn test_euc_jp_x0213_other_plane() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x8F\xFE\xF6"); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); + } + check_bytes(&v, EUC_JP); + } + + #[test] + fn test_not_euc_jp() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x8F\xFE\xF7"); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); + } + check_bytes(&v, WINDOWS_1252); + } + + #[test] + fn test_not_euc_jp_bis() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xA8\xDF"); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); + } + check_bytes(&v, BIG5); + } + + #[test] + fn test_gbk_single_byte_ff() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFF"); + for _ in 0..80 { + v.extend_from_slice(b"\xB5\xC4"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_gbk_single_byte_a0() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xA0 "); + for _ in 0..80 { + v.extend_from_slice(b"\xB5\xC4"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_gbk_single_byte_fe() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFE "); + for _ in 0..80 { + v.extend_from_slice(b"\xB5\xC4"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_not_gbk_single_byte_fc() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFC "); + for _ in 0..80 { + v.extend_from_slice(b"\xB5\xC4"); + } + check_bytes(&v, ISO_8859_5); + } +} diff --git a/third_party/rust/chardetng/src/tld.rs b/third_party/rust/chardetng/src/tld.rs new file mode 100644 index 0000000000..9f43af92d9 --- /dev/null +++ b/third_party/rust/chardetng/src/tld.rs @@ -0,0 +1,340 @@ +/* Any copyright is dedicated to the Public Domain. + * https://creativecommons.org/publicdomain/zero/1.0/ */ + +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Tld { + CentralWindows, + Cyrillic, + Western, + Greek, + TurkishAzeri, + Hebrew, + Arabic, + Baltic, + Vietnamese, + Thai, + Simplified, + Traditional, + Japanese, + Korean, + SimplifiedTraditional, + TraditionalSimplified, + CentralIso, + IcelandicFaroese, + WesternCyrillic, + CentralCyrillic, + WesternArabic, + Generic, + Eu, +} + +pub fn classify_tld(tld: &[u8]) -> Tld { + if tld.len() == 2 { + let key = [tld[0], tld[1]]; + if let Ok(i) = TWO_LETTER_KEYS.binary_search(&key) { + TWO_LETTER_VALUES[i] + } else { + Tld::Western + } + } else if tld.len() == 3 { + match tld { + b"edu" | b"gov" | b"mil" => Tld::Western, + _ => Tld::Generic, + } + } else if tld.starts_with(b"xn--") && tld.len() >= 8 { + // It's unclear is including the IDNs here is a good idea. + // Clearly, they are an anachronism relative to the era + // of legacy encodings. The idea, consistent with previous + // approach in Firefox is to address the case where one + // of these TLDs is configured as an alternative name for + // a server that also serves the same content from a + // two-ASCII-letter TLD. This makes the detection result + // the same either way even though otherwise this thing + // does not make much sense. + if let Ok(i) = PUNYCODE_KEYS.binary_search(&&tld[4..]) { + PUNYCODE_VALUES[i] + } else { + Tld::Generic + } + } else { + Tld::Generic + } +} + +static TWO_LETTER_VALUES: [Tld; 87] = [ + Tld::Generic, // ac + Tld::Arabic, // ae + Tld::Arabic, // af + Tld::Generic, // ai + Tld::WesternCyrillic, // am + Tld::TurkishAzeri, // az + Tld::CentralCyrillic, // ba + Tld::Cyrillic, // bg + Tld::Arabic, // bh + Tld::Cyrillic, // by + Tld::Generic, // bz + Tld::Generic, // cb + Tld::Generic, // cc + Tld::Generic, // cd + Tld::Simplified, // cn + Tld::Generic, // cx + Tld::Greek, // cy + Tld::CentralWindows, // cz + Tld::Generic, // dj + Tld::Arabic, // dz + Tld::Arabic, // eg + Tld::Eu, // eu + Tld::Generic, // fm + Tld::IcelandicFaroese, // fo + Tld::WesternCyrillic, // ge + Tld::Greek, // gr + Tld::TraditionalSimplified, // hk + Tld::CentralWindows, // hr + Tld::CentralIso, // hu + Tld::Hebrew, // il + Tld::Generic, // in + Tld::Arabic, // iq + Tld::Arabic, // ir + Tld::IcelandicFaroese, // is + Tld::Arabic, // jo + Tld::Japanese, // jp + Tld::Cyrillic, // kg + Tld::Korean, // kp + Tld::Korean, // kr + Tld::Arabic, // kw + Tld::Cyrillic, // kz + Tld::Generic, // la + Tld::Arabic, // lb + Tld::Baltic, // lt + Tld::Baltic, // lv + Tld::Arabic, // ly + Tld::Arabic, // ma + Tld::Cyrillic, // md + Tld::Generic, // me + Tld::Cyrillic, // mk + Tld::Cyrillic, // mn + Tld::TraditionalSimplified, // mo + Tld::Arabic, // mr + Tld::Generic, // ms + Tld::WesternArabic, // my + Tld::Generic, // nu + Tld::Arabic, // om + Tld::Arabic, // pk + Tld::CentralIso, // pl + Tld::Arabic, // ps + Tld::Arabic, // qa + Tld::CentralWindows, // ro + Tld::Cyrillic, // rs + Tld::Cyrillic, // ru + Tld::Arabic, // sa + Tld::Arabic, // sd + Tld::SimplifiedTraditional, // sg + Tld::CentralIso, // si + Tld::CentralWindows, // sk + Tld::Generic, // st + Tld::Cyrillic, // su + Tld::Arabic, // sy + Tld::Thai, // th + Tld::Cyrillic, // tj + Tld::Generic, // tk + Tld::Cyrillic, // tm + Tld::Arabic, // tn + Tld::Generic, // to + Tld::TurkishAzeri, // tr + Tld::Generic, // tv + Tld::Traditional, // tw + Tld::Cyrillic, // ua + Tld::Cyrillic, // uz + Tld::Generic, // vc + Tld::Vietnamese, // vn + Tld::Generic, // vu + Tld::Arabic, // ye +]; + +static TWO_LETTER_KEYS: [[u8; 2]; 87] = [ + [b'a', b'c'], // Generic + [b'a', b'e'], // Arabic + [b'a', b'f'], // Arabic + [b'a', b'i'], // Generic + [b'a', b'm'], // WesternCyrillic + [b'a', b'z'], // TurkishAzeri + [b'b', b'a'], // CentralCyrillic + [b'b', b'g'], // Cyrillic + [b'b', b'h'], // Arabic + [b'b', b'y'], // Cyrillic + [b'b', b'z'], // Generic + [b'c', b'b'], // Generic + [b'c', b'c'], // Generic + [b'c', b'd'], // Generic + [b'c', b'n'], // Simplified + [b'c', b'x'], // Generic + [b'c', b'y'], // Greek + [b'c', b'z'], // CentralWindows + [b'd', b'j'], // Generic + [b'd', b'z'], // Arabic + [b'e', b'g'], // Arabic + [b'e', b'u'], // Eu + [b'f', b'm'], // Generic + [b'f', b'o'], // IcelandicFaroese + [b'g', b'e'], // WesternCyrillic + [b'g', b'r'], // Greek + [b'h', b'k'], // TraditionalSimplified + [b'h', b'r'], // CentralWindows + [b'h', b'u'], // CentralIso + [b'i', b'l'], // Hebrew + [b'i', b'n'], // Generic + [b'i', b'q'], // Arabic + [b'i', b'r'], // Arabic + [b'i', b's'], // IcelandicFaroese + [b'j', b'o'], // Arabic + [b'j', b'p'], // Japanese + [b'k', b'g'], // Cyrillic + [b'k', b'p'], // Korean + [b'k', b'r'], // Korean + [b'k', b'w'], // Arabic + [b'k', b'z'], // Cyrillic + [b'l', b'a'], // Generic + [b'l', b'b'], // Arabic + [b'l', b't'], // Baltic + [b'l', b'v'], // Baltic + [b'l', b'y'], // Arabic + [b'm', b'a'], // Arabic + [b'm', b'd'], // Cyrillic + [b'm', b'e'], // Generic + [b'm', b'k'], // Cyrillic + [b'm', b'n'], // Cyrillic + [b'm', b'o'], // TraditionalSimplified + [b'm', b'r'], // Arabic + [b'm', b's'], // Generic + [b'm', b'y'], // WesternArabic + [b'n', b'u'], // Generic + [b'o', b'm'], // Arabic + [b'p', b'k'], // Arabic + [b'p', b'l'], // CentralIso + [b'p', b's'], // Arabic + [b'q', b'a'], // Arabic + [b'r', b'o'], // CentralWindows + [b'r', b's'], // Cyrillic + [b'r', b'u'], // Cyrillic + [b's', b'a'], // Arabic + [b's', b'd'], // Arabic + [b's', b'g'], // SimplifiedTraditional + [b's', b'i'], // CentralIso + [b's', b'k'], // CentralWindows + [b's', b't'], // Generic + [b's', b'u'], // Cyrillic + [b's', b'y'], // Arabic + [b't', b'h'], // Thai + [b't', b'j'], // Cyrillic + [b't', b'k'], // Generic + [b't', b'm'], // Cyrillic + [b't', b'n'], // Arabic + [b't', b'o'], // Generic + [b't', b'r'], // TurkishAzeri + [b't', b'v'], // Generic + [b't', b'w'], // Traditional + [b'u', b'a'], // Cyrillic + [b'u', b'z'], // Cyrillic + [b'v', b'c'], // Generic + [b'v', b'n'], // Vietnamese + [b'v', b'u'], // Generic + [b'y', b'e'], // Arabic +]; + +static PUNYCODE_KEYS: [&'static [u8]; 46] = [ + b"3e0b707e", // Korean + b"54b7fta0cc", // Western + b"80ao21a", // Cyrillic + b"90a3ac", // Cyrillic + b"90ae", // Cyrillic + b"90ais", // Cyrillic + b"clchc0ea0b2g2a9gcd", // SimplifiedTraditional + b"d1alf", // Cyrillic + b"e1a4c", // Eu + b"fiqs8S", // Simplified + b"fiqz9S", // Simplified + b"fzc2c9e2c", // Western + b"j1amh", // Cyrillic + b"j6w193g", // TraditionalSimplified + b"kprw13d", // Traditional + b"kpry57d", // Traditional + b"l1acc", // Cyrillic + b"lgbbat1ad8j", // Arabic + b"mgb2ddes", // Arabic + b"mgb9awbf", // Arabic + b"mgba3a4f16a", // Arabic + b"mgbaam7a8h", // Arabic + b"mgbah1a3hjkrd", // Arabic + b"mgbai9azgqp6j", // Arabic + b"mgbayh7gpa", // Arabic + b"mgbc0a9azcg", // Arabic + b"mgbcpq6gpa1a", // Arabic + b"mgberp4a5d4ar", // Arabic + b"mgbpl2fh", // Arabic + b"mgbtx2b", // Arabic + b"mgbx4cd0ab", // WesternArabic + b"mix891f", // TraditionalSimplified + b"node", // WesternCyrillic + b"o3cw4h", // Thai + b"ogbpf8fl", // Arabic + b"p1ai", // Cyrillic + b"pgbs0dh", // Arabic + b"q7ce6a", // Arabic + b"qxa6a", // Eu + b"qxam", // Greek + b"wgbh1c", // Arabic + b"wgbl6a", // Arabic + b"xkc2al3hye2a", // Western + b"y9a3aq", // WesternCyrillic + b"yfro4i67o", // SimplifiedTraditional + b"ygbi2ammx", // Arabic +]; + +static PUNYCODE_VALUES: [Tld; 46] = [ + Tld::Korean, // 3e0b707e + Tld::Western, // 54b7fta0cc + Tld::Cyrillic, // 80ao21a + Tld::Cyrillic, // 90a3ac + Tld::Cyrillic, // 90ae + Tld::Cyrillic, // 90ais + Tld::SimplifiedTraditional, // clchc0ea0b2g2a9gcd + Tld::Cyrillic, // d1alf + Tld::Eu, // e1a4c + Tld::Simplified, // fiqs8S + Tld::Simplified, // fiqz9S + Tld::Western, // fzc2c9e2c + Tld::Cyrillic, // j1amh + Tld::TraditionalSimplified, // j6w193g + Tld::Traditional, // kprw13d + Tld::Traditional, // kpry57d + Tld::Cyrillic, // l1acc + Tld::Arabic, // lgbbat1ad8j + Tld::Arabic, // mgb2ddes + Tld::Arabic, // mgb9awbf + Tld::Arabic, // mgba3a4f16a + Tld::Arabic, // mgbaam7a8h + Tld::Arabic, // mgbah1a3hjkrd + Tld::Arabic, // mgbai9azgqp6j + Tld::Arabic, // mgbayh7gpa + Tld::Arabic, // mgbc0a9azcg + Tld::Arabic, // mgbcpq6gpa1a + Tld::Arabic, // mgberp4a5d4ar + Tld::Arabic, // mgbpl2fh + Tld::Arabic, // mgbtx2b + Tld::WesternArabic, // mgbx4cd0ab + Tld::TraditionalSimplified, // mix891f + Tld::WesternCyrillic, // node + Tld::Thai, // o3cw4h + Tld::Arabic, // ogbpf8fl + Tld::Cyrillic, // p1ai + Tld::Arabic, // pgbs0dh + Tld::Arabic, // q7ce6a + Tld::Eu, // qxa6a + Tld::Greek, // qxam + Tld::Arabic, // wgbh1c + Tld::Arabic, // wgbl6a + Tld::Western, // xkc2al3hye2a + Tld::WesternCyrillic, // y9a3aq + Tld::SimplifiedTraditional, // yfro4i67o + Tld::Arabic, // ygbi2ammx +]; |