diff options
Diffstat (limited to 'third_party/rust/chardetng')
-rw-r--r-- | third_party/rust/chardetng/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | third_party/rust/chardetng/CONTRIBUTING.md | 38 | ||||
-rw-r--r-- | third_party/rust/chardetng/COPYRIGHT | 9 | ||||
-rw-r--r-- | third_party/rust/chardetng/Cargo.toml | 44 | ||||
-rw-r--r-- | third_party/rust/chardetng/LICENSE-APACHE | 202 | ||||
-rw-r--r-- | third_party/rust/chardetng/LICENSE-MIT | 25 | ||||
-rw-r--r-- | third_party/rust/chardetng/README.md | 177 | ||||
-rw-r--r-- | third_party/rust/chardetng/src/data.rs | 1313 | ||||
-rw-r--r-- | third_party/rust/chardetng/src/lib.rs | 3775 | ||||
-rw-r--r-- | third_party/rust/chardetng/src/tld.rs | 340 |
10 files changed, 5924 insertions, 0 deletions
diff --git a/third_party/rust/chardetng/.cargo-checksum.json b/third_party/rust/chardetng/.cargo-checksum.json new file mode 100644 index 0000000000..1d6743d4d5 --- /dev/null +++ b/third_party/rust/chardetng/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"78c3797cfa83e17d06cc9ae9d2603d6fda0ed9f6b371b4479885d4b382a124a9","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"16c7c78a56ec917e92db97263f4adddcf3749acaf7de88d088d1f5c86c278107","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
\ No newline at end of file diff --git a/third_party/rust/chardetng/CONTRIBUTING.md b/third_party/rust/chardetng/CONTRIBUTING.md new file mode 100644 index 0000000000..1d41d4c60e --- /dev/null +++ b/third_party/rust/chardetng/CONTRIBUTING.md @@ -0,0 +1,38 @@ +If you send a pull request / patch, please observe the following. + +## Licensing + +Since this crate is dual-licensed, +[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions) +is considered to apply in the sense of Contributions being automatically +under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file). +That is, by the act of offering a Contribution, you place your Contribution +under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT` +file. Please do not contribute if you aren't willing or allowed to license your +contributions in this manner. + +You are encouraged to dedicate test code that you contribute to the Public +Domain using the CC0 dedication. If you contribute test code that is not +dedicated to the Public Domain, please be sure not to put it in a part of +source code that the comments designate as being dedicated to the Public +Domain. + +## Copyright Notices + +If you require the addition of your copyright notice, it's up to you to edit in +your notice as part of your Contribution. Not adding a copyright notice is +taken as a waiver of copyright notice. + +## Compatibility with Stable Rust + +Please ensure that your Contribution compiles with the latest stable-channel +rustc. + +## rustfmt + +The `rustfmt` version used for this code is `rustfmt-nightly`. Please either +use that version or avoid using `rustfmt` (so as not to reformat all the code). + +## Unit tests + +Please ensure that `cargo test` succeeds. diff --git a/third_party/rust/chardetng/COPYRIGHT b/third_party/rust/chardetng/COPYRIGHT new file mode 100644 index 0000000000..eac1f4b3b5 --- /dev/null +++ b/third_party/rust/chardetng/COPYRIGHT @@ -0,0 +1,9 @@ +chardetng is copyright 2019 Mozilla Foundation. + +Licensed under the Apache License, Version 2.0 +<LICENSE-APACHE or +https://www.apache.org/licenses/LICENSE-2.0> or the MIT +license <LICENSE-MIT or https://opensource.org/licenses/MIT>, +at your option. All files in the project carrying such +notice may not be copied, modified, or distributed except +according to those terms. diff --git a/third_party/rust/chardetng/Cargo.toml b/third_party/rust/chardetng/Cargo.toml new file mode 100644 index 0000000000..4f5110a092 --- /dev/null +++ b/third_party/rust/chardetng/Cargo.toml @@ -0,0 +1,44 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "chardetng" +version = "0.1.9" +authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"] +description = "A character encoding detector for legacy Web content" +homepage = "https://docs.rs/chardetng/" +documentation = "https://docs.rs/chardetng/" +readme = "README.md" +keywords = [ + "encoding", + "web", + "unicode", + "charset", +] +categories = [ + "text-processing", + "encoding", + "web-programming", + "internationalization", +] +license = "Apache-2.0 OR MIT" +repository = "https://github.com/hsivonen/chardetng" + +[dependencies] +encoding_rs = "0.8.17" +memchr = "2.2.0" + +[dev-dependencies] +detone = "1.0.0" + +[features] +testing-only-no-semver-guarantees-do-not-use = [] diff --git a/third_party/rust/chardetng/LICENSE-APACHE b/third_party/rust/chardetng/LICENSE-APACHE new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/third_party/rust/chardetng/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/rust/chardetng/LICENSE-MIT b/third_party/rust/chardetng/LICENSE-MIT new file mode 100644 index 0000000000..b4850c9520 --- /dev/null +++ b/third_party/rust/chardetng/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2019 Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/third_party/rust/chardetng/README.md b/third_party/rust/chardetng/README.md new file mode 100644 index 0000000000..4856ce5e3b --- /dev/null +++ b/third_party/rust/chardetng/README.md @@ -0,0 +1,177 @@ +# chardetng + +[![crates.io](https://meritbadge.herokuapp.com/chardetng)](https://crates.io/crates/chardetng) +[![docs.rs](https://docs.rs/chardetng/badge.svg)](https://docs.rs/chardetng/) +[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/chardetng/blob/master/COPYRIGHT) + +A character encoding detector for legacy Web content. + +## Licensing + +Please see the file named +[COPYRIGHT](https://github.com/hsivonen/chardetng/blob/master/COPYRIGHT). + +## Documentation + +Generated [API documentation](https://docs.rs/chardetng/) is available +online. + +## Purpose + +The purpose of this detector is user retention for Firefox by ensuring that the long tail of the legacy Web is not more convenient to use in Chrome than in Firefox. (Chrome deployed [ced](https://github.com/google/compact_enc_det/), which left Firefox less convenient to use until the deployment of this detector.) + +## About the Name + +`chardet` was the name of Mozilla's old encoding detector. I named this one `chardetng`, because this the next generation of encoding detector in Firefox. There is no code reuse from the old `chardet`. + +## Optimization Goals + +This crate aims to be more accurate than ICU, more complete than `chardet`, more explainable and modifiable than `compact_enc_det` (aka. ced), and, in an application that already depends on `encoding_rs` for other reasons, smaller in added binary footprint than `compact_enc_det`. + +## Principle of Operation + +In general `chardetng` prefers to do negative matching (rule out possibilities from the set of plausible encodings) than to do positive matching. Since negative matching is insufficient, there is positive matching, too. + +* Except for ISO-2022-JP, pairs of ASCII bytes never contribute to the detection, which has the effect of ignoring HTML syntax without an HTML-aware state machine. +* A single encoding error disqualifies an encoding from the set of possible outcomes. Notably, as the length of the input increases, it becomes increasingly improbable for the input to be valid according to a legacy CJK encoding without being intended as such. Also, there are single-byte encodings that have unmapped bytes in areas that are in active use by other encodings, so such bytes narrow the set of possibilities very effectively. +* A single occurrence of a C1 control character disqualifies an encoding from possible outcomes. +* The first non-ASCII character being a half-width katakana character disqualifies an encoding. (This is _very_ effective for deciding between Shift_JIS and EUC-JP.) +* For single-byte encodings, character pairs are given scores according to their relative frequencies in the applicable Wikipedias. +* There's a variety of smaller penalty rules, such as: + - For encodings for bicameral scripts, having an upper-case letter follow a lower-case letter is penalized. + - For Latin encodings, having three non-ASCII letters in a row is penalized a little and having four or more is penalized a lot. + - For non-Latin encodings, having a non-Latin letter right next to a Latin letter is penalized. + - For single-byte encodings, having a character pair (excluding pairs where both characters are ASCII) that never occurs in the Wikipedias for the applicable languages is heavily penalized. + +## Notes About Encodings + +<dl> +<dt>UTF-8</dt> +<dd>Detected only if explicitly permitted by the argument to the `guess` method. It's harmful for Web browsers to detect UTF-8 without requiring user action, such as choosing a menu item, because Web developers would start relying on the detection.</dd> +<dt>UTF-16[BE|LE]</dt> +<dd>Not detected: Detecting these belongs on the BOM layer.</dd> +<dt>x-user-defined</dt> +<dd>Not detected: This encoding is for XHR. <code><meta charset=x-user-defined></code> in HTML is not unlabeled and means windows-1252.</dd> +<dt>Replacement</dt> +<dd>Not detected.</dd> +<dt>GB18030</dt> +<dd>Detected as GBK.</dd> +<dt>GBK</dt> +<dt>Big5</dt> +<dt>EUC-KR</dt> +<dt>Shift_JIS</dt> +<dt>windows-1250</dt> +<dt>windows-1251</dt> +<dt>windows-1252</dt> +<dt>windows-1253</dt> +<dt>windows-1254</dt> +<dt>windows-1255</dt> +<dt>windows-1256</dt> +<dt>windows-1257</dt> +<dt>windows-1258</dt> +<dt>windows-874</dt> +<dt>ISO-8859-2</dt> +<dt>ISO-8859-7</dt> +<dd>Detected: Historical locale-specific fallbacks.</dd> +<dt>EUC-JP</dt> +<dt>ISO-2022-JP</dt> +<dt>KOI8-U</dt> +<dt>ISO-8859-5</dt> +<dt>IBM866</dt> +<dd>Detected: Detected by multiple browsers past and present.</dd> +<dt>KOI8-R</dt> +<dd>Detected as KOI8-U. (Always guessing the U variant is less likely to corrupt non-box drawing characters.)</dd> +<dt>ISO-8859-8-I</dt> +<dd>Detected as windows-1255.</dd> +<dt>ISO-8859-4</dt> +<dd>Detected: Detected by IE and Chrome; in menu in IE and Firefox.</dd> +<dt>ISO-8859-6</dt> +<dd>Detected: Detected by IE and Chrome.</dd> +<dt>ISO-8859-8</dt> +<dd>Detected: Available in menu in IE and Firefox.</dd> +<dt>ISO-8859-13</dt> +<dd>Detected: Detected by Chrome. This encoding is so similar to windows-1257 that menu items for windows-1257 can be considered to accommodate this one in IE and Firefox. Due to the mechanics of this detector, if this wasn't included as a separate item, the windows-1257 detection wouldn't catch the cases that use curly quotes and are invalid as windows-1257.</dd> +<dt>x-mac-cyrillic</dt> +<dd>Not detected: Not detected by IE and Chrome. (Was previously detected by Firefox.)</dd> +<dt>ISO-8859-3</dt> +<dt>ISO-8859-10</dt> +<dt>ISO-8859-14</dt> +<dt>ISO-8859-15</dt> +<dt>ISO-8859-16</dt> +<dt>macintosh</dt> +<dd>Not detected: These encodings have never been a locale-specific fallback in a major browser or a menu item in IE.</dd> +</dl> + +## Known Problems + +* GBK detection is less accurate than in ced for short titles consisting of fewer than six hanzi. This is mostly due to the design that prioritizes optimizing binary size over accuracy on very short inputs. +* Thai detection is inaccurate for short inputs. +* windows-1257 detection is very inaccurate. (This detector currently doesn't use trigrams. ced uses 8 KB of trigram data to solve this.) +* On non-generic domains, some encodings that are confusable with the legacy encodings native to the TLD are excluded from guesses outright unless the input is invalid according to all the TLD-native encodings. + +## Roadmap + +- [ ] Investigate parallelizing the `feed` method using Rayon. +- [x] Improve windows-874 detection for short inputs. +- [ ] Improve GBK detection for short inputs. +- [ ] Reorganize the frequency data for telling short GBK, EUC-JP, and EUC-KR inputs apart. +- [ ] Make Lithuanian and Latvian detection on generic domains a lot more accurate (likely requires looking at trigrams). +- [x] Tune Central European detection. +- [ ] Tune the penalties applied to confusable encodings on non-generic TLDs to make detection of confusable encodings possible on non-generic TLDs. +- [x] Reduce the binary size by not storing the scoring for implausible-next-to-alphabetic character classes. +- [ ] ~Reduce the binary size by classifying ASCII algorithmically.~ +- [ ] Reduce the binary size by not storing the scores for C1 controls. + +## Release Notes + +### 0.1.9 + +* Fix a bug in ASCII prefix skipping. (Was introduced in 0.1.7.) + +### 0.1.8 + +* Avoid detecting English with no-break spaces as GBK or EUC-KR. + +### 0.1.7 + +* Avoid misdetecting windows-1252 English as windows-1254. +* Avoid misdetecting windows-1252 English as IBM866. +* Improve Chinese and Japanese detection by not giving single-byte encodings score for letter next to digit. +* Improve Italian, Portuguese, Castilian, Catalan, and Galician detection by taking into account ordinal indicator use. +* Reduce lookup table size. + +### 0.1.6 + +* Tune Central European detection. + +### 0.1.5 + +* Improve Thai accuracy a lot. +* Improve accuracy of some languages a bit. +* Remove unused Hebrew ASCII table. + +### 0.1.4 + +* Properly take into account non-ASCII bytes at word boundaries for windows-1252. (Especially relevant for Italian and Catalan.) +* Move Estonian from the Baltic model to the Western model. This improves overall Estonian detection but causes š and ž encoded as windows-1257, ISO-8859-13, or ISO-8859-4 to get misdecoded. (It would be possible to add a post-processing step to adjust for š and ž, but this would cause reloads given the way chardetng is integrated with Firefox.) +* Properly classify letters that ISO-8859-4 has but windows-1257 doesn't have in order to avoid misdetecting non-ISO-8859-4 input as ISO-8859-4. +* Improve character classification of windows-1254. +* Avoid classifying byte 0xA1 or above as space-like. +* Reduce binary size by collapsing similar character classes. + +### 0.1.3 + +* Return TLD-affiliated encoding if UTF-8 is valid but prohibited. + +### 0.1.2 + +* Return UTF-8 if valid and allowed even if all-ASCII. +* Return windows-1252 if UTF-8 valid and prohibited, because various test cases require this. + +### 0.1.1 + +* Detect Visual Hebrew more often. + +### 0.1.0 + +* Initial release. diff --git a/third_party/rust/chardetng/src/data.rs b/third_party/rust/chardetng/src/data.rs new file mode 100644 index 0000000000..5dacd15a8c --- /dev/null +++ b/third_party/rust/chardetng/src/data.rs @@ -0,0 +1,1313 @@ +/* Any copyright is dedicated to the Public Domain. + * https://creativecommons.org/publicdomain/zero/1.0/ */ + +use super::IMPLAUSIBILITY_PENALTY; +use encoding_rs::Encoding; +use encoding_rs::IBM866_INIT; +use encoding_rs::ISO_8859_13_INIT; +use encoding_rs::ISO_8859_2_INIT; +use encoding_rs::ISO_8859_4_INIT; +use encoding_rs::ISO_8859_5_INIT; +use encoding_rs::ISO_8859_6_INIT; +use encoding_rs::ISO_8859_7_INIT; +use encoding_rs::ISO_8859_8_INIT; +use encoding_rs::KOI8_U_INIT; +use encoding_rs::WINDOWS_1250_INIT; +use encoding_rs::WINDOWS_1251_INIT; +use encoding_rs::WINDOWS_1252_INIT; +use encoding_rs::WINDOWS_1253_INIT; +use encoding_rs::WINDOWS_1254_INIT; +use encoding_rs::WINDOWS_1255_INIT; +use encoding_rs::WINDOWS_1256_INIT; +use encoding_rs::WINDOWS_1257_INIT; +use encoding_rs::WINDOWS_1258_INIT; +use encoding_rs::WINDOWS_874_INIT; + +const PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: usize = 0; + +const IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE: usize = 1; + +const IMPLAUSIBLE_BEFORE_ALPHABETIC: usize = 2; + +const IMPLAUSIBLE_AFTER_ALPHABETIC: usize = 3; + +const PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE: usize = 4; + +const PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE: usize = 5; + +const WINDOWS_1256_ZWNJ: usize = 2; + +pub const ASCII_DIGIT: usize = 100; + +#[repr(align(64))] // Align to cache lines +pub struct DetectorData { + pub frequent_simplified: [u16; 128], + pub frequent_kanji: [u16; 128], + pub frequent_hangul: [u16; 128], + latin_ascii: [u8; 128], + non_latin_ascii: [u8; 128], + turkish_ascii: [u8; 128], + windows_1258: [u8; 128], + windows_1250: [u8; 128], + iso_8859_2: [u8; 128], + windows_1251: [u8; 128], + koi8_u: [u8; 128], + iso_8859_5: [u8; 128], + ibm866: [u8; 128], + windows_1252: [u8; 128], + windows_1252_icelandic: [u8; 128], + windows_1253: [u8; 128], + iso_8859_7: [u8; 128], + windows_1254: [u8; 128], + windows_1255: [u8; 128], + iso_8859_8: [u8; 128], + windows_1256: [u8; 128], + iso_8859_6: [u8; 128], + windows_1257: [u8; 128], + iso_8859_13: [u8; 128], + iso_8859_4: [u8; 128], + windows_874: [u8; 128], + vietnamese: [u8; 1975], + central: [u8; 3895], + cyrillic: [u8; 2112], + western: [u8; 2752], + icelandic: [u8; 871], + greek: [u8; 1365], + turkish: [u8; 845], + hebrew: [u8; 1292], + arabic: [u8; 2805], + baltic: [u8; 1387], + thai: [u8; 5180], +} + +#[rustfmt::skip] +pub static DETECTOR_DATA: DetectorData = DetectorData { + frequent_simplified: [ + 0x7684, 0x5E74, 0x56FD, 0x65E5, 0x6708, 0x4E2D, 0x4EBA, 0x4E00, 0x5927, 0x4E3A, 0x5728, 0x662F, 0x5B66, 0x6709, 0x884C, 0x4F1A, + 0x65AF, 0x4E8E, 0x5730, 0x533A, 0x6587, 0x548C, 0x5C14, 0x540D, 0x7B2C, 0x516C, 0x65F6, 0x5C0F, 0x90E8, 0x4E0D, 0x5E02, 0x53F0, + 0x4EE5, 0x4E0A, 0x540E, 0x52A8, 0x51FA, 0x4E2A, 0x672C, 0x4F5C, 0x5BB6, 0x65B0, 0x6210, 0x897F, 0x5B9A, 0x91CD, 0x751F, 0x4E4B, + 0x7535, 0x4E3B, 0x5B50, 0x7528, 0x7279, 0x5206, 0x6C11, 0x4E86, 0x4E9A, 0x5458, 0x514B, 0x5357, 0x653F, 0x7AD9, 0x5FB7, 0x4E0E, + 0x7403, 0x4E1C, 0x79D1, 0x91CC, 0x9053, 0x5C71, 0x6CD5, 0x65B9, 0x5317, 0x5411, 0x5929, 0x53D1, 0x7269, 0x6765, 0x5230, 0x673A, + 0x661F, 0x8DEF, 0x76EE, 0x7F8E, 0x6751, 0x9AD8, 0x957F, 0x519B, 0x5229, 0x4E09, 0x62C9, 0x8F66, 0x5DDE, 0x57FA, 0x6D77, 0x81EA, + 0x4E0B, 0x8D5B, 0x9762, 0x52A0, 0x4ED6, 0x9A6C, 0x5176, 0x53C2, 0x53BF, 0x4EE3, 0x5185, 0x7406, 0x4E16, 0x4E8C, 0x7EBF, 0x53CA, + 0x5EFA, 0x8868, 0x4F4D, 0x7F57, 0x7531, 0x7ACB, 0x591A, 0x53EF, 0x534E, 0x6797, 0x7EF4, 0x5EA6, 0x4E8B, 0x5E73, 0x5916, 0x4F53, + ], + frequent_kanji: [ + 0x5E74, 0x65E5, 0x6708, 0x5927, 0x672C, 0x5B66, 0x4EBA, 0x56FD, 0x4F1A, 0x4E2D, 0x51FA, 0x4E00, 0x8005, 0x5E02, 0x4F5C, 0x540D, + 0x90E8, 0x7528, 0x5730, 0x884C, 0x5834, 0x7530, 0x7B2C, 0x751F, 0x5408, 0x5B50, 0x9053, 0x4E0A, 0x6771, 0x6642, 0x770C, 0x4EE3, + 0x5C71, 0x793E, 0x4E8B, 0x753B, 0x65B0, 0x624B, 0x9AD8, 0x6210, 0x6226, 0x7269, 0x5F8C, 0x767A, 0x9577, 0x7ACB, 0x5206, 0x5DDD, + 0x8A18, 0x6821, 0x9593, 0x696D, 0x95A2, 0x6240, 0x5B9A, 0x9078, 0x5C0F, 0x76EE, 0x52D5, 0x548C, 0x6587, 0x91CE, 0x540C, 0x524D, + 0x5185, 0x958B, 0x7DDA, 0x81EA, 0x53F7, 0x516C, 0x99C5, 0x9001, 0x56DE, 0x753A, 0x9664, 0x4E3B, 0x5BB6, 0x5229, 0x8ECA, 0x901A, + 0x4EAC, 0x8868, 0x5CF6, 0x4E0B, 0x4E16, 0x65B9, 0x6751, 0x66F8, 0x5168, 0x660E, 0x9023, 0x5E73, 0x653E, 0x4F53, 0x7684, 0x5F0F, + 0x756A, 0x5EA6, 0x5317, 0x5165, 0x5916, 0x983C, 0x8A9E, 0x5973, 0x8A71, 0x6A5F, 0x8A2D, 0x539F, 0x4E09, 0x524A, 0x533A, 0x6D77, + 0x4F9D, 0x5F53, 0x73FE, 0x5BFE, 0x4F4D, 0x6570, 0x5316, 0x795E, 0x66F2, 0x7406, 0x6559, 0x7279, 0x7248, 0x5728, 0x6CD5, 0x898B, + ], + frequent_hangul: [ + 0xC774, 0xC758, 0xB2E4, 0xAE30, 0xC5D0, 0xB85C, 0xC0AC, 0xB144, 0xC2A4, 0xB9AC, 0xB294, 0xC77C, 0xD558, 0xAC00, 0xC2DC, 0xC9C0, + 0xB300, 0xC11C, 0xBD84, 0xAD6D, 0xD55C, 0xB3C4, 0xC778, 0xACE0, 0xB958, 0xC790, 0xC8FC, 0xC544, 0xC744, 0xB77C, 0xC218, 0xC81C, + 0xC815, 0xC6D4, 0xB098, 0xAD6C, 0xC804, 0xC5B4, 0xC740, 0xADF8, 0xBD80, 0xB97C, 0xB3D9, 0xC120, 0xC73C, 0xBB38, 0xD2B8, 0xC6A9, + 0xBCF4, 0xC704, 0xB4DC, 0xACFC, 0xAD50, 0xC0C1, 0xB9C8, 0xC7A5, 0xD559, 0xC6D0, 0xC131, 0xD654, 0xC5ED, 0xB2C8, 0xBBF8, 0xACF5, + 0xACBD, 0xD574, 0xC624, 0xC6B0, 0xBA85, 0xC788, 0xD06C, 0xC601, 0xC18C, 0xC870, 0xD68C, 0xC5EC, 0xBBFC, 0xD1A0, 0xBE44, 0xC138, + 0xB974, 0xC720, 0xC2E0, 0xD0A4, 0xC911, 0xACC4, 0xD0C0, 0xC5F0, 0xD504, 0xAD00, 0xB418, 0xC801, 0xCE58, 0xB808, 0xCE74, 0xC9C4, + 0xC640, 0xD130, 0xB4E4, 0xBAA9, 0xACA8, 0xAC8C, 0xAC1C, 0xBC29, 0xD30C, 0xC0B0, 0xD638, 0xCD9C, 0xC74C, 0xB9BC, 0xBA74, 0xC791, + 0xB9CC, 0xB2E8, 0xB118, 0xBAA8, 0xC694, 0xC5C8, 0xC0DD, 0xB0A8, 0xC7AC, 0xBB34, 0xD6C4, 0xD45C, 0xAD70, 0xD3EC, 0xB2F9, 0xB178, + ], + latin_ascii: [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100,100,100,100,100,100,100,100,100,100, 0, 0, 0, 0, 0, 0, + 0,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151,152,153,154, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0, + ], + non_latin_ascii: [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100,100,100,100,100,100,100,100,100,100, 0, 0, 0, 0, 0, 0, + 0,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129, + 129,129,129,129,129,129,129,129,129,129,129, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + ], + turkish_ascii: [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100,100,100,100,100,100,100,100,100,100, 0, 0, 0, 0, 0, 0, + 0,129,130,131,132,133,134,135,136,154,137,138,139,140,141,142, + 143,144,145,146,147,148,149,150,151,152,153, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 27, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0, + ], + windows_1258: [ + 0,255, 0, 53, 0, 0, 0, 0, 0, 0,255, 0,155,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 27,255,255,179, + 0, 55, 53, 53, 53, 53, 52, 53, 53, 55, 53, 53, 56, 52, 54, 53, + 55, 56, 54, 54, 53, 55, 54, 52, 53, 54, 53, 53, 55, 55, 55, 55, + 159,160,161,162,155,155,155,155,163,164,165,155, 28,167,168,169, + 170,155, 29,172,173,174,155, 56,155,175,176,177,155,178, 30, 27, + 31, 32, 33, 34, 27, 27, 27, 27, 35, 36, 37, 27, 38, 39, 40, 41, + 42, 27, 43, 44, 45, 46, 27, 56, 27, 47, 48, 49, 27, 50, 53, 51, + ], + windows_1250: [ + 0,255, 0,255, 0, 0, 0, 0,255, 0,156, 0,157,158,159,160, + 255, 0, 0, 0, 0, 0, 0, 0,255, 0, 28, 0, 29, 30, 31, 32, + 0, 69, 69,161, 69,162, 68, 69, 69, 71,163, 68, 69, 68, 70,165, + 71, 69, 69, 33, 69, 71, 70, 68, 69, 34, 35, 68,164, 0, 36, 37, + 166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181, + 182,183,184,185,186,187,188, 72,189,190,191,192,193,194,195, 27, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 72, 61, 62, 63, 64, 65, 66, 67, 69, + ], + iso_8859_2: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,162, 69,161, 69,164,157, 69, 69,156,163,158,160, 68,159,165, + 71, 34, 69, 33, 69, 36, 29, 69, 69, 28, 35, 30, 32, 0, 31, 37, + 166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181, + 182,183,184,185,186,187,188, 72,189,190,191,192,193,194,195, 27, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 72, 61, 62, 63, 64, 65, 66, 67, 69, + ], + windows_1251: [ + 131,130, 0, 2, 0, 0, 0, 0, 0, 0,132, 0,133,130,134,135, + 3, 0, 0, 0, 0, 0, 0, 0,255, 0, 4, 0, 5, 2, 6, 7, + 0,136, 8,140, 47,130, 46, 47,138, 49,139, 49, 50, 46, 48,141, + 49, 50,137, 9, 2, 49, 48, 46, 10, 47, 11, 48, 12,130, 2, 13, + 142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157, + 158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + ], + koi8_u: [ + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 46, 0, 50, 50, 50, 0, 47, 49, 48, 46, 50, + 47, 47, 47, 10, 11, 47, 9, 13, 47, 47, 47, 47, 47, 2, 8, 47, + 47, 47, 47,138,139, 47,137,141, 47, 47, 47, 47, 47,130,136, 49, + 44, 14, 15, 36, 18, 19, 34, 17, 35, 22, 23, 24, 25, 26, 27, 28, + 29, 45, 30, 31, 32, 33, 20, 16, 42, 41, 21, 38, 43, 39, 37, 40, + 172,142,143,164,146,147,162,145,163,150,151,152,153,154,155,156, + 157,173,158,159,160,161,148,144,170,169,149,166,171,167,165,168, + ], + iso_8859_5: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,138,131,130,139,130,137,141,140,132,133,134,130, 46,136,135, + 142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157, + 158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 47, 10, 3, 2, 11, 2, 9, 13, 12, 4, 5, 6, 2, 47, 8, 7, + ], + ibm866: [ + 142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157, + 158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 138, 10,139, 11,141, 13,136, 8, 49, 46, 46, 0, 47, 47, 47, 3, + ], + windows_1252: [ + 0,255, 0, 60, 0, 0, 0, 0, 0, 0,156, 0,157,255,185,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 29,255, 57,186, + 0, 62, 60, 60, 60, 60, 59, 60, 60, 62, 60, 59, 63, 59, 61, 60, + 62, 63, 61, 61, 60, 62, 61, 59, 60, 61, 60, 59, 62, 62, 62, 62, + 158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173, + 188,174,175,176,177,178,179, 63,180,181,182,183,184,188,188, 27, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 60, 46, 47, 48, 49, 50, 51, 63, 52, 53, 54, 55, 56, 60, 60, 58, + ], + windows_1252_icelandic: [ + 0,255, 0, 41, 0, 0, 0, 0, 0, 0,155, 0,155,255,155,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 27,255, 27,155, + 0, 43, 41, 41, 41, 41, 40, 41, 41, 43, 41, 41, 44, 40, 42, 41, + 43, 44, 42, 42, 41, 43, 42, 40, 41, 42, 41, 41, 43, 43, 43, 43, + 155,156,155,155,157,155,158,155,155,159,155,155,155,160,155,155, + 161,155,155,162,155,155,163, 44,164,155,165,155,155,166,167, 27, + 27, 28, 27, 27, 29, 27, 30, 27, 27, 31, 27, 27, 27, 32, 27, 27, + 33, 27, 27, 34, 27, 27, 35, 44, 36, 27, 37, 27, 27, 38, 39, 27, + ], + windows_1253: [ + 38,255, 0, 38, 0, 0, 0, 0,255, 0,255, 0,255,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255,255,255,255, + 0, 38,131, 38, 38, 38, 37, 38, 38, 40,255, 40, 37, 37, 39, 37, + 40, 37, 39, 39, 0, 40, 39, 37,132,133,134, 39,162, 40,163,164, + 2,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149, + 150,151,255,153,154,155,156,157,158,159,160,161, 3, 4, 5, 6, + 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,255, + ], + iso_8859_7: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0, 40, 39, 38, 38, 38, 37, 38, 38, 40, 38, 40, 37, 37,255, 37, + 40, 37, 39, 39, 0, 38,131, 37,132,133,134, 39,162, 40,163,164, + 2,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149, + 150,151,255,153,154,155,156,157,158,159,160,161, 3, 4, 5, 6, + 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,255, + ], + windows_1254: [ + 0,255, 0, 40, 0, 0, 0, 0, 0, 0,156, 0,156,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 28,255,255,156, + 0, 42, 40, 40, 40, 40, 39, 40, 40, 42, 40, 42, 43, 39, 41, 40, + 42, 43, 41, 41, 40, 42, 41, 39, 40, 41, 40, 41, 42, 42, 42, 42, + 156,156,158,156,157,156,156,159,156,156,160,156,156,156,161,156, + 162,156,156,156,156,156,163, 43,156,156,156,164,165,155,166, 28, + 28, 28, 30, 28, 29, 28, 28, 31, 28, 28, 32, 28, 28, 28, 33, 28, + 34, 28, 28, 28, 28, 28, 35, 43, 28, 28, 28, 36, 37, 26, 38, 28, + ], + windows_1255: [ + 0,255, 0, 37, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255, + 0, 39, 37, 37, 37, 37, 36, 37, 37, 39, 40, 40, 40, 36, 38, 37, + 39, 40, 38, 38, 37, 39, 38, 36, 37, 38, 40, 40, 39, 39, 39, 39, + 2, 2, 2, 2, 2, 2, 2, 3, 4, 2, 2, 2, 5, 2, 36, 37, + 36, 2, 2, 0, 6, 7, 8, 41, 41,255,255,255,255,255,255,255, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,255,255, 37, 37,255, + ], + iso_8859_8: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,255, 37, 37, 37, 37, 36, 37, 37, 39, 40, 40, 40, 36, 38, 37, + 39, 40, 38, 38, 37, 39, 38, 36, 37, 38, 40, 40, 39, 39, 39,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 36, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,255,255, 37, 37,255, + ], + windows_1256: [ + 0, 3, 0, 54, 0, 0, 0, 0, 0, 0, 4, 0,129, 5, 6, 7, + 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 10, 0, 1, 2, 53, 11, + 0, 58, 54, 54, 54, 54, 53, 54, 54, 56, 12, 56, 57, 53, 55, 54, + 56, 57, 55, 55, 54, 56, 55, 53, 54, 55, 58, 55, 56, 56, 56, 58, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 57, 36, 37, 38, 39, 40, 41, 42, 43, + 1, 44, 1, 45, 46, 47, 48, 1, 1, 1, 1, 1, 49, 50, 1, 1, + 51, 51, 51, 51, 1, 51, 51, 57, 51, 1, 51, 1, 1, 54, 54, 52, + ], + iso_8859_6: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,255,255,255, 54,255,255,255,255,255,255,255, 58, 53,255,255, + 255,255,255,255,255,255,255,255,255,255,255, 58,255,255,255, 58, + 255, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,255,255,255,255,255, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 51, 51, 51, 51, + 51, 51, 51,255,255,255,255,255,255,255,255,255,255,255,255,255, + ], + windows_1257: [ + 0,255, 0,255, 0, 0, 0, 0,255, 0,255, 0,255, 47, 47, 47, + 255, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 47, 47,255, + 0,255, 47, 47, 47,255, 46, 47,155, 49,156, 49, 50, 46, 48,155, + 49, 50, 48, 48, 47, 49, 48, 46, 27, 48, 28, 48, 49, 49, 49, 27, + 157,158,159,155,155,155,160,161,162,155,155,163,164,165,166,167, + 168,155,169,155,170,155,155, 50,171,155,155,172,155,155,173, 27, + 29, 30, 31, 27, 27, 27, 32, 33, 34, 27, 27, 35, 36, 37, 38, 39, + 40, 27, 41, 27, 42, 27, 27, 50, 43, 27, 27, 44, 27, 27, 45, 47, + ], + iso_8859_13: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0, 48, 47, 47, 47, 49, 46, 47,155, 49,156, 49, 50, 46, 48,155, + 49, 50, 48, 48, 46, 49, 48, 46, 27, 48, 28, 48, 49, 49, 49, 27, + 157,158,159,155,155,155,160,161,162,155,155,163,164,165,166,167, + 168,155,169,155,170,155,155, 50,171,155,155,172,155,155,173, 27, + 29, 30, 31, 27, 27, 27, 32, 33, 34, 27, 27, 35, 36, 37, 38, 39, + 40, 27, 41, 27, 42, 27, 27, 50, 43, 27, 27, 44, 27, 27, 45, 46, + ], + iso_8859_4: [ + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, + 0,157, 27,156, 47,155,167, 47, 47,168,161,164,155, 46,173, 47, + 49, 29, 47, 28, 47, 27, 39, 47, 47, 40, 33, 36, 27,155, 45, 27, + 159,155,155,155,155,155,155,158,162,155,160,155,163,155,155,166, + 155,169,170,165,155,155,155, 50,155,171,155,155,155,155,172, 27, + 31, 27, 27, 27, 27, 27, 27, 30, 34, 27, 32, 27, 35, 27, 27, 38, + 27, 41, 42, 37, 27, 27, 27, 50, 27, 43, 27, 27, 27, 27, 44, 47, + ], + windows_874: [ + 77,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255, + 255, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255, + 0, 2, 3, 71, 4, 71, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 71, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,255,255,255,255, 77, + 57, 58, 59, 60, 61, 71, 62, 63, 64, 65, 66, 67, 68, 69, 70, 77, + 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,255,255,255,255, + ], + vietnamese: [ + 0, 58, 58, 22, 33, 7, 0, 0, 0, 0, 1, 57, 3, 0, 0, 0, 39, 14, 8, 3, 0, 11, 0, 5, 0, // , + 0, 3, 16, 2, 0, 0, 0, 0, 0, 0, 0, 3, 1,255, 0, 17, 2, 1, 0, 0, 1, 0, 0, 1,255, // a, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // b, + 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 51, 12, 0, 0, 0, 48, 0, 0, 0, 0, 2, 0, 0, 0, // c, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // d, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, // e, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, // f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // g, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // h, + 0, 27, 11, 1, 16, 3, 0, 0, 0, 0, 0, 16, 0, 0,255, 10, 34, 0, 2, 1, 0, 0, 0, 0,255, // i, + 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0,255, // j, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // k, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // l, + 0, 6, 3, 0, 2, 0, 2, 22, 0, 0, 2, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,255, // m, + 0, 59, 23, 10, 19, 22, 18, 5, 0, 0, 28, 61, 6, 0, 0, 0, 39, 4, 20, 9, 6, 1, 0, 1,255, // n, + 0, 5, 13, 0, 5, 3, 0, 0, 0, 0, 0, 3, 0,255, 0, 4, 2, 0, 0,255,255, 0,255, 0,255, // o, + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 13, 0, 0,255, 0, 10, 0, 0, 0, 0, 0, 0, 0,255, // p, + 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0,255, 0, 0,255, 0,255, // q, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // r, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // s, + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0, 0,255, // t, + 0, 12, 3, 1, 1, 0, 4, 0, 0, 0, 6, 4, 0,255,255, 0, 4, 0, 0, 0, 0, 0,255, 1, 0, // u, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, // v, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, // w, + 0, 0,255, 0, 0, 0,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, // x, + 0, 18, 2, 0, 18, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0,255, 0, 0,255, 0,255, // y, + 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, // z, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255, 0, 0, 0, 0, 0,255,255,255,255, 0,255,255,255, 0, 0,255,255, // ß, + 0, 95,255,255,255, 1,255,255,255, 11,255,255,255,255,255, 6,255,255,255,255,255, 8,255,255,255, 4,255,255,255,255,255,255,255, 14, 2,255,255, 17,255,255,255,255,255,255,255, 8, 16,255,255,255, 5,255, // ̀, + 0, 39,255,255,255, 0,255,255,255, 5,255,255,255,255,255, 3,255,255,255,255,255, 20,255,255,255, 1,255,255,255,255,255,255,255, 3, 0,255,255, 31,255,255,255,255,255,255,255, 10, 10,255,255,255, 5,255, // ̉, + 0, 12,255,255,255, 1,255,255,255, 5,255,255,255,255,255, 0,255,255,255,255,255, 3,255,255,255, 1,255,255,255,255,255,255,255, 2, 0,255,255, 3,255,255,255,255,255,255,255, 1, 0,255,255,255, 7,255, // ̃, + 0, 0, 3, 0, 0, 0, 0, 12, 15, 0, 0, 0, 17, 1, 6, 15, 0,255, 0, 0, 1, 0, 16, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255,255,255,255,255, 0,255,255, 1, 0,255,255, 0,255,255,255, 0,255, // à, + 2, 0, 2, 13, 0, 0, 0, 0, 26, 3, 0, 0, 1, 1, 0, 1, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255, 3,255,255,255,255,255,255,255,255,255, // á, + 3, 0, 1, 7, 5, 0, 0, 1, 23, 0, 0, 0, 3, 3, 0, 0, 0, 0, 3, 1, 7, 13, 16, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255,255,255, 8, 0,255,255, 0,255, 0,255, 0,255, // â, + 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 23, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0, 0,255,255,255, 0,255,255,255, 1, 0,255,255, 0,255,255,255,255,255, // ă, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, 0, 0,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255, // è, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255, // é, + 0, 0, 1, 0, 0, 0, 0, 0, 28, 66, 0, 6, 2, 0, 1, 0, 0, 0, 2, 0, 4, 0, 3, 0, 0, 12, 0,255, 0, 0, 0,255,255,255,255, 0, 0, 0, 0, 0,255,255, 13, 0,255, 0,255,255,255,255, 0,255, // ê, + 0, 63,255,255,255, 5,255,255,255, 24,255,255,255,255,255, 22,255,255,255,255,255, 18,255,255,255, 3,255,255,255,255,255,255,255, 17, 6,255,255, 30,255,255,255,255,255,255,255, 23, 20,255,255,255, 10,255, // ́, + 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255,255, 0, 0, 0,255,255, 0,255,255, 0, 0,255,255,255,255,255, // í, + 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // î, + 0, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255, 0, 0,255,255, 0, 0, 0,255,255,255,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255, // ï, + 97, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0,255,255,255,255,255, // đ, + 0, 42,255,255,255, 0,255,255,255, 15,255,255,255,255,255, 18,255,255,255,255,255, 8,255,255,255, 0,255,255,255,255,255,255,255, 34, 4,255,255, 24,255,255,255,255,255,255,255, 41, 17,255,255,255, 17,255, // ̣, + 0, 0, 2, 10, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255, 2,255, 0,255,255,255,255,255, 0,255, // ó, + 2, 0, 7, 8, 0, 0, 0, 2, 18, 0, 0, 0, 1, 17, 2, 0, 0, 0, 1, 6, 5, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255, 0, 0,255,255,255, 22, 0,255, 0, 0,255,255,255, 0,255, // ô, + 5, 0, 1, 1, 0, 0, 0, 0, 8, 2, 0, 0, 2, 1, 0, 0, 0,255, 1, 2, 0, 0, 4, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0,255,255,255, 0,255,255,255, 2, 0,255, 0, 0,255,255,255, 42,255, // ơ, + 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255,255,255,255,255,255,255,255, 0,255,255,255,255, 0,255,255,255,255, // ù, + 0, 0, 0, 1, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255,255, 0,255,255,255,255, 0,255, 0,255,255,255, 0,255,255,255, // ú, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, 0,255, 0, 0, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // û, + 1, 0, 1, 4, 4,255, 0, 7, 28, 1, 0, 0, 4, 0, 3, 0, 0, 0, 4, 7, 9, 0, 2, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0, 0,255,255,255, 0,255,255,255, 15, 0,255,255, 0,255,255,255, 0,255, // ư, + 0, 0,255,255,255,255,255,255,255,255,255,255, 0,255,255, 0,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ÿ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, ̀, ̉, ̃, à, á, â, ă, è, é, ê, ́, í, î, ï, đ, ̣, ó, ô, ơ, ù, ú, û, ư, ÿ, + ], + central: [ + 0, 42, 11, 51, 30, 3,154, 77, 18, 20, 23, 0,139, 0,254, 1, 0, 79, 0, 70,132,121, 0, 52,241, 14, 5, 7, 17, 8, 74, 0, 58, 0, 8, 33, 36, 9, 1,105, 8, // , + 0, 70, 0, 11,146, 0,115, 0, 11, 23, 12, 0, 0, 0, 2, 0, 0, 66, 0,107, 1, 0, 0, 0, 10, 0, 10, 25, 0, 8, 1, 0, 1, 0, 9, 0, 0, 0, 0, 0, 48, // a, + 0, 0, 0, 0, 18, 0, 1, 1, 0, 4, 2, 0, 32, 0, 1, 0, 0, 0, 0, 2, 11, 5, 0, 0, 2, 0, 0, 0, 0, 0, 8, 0, 4, 6, 1, 0, 10, 0, 0, 1, 0, // b, + 0, 2, 62, 0, 0, 0, 9, 45, 9, 0, 5, 0, 47, 0, 8, 0, 0, 0, 0, 0, 5, 31, 0, 3, 33, 3, 0,255, 14, 0, 8, 0, 0, 0, 0, 0, 13, 0, 1, 48, 0, // c, + 0, 0, 0, 0, 10, 6, 1, 19, 0, 0, 2, 0, 23, 0, 6, 0,255, 0, 0, 0, 11, 28, 0, 5, 8, 0, 0, 0, 0, 0, 33, 0, 4, 2, 0, 1, 10, 0, 1, 0, 0, // d, + 0, 70, 0, 0,111, 0, 16, 0, 27, 0, 36, 0, 0, 0, 0, 0,255, 61, 0,145, 5, 0, 0, 0, 0, 0, 0,133, 0, 0, 0, 0, 2, 0, 64, 0, 0, 1, 0, 0, 34, // e, + 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // f, + 0, 0, 0, 0, 2, 0, 2, 5, 0, 0, 0,255, 64, 0, 2, 0, 0, 0, 0, 0, 61, 9, 0, 0, 4, 0, 0,255, 0, 0, 7, 0, 1, 2, 0, 0, 13, 0, 4, 0, 0, // g, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 38, 0, 0, 4, 11, 2, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 4, 0, 0, 5, 0, // h, + 0,110, 0, 0,103, 0, 0, 0,157, 0, 0, 0, 2, 1, 7, 0, 0,225, 0,177, 0, 0, 0, 0, 0, 7, 0, 6, 0, 0, 6, 0, 5, 0, 24, 0, 0, 0, 0, 0,242, // i, + 0, 7, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0,255, 0,255, 27, 1, 0, 0, 10, 9, 0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 8, 0, 0, 0, 0, // j, + 0,193, 0, 0,210, 0, 21, 0, 1, 12, 8, 0, 21, 0, 0, 0, 0, 1, 0,186, 42, 8, 0, 8, 19, 5, 0, 0, 0, 0, 13, 0, 9, 10, 2, 0, 1, 2, 5, 2, 2, // k, + 0, 10, 15, 0, 1, 0, 0, 0, 0, 0, 2, 0, 92, 3, 12, 0, 0, 0, 0, 31, 48, 0, 0, 15, 9, 4, 0, 0, 0, 0, 24, 1, 13, 16, 0, 1, 7, 1, 56, 2, 0, // l, + 0, 2, 9, 0, 1, 0, 1, 0, 0, 2, 0, 0, 22, 4, 16, 0, 0, 0, 0, 0, 15, 0, 0, 7, 27, 10, 0, 0, 0, 0, 4, 0, 1, 2, 0, 4, 4, 0, 0, 22, 0, // m, + 0, 37, 15, 0, 39, 6, 14, 0, 1, 5, 20, 0, 76, 90, 10, 0, 0, 6, 0,121, 54, 0, 0, 16, 20,238, 0, 0, 0, 0, 37, 2, 2, 17, 1, 0, 4, 1, 4, 1, 0, // n, + 0, 16, 0, 3, 7, 0, 93, 0, 7, 21, 12, 0, 0, 0, 0, 0, 0, 2, 1, 11, 4, 0, 0, 0, 5, 0, 0, 1, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, // o, + 0, 23, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 9, 0, 3, 0, 1, 0, 0, 0, 44, 38, 0, 3, 2, 0,255, 0, 0, 0, 6, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, // p, + 255, 0,255,255, 0, 0, 0,255, 0,255,255,255, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0,255, 0, 0,255,255,255,255, 0, 0,255, 0,255,255, 0,255, 0,255,255, // q, + 0, 1, 11, 0, 1, 5, 4, 0, 0, 0, 0, 0, 96, 14, 70, 1, 0, 0, 0, 15, 68, 0, 0, 8, 20, 4, 0, 0, 0, 0, 58, 3, 5, 27, 0, 0, 14, 4, 3, 3, 0, // r, + 0, 1, 0, 0, 10, 0, 0, 6, 0, 9, 4, 0,133, 0, 15, 0, 0, 0, 0, 1,147, 9, 0, 11, 27, 2, 0, 0, 66, 0, 11, 5, 13, 12, 3, 4, 9, 6, 2, 5, 0, // s, + 0,254, 0, 0, 0, 0, 3, 10, 74, 0, 0, 0, 63, 7, 46, 5, 0, 0, 0, 6, 30, 23, 0, 22, 45, 3, 0, 0, 0, 0, 6, 1, 5, 13, 0, 0, 7, 0, 3, 2, 0, // t, + 0, 7, 0, 0, 35, 0, 35, 0, 10, 6, 4,255, 1, 9, 17, 0, 0, 16, 0, 22, 0, 0, 0, 0, 0, 0, 0, 42, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, // u, + 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 1, 0, 0, 0, 0, 8, 38, 0, 0, 2, 12, 1, 0, 0,255, 0, 2, 9, 6, 14, 0, 6, 2, 6, 0, 8, 0, // v, + 0, 0, 33, 0, 0, 1, 0, 0, 0,255, 0,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,113, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // w, + 255, 0,255,255, 0,255, 0,255,255,255,255,255, 0,255, 0, 0,255, 0,255, 0, 3,255, 0, 0, 0,255,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, // x, + 255, 0, 0, 0, 0, 0, 37, 0, 0, 0, 37,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // y, + 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 46, 0, 9, 2, 0, 0,255, 0, 8, 29, 0, 4, 5, 0, 0,255, 0,255, 4, 1, 8, 31, 0, 1, 42, 1, 3, 3, 0, // z, + 0, 0,255,255,255, 0,255,255, 0, 0,255, 0, 0, 0, 0, 1,255,255, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255, 0,255,255, // ß, + 180, 89, 2, 0, 5, 92, 0, 23, 0,166, 40, 4, 4, 15, 12,133,125, 0, 60, 0, 3, 37, 10, 0, 0, 13, 5,255, 9,255, 0, 8,255,255, 0,255, 0,255, 0, 4, 0, 0, 0, 0, 0,255, 3, 0, 0, 0, 1, 3, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,255, 0, 3,255, // š, + 53, 6, 0, 0, 0, 25, 0, 0, 0, 6, 4, 0, 0, 1, 0, 89, 0, 0, 0, 0, 0, 1, 0, 1,255, 5, 0,255,255, 0,255,255, 0, 0, 0,255,255, 0,255, 0,255,255,255,255, 0,255,255,255, 6,255, 0,255,255,255,255, 0,255, 0, 0,255,255,255,255, 0,255,255,255,255, // ś, + 1, 13, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0,255, 1, 28, 0, 0, 0,255,255, 3, 0,255, 2,255, 0, 0,255,255,255,255,255,255,255, 0,255,255, 1,255,255,255, 0, 0, 0,255, 0, 3,255, 0,255,255, 0, 0, 0,255,255,255, 0, 2,255,255, 0,255, // ť, + 96, 34, 0, 0, 21, 89, 0, 0, 0, 28, 0, 0, 5, 1, 3,221, 0, 0,127, 0, 0, 77, 1, 0, 0, 2, 0,255, 0,255, 0, 0,255,255, 0,255, 0,255, 0, 3,255, 0, 0, 5, 0,255, 0, 1, 0,255, 5, 5,255, 0, 0,255, 0, 0, 4,255,255, 0, 2, 1,255, 0, 1,255, // ž, + 4, 7, 0,255, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0,255,255, 0,255,255, 0, 0, 0,255,255, 0,255,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255, 3, 0,255,255,255,255,255,255,255,255,255, // ź, + 18,129, 4, 3, 21, 21, 0, 23, 1, 42, 0, 15, 0, 3, 0, 40, 11, 0, 2, 40, 2, 56, 0, 19,255, 32, 15,255, 0, 0,255, 0, 0, 0, 4,255,255, 0,255, 0,255, 0, 0,255, 0,255, 0, 0, 2, 0, 0, 0,255,255,255,255,255, 30, 0,255, 0,255, 0,255,255,255,255,255, // ł, + 0, 0, 2, 5, 3, 0, 0, 2, 0, 38, 44, 9, 8, 1, 16, 0, 0, 0, 7, 9, 5, 0, 0, 7, 0, 0, 28,255, 0, 0,255, 0, 0, 7, 0, 0,255, 2,255,255,255,255,255,255, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255,255,255,255, 0, 0, // ą, + 140, 48, 0, 1, 0, 50, 0, 0, 0, 33, 0, 0, 0, 0, 1, 19, 0, 0, 4, 0, 0, 12, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255, 0, 2, 3, 0,255,255,255,255, 0, 0,255,255, 0, 4,255,255,255,255,255,255,255, 0,255,255, 0,255, 0,255, 0, // ş, + 8, 3, 0, 0, 7, 56, 0, 0, 4, 1, 0, 0, 0, 0, 0, 13, 0,255, 0, 0, 0, 2, 0,255,255, 0, 0,255, 0,255,255, 0,255,255,255,255, 0,255,255, 5,255, 0,255, 0,255,255, 0,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255,255, 0,255,255, 0,255, // ľ, + 30, 16, 0, 0, 3, 26,255, 0, 0, 5, 0, 5, 0, 0, 1, 27, 0,255, 1, 0, 0, 24, 0, 0, 0, 10, 0,255,255, 0,255,255, 0, 1, 11,255,255, 0,255,255,255,255, 0,255, 0,255,255, 0, 9, 0,255,255,255, 0,255, 0,255, 5, 0,255,255,255,255,255,255,255,255,255, // ż, + 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0,255, 0, 0, 0,255, 0,255,255,255, 0,255,255,255,255, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ŕ, + 37, 0, 10, 9, 29, 2, 7, 20, 20, 13, 34, 45, 62, 52,115, 0, 15, 0, 97, 50, 87, 8, 57, 0, 0, 14, 82,255, 0, 0, 0, 3, 0, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0, 0, 7, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 3,255, 0, 0, 0, 0, 0, // á, + 0, 0, 1, 19, 2, 0, 5, 2, 5, 0, 0, 0, 1, 48, 1, 0, 8, 0, 17, 2, 12, 0, 9, 0, 0, 0, 1,255, 0,255,255, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255, 0, 0, 0, 0,255,255,255,255, 0,255, 0,255, 0,255,255,255,255,255,255,255,255, 0,255, 0, // â, + 0, 0, 14,106, 14, 0, 9, 21, 1, 0, 0, 0, 51, 27, 62, 0, 36, 0, 72, 66,133, 7, 12, 0, 1, 0, 28,255, 0,255,255, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255,255, 0, 0,255,255,255,255,255, 0,255, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 35, // ă, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0,255, 0,255, 0,255,255,255,255, 0,255,255, 0, 0,255,255,255,255,255,255, 0,255,255, 0,255,255,255, 0,255,255,255,255, 0, 0,255, // ä, + 0, 0, 0,255, 5, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, 1, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ĺ, + 15, 18, 0, 0, 0, 91, 0, 0, 0, 96, 0, 1, 0, 0, 0, 29,156,255, 1, 0, 0, 37, 0, 0, 0, 4, 0,255, 14, 40,255, 0, 0, 0, 1,255,255, 0,255, 0,255,255,255,255, 0,255, 0, 0, 1,255, 0, 0,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255, // ć, + 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255, 0, 0,255, 0,255,255, 0,255, 0,255,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255, 0, 0,255, // ç, + 118,219, 32, 0, 1, 87, 0, 0, 2,176, 3, 1, 3, 12, 26,106, 2, 0, 12, 4, 0, 54, 5, 0, 0, 1, 0,255, 76, 0, 0, 0,255, 0,255,255, 0,255, 0, 6, 0, 0, 3, 0, 0,255, 0, 0,255,255, 0, 3,255,255, 0,255, 0, 0, 0,255, 0, 0, 0, 10,255, 0, 0,255, // č, + 106, 1, 6, 3, 16, 0, 11, 14, 4, 2, 5, 60, 49, 41, 86, 0, 11, 0, 28, 50, 74, 1, 32, 0, 0, 10, 33, 0, 0,255, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255,255,255, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, // é, + 0, 0, 4, 1, 2, 0, 0, 4, 0,101, 53, 10, 5, 7, 32, 0, 2, 0, 14, 2, 41, 0, 0, 7, 0, 0, 26,255, 0, 0,255, 0, 0, 5, 0, 0, 0, 1,255,255,255,255,255,255, 0, 0, 0, 0, 0, 0,255, 0,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255, 0, 0, // ę, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255,255,255, 0,255,255,255,255,255,255,255, 0, 0,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ë, + 0, 0, 10, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 36, 54, 0, 6, 0, 0, 0, 23, 0, 37, 0,255, 0, 0,255, 0,255, 0, 0, 0,255,255,255,255,255,255, 0,255,255,255,255, 0,255, 0, 0,255,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255,255,255,255, 0, // ě, + 11, 1, 6, 25, 14, 2, 1, 2, 5, 3, 21, 5, 12, 21,177, 0, 7, 0, 38, 11, 28, 1, 26, 0, 0, 7, 14,255, 29, 0, 0, 7, 0,255,255, 0, 0,255, 0, 4,255, 0,255,255, 0, 0, 14, 0,255,255, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 51,255, 0, 0, 0,255, 0, // í, + 239, 0, 1, 0, 2, 1, 0, 0, 1, 3, 2, 9, 8, 7, 7, 0, 1, 0, 7, 3, 5, 0, 4, 0, 0, 0, 0,255,255,255,255,255,255,255,255, 1,255,255,255, 0, 0, 0,255,255,255, 0, 0,255, 0, 0,255, 0, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, // î, + 4, 4, 0,255, 0, 4,255,255,255, 0, 0,255, 0, 0, 0, 0,255,255, 0,255, 0, 0, 1,255,255, 0, 0,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255, 0,255,255,255, 0, 0,255,255, 0, 0,255, 0,255,255, 0, 0,255,255,255,255,255, 0,255,255,255,255, // ď, + 4, 31, 0,255, 0, 41,255, 0, 0, 1, 0, 0, 0, 0, 1, 69, 0,255, 4, 0,255, 3, 0, 0,255, 0, 0,255, 0,255,255, 63,255,255,255,255,255,255,255, 0,255,255,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255,255, // đ, + 0, 48,255, 0, 0, 16,255,255, 0, 12, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 1,255, 0,255, 6, 0,255,255, 0,255,255, 0, 0, 0, 0,255, 0,255, 0,255,255,255,255,255,255,255, 0, 0, 0,255, 0,255,255, 0, 0,255, 0, 0,255,255,255, 0, 0,255,255,255,255, // ń, + 0, 1, 0, 0, 3, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255, 0, 0,255,255,255, 0,255,255, 0, 0,255,255, 0, 0,255, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 0,255, // ň, + 3, 0, 8, 9, 10, 2, 3, 53, 3, 17, 6, 22, 42, 5, 7, 0, 26, 0, 52, 7, 32, 0, 2, 31, 0, 2, 20,255, 0, 0,255, 0, 0, 16,255,255,255, 1,255, 0,255,255, 0,255, 0, 0, 0, 0, 0,255,255, 0,255,255,255, 0, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, // ó, + 0, 0, 0, 1, 3, 0, 0, 0, 1, 0, 0, 3, 0, 4, 0, 0, 13,255, 1, 0, 0, 0, 1, 0, 0, 0, 0,255,255,255,255, 0,255, 0,255,255, 0, 0,255,255,255,255,255,255,255, 0,255, 0,255,255,255, 0,255,255, 0,255,255, 0, 0,255,255,255,255,255,255,255,255,255, // ô, + 4, 0, 5, 0, 12, 0, 7, 1, 0, 0, 0, 5, 22, 0, 9, 0, 0,255, 9, 13, 33, 0, 2, 0, 0, 3, 23,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255,255,255,255, 0,255, 0,255,255,255, 0,255,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0,255,255, // ő, + 13, 0, 2, 0, 6, 0, 5, 3, 2, 0, 1, 57, 7, 1, 5, 0, 0,255, 8, 2, 22, 0, 1, 0, 0, 2, 15,255, 0,255,255, 0,255, 0,255, 0,255,255,255, 0,255,255, 0,255,255, 0, 0, 0,255,255,255, 0,255,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0,255,255, // ö, + 16, 6, 5, 0, 3, 4, 0, 0, 2, 2, 0, 4, 0, 1, 0, 12, 66,255, 0, 0, 19, 2, 1, 0, 0, 2, 1,255, 0,255,255, 0,255,255,255,255,255,255, 0, 9,255,255,255,255,255,255, 0, 0,255, 0, 3, 5,255,255,255,255, 0, 0,255,255,255, 0, 0, 2,255,255, 0,255, // ř, + 0, 0, 0, 2, 5, 0, 0, 0, 2, 0, 1, 5, 4, 3, 3, 0, 7, 0, 10, 1, 4, 0, 3, 0, 0, 0, 1, 0, 0,255, 0, 0,255, 0,255, 0,255,255,255, 0,255,255,255,255, 0,255, 0, 0,255,255,255, 0,255, 0,255,255, 0, 0,255,255, 0, 0, 0, 0,255, 0, 0,255, // ů, + 27, 0, 0, 36, 2, 0, 0, 1, 1, 4, 26, 8, 4, 2, 8, 0, 1, 0, 14, 40, 14, 0, 2, 0, 0, 1, 4,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0,255, 0, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, // ú, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0,255, 4, 3, 3, 0, 1, 0, 0, 2, 1,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255,255, 0,255,255,255,255,255, 0,255, 0,255,255,255, 0, 0, 0,255,255, // ű, + 5, 0, 2, 0, 0, 0, 3, 1, 1, 0, 0, 8, 2, 2, 1, 0, 15, 0, 14, 4, 4, 0, 1, 1, 0, 3, 13,255, 0,255, 0, 0,255,255,255, 0,255, 0,255, 0,255,255, 0,255,255, 0, 0, 0,255, 0,255, 0,255,255,255,255, 0, 0,255, 0, 0,255,255, 0, 0, 0,255,255, // ü, + 0, 0, 4, 0, 2, 0, 0, 0, 3, 0, 0, 52, 4, 1, 69, 0, 0,255, 13, 0, 11, 0, 56, 0,255, 0, 1,255, 0,255,255, 0,255,255,255,255,255,255,255, 0,255,255, 0,255,255,255, 0, 0,255,255, 0, 0,255,255,255,255, 0,255,255,255, 0, 0, 0, 0,255, 0, 0,255, // ý, + 20,111, 1, 33, 0, 31, 0, 0, 0, 31, 0, 0, 5, 1, 78, 5, 2, 0, 10, 1, 0, 25, 2, 0, 0, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255, 0, 0, 25,255,255,255,255,255, 0,255,255,255, 0, 4,255,255,255,255, 0,255,255,255,255,255, 0,255,255,255, 0, // ţ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, š, ś, ť, ž, ź, ł, ą, ş, ľ, ż, ŕ, á, â, ă, ä, ĺ, ć, ç, č, é, ę, ë, ě, í, î, ď, đ, ń, ň, ó, ô, ő, ö, ř, ů, ú, ű, ü, ý, ţ, + ], + cyrillic: [ + 0, 0, 0, 0, 1, 0, 16, 38, 0, 2, 5, 10,121, 4, 20, 25, 26, 53, 9, 5, 61, 23, 20, 26, 15, 95, 60, 2, 26, 15, 25, 29, 0, 14, 6, 6, 25, 1, 0, 27, 25, 8, 5, 39, // , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, // ѓ, + 0, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 2, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ђ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 4, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // љ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0,255, 5, 0, 0, 0, 0, 2, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // њ, + 0, 0,255, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // ћ, + 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255, // џ, + 7, 0, 0,255,255,255,255,255, 0, 1, 0,255,255,255, 15, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 1, 0, 0, 0, 1, // ў, + 12, 0, 0,255,255, 0,255,255, 0, 2, 0, 0, 0, 0, 2, 3, 15, 5, 5, 0, 0, 4, 0, 0, 21, 15, 10, 17, 0, 6, 14, 4, 6, 0, 3, 1, 8, 1, 0, 0, 0, 2, 0, 0, 0, 0, // і, + 0, 0,255,255,255,255,255,255, 0, 0, 0,255,255, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ё, + 6, 0, 0,255,255,255,255,255, 0, 0,255, 5,255, 0, 1, 7, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // є, + 12, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 5, 1, 0, 0, 0, 2, 0, 0, 20,255, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255, // ј, + 9, 0, 0,255,255,255,255,255,255, 5,255, 0, 0, 13, 3, 3, 0, 4, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 1, 3, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ї, + 32, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0, 0, 28, 0, 23, 22, 26, 22, 19, 0, 3, 12, 5, 0, 44, 38, 18, 58, 1, 21, 44, 17, 54, 1, 2, 28, 5, 8, 3, 1, 9, 0, 12, 0, 0, 0, // а, + 40, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 7, 0, 0, 0, 1, 7, 0, 1, 1, 0, 0, 7, 4, 1, 9, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // б, + 31, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 3, 0, 0, 19, 0, 0, 1, 1, 6, 0, 2, 6, 0, 1, 0, 1, 0, 32, 0, 2, 2, 23, 9, 0, 0, 0, 1, 0, 0, 1, 1, 0, 3, 0, 2, // в, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 1, 20, 0, 0, 1, 0, 9, 0, 0, 9, 7, 0, 5, 2, 18, 11, 0, 8, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 13, 0, 3, // г, + 26, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 2, 0, 2, 19, 0, 1, 5, 0, 13, 2, 2, 3, 2, 0, 6, 1, 12, 30, 0, 4, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 1, // д, + 12, 0, 0, 1, 4, 5, 0, 0, 0, 0, 0, 0, 24, 1, 5, 7, 11, 3, 12, 1, 6, 6, 11, 0, 3, 15, 14, 14, 4, 8, 25, 14, 29, 0, 1, 1, 4, 8, 8, 2, 0, 3, 1, 0, 0, 0, // е, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 3, 2, 1, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, // ж, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 6, 0, 0, 0, 11, 8, 0, 0, 8, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, // з, + 24, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 1, 0, 1, 10, 16, 21, 22, 0, 6, 5, 6, 1, 15, 15, 8, 38, 2, 4, 27, 9, 15, 0, 3, 8, 12, 7, 6, 1, 0, 0, 0, 0, 0, 0, // и, + 6, 0, 0, 0,255,255,255,255, 0, 7, 0, 0,255, 4, 21, 0, 0, 0, 0, 5, 0, 0, 39, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 3, 0, 0, // й, + 54, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 10, 0, 1, 0, 1, 11, 0, 0, 12, 0, 1, 2, 0, 4, 8, 0, 2, 23, 2, 4, 0, 2, 3, 3, 8, 0, 0, 3, 16, 1, 4, 3, // к, + 12, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 6, 0, 4, 29, 12, 4, 5, 2, 18, 0, 0, 17, 4, 5, 11, 0, 0, 21, 2, 3, 4, 1, 15, 1, 0, 0, 0, 0, 0, 4, 3, 2, 12, 0, 2, // л, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 17, 1, 0, 0, 0, 7, 0, 1, 13, 2, 0, 0, 0, 0, 13, 0, 2, 4, 0, 2, 0, 0, 0, 0, 0, 0, 1, 4, 2, 4, 1, 1, // м, + 42, 0, 0, 0, 0, 0, 0, 0, 4, 12, 6, 7, 1, 7, 76, 0, 22, 1, 4, 27, 1, 3, 34, 30, 0, 7, 1, 13, 24, 1, 3, 5, 3, 4, 0, 1, 0, 4, 1, 0, 2, 18, 7, 16, 0, 4, // н, + 37, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 10, 27, 22, 15, 1, 2, 3, 7, 5, 32, 11, 7, 38, 8, 21, 24, 11, 23, 0, 2, 10, 2, 2, 3, 2, 0, 0, 1, 0, 0, 0, // о, + 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 2, 0, 1, 2, 4, 0, 0, 2, 0, 6, 0, 0, 5, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, // п, + 19, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 8, 0, 5, 47, 4, 6, 6, 5, 23, 0, 0, 5, 2, 6, 0, 0, 0, 23, 22, 0, 1, 14, 9, 1, 0, 1, 0, 0, 0, 7, 2, 8, 16, 0, 3, // р, + 53, 0, 0, 0, 0, 0, 0, 0, 4, 9, 2, 0, 1, 2, 21, 1, 4, 1, 2, 11, 0, 0, 12, 2, 4, 7, 1, 13, 15, 1, 4, 6, 3, 6, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 0, 1, // с, + 28, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 1, 0, 1, 32, 0, 1, 3, 0, 12, 0, 1, 22, 1, 4, 7, 1, 6, 23, 0, 14, 41, 14, 3, 0, 1, 1, 1, 21, 0, 2, 2, 6, 2, 1, 4, // т, + 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 2, 4, 2, 4, 6, 3, 0, 2, 0, 0, 6, 5, 6, 3, 0, 3, 7, 4, 7, 18, 1, 6, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, // у, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ф, + 41, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 30, 0, 2, 0, 0, 11, 0, 0, 5, 1, 14, 3, 0, 3, 6, 0, 7, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 4, 3, 5, 0, 0, // х, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 0, 0, 0, 0, 4, 0, 0, 7, 1, 0, 1, 0, 2, 1, 0, 0, 9, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, // ц, + 6, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 1, 5, 0, 2, 0, 0, 6, 0, 0, 1, 0, 0, 3, 0, 2, 0, 0, 2, 0, 1, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, // ч, + 12, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 17, 0, 0, 1, 0, 2, 0, 0, 26, 0, 0, 0, 0, 0, 22, 2, 6, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, // ш, + 2, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, // щ, + 0, 0,255,255,255,255, 0,255, 0, 0, 0,255,255,255, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 11, 0, 1, 0, 0, 2, 2, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ъ, + 1, 0, 0,255,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 3, 11, 0, 4, 0, 2, 1, 0, 0, 0, 3, 1, 16, 0, 0, 22, 2, 10, 0, 0, 0, 8, 6, 3, 0, 0, 0, 0, 0, 0, 0, // ы, + 0, 0, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 7, 3, 0, 1, 13, 7, 7, 0, 35, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, // ь, + 10, 0, 0,255,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 1, 1, 10, 11, 0, 2, 2, 0, 0, 0, 9, 3, 9, 0, 0, 7, 6, 9, 0, 0, 8, 3, 2, 1, 0, 0, 0, 0, 17, 0, 0, // э, + 14, 0, 0, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ю, + 5, 0, 0,255,255,255,255,255, 0, 9, 0, 0,255, 0, 11, 0, 3, 0, 0, 0, 0, 2, 24, 0, 0, 5, 2, 14, 1, 0, 2, 3, 1, 0, 0, 1, 3, 0, 0, 0, 0, 16, 1, 0, 0, 0, // я, + // , a, ѓ, ђ, љ, њ, ћ, џ, ў, і, ё, є, ј, ї, а, б, в, г, д, е, ж, з, и, й, к, л, м, н, о, п, р, с, т, у, ф, х, ц, ч, ш, щ, ъ, ы, ь, э, ю, я, + ], + western: [ + 18, 3, 0,254, 74, 0, 5,254,254, 2, 25,254,149, 4,254, 66,148,254, 0,254,122,238, 8, 1, 20, 13,254, 35, 20, 3, 1, 0, // , + 0, 3, 0, 0, 0, 0, 0, 5, 2, 0, 86, 9, 76, 0, 0, 0,241, 0, 0, 49, 0, 0, 0, 0, 11, 2, 0, 34, 0, 1, 2, 0, // a, + 19, 0, 0, 5, 5, 0, 0, 8, 13, 5, 0, 34, 22, 0, 0, 0, 4, 0, 0, 0, 6, 1, 3, 3, 42, 37, 8, 8, 0, 67, 0, 0, // b, + 0, 0, 0, 9, 6, 1, 0, 22, 10, 1, 0, 19, 54, 1, 0, 1, 18, 3, 1, 2, 40, 7, 0, 0, 6, 0, 3, 5, 1, 34, 0, 0, // c, + 0, 0, 0, 5, 5, 0, 0, 12, 45, 16, 1, 6, 42, 0, 13, 3, 10, 0, 2, 0, 66, 11, 5, 8, 33,104, 3, 4, 0, 19, 0, 0, // d, + 63, 5, 0, 0, 0, 0, 2, 33, 15, 1, 3, 0, 87, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 49, 1, 11, 0, 3, 0, 9, 1, 0, // e, + 0, 0, 0, 8, 8, 0, 0, 10, 2, 7, 0,162, 23, 0, 13, 0, 4, 0, 0, 0, 1, 3, 0, 0, 15, 4, 0, 0, 0, 4, 0, 0, // f, + 1, 0, 0, 14, 16, 24, 0, 29, 11, 41, 0, 13, 86, 0, 14, 9, 3, 0, 0, 0, 20, 8, 7, 7, 13, 37, 14, 0, 0, 12, 0, 0, // g, + 1, 0, 0, 0, 0, 0, 0, 47, 2, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 29, 20, 0, 0, 0, 0, 45, 0, 0, // h, + 5, 4, 0,166,120, 0, 0,144, 0, 2, 3, 88,254, 0, 0, 0, 0, 0, 0, 3, 28,107, 0,112, 8, 2, 44, 32, 0, 3, 3, 0, // i, + 0, 0, 0, 0, 0, 0, 0, 39, 9, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 16, 18, 44, 0, 0, 0, 0, 0,255, // j, + 0, 2, 0, 0, 1, 0, 0, 48, 31, 32, 1, 60, 1, 0, 4, 0, 1, 0, 0, 0, 1, 3, 0, 2, 20, 47, 0, 0, 0, 20, 0, 0, // k, + 4, 0, 0, 12, 16, 0, 0, 54, 40, 48, 0, 64, 36, 0, 39, 6, 12, 3, 0, 0, 27, 9, 3, 24, 42, 33, 2, 9, 7, 77, 0, 0, // l, + 0, 0, 0, 14, 5, 4, 0, 60, 11, 4, 3, 48, 30, 7, 28, 1, 10, 1, 0, 0, 24, 41, 3, 3, 19, 24, 1, 8, 2, 36, 0, 0, // m, + 1, 1, 0, 24, 91, 16, 0,132, 62, 73, 1, 56, 71, 33, 78, 7, 35, 2, 3, 0, 94,254, 10, 21, 33, 38, 24, 21, 1, 61, 0, 0, // n, + 0, 1, 0, 0, 0, 0,254, 6, 0, 1, 27, 0, 13, 0, 0, 84,127, 0, 0, 62, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, // o, + 0, 0, 0, 5, 2, 0, 0, 9, 15, 0, 0, 4, 34, 0, 6, 0, 6, 0, 0, 0, 20, 12, 9, 28, 10, 22, 0, 3, 0, 7, 0, 0, // p, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 33, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // q, + 0, 0, 0, 83, 62, 1, 0,198,139,125, 0,229, 94, 54,190, 38, 18, 1, 0, 0,176, 24, 16, 29,193,181, 13, 13, 2,131, 0, 0, // r, + 1, 0, 0, 41, 34, 0, 0, 41, 24, 42, 0, 68,113, 15,159, 6, 43, 19, 4, 58, 14, 18, 1, 4, 48, 42, 4, 12, 9, 20, 0, 0, // s, + 7, 1, 0, 14, 20, 8, 0, 56, 37, 31, 0,104, 67, 14,113, 3, 50, 9, 5, 0, 89, 7, 19, 22, 13, 14, 40, 12, 15, 18, 0, 0, // t, + 0, 1, 5, 1, 2, 0, 0, 30, 0, 0, 1, 15, 2, 0, 1, 0, 1, 0, 0, 2, 4, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, // u, + 0, 2, 0, 1, 6, 0, 0, 29, 33, 13, 0, 19, 46, 0, 15, 0, 7, 0, 1, 31, 2, 2, 3, 1, 32, 27, 0, 0, 1, 1, 0, 0, // v, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0,255, // w, + 0, 0, 0, 1, 16, 0, 0, 23, 0, 0, 0, 3, 14, 0, 0, 0, 2, 3, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, // x, + 0, 0, 0, 0, 0, 0, 0, 58, 8, 0, 0, 1, 1, 62, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 82, 0, 0, 0, 0, 0,255, // y, + 0, 0, 0, 0, 2, 0, 0, 0, 14, 0, 0, 7, 3, 0, 6, 0, 3, 5, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, // z, + 0, 29, 0, 0, 0, 15, 0, 0, 0, 11, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255,255, 4, 0, 0,255,255, 0,255, 0,255, 0, 0,255,255,255, 0, 0, 0, 8, 0,255, 0, 0, 2, 0, 0, // ß, + 6, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 1, 0, 0, 0, 0, 0, 0, 0,255, 0, 1, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // š, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255,255, 0,255,255, // œ, + 107, 0, 22, 16, 18, 14, 6, 24, 46, 15, 2, 0, 42, 18, 17, 0, 36, 0, 34, 4,254, 1, 2, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, // à, + 41, 0, 10, 8, 21, 34, 5, 5, 60, 18, 5, 1, 29, 42, 26, 2, 16, 0, 27, 9, 43, 28, 7, 0, 0, 1, 4, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0,255, // á, + 24, 0, 1, 2, 0, 0, 0, 0, 7, 0, 0, 0, 3, 1, 0, 0, 0, 0, 2, 0, 5, 0, 1, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 2, 0,255, 0,255, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255, 0,255, // â, + 0, 0, 0, 1, 2, 3, 0, 1, 2, 12, 0, 0, 1, 7, 29, 4, 1,255, 11, 66, 11, 0, 1, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0,255,255,127,255,255,255,255,255, 0, 0,255, 0, 0,255,255, 0,255,255,255,255,255,255,255,255, // ã, + 134, 1, 11, 0, 25, 6, 15, 11, 61, 24,123, 95,114, 68, 53, 1, 49, 0, 60, 98,198, 0, 88, 29, 0, 6, 12, 0, 0,255, 0,255, 0, 0,118, 0,255, 0,255, 0,255, 0,255, 0,255,255, 0,255,255, 0,255, 2,255,255,255, 0, 0, 0,255, // ä, + 156, 0, 12, 14, 19, 3, 12, 47, 17, 3, 12, 5, 30, 47, 22, 0,205, 0,184, 70, 19, 0, 22, 8, 0, 6, 1,255, 0,255,255, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, // å, + 26, 0, 7, 0, 4, 0, 23, 8, 15, 0, 18, 19, 56, 23, 24, 0, 9, 0, 82, 37, 24, 0, 71, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255, 0,255, // æ, + 17,112, 0, 2, 0, 15, 0, 0, 0, 35, 0, 0, 2, 0, 59, 9, 1, 0, 36, 0, 0, 8, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // ç, + 254, 0, 9, 14, 20, 0, 15, 6, 70,144, 14, 45, 47, 92, 16, 3,123, 0, 38, 23,115, 52, 22, 42, 2, 80, 19,255, 0,255, 0, 0,255,255, 0,255,255, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0, 1,255,255, // è, + 152, 2, 19, 24, 85, 0, 29, 23, 26, 25, 2, 9, 43, 60, 62, 1, 32, 0,122, 45,169, 15, 13, 30, 7, 4, 8, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, // é, + 5, 0, 0, 3, 7, 0, 0, 10, 2, 3, 0, 26, 6, 6, 20, 1, 2, 0, 20, 1, 11, 5, 5, 2, 0, 0, 1,255, 0,255,255,255, 0,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0,255, // ê, + 36, 2, 23, 15, 36,143, 5, 23, 52, 52, 66, 48, 92, 57,216, 10,125, 35, 89, 58,254, 9, 24, 14, 0, 0, 8,255, 0,255, 0,255,255,255, 0, 0,255, 1, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0,255, 0, 0, 0,255, // ë, + 12, 0, 1, 4, 6, 0, 3, 21, 10, 0, 0, 0, 18, 8, 4, 0, 1, 0, 65, 35, 8, 3, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255,255,255,255,255,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0,255, // ì, + 40, 72, 7, 10, 16, 2, 23, 10, 34, 0, 0, 1, 34, 15, 21, 1, 3, 0,203, 28, 58, 23, 11, 0, 10, 0, 2, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0,255,255, 0,255, 2, 0,255, // í, + 6, 5, 1, 9, 5, 0, 0, 0, 22, 0, 9, 8, 8, 6, 9, 1, 10, 0, 20, 6,182, 0, 13, 0, 0, 24, 1,255, 0,255,255,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0,255,255,255, // î, + 0, 6, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, // ï, + 0,254, 0, 0, 0, 26, 0, 0, 0, 61, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255,255, // ñ, + 20, 0, 56, 43, 8,162, 14, 3, 23, 19, 2,118, 31, 26, 46, 0, 20, 0, 23, 6, 24, 19, 6, 21, 5, 27, 63,255, 0,255, 0, 0,255,255,255,255,255, 3, 0,255,255,255, 0, 0,255, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, // ò, + 67, 0, 12, 15, 9, 7, 8, 66, 13,254, 3, 23, 14, 16, 16, 0, 8, 0, 29, 11, 26, 0, 5, 5, 1, 10, 13,255, 0,255,255, 0,255, 0, 0,255,255, 1,255, 0,255,255, 0, 0,255, 0, 1, 0, 0, 0, 0,255,255,255, 0,255,255, 0,255, // ó, + 18, 3, 3, 12, 1, 0, 2, 0, 7, 0, 1, 0, 2, 2, 8, 0, 6, 0, 6, 7, 4, 0, 2, 0, 0, 0, 1,255, 0, 0,255, 0, 0,255,255,255, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255,255,255,255, 0, 0,255,255,255, // ô, + 29, 2, 0, 0, 0, 0, 0, 0, 5, 2, 22, 30, 25, 38, 19, 0, 33,255, 4, 39, 24, 0, 88, 0, 0, 0, 0,255, 0,255,255, 0,255, 0,255,255,255, 36,255,255,255,255,255, 0,255,255, 0,255, 0, 0, 6, 0,255,255,255, 0, 0, 0,255, // õ, + 44, 0, 33, 0, 25, 0,142, 5, 46, 10, 25, 32, 26, 13, 6, 0, 3, 0, 30, 8, 35, 0, 25, 5, 0, 44, 7, 0, 0,255,255, 0,255,255, 73, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0,255, 0, 0, 0, 39, 0,255,255,255, 0, 0, 0, // ö, + 52, 0, 21, 0, 57, 0,119, 12, 47, 3, 59, 33, 45, 15, 12, 0, 3, 0, 52, 82, 49, 1, 11, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255, 0, 0, 0,255, 0,255,255,255, 0,255,255, 0,255,255,255,255, 0, 0,255,255,255,255,255, 0, // ø, + 25, 0, 4, 3, 53, 0, 0, 2, 12, 72, 0, 0, 30, 0, 0,254, 0, 0, 6, 3, 3, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0,255,255,255,255, 0,255, 0, 0,255,255, 0, 0, 0, 0, 0, 0, // ù, + 19, 2, 1, 7, 9, 1, 12, 5, 9, 41, 1, 0, 10, 7, 9, 0, 8, 0, 12, 28, 8, 0, 0, 0, 0, 1, 0,255, 0,255,255, 0,255,255,255,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0, 0,255, 0,255,255, 0, 0,255,255, 0,255, // ú, + 0, 0, 0, 0, 1, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 45, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0,255, 0,255,255,255, 0, 0,255,255,255,255, 0,255,255,255, 0,255, 0, 0,255, 0, // û, + 95, 2, 19, 0, 6, 2,121, 9, 15, 1, 5, 44, 18, 26, 7, 0, 11, 2, 68, 49, 20, 0, 2, 17, 0, 0, 6, 0, 0,255, 0,255,255,255, 0,255,255, 0,255, 0,255, 0,255,255,255, 0, 0,255,255,255, 0, 0,255, 0, 0, 0, 31, 0, 0, // ü, + 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0,255, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, // ž, + 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 0,255,255,255,255,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, 0,255, 0,255,255,255, 0, 0, 0, // ÿ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, š, œ, à, á, â, ã, ä, å, æ, ç, è, é, ê, ë, ì, í, î, ï, ñ, ò, ó, ô, õ, ö, ø, ù, ú, û, ü, ž, ÿ, + ], + icelandic: [ + 0, 68, 0, 0, 2,122,156, 5, 1, 1, 5, 1, 0, // , + 0, 2,255, 0, 0, 6, 51, 2, 0, 0, 5, 0, 19, // a, + 0, 1, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, // b, + 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, // c, + 0, 0, 0, 2, 0, 2, 0, 0, 0, 5, 0, 1,255, // d, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 22, // e, + 0, 4, 0, 1, 0, 5, 3, 3, 13, 0, 0, 0, 0, // f, + 0, 4, 0, 3, 1, 7, 2, 10, 12, 19, 7, 3, 0, // g, + 0, 2, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, // h, + 0, 1, 0, 0, 0, 0, 69, 0, 0, 0, 1, 2, 4, // i, + 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 8, // j, + 0, 3, 0, 5, 1, 21, 1, 10, 4, 10, 11, 0,255, // k, + 0, 30, 0, 8, 9, 4, 6, 78, 20, 18, 4, 1, 0, // l, + 0, 2, 0, 5, 0, 8, 2, 9, 1, 3, 1, 1, 0, // m, + 0, 9, 4, 4, 0, 11, 2, 18, 11, 6, 13, 3, 0, // n, + 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 3, // o, + 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0,255, // p, + 0,255,255,255, 0, 0,255,255,255,255,255,255,255, // q, + 0, 45, 1, 13, 7, 2, 7, 25, 17, 59, 9, 8, 7, // r, + 0, 8, 1, 2, 1, 37, 13, 5, 0, 1, 9, 9, 0, // s, + 0, 17, 0, 14, 7, 6, 1, 17, 3, 3, 14, 5, 0, // t, + 0, 0, 0, 0, 0, 7, 61, 0, 0, 0, 0, 3, 1, // u, + 0, 5, 0, 2, 0, 3, 4, 3, 0, 9, 0, 0, 6, // v, + 0, 0,255,255,255,255,255, 0, 0, 0,255,255,255, // w, + 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255,255, // x, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // y, + 0, 0,255, 0, 0, 0,255, 0, 0,255, 0, 0,255, // z, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0,255, 0,255, 0, // ß, + 83, 0, 2, 0, 4, 0, 2, 3, 6, 0, 16, 3, 9, 23, 5, 0, 2,255, 24, 3, 7, 0, 5, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255,255, 0, 0, 8, // á, + 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,255, 0,255, 0, 0, 0,255, 1, 0,255,255, 0,255,255, 0,255,255,255,255,255,255,255,255,255,255, // ä, + 4, 0, 7, 0, 3, 0, 8, 1, 4, 0, 0, 0, 5, 2, 3, 0, 4,255, 18, 5, 8, 0, 8, 0, 0, 0,255,255, 0,255, 0,255, 0, 0, 0,255,255, 0, 0, 2, // æ, + 1, 0, 0, 0, 0, 0, 8, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0,255, 6, 6, 0, 0, 1, 0, 0, 0, 0, 0,255,255,255, 0,255,255,255,255,255,255,255, 0, // é, + 126, 2, 2, 0, 1, 0, 1, 2, 0, 0, 0, 3, 16, 2, 5, 0, 1, 0, 25, 28, 19, 0, 29, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0,255,255, 0,255, 0, // í, + 0, 95, 0,255, 0, 24, 5, 5, 0,122, 0, 0, 0, 0, 0, 7, 0,255, 46, 0, 0, 20, 2,255,255, 6,255, 0, 8,255, 23, 0, 36, 0, 13, 7, 21, 1, 2,255, // ð, + 8, 0, 63, 0, 7, 0, 22, 2, 3, 4, 39, 9, 5, 4, 5, 0, 1,255, 13, 8, 21, 0, 5, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, 5, // ó, + 9, 0, 1, 0, 0, 0, 1, 2, 8, 0, 18, 2, 10, 4, 3, 0, 0,255, 4, 6, 11, 0, 5, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, // ö, + 17, 0, 3,255, 3, 0, 40, 1, 11, 0, 14, 1, 13, 2, 2, 0, 0,255, 14, 9, 12, 0, 12,255,255, 0,255,255,255,255,255,255, 0, 0, 0,255,255, 0,255,255, // ø, + 20, 0, 9, 0, 0, 0, 0, 2, 7, 0, 6, 3, 1, 1, 10, 0, 0,255, 12, 3, 3, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0, 1, // ú, + 1, 0, 9, 0, 2,255, 0, 0, 0,255, 0, 1, 6, 0, 7,255, 0,255, 1, 4, 9,255, 0,255, 0, 0, 0,255,255,255,255,255,255, 0, 0,255,255, 0,255, 4, // ý, + 87, 1,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255,255, 0,255, 0, 0, 2, 0, 0, 0,255, 0, 0, 0, // þ, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, á, ä, æ, é, í, ð, ó, ö, ø, ú, ý, þ, + ], + greek: [ + 0, 12, 0, 16, 6, 69, 1, 0, 0, 24, 0, 33, 0, 41, 2, 2, 1, 50, 0, 44, 1, 2,105, 1, 2, 33, 0, 0, 0, 2, 0, 0, 15, 5, 1, // , + 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0,255, 0, 0,255,255, 0,255,255, 0, 0,255, 0,255, 0,255,255,255,255,255, 0,255,255,255, 0,255, 0,255,255,255, 0,255,255,255,255,255, // ΐ, + 4, 0, 0, 0, 0,255, 0, 0, 1, 1, 0, 0, 0, 0, 0, 4, 7, 4, 3, 3, 0, 0, 3, 6,255, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // ά, + 9, 0,255,255, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 1, 3, 2, 6, 2, 0, 0, 2, 2,255, 0, 5, 0, 1, 1, 0, 0, 0, 0, 0,255,255, // έ, + 3, 0,255,255,255,255, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 10, 3, 1, 2, 0, 0, 0, 2,255, 0, 4, 0, 1, 1, 0, 0, 0,255,255,255,255, // ή, + 0, 0,255, 0, 0,255, 0, 5, 1, 2, 2, 15, 0, 0, 0, 0, 1, 4, 1, 5, 0, 5, 2, 14,255, 3, 4, 0, 1, 1, 0, 0, 0, 0, 0,255,255, // ί, + 46, 0, 0, 0, 1, 0, 22, 0, 4, 3, 4, 0, 1, 0, 2, 20, 34, 8, 15, 23, 1, 0, 13, 18,255, 5, 30, 1, 4, 3, 0, 0, 0, 0, 0, 0, 0, // α, + 10, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // β, + 15, 0, 0, 1, 1, 0, 0, 7, 0, 1, 0, 2, 0, 6, 0, 1, 0, 0, 0, 0, 0, 5, 0, 5,255, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, // γ, + 19, 0, 0, 3, 0, 0, 4, 2, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 3, 0, 4, 0, 0,255, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // δ, + 33, 0, 0, 0, 0, 0, 2, 0, 2, 4, 7, 0, 2, 0, 5, 2, 9, 9, 16, 8, 1, 0, 10, 7,255, 15, 15, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, // ε, + 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ζ, + 10, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 4, 0, 2, 8, 3, 6, 1, 0, 1, 3,255, 14, 41, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, // η, + 5, 0, 0, 1, 0, 1, 0, 5, 0, 0, 0, 2, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,255, 0, 0, 1, 0, 0, 0, 0, 0,255, 0, 0, 0, // θ, + 6, 0, 0, 0, 0, 0, 0, 28, 2, 8, 12, 19, 0, 0, 0, 0, 3, 9, 5, 10, 0, 12, 6, 19,255, 6, 19, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, // ι, + 44, 0, 0, 1, 1, 0, 2, 8, 0, 5, 0, 5, 0, 4, 0, 33, 0, 0, 0, 0, 0, 3, 0, 2,255, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // κ, + 7, 0, 0, 5, 3, 0, 1, 10, 1, 1, 0, 9, 0, 2, 1, 3, 3, 9, 0, 0, 0, 10, 4, 0,255, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 1, 0, // λ, + 20, 0, 0, 1, 1, 3, 0, 5, 0, 1, 0, 3, 0, 8, 1, 3, 0, 0, 2, 0, 0, 8, 0, 2,255, 5, 0, 4, 0, 0, 0, 2, 0, 0, 3, 1, 0, // μ, + 12, 0, 0, 6, 8, 1, 7, 30, 0, 1, 0, 10, 0, 14, 1, 8, 0, 0, 0, 1, 0, 16, 0, 1,255, 0, 0, 8, 0, 1, 0, 13, 0, 0, 5, 3, 9, // ν, + 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // ξ, + 20, 0, 0, 0, 0, 0, 9, 0, 3, 9, 6, 0, 1, 0, 2, 14, 12, 13, 10, 14, 0, 0, 26, 21,255, 4, 54, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, // ο, + 45, 0, 0, 1, 0, 0, 0, 13, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 5, 0, 0, 6, 0, 0,255, 2, 0, 5, 0, 0, 0, 1, 0, 0, 2, 1, 0, // π, + 3, 0, 0, 5, 5, 1, 2, 17, 3, 4, 4, 19, 0, 3, 1, 4, 5, 0, 0, 0, 0, 14, 12, 0,255, 0, 9, 5, 1, 4, 0, 2, 0, 0, 3, 2, 1, // ρ, + 0, 0, 0, 1, 6, 6, 0, 14, 0, 0, 0, 9, 0, 21, 0, 6, 0, 0, 0, 0, 0, 17, 0, 0,255, 0, 0, 5, 0, 0, 0, 4, 0, 0, 7, 1, 0, // ς, + 43, 0, 0, 3, 2, 3, 4, 10, 0, 0, 0, 4, 0, 6, 0, 13, 0, 0, 0, 1, 0, 7, 0, 0,255, 2, 2, 5, 0, 0, 0, 3, 0, 0, 3, 2, 1, // σ, + 73, 0, 0, 4, 2, 3, 3, 21, 0, 0, 0, 10, 0, 5, 0, 4, 4, 0, 0, 16, 0, 3, 2, 2,255, 35, 0, 5, 0, 0, 0, 3, 0, 0, 7, 1, 1, // τ, + 4, 0,255, 0, 0, 0, 0, 4, 0, 0, 1, 5, 0, 0, 1, 0, 2, 1, 0, 0, 0, 53, 0, 1,255, 7, 1, 0, 1, 0, 0, 0,255,255, 0, 0, 0, // υ, + 6, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0,255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // φ, + 9, 0, 0, 0, 3, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 4,255, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // χ, + 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, // ψ, + 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 3, 0, 0, 1, 4,255, 1, 6, 0, 1, 1, 0, 0, 0,255, 0, 0, 0, // ω, + 0, 0,255, 0, 0, 0, 0, 1, 0,255,255, 0,255, 0,255, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, // ϊ, + 0, 0,255, 0, 0,255, 0, 0,255,255, 0, 0,255,255,255, 0,255, 0,255,255,255, 0,255, 0,255, 0, 0, 0,255,255,255, 0,255,255, 0,255,255, // ϋ, + 6, 0, 0,255, 0,255, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 11, 1, 4, 3, 0, 0, 9, 5,255, 1, 4, 0, 0, 0, 0, 0, 0,255, 0,255,255, // ό, + 0, 0,255,255, 0,255,255, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 13, 0, 0,255, 2, 0, 0, 0, 0, 0, 0,255,255, 0,255,255, // ύ, + 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 3,255, 0, 1, 0, 0, 1, 0, 0, 0,255,255,255,255, // ώ, + // , a, ΐ, ά, έ, ή, ί, α, β, γ, δ, ε, ζ, η, θ, ι, κ, λ, μ, ν, ξ, ο, π, ρ, ς, σ, τ, υ, φ, χ, ψ, ω, ϊ, ϋ, ό, ύ, ώ, + ], + turkish: [ + 195,254, 0,140, 0, 12,220,165, 2, 1, 58, 25, 27, // , + 1, 23, 0, 2, 0, 19, 0, 0, 4, 0, 0, 1, 26, // a, + 2, 53, 0, 12, 0, 0, 3, 5, 0, 0, 1, 1, 0, // b, + 13, 31, 0, 4, 0, 0, 0, 0, 0, 0, 0, 2, 0, // c, + 7,161, 0, 22, 0, 0, 11, 4, 1, 3, 1, 2, 16, // d, + 0, 18, 0, 0, 0, 22, 0, 0, 6, 0, 1, 0, 14, // e, + 3, 19, 0, 12, 0, 0, 0, 1, 0, 0, 0, 8, 1, // f, + 0, 45, 0, 0, 0, 0, 1, 2, 0, 0, 2, 0, 0, // g, + 0, 27, 0, 21, 0, 0, 1, 2, 0, 0, 0, 2, 0, // h, + 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, // j, + 37,114, 0, 30, 0, 1, 8, 25, 0, 2, 4, 14, 6, // k, + 60,157, 0, 45, 0, 4, 3, 9, 13, 19, 1, 21, 18, // l, + 39,105, 0, 19, 0, 2, 6, 5, 0, 1, 2, 22, 10, // m, + 105,198, 0, 63, 0, 0, 89, 46, 0, 28, 13, 36, 0, // n, + 1, 54, 0, 0, 0, 7, 0, 0, 0, 0,255, 0, 1, // o, + 2, 22, 0, 0, 0, 0, 0, 3, 0, 0, 3, 6, 0, // p, + 8, 12, 0, 6, 0, 0, 0, 0, 0, 0, 0, 1, 1, // q, + 44,125, 0,124, 0, 0, 21, 23, 6, 22, 10, 42, 1, // r, + 18,123, 0, 48, 0, 0, 0, 17, 0, 2, 3, 7, 0, // s, + 5,117, 0, 35, 0, 1, 2, 7, 0, 1, 1, 4, 25, // t, + 0, 3, 0, 0, 0, 1, 0, 0, 23, 0, 0, 0, 7, // u, + 1, 30, 0, 4, 0, 0, 2, 2, 0, 11, 0, 1, 0, // v, + 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, // w, + 3, 9, 0, 3, 0, 0, 3, 0, 0, 0, 0, 1, 0, // x, + 8, 73, 0, 14, 0, 0, 1, 5, 0, 8, 5, 7, 0, // y, + 12, 34, 0, 10, 0, 0, 10, 5, 0, 10, 0, 26, 0, // z, + 25, 0, 2, 22, 54, 0, 8, 2, 4, 0, 38, 68, 26, 55, 0, 10, 7, 67, 64, 38, 0, 0, 0, 4, 44, 9, 3, 0, 0, 0, 0, 15,255,255, 28, 0,255, 0, 17, // ı, + 228, 15, 82, 26,125, 8, 28, 37, 54, 38,120,164, 71, 79, 6, 17, 7,254,151,193, 4, 68, 61, 9, 15, 20, 0, 2, 0, 0, 0, 26, 0, 0, 22, 0, 0, 0, 36, // i, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, // ß, + 36, 0, 20, 12, 81, 0, 14, 9, 24, 0, 17,101, 55, 24, 0, 1, 12, 35, 31, 42, 0, 36, 0, 7, 21, 21, 0, 1, 0, 0,255, 4,255,255, 0, 0,255, 3, 24, // ä, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0,255, 0,255, 0, // â, + 31, 17, 0, 0, 0, 11, 0, 0, 2, 0, 10, 8, 1, 2, 1, 0, 0, 4, 0, 4, 3, 7, 0, 1, 0, 0, 1, 19, 0, 3, 0, 0, 0, 0, 0, 0, 0, 5, 1, // ç, + 23, 0, 9, 1, 20, 0, 1, 6, 13, 7, 17, 18, 15, 48, 0, 12, 0, 57, 7, 23, 0, 20, 23, 2, 57, 5,255, 0, 0,255,255, 3, 0, 0, 0,255, 0,255, 6, // ê, + 20, 0, 6, 5, 24, 0, 4, 2, 6, 16, 25, 21, 15, 33, 0, 4, 1, 52, 12, 20, 0, 13, 17, 0, 16, 10,255, 0, 0,255, 0, 2, 0, 0, 0,255, 0,255, 7, // î, + 0, 35, 0, 0, 0, 12, 0, 0, 0,255, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 13, 16, 0, 0, 0, 0,255, 0, 0, 2, 0, 2, 0, // ğ, + 30, 0, 11, 0, 6, 0, 0, 22, 1, 0, 10, 0, 2, 4, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 14, 1, 0, 0, 0, 0,255, 1,255,255, 0, 0,255, 0, 0, // ö, + 45, 0, 20, 0, 4, 0, 0, 0, 1, 0, 2, 1, 4, 5, 0, 1, 0, 10, 2, 5, 0, 0, 0, 0, 1, 0,255, 0, 0,255,255, 3, 0, 0, 0,255, 0, 0, 3, // û, + 20, 0, 11, 5, 26, 0, 0, 15, 3, 0, 11, 26, 21, 17, 0, 0, 1, 15, 8, 34, 0, 2, 0, 1, 17, 2, 0, 0, 0, 0, 0, 6,255, 0, 2, 0,255, 0, 3, // ü, + 33, 45, 1, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 14, 0, 0, 12, 0, 1, 5, 0, 0, 28, 33, 0, 11, 0, 0, 6, 5, 0, 0, 2, 6, 0, // ş, + // , a, b, c, d, e, f, g, h, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ı, i, ß, ä, â, ç, ê, î, ğ, ö, û, ü, ş, + ], + hebrew: [ + 0, 1, 0, 0, 0, 0, 0, 28, 29, 17, 24,144, 46, 26, 7, 88, 68, 22, 2, 71,106, 3,200, 0, 35, 69, 10, 2, 9, 2, 13,104, 19,138, // , + 0,255,255, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, // a, + 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, // ְ, + 0, 0, 0, 0,255,255, 0,255, 1, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, // ַ, + 0,255, 0, 0, 0,255, 0,255,255, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, // ָ, + 0,255, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255, 0, 3,255, 0, 0, 0, 0, 0, // ּ, + 1,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0,255,255,255,255, 0, 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, // װ, + 0,255,255,255,255, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0,255, 0,255,255,255, 0,255, 0,255, 0,255,255,255, 0,255, 0, 0, 0, 0,255, // ױ, + 0,255, 0, 0,255, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, // ײ, + 171, 0, 0, 0, 0, 0, 0,255, 0, 0, 20, 14, 15, 21, 22, 5, 0, 22, 15, 0, 2, 17, 0, 16, 0, 16, 6, 2, 0, 27, 0, 3, 25, 30, 5, 3, // א, + 101, 0, 0, 0, 0, 0,255, 0, 0, 11, 2, 2, 2, 7, 22, 0, 4, 1, 15, 0, 3, 6, 0, 5, 0, 2, 1, 7, 0, 0, 0, 3, 5, 16, 6, 4, // ב, + 44, 0, 0, 0, 0, 0,255, 0, 0, 7, 1, 0, 2, 5, 9, 0, 0, 6, 12, 0, 0, 3, 0, 2, 0, 17, 1, 15, 0, 1, 0, 0, 0, 7, 1, 1, // ג, + 70, 0, 0, 0, 0, 0,255, 0, 0, 9, 4, 4, 0, 4, 16, 0, 5, 0, 15, 0, 3, 5, 0, 8, 0, 20, 3, 7, 0, 2,255, 1, 2, 4, 0, 0, // ד, + 158, 0, 0, 0, 0, 0, 0,255,255, 5, 8, 2, 6, 3, 8, 5, 4, 2, 36, 0, 4, 17, 0, 11, 0, 16, 1, 4, 0, 4, 0, 2, 5, 16, 9, 5, // ה, + 42, 0, 0, 0, 0, 0, 0,255,255, 48, 21, 16, 18, 19, 73, 5, 13, 12, 38, 0, 11, 22, 0, 30, 0, 25, 10, 13, 0, 32, 0, 18, 22, 34, 19, 17, // ו, + 32, 0, 0, 0, 0, 0,255, 0, 0, 8, 0, 0, 2, 1, 6, 0, 2, 0, 25,255, 1, 0, 0, 2, 0, 1, 0, 4, 0, 0,255, 0, 0, 0, 0, 0, // ז, + 18, 0, 0, 0, 0, 0,255,255,255, 8, 4, 0, 0, 6, 7, 0, 0, 1, 10, 0, 1, 5, 0, 6, 0, 2, 0, 0, 0, 1, 0, 1, 0, 4, 4, 5, // ח, + 22, 0, 0, 7, 1, 0,255, 0, 0, 28, 1, 4, 0, 2, 6, 0, 0, 0, 30, 0, 5, 9, 0, 4, 0, 20, 21, 12, 0, 7, 0, 1, 10, 16, 20, 0, // ט, + 31, 0, 0, 0, 0, 0, 0,255,255, 65, 31, 9, 45, 19, 60, 22, 12, 19, 78, 0, 9, 37, 0, 25, 0, 45, 15, 12, 0, 15, 0, 15, 17, 50, 25, 13, // י, + 0,255, 0, 0, 0, 0,255, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 0, 10, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,255, 0, 0, 4, 0, 0, // ך, + 29, 0, 0, 0, 0, 0,255, 0, 0, 5, 2, 0, 0, 4, 7, 2, 0, 0, 5, 0, 0, 4, 0, 4, 0, 1, 1, 3, 0, 0,255, 0, 0, 6, 2, 1, // כ, + 64, 0, 0, 1, 0, 0, 0, 0, 0, 35, 7, 8, 2, 5, 29, 1, 6, 4, 20, 0, 8, 4, 0, 7, 0, 1, 2, 18, 0, 4, 0, 2, 6, 2, 29, 2, // ל, + 0, 0, 0, 0, 0, 0,255, 0, 0, 2, 0, 3, 1, 3, 9, 0, 0, 0, 65, 0, 0, 2, 0, 0, 0, 0, 0, 11,255, 0,255, 0, 0, 0, 5, 1, // ם, + 89, 0, 0, 0, 0, 0,255, 0, 0, 12, 11, 1, 2, 21, 15, 2, 3, 1, 10, 0, 2, 8, 0, 5, 0, 1, 2, 7, 0, 0, 0, 2, 1, 5, 7, 10, // מ, + 1,255, 0, 0, 0, 0, 0, 0, 0, 13, 8, 3, 3, 1, 48, 1, 0, 12, 37, 0, 3, 3, 0, 3, 0, 0, 2, 31, 0, 3,255, 1, 1, 15, 5, 1, // ן, + 24, 0, 0, 2, 0, 0, 0, 0, 0, 35, 8, 2, 1, 5, 26, 0, 1, 1, 35, 0, 4, 4, 0, 8, 0, 0, 1, 30, 0, 5, 0, 0, 2, 3, 13, 2, // נ, + 22, 0, 0, 0, 0, 0, 0, 0, 0, 16, 4, 0, 0, 6, 14, 0, 2, 1, 17, 0, 1, 2, 0, 6, 0, 5, 0, 12, 0, 2,255, 0, 3, 5, 0, 0, // ס, + 33, 0, 0, 0, 0, 1, 0, 0, 0, 0, 16, 44, 38, 14, 25, 8, 0, 46, 36, 0, 5, 20, 0, 25, 0, 25, 5, 1, 0, 11, 0, 5, 15, 19, 18, 1, // ע, + 0,255, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 1, 0, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,255, 0, 0, 0, 0, 0, // ף, + 76, 0, 0, 0, 0, 0,255, 0, 0, 13, 2, 0, 0, 8, 12, 0, 0, 0, 9, 0, 1, 4, 0, 5, 0, 2, 10, 8, 0, 0,255, 1, 1, 4, 5, 3, // פ, + 0, 0, 0, 0, 0, 0,255,255,255, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, // ץ, + 24, 0, 0, 0, 0, 0,255,255, 0, 4, 1, 0, 0, 5, 7, 0, 0, 0, 7, 0, 0, 1, 0, 5, 0, 5, 0, 2, 0, 0,255, 0, 2, 3, 0, 0, // צ, + 39, 0, 0, 0, 0, 0, 0, 0, 0, 7, 3, 0, 1, 6, 9, 0, 4, 0, 17, 0, 0, 5, 0, 7, 0, 5, 3, 9,255, 3,255, 0, 0, 5, 1, 4, // ק, + 27, 0, 0, 3, 7, 0,255, 0, 0, 60, 25, 8, 9, 10, 49, 3, 9, 9, 22, 0, 3, 2, 0, 11, 0, 2, 4,118, 0, 18, 0, 5, 10, 0, 11, 9, // ר, + 75, 0, 0, 0, 0, 0,255, 0, 0, 8, 7, 0, 2, 10, 9, 4, 2, 7, 32, 0, 1, 4, 0, 26, 0, 3, 0, 6, 0, 2, 0, 0, 2, 7, 0, 2, // ש, + 21, 0, 0, 0, 0, 0,255,255, 0, 17, 8, 1, 2, 9, 65, 0, 6, 1, 34, 0, 5, 5, 0, 8, 0, 8, 2, 4,255, 5, 0, 1, 1, 10, 11, 1, // ת, + // , a, ְ, ַ, ָ, ּ, װ, ױ, ײ, א, ב, ג, ד, ה, ו, ז, ח, ט, י, ך, כ, ל, ם, מ, ן, נ, ס, ע, ף, פ, ץ, צ, ק, ר, ש, ת, + ], + arabic: [ + 0, 8, 11, 2, 0, 9, 7, 40, 0, 34, 6, 74, 8, 1, 0, 0, 0, 10, 74, 21, 87, 40, 2, 3, 3, 5, 50, 0, 78, 23, 29, 9, 3, 2, 5, 5, 13, 7, 0, 22, 7, 4, 35, 42, 69, 85, 37, 11, 44, 0, 49, // , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a, + 12, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 4, 9, 0, 0, 0, 0, 0, // , + 20, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, // پ, + 8, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 2, 0,255,255,255, 0, 0, 0,255, 0,255, 0, 0, 0, 5, 0, 1,255, 0, 0, 0, // ٹ, + 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,255, 0, 0, 0, // چ, + 2, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255,255, 0,255, 0,255, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // ژ, + 8, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0,255, 0, 1, 0,255, 0,255, 0,255, 0, 0,255, 0, 0, 0, 0,255,255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, // ڈ, + 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,255, 1, 0, 0, 0, 0, 3, 0, 3, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, // گ, + 79, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 2, 0, 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 4, 7, 1, 0, 1, 0, 0, 0, 0, // ک, + 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0,255, 0, 0, 0,255, 0,255, 0,255,255, 0,255, 0, 0, 0, 0,255,255, 0, 0,255, 0,255, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, // ڑ, + 27,255,255,255, 0,255,255, 0,255, 0,255,255, 0, 0, 0, 0,255, 0,255, 0, 2,255,255, 0,255, 0,255,255, 0,255, 0, 0, 0, 0,255,255,255,255,255,255, 0,255,255,255,255, 0, 0, 0, 4, 0, 0, 0, 0, // ں, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,255, 0, 0,255, 0, 0, 0,255, 0, 0, 3,255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255, // ھ, + 27, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 3, 2, 0, 3, 0, 0, 0, 0, 5, 0, 25, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 9, 1, 2, 0, 2,255, 0, 0, 0, // ہ, + 7, 0, 0, 0, 0,255,255,255, 0,255,255,255,255, 0, 0, 0, 0, 0,255, 0, 4, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0,255,255, 0,255, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, // ء, + 14, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // آ, + 24, 0, 0,255,255, 0,255, 0, 0, 0,255,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0,255, // أ, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, // ؤ, + 11, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0,255, // إ, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, // ئ, + 124, 0, 5, 6, 5, 0, 0, 1, 7, 17, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 24, 0, 18, 2, 9, 6, 2, 13, 1, 23, 4, 11, 4, 2, 2, 2, 1, 7, 1, 0, 5, 7, 7, 19, 13, 14, 21, 18, 0, 15, 0, 0, // ا, + 50, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 11, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 2, 4, 0, 0, 0, 1, 0, 3, 0, 0, 0, 1, 1, 6, 2, 1, 0, 7, 0, 3, 0, 0, // ب, + 0, 0, 0, 0,255, 0, 0,255, 0, 0,255,255, 0,255, 0, 0, 0, 0,255, 0, 1, 3, 0, 0, 0, 1, 1, 0, 4, 0, 10, 0, 1, 0, 0, 0, 1, 0, 3, 1, 0, 2, 3, 1, 8, 4, 4, 0, 0, 0, 22, 0,255, // ة, + 38, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 1, 0, 0, 0, 0, 1, 2, 0, 0, 3, 0, 24, 3, 0, 0, 0, 0, 1, 0, 0, 2, 1, 2, 9, 7, 5, 0, 5, 0, 4, 0, 0, // ت, + 1, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, // ث, + 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 1, 0, 3, 0, 1, 0, 0, // ج, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 3, 0, 0, 1, 0, 1, 0, 0, // ح, + 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, // خ, + 42, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 2, 0, 0, 0, 1, 4, 1, 2, 0, 27, 1, 1, 5, 1, 0, 0, 0, 4, 0, 0, 0, 4, 0, 5, 4, 15, 1, 8, 0, 6, 0, 0, // د, + 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, // ذ, + 61, 0, 0, 6, 2, 0, 0, 0, 5, 9, 0, 0, 0, 6, 0, 1, 2, 0, 0, 1, 32, 12, 0, 7, 0, 1, 1, 2, 18, 1, 4, 0, 2, 4, 1, 0, 1, 0, 3, 2, 0, 5, 2, 4, 3, 27, 1, 8, 15, 0, 8, 0, 0, // ر, + 30, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 1, 0, 0, // ز, + 31, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 17, 1, 0, 1, 0, 0, 1, 0, 1, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 6, 9, 3, 1, 5, 0, 4, 0, 0, // س, + 22, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 2, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 3, 0, 1, 0, 0, 0, 0, // ش, + 8, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 14, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 0, // ص, + 1, 0, 0, 0,255,255,255,255, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, // ض, + 2, 0, 0, 0, 0, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 0, // ط, + 0, 0, 0, 0,255, 0,255,255, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,255, 0, 0,255, // ظ, + 18, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 2, 0, 0, 0, 5, 2, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 9, 6, 0, 0, 5, 0, 1, 0, 0, // ع, + 2, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, // غ, + 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ـ, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1, 0, 3, 0, 16, 0, 0, // ف, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 3, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 2, 1, 0, 4, 0, 3, 0, 0, // ق, + 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 1, 0, 0, 1, 0, 4, 0, 0, // ك, + 17, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0,255, 0, 0, 0, 1, 1, 0, 3, 0,123, 6, 0, 1, 1, 1, 1, 2, 1, 0, 0, 0, 3, 0, 2, 1, 1, 0, 10, 0, 0, 1, 3, 2, 5, 8, 0, 2, 12, 0, 10, 0, 0, // ل, + 76, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 16, 0, 0, 2, 0, 3, 2, 0, 4, 0, 8, 22, 6, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 27, 1, 1, 4, 6, 0, 5, 0, 0, // م, + 38, 0, 1, 1, 0, 0, 0, 0, 1, 4, 0, 0, 0, 1, 0, 2, 3, 4, 3, 1, 55, 2, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 3, 2, 14, 0, 1, 0, 3, 0, 0, 0, 0, 2, 4, 14, 1, 2, 10, 0, 11, 0, 0, // ن, + 16, 0, 11, 0, 0, 0, 0, 0, 0, 4, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 10, 13, 0, 4, 0, 1, 0, 0, 30, 0, 6, 2, 1, 5, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 4, 3, 4, 0, 1, 0, 1, 0, 0, // ه, + 36, 0, 0, 1, 0, 0, 0, 0, 2, 6, 0, 0, 1, 6, 0, 0, 4, 0, 0, 0, 10, 9, 0, 5, 0, 9, 4, 4, 5, 0, 13, 1, 3, 3, 3, 0, 1, 0, 1, 0, 0, 2, 1, 4, 9, 6, 10, 3, 1, 0, 8, 0, 0, // و, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,255, // ى, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 1, 0, 1, 2, 5, 8, 0, 5, 0, 2, 2, 1, 12, 1, 22, 2, 8, 2, 2, 1, 1, 0, 2, 1, 0, 19, 3, 4, 14, 7, 24, 2, 10, 0, 1, 0, 0, // ي, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ً, + 2, 0,255, 0, 0, 1, 0, 0, 0, 12, 0, 0, 0, 6, 0, 0,255, 0,255, 5, 0, 0,255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0, 0,255, 0, 0, 0, // ے, + // , a, , پ, ٹ, چ, ژ, ڈ, گ, ک, ڑ, ں, ھ, ہ, ء, آ, أ, ؤ, إ, ئ, ا, ب, ة, ت, ث, ج, ح, خ, د, ذ, ر, ز, س, ش, ص, ض, ط, ظ, ع, غ, ـ, ف, ق, ك, ل, م, ن, ه, و, ى, ي, ً, ے, + ], + baltic: [ + 0, 0, 57, 42,135, 14, 20, 3,119, 0, 0, 18, 1, 18, 0, 0,205, 1, 4, // , + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 16, 39, 14, 0, 0, 0, 16, // a, + 0,255, 0, 0, 3, 0, 0, 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0, 0, // b, + 0,255, 0, 0, 19, 0, 9, 0, 0,255, 0, 6, 0, 0, 0, 0, 0, 1, 0, // c, + 0,255, 0, 0, 17, 0, 6, 0, 6,255, 0, 14, 0, 0, 0, 0, 0, 6, 3, // d, + 0, 0, 0, 0, 0, 0, 0, 9, 0, 5, 3, 0, 1, 7, 3, 0, 0, 0, 21, // e, + 0,255, 0, 0, 4, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, // f, + 0,255, 0, 1, 1, 0, 1, 0, 3, 0, 0, 27, 0, 0, 0, 0, 0, 1, 0, // g, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // h, + 0, 0, 0, 0, 0, 0, 0, 72, 0, 13, 9, 0, 5, 41, 7, 0, 0, 0, 56, // i, + 0,255, 6, 0, 30, 0, 32, 0, 41, 0,255, 12, 0, 0, 0, 0, 3, 0, 0, // j, + 0, 0, 0, 4, 32, 0, 8, 0, 2, 0, 0, 3, 0, 36, 0, 0, 0, 6, 1, // k, + 0,255, 0, 0, 29, 0, 36, 1, 24, 0, 0, 4, 0, 5, 0, 0, 0, 2, 0, // l, + 0, 0, 0, 1, 16, 0, 11, 0, 6, 0, 0, 15, 0, 2, 0, 0, 0, 1, 6, // m, + 0,255, 0, 0, 19, 0, 7, 0, 10, 0, 0, 12, 0, 8, 0, 0, 0, 16, 6, // n, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 10, 2,255, 0, 0, 6, // o, + 0,255, 0, 0, 3, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, // p, + 0,255, 0, 0, 0, 0,255, 0, 0,255,255, 0,255, 0,255,255, 0, 0, 0, // q, + 0,255, 2, 2, 59, 0, 23, 0, 2, 0, 0, 6, 0, 3, 0, 0, 0, 23, 0, // r, + 0,255, 2, 7, 50, 7, 9, 1, 88, 0, 0, 7, 0, 4, 0, 0, 0, 5, 1, // s, + 0,255, 0, 2, 33, 0, 31, 0, 10, 0, 0, 21, 0, 22, 0, 0, 0, 6, 1, // t, + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 13, 11, 12, 0, 0, 0, 7, // u, + 0,255, 0, 5, 10, 0, 2, 0, 3, 0, 0, 21, 0, 12, 0, 0, 0, 1, 3, // v, + 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255, 0,255, 0, 0, 0, 0, // w, + 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255, 0,255,255, 0, 0, 0, // x, + 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 5,255, 0, 0, 0, 7, // y, + 0,255, 0, 0, 4, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, // z, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0,255,255, 0, 0, // ß, + 0, 0, 0,255,255, 0,255, 0,255, 0,255, 0,255,255,255, 0, 0,255,255,255, 0, 0,255,255,255,255,255,255, 0,255,255, 0,255, 0,255,255,255,255,255,255,255,255,255,255, 0,255, // ŗ, + 1, 0, 1, 0, 1, 0, 0, 1, 0, 4, 9, 4, 3, 10, 5, 0, 0, 0, 5, 9, 9, 0, 1, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0, 0, 0, // ą, + 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 5, 0, 0, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0, 0, 0, // į, + 9, 0, 10, 2, 28, 3, 0, 13, 1, 6, 45, 45, 27, 28, 50, 0, 30, 0, 40, 13, 73, 2, 28, 0, 0, 0, 5, 0, 0,255,255, 0,255,255, 0,255, 0, 0, 0, 2, 4, 1,255,255,255, 3, // ā, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 4, 0, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0, 0, 0, // ę, + 4, 0, 3, 6, 12, 0, 0, 0, 0, 0, 2, 0, 20, 16, 8, 0, 35,255, 15, 19, 28, 0, 26, 0,255, 0, 5,255, 0,255,255, 0,255, 0, 0,255, 0, 1,255, 0, 1, 3,255,255, 0, 0, // ē, + 12, 9, 0, 0, 0, 9, 0, 0, 0, 15, 0, 0, 1, 0, 8, 2, 0, 0, 4, 7, 0, 3, 0, 0, 0, 6, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 7, 0, 0, 0, 1, 0, // č, + 1, 0, 20, 0, 27, 0, 0, 9, 0, 0, 3, 27, 33, 22, 68, 0, 12, 0, 25, 12, 29, 0, 20, 0, 0, 0, 1,255,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 2,255,255, 0, 0, 4, // ė, + 6, 1,255, 0, 0, 2,255, 0,255, 1,255, 0, 0, 0, 0, 3, 0,255, 1, 0, 0, 1, 0,255,255,255, 0,255,255,255,255, 0,255, 1,255,255, 0,255, 0, 0, 0, 0,255,255, 0, 0, // ģ, + 4, 1, 0, 0, 0, 1,255, 0,255, 3, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0,255,255, 0,255, 0, 0,255,255, 0, 0, 0, 5, 2,255,255, 0, 0, // ķ, + 4, 0, 5, 11, 17, 0, 0, 0, 0, 0, 0, 0, 32, 4, 17, 0, 2, 0, 44, 6, 35, 0, 7, 0, 0, 0, 25, 0, 0,255,255, 0,255, 0, 0,255, 0, 2, 0, 0, 3, 0,255,255, 0, 0, // ī, + 2, 11, 0,255, 0, 10, 0, 2, 0, 3,255, 11, 0, 0,255, 2, 0,255, 0, 0, 0, 2, 0,255,255,255, 0,255,255,255,255, 1,255, 2,255,255, 0, 0, 0, 0, 0,255, 0,255, 0, 0, // ļ, + 75, 31, 0, 0, 0, 15, 0, 1, 0, 71, 0, 18, 1, 1, 1, 13, 2, 0, 7, 0, 1, 10, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 4, 0, 0,255, 0, 3, 1, 0, 5, 0, 0, 3, 0, // š, + 1, 5, 0, 0, 0, 6,255, 0, 0, 24, 0, 0, 0, 0, 0, 2, 0,255, 1, 0, 0, 1, 0,255,255,255, 2,255,255,255,255, 3,255, 0, 0,255,255,255, 1, 0, 0, 0, 0,255, 0, 0, // ņ, + 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255, 0,255,255,255,255,255, 0,255, 0,255,255, 0, // ō, + 0, 0, 3, 0, 4, 0, 0, 4, 0, 76, 18, 13, 8, 11, 19, 0, 0, 0, 9, 4, 28, 0, 3, 0, 0, 0, 0,255,255, 0, 0,255, 0,255, 0, 0,255,255,255,255, 0,255,255, 0, 0, 1, // ų, + 2, 0, 9, 3, 1, 0, 0, 1, 0, 12, 8, 8, 3, 5, 0, 0, 1, 0, 9, 1, 12, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0, 0, 0,255, 1, 0, 0,255, 0, 0, 0, // ū, + 46, 17, 0, 0, 30, 11, 0, 1, 0, 3, 0, 0, 1, 3, 1, 2, 0, 0, 8, 0, 0, 16, 0, 0, 0, 2, 0, 0,255, 1, 1, 0, 0, 0, 0, 3,255,255, 0, 0, 0, 0,255, 0, 0, 0, // ž, + // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, ß, ŗ, ą, į, ā, ę, ē, č, ė, ģ, ķ, ī, ļ, š, ņ, ō, ų, ū, ž, + ], + thai: [ + 6, 0, 2, 0, 14, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 2, 0, 1, 0, 21, 3, 1, 0, 0, 5, 0, 0, 9, 9, 4, 0, 4, 2, 6, 0, 2, 0, 0, 4, 0, 0, 4, 0, 11, 0, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 3, 0, 0, 16, 0, 0, 0, // , + 0, 0, 0,255, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0,255,255, // a, + 8, 0, 6, 0, 4, 0, 91, 3, 0, 13, 1, 0, 1, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 17, 6, 3, 0, 0, 0, 0, 0, 8, 13, 10, 0, 6, 5, 0, 0, 4, 1, 0, 28, 0, 0, 13, 40, 50, 4, 26, 13, 9, 0, 23, 33, 0, 80, 28, 16, 2, 3, 0, 17, 12, 13, 3, 0, 19, 0,255, 0, // ก, + 1, 0, 8, 0, 1, 0,119, 2, 0, 0, 0,255, 3, 0, 0, 0, 0, 0, 0, 7, 1, 0, 0, 0, 14, 3, 0, 0, 0, 1, 0, 0, 6, 4, 4,255, 5, 4, 0, 0, 0, 0, 0, 5, 0, 0, 3, 0, 17, 2, 5, 6, 0, 0, 16, 1, 0,137, 33, 4, 0, 12, 0, 0, 12, 9, 0, 0, 15, 0,255, 0, // ข, + 7, 0, 23, 0, 61, 0,254, 2, 0, 9, 3,255, 2, 0, 0, 0, 0,255, 1, 10, 1, 0, 16, 0, 51, 21, 1, 0, 0, 1, 0, 1, 14, 13, 32, 0, 5, 5, 0, 0, 4, 0, 0, 16, 0, 0, 31, 11,120, 11, 16, 21, 0, 0, 61, 5, 0,173, 54,110, 7, 6, 0, 14, 29, 37, 2, 0, 42, 0,255, 0, // ค, + 0,255, 0,255, 2, 0, 8, 0,255, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 5,255, 0,255, 0, 0, 0,255,255, 0, 0,255,255, // ฆ, + 0, 0, 6, 1, 20, 0, 24, 6, 0, 1, 3, 0, 0, 0, 0, 0,255,255, 0, 22, 0, 0, 0, 5, 4, 2, 0, 0, 0, 6, 0, 0, 7, 39, 53, 0, 41, 56, 0, 0, 20, 4, 0,254, 0,255, 1,167,254, 4,100, 0, 46, 0, 55, 34, 0, 26, 1, 2,255, 1, 0, 20,254,254, 2, 1, 1, 0,255, 0, // ง, + 4, 0, 9, 1, 3, 0, 74, 23, 0, 1, 0,255, 9, 0, 0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 14, 2, 1, 0, 0, 2, 0, 0, 5, 3, 7, 0, 4, 5, 4, 0, 1, 0, 0, 4, 0, 0, 14, 9, 32, 1, 17, 3, 0, 0, 5, 4, 0,137, 10, 15, 31, 0, 0, 44, 24, 23, 0, 0, 15, 0,255, 0, // จ, + 0,255, 1, 0, 0, 0, 5, 4, 0, 0, 0,255, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0, 0, 1, 0, 0, 0,255,255, 0, 0, 0, 47, 1, 3,255, 0, 0, 0, 1, 0, 0, 0, 1,255,255,255, // ฉ, + 3, 0, 8, 0, 11, 0,139, 1, 0, 17, 1, 0, 32, 0, 0, 0, 0, 0, 0, 8, 3, 0, 1, 0, 31, 5, 1, 0, 0, 7, 0, 0, 18, 22, 8, 0, 7, 7, 0, 0, 1, 0, 0, 10, 0, 0, 31, 16,141, 2, 33, 15, 0, 3, 11, 13, 0,208, 45, 22,170, 11, 0, 2, 21, 32, 0, 0, 29, 0,255, 0, // ช, + 2, 0, 16, 0, 6, 0, 47, 0, 0, 3, 5, 0, 0, 0, 0, 0,255, 0, 0, 2, 2, 0, 1, 0, 13, 2, 2,255,255, 0, 2,255, 3, 2, 4,255, 3, 2, 0, 0, 6, 0,255, 5, 0, 0, 4, 0, 16, 10, 7, 9, 0, 0, 4, 9, 0,178, 19, 44, 0, 40, 0, 0, 14, 4, 0, 0, 19,255,255,255, // ซ, + 0,255, 0,255, 0,255, 0, 0,255, 2,255,255, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 1, 0, 0,255, 0,255,255, 0, 0,255,255, 0,255,255,255, // ฌ, + 0,255, 0, 0, 0,255, 3, 1,255, 13, 0, 0, 83,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,255, 1, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 37, 0, 1,255,255, 0, 48, 7, 0, 16, 0, 0, 0, 44, 11, 0, 0, 0, 1, 0, 0,255, 3, 1, 0, 0, 0, 2, 0,255,255, // ญ, + 0,255, 11,255, 0, 0, 0,255,255, 0,255,255,255, 0,255, 0,255,255, 0,255, 0,255, 0, 0, 0, 0, 0,255,255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0,255, 0, 0,255, 0, 0, 0, 0, 0, 0,255,255, 7, 0, 0, 0,255, 0,255,255,255,255, 0, 0,255,255, 0, 0,255,255, // ฎ, + 0,255, 4,255, 0, 0, 0, 0,255, 0,255,255, 0,255, 0, 0,255,255, 0,255, 0,255, 0,255, 0, 0, 9, 0,255, 0,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255,255, 0, 2, 1,255, 0, 0,255,255, 2, 0, 0, 0,255, 0,255,255,255,255, 0,255,255,255, 0,255,255,255, // ฏ, + 0,255, 0, 0, 0,255, 0, 0,255, 0,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,255, 0,255,255, 0, 11, 0, 0, 0, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255, 0, 0, 0,255,255, 0, 0,255,255, // ฐ, + 0,255, 0,255, 0,255, 0, 0,255, 0,255,255, 0,255,255, 0, 0, 0, 2, 0, 0,255, 0,255, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255, 0, 0,255, 0,255,255,255, 0,255,255, 0, 0, 0,255, 0, 0,255,255, 0, 0, 0, 0,255, 0,255,255,255,255,255,255,255,255, 0,255,255,255, // ฑ, + 0,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255, 0, 0,255,255,255,255,255, 0,255,255,255,255, 0, 0, 0, 0,255,255,255,255,255,255,255,255,255, 4, 0,255, 0, 0,255,255, 2, 0, 0, 0,255, 0,255,255,255,255,255,255,255,255, 0, 0,255,255, // ฒ, + 0,255, 0, 2, 16, 0, 0, 0,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 1, 0, 10, 0, 0, 1, 0, 3, 0, 0,255, 0,255,255, 0, 2, 6, 0, 0, 0,255,255, 11, 0, 0, 0,255, 0,255,255,255,255, 0, 0,255,255, 0, 0,255,255, // ณ, + 2, 0, 3, 0, 4, 0, 15, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 12, 2, 1, 0, 0, 0, 0, 0, 3, 2, 5, 3, 3, 17, 0, 0, 7, 0, 0, 11, 0, 0, 5, 26, 11, 1, 21, 2, 0, 0, 32, 7, 0, 75, 9, 54, 2, 75, 0, 8, 4, 5, 0, 0, 10, 0,255, 0, // ด, + 4, 0, 6, 7, 8, 0, 32, 1, 0, 2, 7, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, 0, 19, 1, 1, 0, 0, 0, 0, 0, 4, 2, 4, 0, 2, 1, 1, 0, 14, 4, 0, 7, 0, 0, 4, 16, 17, 1, 18, 3, 0, 0, 27, 6, 0, 36, 34, 12, 9, 5, 0, 5, 11, 6, 0, 0, 14, 0,255, 0, // ต, + 1, 0, 0, 0, 0,255, 7, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 1, 0, 0,255,255, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 1, 3, 0,255, 0, 0, 0, 3, 3, 0, 0, 0,255,255,255, // ถ, + 4, 0, 4, 1, 3, 0, 50, 1, 0, 5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 32, 8, 2, 0, 0, 2, 0, 1, 5, 4, 6, 0, 3, 2, 0, 1, 1, 0, 0, 3, 0, 0, 8, 3, 13, 1, 16, 3, 0, 0, 18, 2, 0, 83, 12, 15, 0, 37, 0, 1, 13, 9, 0, 0, 23, 0,255, 0, // ท, + 1,255, 0, 0, 0, 0, 2, 0,255, 1, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 6, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 3, 0,255, 2, 0, 5, 0, 3, 0, 0, 0, 4, 0, 0, 11, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 2, 0,255,255, // ธ, + 4, 0, 4, 7, 41, 0, 24, 11, 0, 43, 9, 0, 0, 0, 0, 0, 0, 3, 0, 6, 3, 1, 3, 1, 10, 3, 1, 1, 0, 1, 0, 0, 9, 30, 3, 0, 5, 14, 2, 0, 10, 32, 0, 41, 0, 0, 5, 69, 60, 8, 35, 9, 0, 2, 20, 10, 0, 38, 13, 14,150, 5, 0,124, 80, 92, 0, 0, 10, 0,255, 0, // น, + 3, 0, 1, 1, 2, 0, 15, 2, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1, 0, 5, 10, 0, 0, 0, 2, 0, 0, 2, 2, 4, 0, 4, 1, 1, 0, 1, 0, 0, 15, 0, 0, 7, 48, 8, 5, 7, 2, 0, 1, 12, 1, 0, 30, 33, 8, 3, 1, 0, 10, 5, 4, 0, 0, 9, 0,255, 0, // บ, + 5, 0, 1, 0, 3, 0, 29, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 2, 0, 0,255, 0, 0, 0, 5, 1, 5, 0, 2, 0, 0, 0, 1, 0, 0, 3, 0, 0, 3, 1, 4, 1, 3, 2, 0, 0, 6, 11, 0,163, 8, 11, 0, 45, 0, 0, 15, 6, 0, 0, 6, 0,255,255, // ป, + 2, 0, 0, 0, 0,255, 7, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 1, 0, 1, 0, 0, 1, 0,255, 2, 0, 0, 4, 8, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0,255,255,255, // ผ, + 0, 0, 0, 0, 0, 0, 2, 0,255, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0,255, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,255,255,255, // ฝ, + 9, 0, 1, 0, 1, 0, 23, 7, 5, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 6, 1, 0,255,255, 0, 0, 0, 6, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 4, 4, 20, 1, 5, 4, 0, 0, 3, 1, 0, 63, 11, 4, 0, 3, 0, 0, 6, 4, 0, 0, 4, 0,255, 0, // พ, + 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 1, 0, 0,255,255, 0, 1,255, 0, 0, 2,255, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 7, 4, 1, 0, 29, 0, 0, 0, 0, 0, 0, 3,255,255, 0, // ฟ, + 2, 0, 0, 0, 0, 0, 5, 0,255, 1, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,255,255, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 3, 0, 0, 1,255, 0, 0, 0, 3, 0, 1, 0,255, 0, 2, 0, 0, 15, 0, 0, 0, 0, 0, 0, 1, 2,255,255, 1,255,255, 0, // ภ, + 5, 0, 5, 1, 63, 0, 29, 3, 0, 17, 3, 0, 1, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 7, 1, 0, 0, 0, 1, 0, 0, 4, 7, 17, 0, 2, 7, 0, 0, 18, 59, 0, 21, 0, 0, 8, 6, 52, 1, 12, 6, 0, 0, 22, 9, 0, 82, 27, 21, 0, 43, 0, 8, 59, 21, 0, 0, 10, 0, 0, 0, // ม, + 1, 0, 1, 1, 6, 0, 10, 0, 0, 6, 1, 0, 1, 0, 0, 0, 0, 0, 0, 13, 2, 0, 18, 1, 5, 0, 1, 0, 0, 5, 0, 0, 1, 1, 3,255, 4, 13, 0, 1, 1, 2, 0, 27, 0, 0, 4, 16, 74, 0, 7, 61, 0, 0, 6, 0, 0, 16, 5, 8, 0, 0, 0, 1, 50, 8, 0, 0, 3, 0, 0, 0, // ย, + 6, 0, 35, 0,148, 0, 29, 9, 0, 7, 1, 0, 1, 5, 0, 0, 0, 0, 1, 4, 27, 0, 14, 5, 8, 14, 42, 0, 1, 32, 2, 1, 8, 5, 23, 0, 1, 8, 4, 1, 16, 14, 0, 33, 0, 0, 8, 0,121, 3, 8, 5, 0, 0, 20, 14, 0, 85, 17, 40, 0, 8, 0, 0, 8, 18, 0, 0, 7, 0, 0, 0, // ร, + 0,255, 3, 0, 0, 0, 0, 0,255, 0,255,255, 0,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 3, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255,255, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255,255,255,255,255,255, 0, 0,255,255, 0,255,255,255, // ฤ, + 2, 0, 19, 1, 40, 0, 15, 1, 5, 5, 6, 0, 3, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 3, 6, 14, 5, 0, 10, 6, 0, 2, 2, 1,255, 3, 5, 0, 0, 2, 26, 0, 19, 0, 0, 2, 9, 20, 2, 12, 2, 0, 0, 17, 25, 0, 73,127, 25, 0, 5, 0, 0, 4, 2, 0, 0, 7, 0, 0, 0, // ล, + 4, 0, 8, 4, 80, 0, 23, 1, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 12, 2, 0, 0, 0, 1, 0, 0, 18, 8, 9, 0, 4, 0, 1, 0, 5, 12, 0, 5, 0, 0, 8, 16, 32, 1, 8, 7, 0, 0, 4, 1, 0, 48, 3, 1, 0, 8, 0, 2, 60, 50, 0, 1, 4, 0, 0, 0, // ว, + 6, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 9, 0, 5, 0, 0, 0, 1, 0, 0, 12,255, 0,255, 0, 0, 0, 0, 0, 0,255, 3, 0,255,255, // ศ, + 0,255, 16, 0, 0, 1, 4, 0,255, 0,255,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255, 0, 1, 0, 3, 5, 0, 0, 1, 0, 0, 0,255, 0,255,255, 0, 0, 9,255, 4, 0, 0,255, 6, 0, 0, 1,255, 0,255, 0,255,255, 0, 0,255,255, 0, 0,255, 0, // ษ, + 9, 0, 8, 4, 4, 0, 75, 3, 0, 12, 1, 0, 1, 0, 0, 0, 0, 0, 1, 5, 2, 0, 1, 0, 19, 5, 1, 0, 0, 0, 0, 0, 12, 6, 9, 0, 3, 2, 2, 0, 1, 0, 0, 13, 0, 0, 11, 9, 26, 1, 16, 7, 0, 0, 8, 2, 0, 55, 26, 2, 3, 0, 0, 1, 29, 16, 0, 0, 19, 0,255, 0, // ส, + 7, 0, 2, 0, 1, 0, 77, 2, 0, 0, 0,255, 4, 4, 0, 0, 0, 0, 0, 21, 0, 0, 2, 0, 9, 2, 0, 0,255, 0, 0, 0, 13, 3, 2, 0, 1, 2, 0, 0, 5, 0, 0, 4, 0, 0, 8, 0, 12, 7, 3, 1, 0, 0, 2, 4, 0, 62, 60, 4, 82, 6, 0, 0, 11, 10, 0, 0, 3, 0, 0, 0, // ห, + 0,255, 0, 0,255,255,255, 0, 0, 0,255,255,255,255,255,255,255,255, 0,255,255,255,255,255, 0,255,255,255,255,255,255,255, 0, 0, 0,255,255, 0,255,255, 0, 0,255, 0,255,255,255, 0, 0,255, 0, 5,255,255, 2, 0,255, 0,255, 0,255,255,255,255, 0,255, 0,255,255,255,255,255, // ฬ, + 8, 0, 13, 98, 32, 0, 68, 12, 0, 23, 18, 0, 1, 0, 0, 1, 0, 0, 0, 10, 10, 0, 6, 4, 20, 17, 3, 0, 0, 2, 2, 4, 9, 9, 21,255, 11, 8, 1, 0, 7, 1, 0, 21, 2, 0, 9, 0, 10, 4, 2, 5, 0, 57, 0, 2, 0, 84, 22, 36, 0, 8, 0, 19,254,144, 1, 1, 11, 0,255, 0, // อ, + 0, 0, 0, 0, 0,255, 2, 0,255, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0, 1, 0, 0, 0, 0,255, 0, 0, 0, 8, 3, 4,255, 3, 0, 0, 0, 0, 0, 0, 1,255,255,255, // ฮ, + 0, 0, 0, 0, 0,255, 0, 0,255, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255, 0, 0,255, 0, 0, 0, 0,255,255, 0, 0, 0,255,255, 0,255,255,255, 0, 0, 0,255,255, 0, 0,255,255, // ฯ, + 0,255, 3, 0, 8, 0, 2, 33, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 7, 0, 5, 0, 2, 0, 7, 0, 1, 0,255, 0, 0, 0, 3, 3, 93,255, 44, 1, 0, 0, 3, 0, 0, 5, 0, 0, 0, 0, 8, 0, 0, 0,255,255, 0,255,255, 0,255, 0,255,255,255, 0, 0, 0, 1, 0, 0,255,255,255, // ะ, + 0,255, 33, 13, 29, 0, 0, 54, 5, 48, 13, 0, 3, 0, 0, 0, 0, 0, 0, 12, 23, 1, 18, 2, 29, 17, 7, 0, 1, 11, 1, 3, 12, 8, 50,255, 26, 39, 4, 4, 21, 6, 0, 22, 2,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255,255,255,255,255,255,255,255, 0, 0,255,255,255,255,255, 0, // ั, + 0, 0, 97, 28, 43, 1, 47, 62, 8,139, 31, 0, 33, 9, 0, 3, 0, 0, 3, 11, 15, 8, 23, 5, 41, 17, 5, 0, 0, 6, 1, 34, 49, 29, 63, 0, 28, 30, 10, 18, 31, 30, 6, 33, 3,255, 0, 0, 0, 0, 0, 0,255,255, 0,255,255, 0,255,255,255,255,255, 0,224,254, 1, 0, 0, 0,255, 0, // า, + 0,255, 5, 0, 35,255, 0, 21, 0, 1, 0,255,255,255,255,255,255,255, 0, 2, 8, 0, 10, 0, 4, 0, 0, 0,255, 0, 0,255, 0, 0, 0,255, 3, 0,255,255, 10, 0,255, 7, 0,255, 0, 0, 0, 0,255,255,255,255,255, 0,255, 0,255,255,255, 0,255,255, 3, 33, 0, 0, 0,255,255, 0, // ำ, + 0,255, 20, 0, 22, 0, 8, 20, 2, 65, 34, 0, 17, 0, 12, 0, 1, 0, 1, 15, 28, 1, 5, 14, 18, 6, 7, 1, 0, 14, 2, 1, 14, 3, 35, 0, 18, 32, 4, 0, 20, 2, 0, 23, 2,255, 0, 0, 0,255, 0, 0,255, 0,255, 0, 0, 0,255,255,255,255,255, 0, 0, 0,255,255, 0,255,255, 0, // ิ, + 0,255, 10, 7, 9, 0, 0, 12, 4, 56, 51, 0, 15, 3, 0, 0, 0, 0, 1, 19, 4, 0, 86, 2, 23, 5, 18, 0, 0, 6, 1, 0, 34, 3, 37,255, 23, 7, 0, 0, 16, 0, 0, 9, 0,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255, 0, 0,255,255,255,255,255,255, 0, 0,255,255, 0,255,255, 0, // ี, + 0,255, 0, 17, 0,255, 0, 9, 0, 0, 44, 0, 0, 0, 0,255,255,255, 0, 0, 0, 6, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 5,255, 1, 0,255, 0, 0,255,255,255,255,255, 0, 0, 0,255, 0,255,255,255,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255, // ึ, + 0,255, 0, 0, 29,255, 0, 0, 0, 66, 3, 0, 0,255, 0,255,255, 0, 0, 2, 0, 1, 0, 0, 6, 0, 0, 0, 0, 9, 0,255, 18, 1, 19,255, 7, 0, 0, 0, 6, 0, 0, 6, 0,255,255,255, 0,255, 0,255,255, 0,255,255,255, 0,255,255,255,255,255,255, 0, 0,255,255, 0,255,255, 0, // ื, + 0,255, 20, 9, 88, 0, 1, 45, 1, 55, 17, 0, 0, 0, 0, 0, 0,255, 1, 4, 25, 6, 11, 7, 22, 29, 10, 0, 0, 8, 19, 0, 12, 28, 29,255, 17, 4, 1, 0, 53, 3, 0, 22, 1,255,255, 0, 0,255,255, 0, 0,255, 0, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255, 0, // ุ, + 0,255, 6, 0, 37, 0, 3, 9, 0, 15, 28, 1, 1, 0, 0, 0, 0,255, 0, 26, 8, 15, 3, 0, 4, 7, 2, 48, 0, 7, 2, 10, 77, 38, 26, 0, 13, 1, 3, 0, 26, 1,255, 4, 0,255,255,255,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 0, // ู, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255, // ฺ, + 20, 0, 41, 1, 31, 0,254, 12, 0, 16, 23, 0, 12, 1, 1, 1, 0, 0, 1, 30, 10, 2, 7, 1,110, 23, 9, 0, 0, 4, 4, 0, 52, 35, 41, 0, 29, 20, 5, 0, 30, 0, 0, 67, 2, 0,130, 0, 84, 35, 40, 50, 0, 0, 20, 56, 0, 0, 0, 0, 0, 0, 0, 8,138,119, 0, 0,142, 0,255, 0, // เ, + 10, 0, 19, 0, 9, 0,170, 5, 0, 3, 2, 0, 4, 0, 0, 0, 0, 0, 0, 9, 3, 0, 2, 0, 36, 8, 6, 0, 0, 1, 0, 0, 12, 13, 20,255, 6, 5, 1, 0, 6, 0, 0, 9, 0, 0, 16, 0, 26, 16, 4, 13, 0, 0, 2, 11, 0, 0, 0, 0,255, 0, 0, 1, 52, 49, 0, 0, 68, 0,255, 0, // แ, + 6, 0, 7, 0, 29, 0, 72, 3, 0, 9, 11, 0, 2, 0, 0, 0, 0, 0, 0, 4, 3, 0, 2, 0, 27, 5, 2, 0,255, 1, 0, 0, 9, 8, 8, 0, 8, 6, 1, 0, 20, 0, 0, 8, 1, 0, 12, 0, 22, 2, 21, 13, 0, 0, 26, 14, 1, 0, 0, 0,255, 0, 0, 0, 11, 10, 0, 0, 33, 0,255, 0, // โ, + 3, 0, 9, 0, 4, 0,114, 4, 0, 1, 0, 0, 6, 0, 2, 0, 0, 0, 0, 10, 3, 1, 0, 0, 27, 5, 2, 0,255, 1, 0, 0, 7, 16, 10,255, 6, 5, 2, 0, 1, 0, 0, 9, 0, 0, 8, 0, 22, 14, 3, 7, 0, 0, 2, 3, 0, 0,255,255, 0, 0, 0, 1, 55, 53, 0, 0, 24,255,255, 0, // ใ, + 5, 0, 8, 0, 4, 0,116, 3, 0, 3, 1, 0, 1, 1, 0, 0, 0, 0, 0, 5, 1, 4, 0, 0, 28, 10, 3, 0,255, 0, 0, 0, 6, 9, 5,255, 3, 9, 8, 0, 3, 0, 0, 10, 0, 0, 15, 0, 23, 5, 5, 4, 0, 0, 9, 15, 0, 0,255,255,255, 0, 0, 12, 51, 30, 0, 0, 34, 0,255, 0, // ไ, + 0,255, 0,255, 0,255, 12,255,255,255,255,255, 0,255,255,255,255,255, 0, 0, 0, 0, 0, 0, 2, 0, 0,255,255, 0,255,255, 0, 0, 0,255, 0, 0,255, 0, 0,255,255, 0,255,255, 0,255, 0, 0, 0, 0,255,255, 0, 0,255,255,255,255,255,255, 0,255, 0, 0, 0, 0, 0,255,255,255, // ๆ, + 0,255, 25, 6, 4, 0, 0, 7, 0, 7, 21, 0, 0, 0, 0,255,255,255, 0, 16, 5, 0, 2, 0, 2, 1,112, 0, 0, 1, 0, 0, 3, 2, 11,255, 12, 6, 0,255, 0, 7,255, 13, 0,255,255,255,255,255, 0,255,255,255,255, 0, 0, 0,255,255,255,255,255, 0,255,255,255,255,255,255,255,255, // ็, + 0, 0, 28, 36, 44, 5, 7, 4, 0,166, 14,255, 53,255,255,255,255, 0, 0, 3, 64, 4, 15, 0, 19, 8, 5, 12, 4, 4, 1, 0, 69, 28, 23,255, 45, 56, 0,255, 35, 40, 0, 6, 1,255, 0, 35, 0, 0, 43,254, 46,156,119,254, 0, 0, 0,255, 0, 0,255, 0, 0, 0, 0,255, 0, 0,255, 0, // ่, + 0, 0, 17,126, 42, 0, 0, 91, 0,177, 69, 0, 2,255,255,255,255,255,255,105, 38, 4, 11, 0, 56, 12, 6, 1, 0, 3, 8, 0, 10, 7, 40,255, 32, 11, 0, 0, 7, 61,255, 26, 0,255, 0,125, 0, 0, 14, 49, 25, 28, 16,211,255, 0, 0, 0, 0, 0,255, 0, 0, 0,255,255,255, 0,255, 0, // ้, + 0,255, 2, 0, 0,255, 0, 2,255, 0, 0,255, 0, 0,255,255,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0,255,255, 0, 0,255, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0, 0,255,255, 0,255,255, 1, 0, 0, 0, 6, 1,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255, // ๊, + 0,255, 0, 0, 0, 0,255, 1,255, 0, 0,255, 0,255,255,255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0,255,255,255,255, 0, 0, 0,255, 0, 0,255,255, 0, 0,255, 0, 0,255,255, 0,255,255, 0, 1, 0, 0, 0, 1, 0,255,255,255,255,255,255, 0, 0,255,255,255,255,255,255,255, // ๋, + 0, 0, 6, 1,105, 5, 0, 2, 0, 9, 44, 0, 3, 0, 0, 1, 2, 0, 24, 19, 28, 1, 11, 12, 24, 0, 8,255,255, 4, 1, 0, 2, 46,168,255, 28, 8, 13, 10, 39, 14,255, 0, 1,255, 0,255,255,255, 21, 0,255, 0, 14, 0, 0,255,255,255,255,255,255,255,255,255,255,255, 0,255,255,255, // ์, + 0,255, 0, 0, 0,255,255, 0,255, 0, 0,255, 0, 0,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255, 0,255,255, 0, 0, 0,255, 0, 0, 0, 0, 0, 0,255, 0,255,255,255,255,255,255, 0,255,255,255, 0,255,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255, // ํ, + 0,255, 0,255,255,255,255,255,255,255,255,255, 0,255,255,255,255,255,255,255, 0,255,255,255,255,255, 0,255,255, 0,255,255,255,255,255,255, 0,255,255,255, 0, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // ๎, + 0, 0, 0, 0, 0,255, 0,255,255,255,255,255,255,255,255,255,255,255,255, 0,255,255, 0,255, 0, 0,255,255,255, 0,255,255, 0, 0, 0, 0, 0, 0, 0,255, 0, 0,255, 0,255,255, 0, 0, 0, 0, 0, 0,255,255,255, 0,255, 0, 0, 0, 0,255,255,255, 0, 0,255,255, 0,255,255, 0, // ๅ, + // , a, ก, ข, ค, ฆ, ง, จ, ฉ, ช, ซ, ฌ, ญ, ฎ, ฏ, ฐ, ฑ, ฒ, ณ, ด, ต, ถ, ท, ธ, น, บ, ป, ผ, ฝ, พ, ฟ, ภ, ม, ย, ร, ฤ, ล, ว, ศ, ษ, ส, ห, ฬ, อ, ฮ, ฯ, ะ, ั, า, ำ, ิ, ี, ึ, ื, ุ, ู, ฺ, เ, แ, โ, ใ, ไ, ๆ, ็, ่, ้, ๊, ๋, ์, ํ, ๎, ๅ, + ], +}; + +const VIETNAMESE_ASCII: usize = 27; +const VIETNAMESE_NON_ASCII: usize = 25; +const CENTRAL_ASCII: usize = 27; +const CENTRAL_NON_ASCII: usize = 41; +const CYRILLIC_ASCII: usize = 2; +const CYRILLIC_NON_ASCII: usize = 44; +const WESTERN_ASCII: usize = 27; +const WESTERN_NON_ASCII: usize = 32; +const ICELANDIC_ASCII: usize = 27; +const ICELANDIC_NON_ASCII: usize = 13; +const GREEK_ASCII: usize = 2; +const GREEK_NON_ASCII: usize = 35; +const TURKISH_ASCII: usize = 26; +const TURKISH_NON_ASCII: usize = 13; +const HEBREW_ASCII: usize = 2; +const HEBREW_NON_ASCII: usize = 34; +const ARABIC_ASCII: usize = 2; +const ARABIC_NON_ASCII: usize = 51; +const BALTIC_ASCII: usize = 27; +const BALTIC_NON_ASCII: usize = 19; +const THAI_ASCII: usize = 2; +const THAI_NON_ASCII: usize = 70; +#[inline(always)] +fn compute_index( + x: usize, + y: usize, + ascii_classes: usize, + non_ascii_classes: usize, +) -> Option<usize> { + if x == 0 && y == 0 { + return None; + } + if x < ascii_classes && y < ascii_classes { + return None; + } + if y >= ascii_classes { + return Some( + (ascii_classes * non_ascii_classes) + + (ascii_classes + non_ascii_classes) * (y - ascii_classes) + + x, + ); + } + Some(y * non_ascii_classes + x - ascii_classes) +} + +pub struct SingleByteData { + pub encoding: &'static Encoding, + lower: &'static [u8; 128], + upper: &'static [u8; 128], + probabilities: &'static [u8], + ascii: usize, + non_ascii: usize, +} + +impl SingleByteData { + #[inline(always)] + pub fn classify(&'static self, byte: u8) -> u8 { + let high = byte >> 7; + let low = byte & 0x7F; + if high == 0u8 { + self.lower[usize::from(low)] + } else { + self.upper[usize::from(low)] + } + } + + #[inline(always)] + pub fn is_latin_alphabetic(&'static self, caseless_class: u8) -> bool { + let caseless_class_usize = usize::from(caseless_class); + caseless_class_usize > 0 && caseless_class_usize < (self.ascii + self.non_ascii) + } + + #[inline(always)] + pub fn is_non_latin_alphabetic( + &'static self, + caseless_class: u8, + is_windows_1256: bool, + ) -> bool { + let caseless_class_usize = usize::from(caseless_class); + let lower_bound = if is_windows_1256 { + WINDOWS_1256_ZWNJ + } else { + 1 + }; + caseless_class_usize > lower_bound && caseless_class_usize < (self.ascii + self.non_ascii) + } + + #[inline(always)] + pub fn score( + &'static self, + current_class: u8, + previous_class: u8, + is_windows_1256: bool, + ) -> i64 { + let current_usize = usize::from(current_class); + let previous_usize = usize::from(previous_class); + let stored_boundary = self.ascii + self.non_ascii; + if current_usize < stored_boundary { + if previous_usize < stored_boundary { + // Both below + if let Some(index) = + compute_index(previous_usize, current_usize, self.ascii, self.non_ascii) + { + let b = self.probabilities[index]; + if b == 255 { + IMPLAUSIBILITY_PENALTY + } else { + i64::from(b) + } + } else { + 0 + } + } else { + // Current below stored, prev above + if current_usize == 0 + || current_usize == ASCII_DIGIT + || (is_windows_1256 && current_usize == WINDOWS_1256_ZWNJ) + { + // Current is space-like + 0 + } else { + // Current is alphabetic + let previous_unstored = previous_usize - stored_boundary; + match previous_unstored { + PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE => 0, + IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE => IMPLAUSIBILITY_PENALTY, + IMPLAUSIBLE_BEFORE_ALPHABETIC => IMPLAUSIBILITY_PENALTY, + IMPLAUSIBLE_AFTER_ALPHABETIC => 0, + PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE => { + if current_usize < self.ascii { + IMPLAUSIBILITY_PENALTY + } else { + 0 + } + } + PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE => { + if current_usize < self.ascii { + 0 + } else { + IMPLAUSIBILITY_PENALTY + } + } + _ => { + debug_assert_eq!(previous_usize, ASCII_DIGIT); + 0 + } + } + } + } + } else { + if previous_usize < stored_boundary { + // Current above, prev below + if previous_usize == 0 + || previous_usize == ASCII_DIGIT + || (is_windows_1256 && previous_usize == WINDOWS_1256_ZWNJ) + { + // Previous is space-like + 0 + } else { + // Current is alphabetic + let current_unstored = current_usize - stored_boundary; + match current_unstored { + PLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE => 0, + IMPLAUSIBLE_NEXT_TO_ALPHABETIC_ON_EITHER_SIDE => IMPLAUSIBILITY_PENALTY, + IMPLAUSIBLE_BEFORE_ALPHABETIC => 0, + IMPLAUSIBLE_AFTER_ALPHABETIC => IMPLAUSIBILITY_PENALTY, + PLAUSIBLE_NEXT_TO_NON_ASCII_ALPHABETIC_ON_EITHER_SIDE => { + if previous_usize < self.ascii { + IMPLAUSIBILITY_PENALTY + } else { + 0 + } + } + PLAUSIBLE_NEXT_TO_ASCII_ALPHABETIC_ON_EITHER_SIDE => { + if previous_usize < self.ascii { + 0 + } else { + IMPLAUSIBILITY_PENALTY + } + } + _ => { + debug_assert_eq!(current_usize, ASCII_DIGIT); + 0 + } + } + } + } else if current_usize == ASCII_DIGIT || previous_usize == ASCII_DIGIT { + 0 + } else { + // Both above + IMPLAUSIBILITY_PENALTY + } + } + } +} + +impl PartialEq for SingleByteData { + #[inline] + fn eq(&self, other: &SingleByteData) -> bool { + (self as *const SingleByteData) == (other as *const SingleByteData) + } +} + +pub static SINGLE_BYTE_DATA: [SingleByteData; 20] = [ + SingleByteData { + encoding: &WINDOWS_1258_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1258, + probabilities: &DETECTOR_DATA.vietnamese, + ascii: VIETNAMESE_ASCII, + non_ascii: VIETNAMESE_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1250_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1250, + probabilities: &DETECTOR_DATA.central, + ascii: CENTRAL_ASCII, + non_ascii: CENTRAL_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_2_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.iso_8859_2, + probabilities: &DETECTOR_DATA.central, + ascii: CENTRAL_ASCII, + non_ascii: CENTRAL_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1251_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_1251, + probabilities: &DETECTOR_DATA.cyrillic, + ascii: CYRILLIC_ASCII, + non_ascii: CYRILLIC_NON_ASCII, + }, + SingleByteData { + encoding: &KOI8_U_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.koi8_u, + probabilities: &DETECTOR_DATA.cyrillic, + ascii: CYRILLIC_ASCII, + non_ascii: CYRILLIC_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_5_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.iso_8859_5, + probabilities: &DETECTOR_DATA.cyrillic, + ascii: CYRILLIC_ASCII, + non_ascii: CYRILLIC_NON_ASCII, + }, + SingleByteData { + encoding: &IBM866_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.ibm866, + probabilities: &DETECTOR_DATA.cyrillic, + ascii: CYRILLIC_ASCII, + non_ascii: CYRILLIC_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1252_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1252, + probabilities: &DETECTOR_DATA.western, + ascii: WESTERN_ASCII, + non_ascii: WESTERN_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1252_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1252_icelandic, + probabilities: &DETECTOR_DATA.icelandic, + ascii: ICELANDIC_ASCII, + non_ascii: ICELANDIC_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1253_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_1253, + probabilities: &DETECTOR_DATA.greek, + ascii: GREEK_ASCII, + non_ascii: GREEK_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_7_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.iso_8859_7, + probabilities: &DETECTOR_DATA.greek, + ascii: GREEK_ASCII, + non_ascii: GREEK_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1254_INIT, + lower: &DETECTOR_DATA.turkish_ascii, + upper: &DETECTOR_DATA.windows_1254, + probabilities: &DETECTOR_DATA.turkish, + ascii: TURKISH_ASCII, + non_ascii: TURKISH_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1255_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_1255, + probabilities: &DETECTOR_DATA.hebrew, + ascii: HEBREW_ASCII, + non_ascii: HEBREW_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_8_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.iso_8859_8, + probabilities: &DETECTOR_DATA.hebrew, + ascii: HEBREW_ASCII, + non_ascii: HEBREW_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1256_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_1256, + probabilities: &DETECTOR_DATA.arabic, + ascii: ARABIC_ASCII, + non_ascii: ARABIC_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_6_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.iso_8859_6, + probabilities: &DETECTOR_DATA.arabic, + ascii: ARABIC_ASCII, + non_ascii: ARABIC_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_1257_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.windows_1257, + probabilities: &DETECTOR_DATA.baltic, + ascii: BALTIC_ASCII, + non_ascii: BALTIC_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_13_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.iso_8859_13, + probabilities: &DETECTOR_DATA.baltic, + ascii: BALTIC_ASCII, + non_ascii: BALTIC_NON_ASCII, + }, + SingleByteData { + encoding: &ISO_8859_4_INIT, + lower: &DETECTOR_DATA.latin_ascii, + upper: &DETECTOR_DATA.iso_8859_4, + probabilities: &DETECTOR_DATA.baltic, + ascii: BALTIC_ASCII, + non_ascii: BALTIC_NON_ASCII, + }, + SingleByteData { + encoding: &WINDOWS_874_INIT, + lower: &DETECTOR_DATA.non_latin_ascii, + upper: &DETECTOR_DATA.windows_874, + probabilities: &DETECTOR_DATA.thai, + ascii: THAI_ASCII, + non_ascii: THAI_NON_ASCII, + }, +]; + +pub const WINDOWS_1258_INDEX: usize = 0; +pub const WINDOWS_1250_INDEX: usize = 1; +pub const ISO_8859_2_INDEX: usize = 2; +pub const WINDOWS_1251_INDEX: usize = 3; +pub const KOI8_U_INDEX: usize = 4; +pub const ISO_8859_5_INDEX: usize = 5; +pub const IBM866_INDEX: usize = 6; +pub const WINDOWS_1252_INDEX: usize = 7; +pub const WINDOWS_1252_ICELANDIC_INDEX: usize = 8; +pub const WINDOWS_1253_INDEX: usize = 9; +pub const ISO_8859_7_INDEX: usize = 10; +pub const WINDOWS_1254_INDEX: usize = 11; +pub const WINDOWS_1255_INDEX: usize = 12; +pub const ISO_8859_8_INDEX: usize = 13; +pub const WINDOWS_1256_INDEX: usize = 14; +pub const ISO_8859_6_INDEX: usize = 15; +pub const WINDOWS_1257_INDEX: usize = 16; +pub const ISO_8859_13_INDEX: usize = 17; +pub const ISO_8859_4_INDEX: usize = 18; +pub const WINDOWS_874_INDEX: usize = 19; diff --git a/third_party/rust/chardetng/src/lib.rs b/third_party/rust/chardetng/src/lib.rs new file mode 100644 index 0000000000..c19d81d943 --- /dev/null +++ b/third_party/rust/chardetng/src/lib.rs @@ -0,0 +1,3775 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! `chardetng` is a character encoding detector for legacy Web content. +//! +//! It is optimized for binary size in applications that already depend +//! on `encoding_rs` for other reasons. + +use encoding_rs::Decoder; +use encoding_rs::DecoderResult; +use encoding_rs::Encoding; +use encoding_rs::BIG5; +use encoding_rs::EUC_JP; +use encoding_rs::EUC_KR; +use encoding_rs::GBK; +use encoding_rs::ISO_2022_JP; +use encoding_rs::ISO_8859_8; +use encoding_rs::SHIFT_JIS; +use encoding_rs::UTF_8; +use encoding_rs::WINDOWS_1255; + +mod data; +mod tld; +use data::*; +use tld::classify_tld; +use tld::Tld; + +const LATIN_ADJACENCY_PENALTY: i64 = -50; + +const IMPLAUSIBILITY_PENALTY: i64 = -220; + +const ORDINAL_BONUS: i64 = 300; + +/// Must match the ISO-8859-2 score for " Š ". Note: There +/// are four Slovenian Wikipedia list page titles where the +/// list is split by letter so that Š stands alone for the +/// list part for Š. Let's assume that's a special case not +/// worth detecting even though the copyright sign detection +/// makes Slovenian title detection round to one percentage +/// point worse. +const COPYRIGHT_BONUS: i64 = 222; + +const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180; + +const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40; + +const NON_LATIN_ALL_CAPS_PENALTY: i64 = -40; + +const NON_LATIN_MIXED_CASE_PENALTY: i64 = -20; + +// Manually calibrated relative to windows-1256 Arabic +const CJK_BASE_SCORE: i64 = 41; + +const CJK_SECONDARY_BASE_SCORE: i64 = 20; // Was 20 + +const SHIFT_JIS_SCORE_PER_KANA: i64 = 20; + +const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE; + +const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE; + +// Manually calibrated relative to windows-1256 Persian and Urdu +const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75; + +const HALF_WIDTH_KATAKANA_SCORE: i64 = 1; + +// Unclear if this is a good idea; seems not harmful, but can't be sure. +const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10; + +const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Should this be larger? + +const SHIFT_JIS_EXTENSION_PENALTY: i64 = SHIFT_JIS_PUA_PENALTY * 2; + +const SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY: i64 = SHIFT_JIS_EXTENSION_PENALTY; + +const EUC_JP_SCORE_PER_KANA: i64 = CJK_BASE_SCORE + (CJK_BASE_SCORE / 3); // Relative to Big5 + +const EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA: i64 = CJK_BASE_SCORE - 1; + +const EUC_JP_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE; + +const EUC_JP_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE; + +const EUC_JP_SCORE_PER_OTHER_KANJI: i64 = CJK_SECONDARY_BASE_SCORE / 4; + +const EUC_JP_INITIAL_KANA_PENALTY: i64 = -((CJK_BASE_SCORE / 3) + 1); + +const EUC_JP_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 50); // Needs to be more severe than for Shift_JIS to avoid misdetecting EUC-KR! + +const BIG5_SCORE_PER_LEVEL_1_HANZI: i64 = CJK_BASE_SCORE; + +const BIG5_SCORE_PER_OTHER_HANZI: i64 = CJK_SECONDARY_BASE_SCORE; + +const BIG5_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 30); // More severe than other PUA penalties to avoid misdetecting EUC-KR! (25 as the multiplier is too little) + +const BIG5_SINGLE_BYTE_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 40); + +const EUC_KR_SCORE_PER_EUC_HANGUL: i64 = CJK_BASE_SCORE + 1; + +const EUC_KR_SCORE_PER_NON_EUC_HANGUL: i64 = CJK_SECONDARY_BASE_SCORE / 5; + +const EUC_KR_SCORE_PER_HANJA: i64 = CJK_SECONDARY_BASE_SCORE / 2; + +const EUC_KR_HANJA_AFTER_HANGUL_PENALTY: i64 = -(CJK_BASE_SCORE * 10); + +const EUC_KR_LONG_WORD_PENALTY: i64 = -6; + +const EUC_KR_PUA_PENALTY: i64 = GBK_PUA_PENALTY - 1; // Break tie in favor of GBK + +const EUC_KR_MAC_KOREAN_PENALTY: i64 = EUC_KR_PUA_PENALTY * 2; + +const EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY: i64 = EUC_KR_MAC_KOREAN_PENALTY; + +const GBK_SCORE_PER_LEVEL_1: i64 = CJK_BASE_SCORE; + +const GBK_SCORE_PER_LEVEL_2: i64 = CJK_SECONDARY_BASE_SCORE; + +const GBK_SCORE_PER_NON_EUC: i64 = CJK_SECONDARY_BASE_SCORE / 4; + +const GBK_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Factor should be at least 2, but should it be larger? + +const GBK_SINGLE_BYTE_EXTENSION_PENALTY: i64 = GBK_PUA_PENALTY * 4; + +const CJK_LATIN_ADJACENCY_PENALTY: i64 = -CJK_BASE_SCORE; // smaller penalty than LATIN_ADJACENCY_PENALTY + +const CJ_PUNCTUATION: i64 = CJK_BASE_SCORE / 2; + +const CJK_OTHER: i64 = CJK_SECONDARY_BASE_SCORE / 4; + +/// Latin letter caseless class +const LATIN_LETTER: u8 = 1; + +fn contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool { + for &b in label.into_iter() { + if b >= 0x80 { + return true; + } + if b == b'.' { + return true; + } + if b >= b'A' && b <= b'Z' { + return true; + } + } + false +} + +// For Latin, we only penalize pairwise bad transitions +// if one participant is non-ASCII. This avoids violating +// the principle that ASCII pairs never contribute to the +// score. (Maybe that's a bad principle, though!) +#[derive(PartialEq)] +enum LatinCaseState { + Space, + Upper, + Lower, + AllCaps, +} + +// Fon non-Latin, we calculate case-related penalty +// or bonus on a per-non-Latin-word basis. +#[derive(PartialEq)] +enum NonLatinCaseState { + Space, + Upper, + Lower, + UpperLower, + AllCaps, + Mix, +} + +struct NonLatinCasedCandidate { + data: &'static SingleByteData, + prev: u8, + case_state: NonLatinCaseState, + prev_ascii: bool, + current_word_len: u64, + longest_word: u64, + ibm866: bool, + prev_was_a0: bool, // Only used with IBM866 +} + +impl NonLatinCasedCandidate { + fn new(data: &'static SingleByteData) -> Self { + NonLatinCasedCandidate { + data: data, + prev: 0, + case_state: NonLatinCaseState::Space, + prev_ascii: true, + current_word_len: 0, + longest_word: 0, + ibm866: data == &SINGLE_BYTE_DATA[IBM866_INDEX], + prev_was_a0: false, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); + + // The purpose of this state machine is to avoid misdetecting Greek as + // Cyrillic by: + // + // * Giving a small bonus to words that start with an upper-case letter + // and are lower-case for the rest. + // * Giving a large penalty to start with one lower-case letter followed + // by all upper-case (obviously upper and lower case inverted, which + // unfortunately is possible due to KOI8-U). + // * Giving a small per-word penalty to all-uppercase KOI8-U (to favor + // all-lowercase Greek over all-caps KOI8-U). + // * Giving large penalties for mixed-case other than initial upper-case. + // This also helps relative to non-cased encodings. + + // ASCII doesn't participate in non-Latin casing. + if caseless_class == LATIN_LETTER { + // Latin + // Mark this word as a mess. If there end up being non-Latin + // letters in this word, the ASCII-adjacency penalty gets + // applied to Latin/non-Latin pairs and the mix penalty + // to non-Latin/non-Latin pairs. + // XXX Apply penalty here + self.case_state = NonLatinCaseState::Mix; + } else if !non_ascii_alphabetic { + // Space + match self.case_state { + NonLatinCaseState::Space + | NonLatinCaseState::Upper + | NonLatinCaseState::Lower => {} + NonLatinCaseState::UpperLower => { + // Intentionally applied only once per word. + score += NON_LATIN_CAPITALIZATION_BONUS; + } + NonLatinCaseState::AllCaps => { + // Intentionally applied only once per word. + if self.data == &SINGLE_BYTE_DATA[KOI8_U_INDEX] { + // Apply only to KOI8-U. + score += NON_LATIN_ALL_CAPS_PENALTY; + } + } + NonLatinCaseState::Mix => { + // Per letter + score += NON_LATIN_MIXED_CASE_PENALTY * (self.current_word_len as i64); + } + } + self.case_state = NonLatinCaseState::Space; + } else if (class >> 7) == 0 { + // Lower case + match self.case_state { + NonLatinCaseState::Space => { + self.case_state = NonLatinCaseState::Lower; + } + NonLatinCaseState::Upper => { + self.case_state = NonLatinCaseState::UpperLower; + } + NonLatinCaseState::Lower + | NonLatinCaseState::UpperLower + | NonLatinCaseState::Mix => {} + NonLatinCaseState::AllCaps => { + self.case_state = NonLatinCaseState::Mix; + } + } + } else { + // Upper case + match self.case_state { + NonLatinCaseState::Space => { + self.case_state = NonLatinCaseState::Upper; + } + NonLatinCaseState::Upper => { + self.case_state = NonLatinCaseState::AllCaps; + } + NonLatinCaseState::Lower | NonLatinCaseState::UpperLower => { + self.case_state = NonLatinCaseState::Mix; + } + NonLatinCaseState::AllCaps | NonLatinCaseState::Mix => {} + } + } + + // XXX Apply penalty if > 16 + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + let is_a0 = b == 0xA0; + if !ascii_pair { + // 0xA0 is no-break space in many other encodings, so avoid + // assigning score to IBM866 when 0xA0 occurs next to itself + // or a space-like byte. + if !(self.ibm866 + && ((is_a0 && (self.prev_was_a0 || self.prev == 0)) + || caseless_class == 0 && self.prev_was_a0)) + { + score += self.data.score(caseless_class, self.prev, false); + } + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER + && self.data.is_non_latin_alphabetic(self.prev, false) + { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + self.prev_was_a0 = is_a0; + } + Some(score) + } +} + +enum OrdinalState { + Other, + Space, + PeriodAfterN, + OrdinalExpectingSpace, + OrdinalExpectingSpaceUndoImplausibility, + OrdinalExpectingSpaceOrDigit, + OrdinalExpectingSpaceOrDigitUndoImplausibily, + UpperN, + LowerN, + FeminineAbbreviationStartLetter, + Digit, + Roman, + Copyright, +} + +struct LatinCandidate { + data: &'static SingleByteData, + prev: u8, + case_state: LatinCaseState, + prev_non_ascii: u32, + ordinal_state: OrdinalState, // Used only when `windows1252 == true` + windows1252: bool, +} + +impl LatinCandidate { + fn new(data: &'static SingleByteData) -> Self { + LatinCandidate { + data: data, + prev: 0, + case_state: LatinCaseState::Space, + prev_non_ascii: 0, + ordinal_state: OrdinalState::Space, + windows1252: data == &SINGLE_BYTE_DATA[WINDOWS_1252_INDEX], + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_non_ascii == 0 && ascii; + + let non_ascii_penalty = match self.prev_non_ascii { + 0 | 1 | 2 => 0, + 3 => -5, + 4 => -20, + _ => -200, + }; + score += non_ascii_penalty; + // XXX if has Vietnamese-only characters and word length > 7, + // apply penalty + + if !self.data.is_latin_alphabetic(caseless_class) { + self.case_state = LatinCaseState::Space; + } else if (class >> 7) == 0 { + // Penalizing lower case after two upper case + // is important for avoiding misdetecting + // windows-1250 as windows-1252 (byte 0x9F). + if self.case_state == LatinCaseState::AllCaps && !ascii_pair { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + self.case_state = LatinCaseState::Lower; + } else { + match self.case_state { + LatinCaseState::Space => { + self.case_state = LatinCaseState::Upper; + } + LatinCaseState::Upper | LatinCaseState::AllCaps => { + self.case_state = LatinCaseState::AllCaps; + } + LatinCaseState::Lower => { + if !ascii_pair { + // XXX How bad is this for Irish Gaelic? + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + self.case_state = LatinCaseState::Upper; + } + } + } + + // Treat pairing space-like, which can be non-ASCII, with ASCII as + // ASCIIish enough not to get a score in order to avoid giving + // ASCII i and I in windows-1254 next to windows-125x apostrophe/quote + // a score. This avoids detecting English I’ as Turkish. + let ascii_ish_pair = ascii_pair + || (ascii && self.prev == 0) + || (caseless_class == 0 && self.prev_non_ascii == 0); + + if !ascii_ish_pair { + score += self.data.score(caseless_class, self.prev, false); + } + + if self.windows1252 { + // This state machine assigns score to the sequences + // * " º " (Spanish) + // * " ª " (Spanish) + // * ".ª " (Spanish) + // * ".º " (Spanish) + // * "n.º1" (Spanish) + // * " Mª " (Spanish) + // * " Dª " (Spanish) + // * " Nª " (Spanish) + // * " Sª " (Spanish) + // * " 3º " (Italian, where 3 is an ASCII digit) + // * " 3ª " (Italian, where 3 is an ASCII digit) + // * " Xº " (Italian, where X is a small Roman numeral) + // * " Xª " (Italian, where X is a small Roman numeral) + // * " Nº1" (Italian, where 1 is an ASCII digit) + // * " Nº " (Italian) + // * " © " (otherwise ASCII-only) + // which are problematic to deal with by pairwise scoring + // without messing up Romanian detection. + // Initial sc + match self.ordinal_state { + OrdinalState::Other => { + if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } + } + OrdinalState::Space => { + if caseless_class == 0 { + // pass + } else if b == 0xAA || b == 0xBA { + self.ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if b == b'M' || b == b'D' || b == b'S' { + self.ordinal_state = OrdinalState::FeminineAbbreviationStartLetter; + } else if b == b'N' { + // numero or Nuestra + self.ordinal_state = OrdinalState::UpperN; + } else if b == b'n' { + // numero + self.ordinal_state = OrdinalState::LowerN; + } else if caseless_class == (ASCII_DIGIT as u8) { + self.ordinal_state = OrdinalState::Digit; + } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24 + /* X */ + { + self.ordinal_state = OrdinalState::Roman; + } else if b == 0xA9 { + self.ordinal_state = OrdinalState::Copyright; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::OrdinalExpectingSpace => { + if caseless_class == 0 { + score += ORDINAL_BONUS; + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::OrdinalExpectingSpaceUndoImplausibility => { + if caseless_class == 0 { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::OrdinalExpectingSpaceOrDigit => { + if caseless_class == 0 { + score += ORDINAL_BONUS; + self.ordinal_state = OrdinalState::Space; + } else if caseless_class == (ASCII_DIGIT as u8) { + score += ORDINAL_BONUS; + // Deliberately set to `Other` + self.ordinal_state = OrdinalState::Other; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily => { + if caseless_class == 0 { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + self.ordinal_state = OrdinalState::Space; + } else if caseless_class == (ASCII_DIGIT as u8) { + score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; + // Deliberately set to `Other` + self.ordinal_state = OrdinalState::Other; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::UpperN => { + if b == 0xAA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if b == 0xBA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if b == b'.' { + self.ordinal_state = OrdinalState::PeriodAfterN; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::LowerN => { + if b == 0xBA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; + } else if b == b'.' { + self.ordinal_state = OrdinalState::PeriodAfterN; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::FeminineAbbreviationStartLetter => { + if b == 0xAA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::Digit => { + if b == 0xAA || b == 0xBA { + self.ordinal_state = OrdinalState::OrdinalExpectingSpace; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else if caseless_class == (ASCII_DIGIT as u8) { + // pass + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::Roman => { + if b == 0xAA || b == 0xBA { + self.ordinal_state = + OrdinalState::OrdinalExpectingSpaceUndoImplausibility; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24 + /* X */ + { + // pass + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::PeriodAfterN => { + if b == 0xBA { + self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit; + } else if caseless_class == 0 { + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + OrdinalState::Copyright => { + if caseless_class == 0 { + score += COPYRIGHT_BONUS; + self.ordinal_state = OrdinalState::Space; + } else { + self.ordinal_state = OrdinalState::Other; + } + } + } + } + + if ascii { + self.prev_non_ascii = 0; + } else { + self.prev_non_ascii += 1; + } + self.prev = caseless_class; + } + Some(score) + } +} + +struct ArabicFrenchCandidate { + data: &'static SingleByteData, + prev: u8, + case_state: LatinCaseState, + prev_ascii: bool, + current_word_len: u64, + longest_word: u64, +} + +impl ArabicFrenchCandidate { + fn new(data: &'static SingleByteData) -> Self { + ArabicFrenchCandidate { + data: data, + prev: 0, + case_state: LatinCaseState::Space, + prev_ascii: true, + current_word_len: 0, + longest_word: 0, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + if caseless_class != LATIN_LETTER { + // We compute case penalties for French only + self.case_state = LatinCaseState::Space; + } else if (class >> 7) == 0 { + if self.case_state == LatinCaseState::AllCaps && !ascii_pair { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + self.case_state = LatinCaseState::Lower; + } else { + match self.case_state { + LatinCaseState::Space => { + self.case_state = LatinCaseState::Upper; + } + LatinCaseState::Upper | LatinCaseState::AllCaps => { + self.case_state = LatinCaseState::AllCaps; + } + LatinCaseState::Lower => { + if !ascii_pair { + score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; + } + self.case_state = LatinCaseState::Upper; + } + } + } + + // Count only Arabic word length and ignore French + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, true); + // XXX apply penalty if > 23 + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + if !ascii_pair { + score += self.data.score(caseless_class, self.prev, true); + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER + && self.data.is_non_latin_alphabetic(self.prev, true) + { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + } + Some(score) + } +} + +struct CaselessCandidate { + data: &'static SingleByteData, + prev: u8, + prev_ascii: bool, + current_word_len: u64, + longest_word: u64, +} + +impl CaselessCandidate { + fn new(data: &'static SingleByteData) -> Self { + CaselessCandidate { + data: data, + prev: 0, + prev_ascii: true, + current_word_len: 0, + longest_word: 0, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); + // Apply penalty if > 23 and not Thai + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + if !ascii_pair { + score += self.data.score(caseless_class, self.prev, false); + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER + && self.data.is_non_latin_alphabetic(self.prev, false) + { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + } + Some(score) + } +} + +fn is_ascii_punctuation(byte: u8) -> bool { + match byte { + b'.' | b',' | b':' | b';' | b'?' | b'!' => true, + _ => false, + } +} + +struct LogicalCandidate { + data: &'static SingleByteData, + prev: u8, + prev_ascii: bool, + plausible_punctuation: u64, + current_word_len: u64, + longest_word: u64, +} + +impl LogicalCandidate { + fn new(data: &'static SingleByteData) -> Self { + LogicalCandidate { + data: data, + prev: 0, + prev_ascii: true, + plausible_punctuation: 0, + current_word_len: 0, + longest_word: 0, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); + // XXX apply penalty if > 22 + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + if !ascii_pair { + score += self.data.score(caseless_class, self.prev, false); + + let prev_non_ascii_alphabetic = self.data.is_non_latin_alphabetic(self.prev, false); + if caseless_class == 0 && prev_non_ascii_alphabetic && is_ascii_punctuation(b) { + self.plausible_punctuation += 1; + } + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER && prev_non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + } + Some(score) + } +} + +struct VisualCandidate { + data: &'static SingleByteData, + prev: u8, + prev_ascii: bool, + prev_punctuation: bool, + plausible_punctuation: u64, + current_word_len: u64, + longest_word: u64, +} + +impl VisualCandidate { + fn new(data: &'static SingleByteData) -> Self { + VisualCandidate { + data: data, + prev: 0, + prev_ascii: true, + prev_punctuation: false, + plausible_punctuation: 0, + current_word_len: 0, + longest_word: 0, + } + } + + fn feed(&mut self, buffer: &[u8]) -> Option<i64> { + let mut score = 0i64; + for &b in buffer { + let class = self.data.classify(b); + if class == 255 { + return None; + } + let caseless_class = class & 0x7F; + + let ascii = b < 0x80; + let ascii_pair = self.prev_ascii && ascii; + + let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); + // XXX apply penalty if > 22 + if non_ascii_alphabetic { + self.current_word_len += 1; + } else { + if self.current_word_len > self.longest_word { + self.longest_word = self.current_word_len; + } + self.current_word_len = 0; + } + + if !ascii_pair { + score += self.data.score(caseless_class, self.prev, false); + + if non_ascii_alphabetic && self.prev_punctuation { + self.plausible_punctuation += 1; + } + + if self.prev == LATIN_LETTER && non_ascii_alphabetic { + score += LATIN_ADJACENCY_PENALTY; + } else if caseless_class == LATIN_LETTER + && self.data.is_non_latin_alphabetic(self.prev, false) + { + score += LATIN_ADJACENCY_PENALTY; + } + } + + self.prev_ascii = ascii; + self.prev = caseless_class; + self.prev_punctuation = caseless_class == 0 && is_ascii_punctuation(b); + } + Some(score) + } +} + +struct Utf8Candidate { + decoder: Decoder, +} + +impl Utf8Candidate { + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut dst = [0u8; 1024]; + let mut total_read = 0; + loop { + let (result, read, _) = self.decoder.decode_to_utf8_without_replacement( + &buffer[total_read..], + &mut dst, + last, + ); + total_read += read; + match result { + DecoderResult::InputEmpty => { + return Some(0); + } + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + continue; + } + } + } + } +} + +struct Iso2022Candidate { + decoder: Decoder, +} + +impl Iso2022Candidate { + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut dst = [0u16; 1024]; + let mut total_read = 0; + loop { + let (result, read, _) = self.decoder.decode_to_utf16_without_replacement( + &buffer[total_read..], + &mut dst, + last, + ); + total_read += read; + match result { + DecoderResult::InputEmpty => { + return Some(0); + } + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + continue; + } + } + } + } +} + +#[derive(PartialEq)] +enum LatinCj { + AsciiLetter, + Cj, + Other, +} + +#[derive(PartialEq, Copy, Clone)] +enum HalfWidthKatakana { + DakutenForbidden, + DakutenAllowed, + DakutenOrHandakutenAllowed, +} + +#[derive(PartialEq)] +enum LatinKorean { + AsciiLetter, + Hangul, + Hanja, + Other, +} + +fn cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64 { + if let Some(pos) = table.iter().position(|&x| x == u) { + ((128 - pos) / 16) as i64 + } else { + 0 + } +} + +struct GbkCandidate { + decoder: Decoder, + prev_byte: u8, + prev: LatinCj, + pending_score: Option<i64>, +} + +impl GbkCandidate { + fn maybe_set_as_pending(&mut self, s: i64) -> i64 { + assert!(self.pending_score.is_none()); + if self.prev == LatinCj::Cj || !more_problematic_lead(self.prev_byte) { + s + } else { + self.pending_score = Some(s); + 0 + } + } + + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written == 1 { + let u = dst[0]; + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + self.pending_score = None; // Discard pending score + if self.prev == LatinCj::Cj { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::AsciiLetter; + } else if u == 0x20AC { + // euro sign + self.pending_score = None; // Discard pending score + // Should there even be a penalty? + self.prev = LatinCj::Other; + } else if u >= 0x4E00 && u <= 0x9FA5 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + if b >= 0xA1 && b <= 0xFE { + match self.prev_byte { + 0xA1..=0xD7 => { + score += GBK_SCORE_PER_LEVEL_1; + score += + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_simplified); + } + 0xD8..=0xFE => score += GBK_SCORE_PER_LEVEL_2, + _ => { + score += GBK_SCORE_PER_NON_EUC; + } + } + } else { + score += self.maybe_set_as_pending(GBK_SCORE_PER_NON_EUC); + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // XXX score? + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if u >= 0xE000 && u < 0xF900 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // Treat the GB18030-required PUA mappings as non-EUC ideographs. + match u { + 0xE78D..=0xE796 + | 0xE816..=0xE818 + | 0xE81E + | 0xE826 + | 0xE82B + | 0xE82C + | 0xE831 + | 0xE832 + | 0xE83B + | 0xE843 + | 0xE854 + | 0xE855 + | 0xE864 => { + score += GBK_SCORE_PER_NON_EUC; + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } + _ => { + score += GBK_PUA_PENALTY; + self.prev = LatinCj::Other; + } + } + } else { + match u { + 0x3000 // Distinct from Korean, space + | 0x3001 // Distinct from Korean, enumeration comma + | 0x3002 // Distinct from Korean, full stop + | 0xFF08 // Distinct from Korean, parenthesis + | 0xFF09 // Distinct from Korean, parenthesis + | 0xFF01 // Distinct from Japanese, exclamation + | 0xFF0C // Distinct from Japanese, comma + | 0xFF1B // Distinct from Japanese, semicolon + | 0xFF1F // Distinct from Japanese, question + => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJ_PUNCTUATION; + } + 0..=0x7F => { + self.pending_score = None; // Discard pending score + } + _ => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJK_OTHER; + } + } + self.prev = LatinCj::Other; + } + } else if written == 2 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + let u = dst[0]; + if u >= 0xDB80 && u <= 0xDBFF { + score += GBK_PUA_PENALTY; + self.prev = LatinCj::Other; + } else if u >= 0xD480 && u < 0xD880 { + score += GBK_SCORE_PER_NON_EUC; + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else { + score += CJK_OTHER; + self.prev = LatinCj::Other; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(malformed_len, _) => { + if (self.prev_byte == 0xA0 || self.prev_byte == 0xFE || self.prev_byte == 0xFD) + && (b < 0x80 || b == 0xFF) + { + // Mac OS Chinese Simplified single-byte that conflicts with code page GBK lead byte + // followed by ASCII or a non-conflicting single-byte extension. + self.pending_score = None; // Just in case + score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; + if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { + self.prev = LatinCj::AsciiLetter; + } else if b == 0xFF { + score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + } else { + self.prev = LatinCj::Other; + } + // The GBK decoder has the pending ASCII concept, which is + // a problem with this trickery, so let's reset the state. + self.decoder = GBK.new_decoder_without_bom_handling(); + } else if malformed_len == 1 && b == 0xFF { + // Mac OS Chinese Simplified single-byte extension that doesn't conflict with lead bytes + self.pending_score = None; // Just in case + score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + // The GBK decoder has the pending ASCII concept, which is + // a problem with this trickery, so let's reset the state. + self.decoder = GBK.new_decoder_without_bom_handling(); + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +// Shift_JIS and Big5 +fn problematic_lead(b: u8) -> bool { + match b { + 0x91..=0x97 | 0x9A | 0x8A | 0x9B | 0x8B | 0x9E | 0x8E | 0xB0 => true, + _ => false, + } +} + +// GBK and EUC-KR +fn more_problematic_lead(b: u8) -> bool { + problematic_lead(b) || b == 0x82 || b == 0x84 || b == 0x85 || b == 0xA0 +} + +struct ShiftJisCandidate { + decoder: Decoder, + half_width_katakana_seen: bool, + half_width_katakana_state: HalfWidthKatakana, + prev: LatinCj, + prev_byte: u8, + pending_score: Option<i64>, +} + +impl ShiftJisCandidate { + fn maybe_set_as_pending(&mut self, s: i64) -> i64 { + assert!(self.pending_score.is_none()); + if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) { + s + } else { + self.pending_score = Some(s); + 0 + } + } + + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written > 0 { + let half_width_katakana_state = self.half_width_katakana_state; + self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden; + let u = dst[0]; + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + self.pending_score = None; // Discard pending score + if self.prev == LatinCj::Cj { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::AsciiLetter; + } else if u >= 0xFF61 && u <= 0xFF9F { + if !self.half_width_katakana_seen { + self.half_width_katakana_seen = true; + // To avoid misdetecting title-length inputs + score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY; + } + self.pending_score = None; // Discard pending score + score += HALF_WIDTH_KATAKANA_SCORE; + + if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 { + self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed; + } else if u >= 0xFF8A && u <= 0xFF8E { + self.half_width_katakana_state = + HalfWidthKatakana::DakutenOrHandakutenAllowed; + } else if u == 0xFF9E { + if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden { + score += IMPLAUSIBILITY_PENALTY; + } else { + score += HALF_WIDTH_KATAKANA_VOICING_SCORE; + } + } else if u == 0xFF9F { + if half_width_katakana_state + != HalfWidthKatakana::DakutenOrHandakutenAllowed + { + score += IMPLAUSIBILITY_PENALTY; + } else { + score += HALF_WIDTH_KATAKANA_VOICING_SCORE; + } + } + + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if u >= 0x3040 && u < 0x3100 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += SHIFT_JIS_SCORE_PER_KANA; + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + if self.prev_byte < 0x98 || (self.prev_byte == 0x98 && b < 0x73) { + score += self.maybe_set_as_pending( + SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI + + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji), + ); + } else { + score += self.maybe_set_as_pending(SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI); + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if u >= 0xE000 && u < 0xF900 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += SHIFT_JIS_PUA_PENALTY; + self.prev = LatinCj::Other; + } else { + match u { + 0x3000 // Distinct from Korean, space + | 0x3001 // Distinct from Korean, enumeration comma + | 0x3002 // Distinct from Korean, full stop + | 0xFF08 // Distinct from Korean, parenthesis + | 0xFF09 // Distinct from Korean, parenthesis + => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // Not really needed for CJK distinction + // but let's give non-zero score for these + // common byte pairs anyway. + score += CJ_PUNCTUATION; + } + 0..=0x7F => { + self.pending_score = None; // Discard pending score + } + 0x80 => { + // This is a control character that overlaps euro + // in windows-1252 and happens to be a non-error + // is Shift_JIS. + self.pending_score = None; // Discard pending score + score += IMPLAUSIBILITY_PENALTY; + } + _ => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJK_OTHER; + } + } + self.prev = LatinCj::Other; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(malformed_len, _) => { + if (((self.prev_byte >= 0x81 && self.prev_byte <= 0x9F) + || (self.prev_byte >= 0xE0 && self.prev_byte <= 0xFC)) + && ((b >= 0x40 && b <= 0x7E) || (b >= 0x80 && b <= 0xFC))) + && !((self.prev_byte == 0x82 && b >= 0xFA) + || (self.prev_byte == 0x84 && ((b >= 0xDD && b <= 0xE4) || b >= 0xFB)) + || (self.prev_byte == 0x86 && b >= 0xF2 && b <= 0xFA) + || (self.prev_byte == 0x87 && b >= 0x77 && b <= 0x7D) + || (self.prev_byte == 0xFC && b >= 0xF5)) + { + // Shift_JIS2004 or MacJapanese + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += SHIFT_JIS_EXTENSION_PENALTY; + // Approximate boundary + if self.prev_byte < 0x87 { + self.prev = LatinCj::Other; + } else { + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } + } else if malformed_len == 1 && (b == 0xA0 || b >= 0xFD) { + self.pending_score = None; // Just in case + score += SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +struct EucJpCandidate { + decoder: Decoder, + non_ascii_seen: bool, + half_width_katakana_state: HalfWidthKatakana, + prev: LatinCj, + prev_byte: u8, + prev_prev_byte: u8, +} + +impl EucJpCandidate { + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written > 0 { + let half_width_katakana_state = self.half_width_katakana_state; + self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden; + let u = dst[0]; + if !self.non_ascii_seen && u >= 0x80 { + self.non_ascii_seen = true; + if u >= 0x3040 && u < 0x3100 { + // Remove the kana advantage over initial Big5 + // hanzi. + score += EUC_JP_INITIAL_KANA_PENALTY; + } + } + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + if self.prev == LatinCj::Cj { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::AsciiLetter; + } else if u >= 0xFF61 && u <= 0xFF9F { + score += HALF_WIDTH_KATAKANA_SCORE; + + if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 { + self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed; + } else if u >= 0xFF8A && u <= 0xFF8E { + self.half_width_katakana_state = + HalfWidthKatakana::DakutenOrHandakutenAllowed; + } else if u == 0xFF9E { + if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden { + score += IMPLAUSIBILITY_PENALTY; + } else { + score += HALF_WIDTH_KATAKANA_VOICING_SCORE; + } + } else if u == 0xFF9F { + if half_width_katakana_state + != HalfWidthKatakana::DakutenOrHandakutenAllowed + { + score += IMPLAUSIBILITY_PENALTY; + } else { + score += HALF_WIDTH_KATAKANA_VOICING_SCORE; + } + } + + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Other; + } else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) { + match u { + 0x3090 // hiragana wi + | 0x3091 // hiragana we + | 0x30F0 // katakana wi + | 0x30F1 // katakana we + => { + // Remove advantage over Big5 Hanzi + score += EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA; + } + _ => { + score += EUC_JP_SCORE_PER_KANA; + } + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { + if self.prev_prev_byte == 0x8F { + score += EUC_JP_SCORE_PER_OTHER_KANJI; + } else if self.prev_byte < 0xD0 { + score += EUC_JP_SCORE_PER_LEVEL_1_KANJI; + score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji); + } else { + score += EUC_JP_SCORE_PER_LEVEL_2_KANJI; + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else { + match u { + 0x3000 // Distinct from Korean, space + | 0x3001 // Distinct from Korean, enumeration comma + | 0x3002 // Distinct from Korean, full stop + | 0xFF08 // Distinct from Korean, parenthesis + | 0xFF09 // Distinct from Korean, parenthesis + => { + score += CJ_PUNCTUATION; + } + 0..=0x7F => {} + _ => { + score += CJK_OTHER; + } + } + self.prev = LatinCj::Other; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(_, _) => { + if b >= 0xA1 + && b <= 0xFE + && self.prev_byte >= 0xA1 + && self.prev_byte <= 0xFE + && ((self.prev_prev_byte != 0x8F + && !(self.prev_byte == 0xA8 && b >= 0xDF && b <= 0xE6) + && !(self.prev_byte == 0xAC && b >= 0xF4 && b <= 0xFC) + && !(self.prev_byte == 0xAD && b >= 0xD8 && b <= 0xDE)) + || (self.prev_prev_byte == 0x8F + && self.prev_byte != 0xA2 + && self.prev_byte != 0xA6 + && self.prev_byte != 0xA7 + && self.prev_byte != 0xA9 + && self.prev_byte != 0xAA + && self.prev_byte != 0xAB + && self.prev_byte != 0xED + && !(self.prev_byte == 0xFE && b >= 0xF7))) + { + score += EUC_JP_EXTENSION_PENALTY; + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_prev_byte = self.prev_byte; + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +struct Big5Candidate { + decoder: Decoder, + prev: LatinCj, + prev_byte: u8, + pending_score: Option<i64>, +} + +impl Big5Candidate { + fn maybe_set_as_pending(&mut self, s: i64) -> i64 { + assert!(self.pending_score.is_none()); + if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) { + s + } else { + self.pending_score = Some(s); + 0 + } + } + + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written == 1 { + let u = dst[0]; + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + self.pending_score = None; // Discard pending score + if self.prev == LatinCj::Cj { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::AsciiLetter; + } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + match self.prev_byte { + 0xA4..=0xC6 => { + score += self.maybe_set_as_pending(BIG5_SCORE_PER_LEVEL_1_HANZI); + // score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_traditional); + } + _ => { + score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI); + } + } + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else { + match u { + 0x3000 // Distinct from Korean, space + | 0x3001 // Distinct from Korean, enumeration comma + | 0x3002 // Distinct from Korean, full stop + | 0xFF08 // Distinct from Korean, parenthesis + | 0xFF09 // Distinct from Korean, parenthesis + | 0xFF01 // Distinct from Japanese, exclamation + | 0xFF0C // Distinct from Japanese, comma + | 0xFF1B // Distinct from Japanese, semicolon + | 0xFF1F // Distinct from Japanese, question + => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // Not really needed for CJK distinction + // but let's give non-zero score for these + // common byte pairs anyway. + score += CJ_PUNCTUATION; + } + 0..=0x7F => { + self.pending_score = None; // Discard pending score + } + _ => { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJK_OTHER; + } + } + self.prev = LatinCj::Other; + } + } else if written == 2 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + if dst[0] == 0xCA || dst[0] == 0xEA { + score += CJK_OTHER; + self.prev = LatinCj::Other; + } else { + debug_assert!(dst[0] >= 0xD480 && dst[0] < 0xD880); + score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI); + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(malformed_len, _) => { + if self.prev_byte >= 0x81 + && self.prev_byte <= 0xFE + && ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE)) + { + // The byte pair is in the Big5 range but unmapped. + // Treat as PUA to avoid rejecting Big5-UAO, etc. + // We don't reprocess `b` even if ASCII, since it's + // logically part of the pair. + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += BIG5_PUA_PENALTY; + // Assume Hanzi semantics + if self.prev == LatinCj::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinCj::Cj; + } else if (self.prev_byte == 0xA0 + || self.prev_byte == 0xFD + || self.prev_byte == 0xFE) + && (b < 0x80 || b == 0xFF) + { + // Mac OS Chinese Traditional single-byte that conflicts with code page Big5 lead byte + // followed by ASCII or a non-conflicting single-byte extension. + self.pending_score = None; // Just in case + score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; + if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { + self.prev = LatinCj::AsciiLetter; + } else if b == 0xFF { + score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + } else { + self.prev = LatinCj::Other; + } + } else if malformed_len == 1 && b == 0xFF { + // Mac OS Chinese Traditional single-byte extension that doesn't conflict with lead bytes + self.pending_score = None; // Just in case + score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinCj::Other; + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +struct EucKrCandidate { + decoder: Decoder, + prev_byte: u8, + prev_was_euc_range: bool, + prev: LatinKorean, + current_word_len: u64, + pending_score: Option<i64>, +} + +impl EucKrCandidate { + fn maybe_set_as_pending(&mut self, s: i64) -> i64 { + assert!(self.pending_score.is_none()); + if self.prev == LatinKorean::Hangul || !more_problematic_lead(self.prev_byte) { + s + } else { + self.pending_score = Some(s); + 0 + } + } + + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + let mut score = 0i64; + let mut src = [0u8]; + let mut dst = [0u16; 2]; + for &b in buffer { + let in_euc_range = b >= 0xA1 && b <= 0xFE; + src[0] = b; + let (result, read, written) = self + .decoder + .decode_to_utf16_without_replacement(&src, &mut dst, false); + if written > 0 { + let u = dst[0]; + if (u >= u16::from(b'a') && u <= u16::from(b'z')) + || (u >= u16::from(b'A') && u <= u16::from(b'Z')) + { + self.pending_score = None; // Discard pending score + match self.prev { + LatinKorean::Hangul | LatinKorean::Hanja => { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + _ => {} + } + self.prev = LatinKorean::AsciiLetter; + self.current_word_len = 0; + } else if u >= 0xAC00 && u <= 0xD7A3 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + if self.prev_was_euc_range && in_euc_range { + score += EUC_KR_SCORE_PER_EUC_HANGUL; + score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_hangul); + } else { + score += self.maybe_set_as_pending(EUC_KR_SCORE_PER_NON_EUC_HANGUL); + } + if self.prev == LatinKorean::AsciiLetter { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + self.prev = LatinKorean::Hangul; + self.current_word_len += 1; + if self.current_word_len > 5 { + score += EUC_KR_LONG_WORD_PENALTY; + } + } else if (u >= 0x4E00 && u < 0xAC00) || (u >= 0xF900 && u <= 0xFA0B) { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += EUC_KR_SCORE_PER_HANJA; + match self.prev { + LatinKorean::AsciiLetter => { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + LatinKorean::Hangul => { + score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY; + } + _ => {} + } + self.prev = LatinKorean::Hanja; + self.current_word_len += 1; + if self.current_word_len > 5 { + score += EUC_KR_LONG_WORD_PENALTY; + } + } else { + if u >= 0x80 { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + score += CJK_OTHER; + } else { + self.pending_score = None; // Discard pending score + } + self.prev = LatinKorean::Other; + self.current_word_len = 0; + } + } + match result { + DecoderResult::InputEmpty => { + assert_eq!(read, 1); + } + DecoderResult::Malformed(malformed_len, _) => { + if (self.prev_byte == 0xC9 || self.prev_byte == 0xFE) && b >= 0xA1 && b <= 0xFE + { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // The byte pair is in code page 949 EUDC range + score += EUC_KR_PUA_PENALTY; + // Assume Hanja semantics + match self.prev { + LatinKorean::AsciiLetter => { + score += CJK_LATIN_ADJACENCY_PENALTY; + } + LatinKorean::Hangul => { + score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY; + } + _ => {} + } + self.prev = LatinKorean::Hanja; + self.current_word_len += 1; + if self.current_word_len > 5 { + score += EUC_KR_LONG_WORD_PENALTY; + } + } else if (self.prev_byte == 0xA1 + || (self.prev_byte >= 0xA3 && self.prev_byte <= 0xA8) + || (self.prev_byte >= 0xAA && self.prev_byte <= 0xAD)) + && (b >= 0x7B && b <= 0x7D) + { + if let Some(pending) = self.pending_score { + score += pending; + self.pending_score = None; + } + // MacKorean symbols in range not part of code page 949 + score += EUC_KR_MAC_KOREAN_PENALTY; + self.prev = LatinKorean::Other; + self.current_word_len = 0; + } else if (self.prev_byte >= 0x81 && self.prev_byte <= 0x84) + && (b <= 0x80 || b == 0xFF) + { + // MacKorean single-byte that conflicts with code page 949 lead byte + // followed by ASCII or a non-conflicting single-byte extension. + self.pending_score = None; // Just in case + score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; + if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { + self.prev = LatinKorean::AsciiLetter; + } else if b == 0x80 || b == 0xFF { + score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinKorean::Other; + } else { + self.prev = LatinKorean::Other; + } + self.current_word_len = 0; + } else if malformed_len == 1 && (b == 0x80 || b == 0xFF) { + // MacKorean single-byte extensions that don't conflict with lead bytes + self.pending_score = None; // Just in case + score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; + self.prev = LatinKorean::Other; + self.current_word_len = 0; + } else { + return None; + } + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + self.prev_was_euc_range = in_euc_range; + self.prev_byte = b; + } + if last { + let (result, _, _) = self + .decoder + .decode_to_utf16_without_replacement(b"", &mut dst, true); + match result { + DecoderResult::InputEmpty => {} + DecoderResult::Malformed(_, _) => { + return None; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + } + Some(score) + } +} + +enum InnerCandidate { + Latin(LatinCandidate), + NonLatinCased(NonLatinCasedCandidate), + Caseless(CaselessCandidate), + ArabicFrench(ArabicFrenchCandidate), + Logical(LogicalCandidate), + Visual(VisualCandidate), + Utf8(Utf8Candidate), + Iso2022(Iso2022Candidate), + Shift(ShiftJisCandidate), + EucJp(EucJpCandidate), + EucKr(EucKrCandidate), + Big5(Big5Candidate), + Gbk(GbkCandidate), +} + +impl InnerCandidate { + fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> { + match self { + InnerCandidate::Latin(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::NonLatinCased(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::Caseless(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::ArabicFrench(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::Logical(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::Visual(c) => { + if let Some(new_score) = c.feed(buffer) { + if last { + // Treat EOF as space-like + if let Some(additional_score) = c.feed(b" ") { + Some(new_score + additional_score) + } else { + None + } + } else { + Some(new_score) + } + } else { + None + } + } + InnerCandidate::Utf8(c) => c.feed(buffer, last), + InnerCandidate::Iso2022(c) => c.feed(buffer, last), + InnerCandidate::Shift(c) => c.feed(buffer, last), + InnerCandidate::EucJp(c) => c.feed(buffer, last), + InnerCandidate::EucKr(c) => c.feed(buffer, last), + InnerCandidate::Big5(c) => c.feed(buffer, last), + InnerCandidate::Gbk(c) => c.feed(buffer, last), + } + } +} + +fn encoding_for_tld(tld: Tld) -> usize { + match tld { + Tld::CentralWindows | Tld::CentralCyrillic => EncodingDetector::CENTRAL_WINDOWS_INDEX, + Tld::Cyrillic => EncodingDetector::CYRILLIC_WINDOWS_INDEX, + Tld::Generic | Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic | Tld::Eu => { + EncodingDetector::WESTERN_INDEX + } + Tld::IcelandicFaroese => EncodingDetector::ICELANDIC_INDEX, + Tld::Greek => EncodingDetector::GREEK_ISO_INDEX, + Tld::TurkishAzeri => EncodingDetector::TURKISH_INDEX, + Tld::Hebrew => EncodingDetector::LOGICAL_INDEX, + Tld::Arabic => EncodingDetector::ARABIC_WINDOWS_INDEX, + Tld::Baltic => EncodingDetector::BALTIC_WINDOWS_INDEX, + Tld::Vietnamese => EncodingDetector::VIETNAMESE_INDEX, + Tld::Thai => EncodingDetector::THAI_INDEX, + Tld::Simplified | Tld::SimplifiedTraditional => EncodingDetector::GBK_INDEX, + Tld::Traditional | Tld::TraditionalSimplified => EncodingDetector::BIG5_INDEX, + Tld::Japanese => EncodingDetector::SHIFT_JIS_INDEX, + Tld::Korean => EncodingDetector::EUC_KR_INDEX, + Tld::CentralIso => EncodingDetector::CENTRAL_ISO_INDEX, + } +} + +fn encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool { + match tld { + Tld::CentralWindows => encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX, + Tld::Cyrillic => { + encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX + || encoding == EncodingDetector::CYRILLIC_KOI_INDEX + || encoding == EncodingDetector::CYRILLIC_IBM_INDEX + || encoding == EncodingDetector::CYRILLIC_ISO_INDEX + } + Tld::Western => encoding == EncodingDetector::WESTERN_INDEX, + Tld::Greek => { + encoding == EncodingDetector::GREEK_WINDOWS_INDEX + || encoding == EncodingDetector::GREEK_ISO_INDEX + } + Tld::TurkishAzeri => encoding == EncodingDetector::TURKISH_INDEX, + Tld::Hebrew => encoding == EncodingDetector::LOGICAL_INDEX, + Tld::Arabic => { + encoding == EncodingDetector::ARABIC_WINDOWS_INDEX + || encoding == EncodingDetector::ARABIC_ISO_INDEX + } + Tld::Baltic => { + encoding == EncodingDetector::BALTIC_WINDOWS_INDEX + || encoding == EncodingDetector::BALTIC_ISO13_INDEX + || encoding == EncodingDetector::BALTIC_ISO4_INDEX + } + Tld::Vietnamese => encoding == EncodingDetector::VIETNAMESE_INDEX, + Tld::Thai => encoding == EncodingDetector::THAI_INDEX, + Tld::Simplified => encoding == EncodingDetector::GBK_INDEX, + Tld::Traditional => encoding == EncodingDetector::BIG5_INDEX, + Tld::Japanese => { + encoding == EncodingDetector::SHIFT_JIS_INDEX + || encoding == EncodingDetector::EUC_JP_INDEX + } + Tld::Korean => encoding == EncodingDetector::EUC_KR_INDEX, + Tld::SimplifiedTraditional | Tld::TraditionalSimplified => { + encoding == EncodingDetector::GBK_INDEX || encoding == EncodingDetector::BIG5_INDEX + } + Tld::CentralIso => encoding == EncodingDetector::CENTRAL_ISO_INDEX, + Tld::IcelandicFaroese => encoding == EncodingDetector::ICELANDIC_INDEX, + Tld::WesternCyrillic => { + encoding == EncodingDetector::WESTERN_INDEX + || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX + || encoding == EncodingDetector::CYRILLIC_KOI_INDEX + || encoding == EncodingDetector::CYRILLIC_IBM_INDEX + || encoding == EncodingDetector::CYRILLIC_ISO_INDEX + } + Tld::CentralCyrillic => { + encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX + || encoding == EncodingDetector::CENTRAL_ISO_INDEX + || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX + || encoding == EncodingDetector::CYRILLIC_KOI_INDEX + || encoding == EncodingDetector::CYRILLIC_IBM_INDEX + || encoding == EncodingDetector::CYRILLIC_ISO_INDEX + } + Tld::WesternArabic => { + encoding == EncodingDetector::WESTERN_INDEX + || encoding == EncodingDetector::ARABIC_WINDOWS_INDEX + || encoding == EncodingDetector::ARABIC_ISO_INDEX + } + Tld::Eu => { + encoding == EncodingDetector::WESTERN_INDEX + || encoding == EncodingDetector::ICELANDIC_INDEX + || encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX + || encoding == EncodingDetector::CENTRAL_ISO_INDEX + || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX + || encoding == EncodingDetector::CYRILLIC_KOI_INDEX + || encoding == EncodingDetector::CYRILLIC_IBM_INDEX + || encoding == EncodingDetector::CYRILLIC_ISO_INDEX + || encoding == EncodingDetector::GREEK_WINDOWS_INDEX + || encoding == EncodingDetector::GREEK_ISO_INDEX + || encoding == EncodingDetector::BALTIC_WINDOWS_INDEX + || encoding == EncodingDetector::BALTIC_ISO13_INDEX + || encoding == EncodingDetector::BALTIC_ISO4_INDEX + } + Tld::Generic => false, + } +} + +fn score_adjustment(score: i64, encoding: usize, tld: Tld) -> i64 { + if score < 1 { + return 0; + } + // This is the most ad hoc part of this library. + let (divisor, constant) = match tld { + Tld::Generic => { + unreachable!(); + } + Tld::CentralWindows | Tld::CentralIso => { + match encoding { + EncodingDetector::WESTERN_INDEX + | EncodingDetector::ICELANDIC_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::VIETNAMESE_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Cyrillic => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::VIETNAMESE_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Greek => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::CYRILLIC_WINDOWS_INDEX + | EncodingDetector::CYRILLIC_ISO_INDEX + | EncodingDetector::CYRILLIC_KOI_INDEX + | EncodingDetector::CYRILLIC_IBM_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::TurkishAzeri => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::VIETNAMESE_INDEX + | EncodingDetector::ICELANDIC_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Hebrew => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::CYRILLIC_WINDOWS_INDEX + | EncodingDetector::CYRILLIC_ISO_INDEX + | EncodingDetector::CYRILLIC_KOI_INDEX + | EncodingDetector::CYRILLIC_IBM_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::VIETNAMESE_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Arabic => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::EUC_KR_INDEX + | EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::CYRILLIC_WINDOWS_INDEX + | EncodingDetector::CYRILLIC_ISO_INDEX + | EncodingDetector::CYRILLIC_KOI_INDEX + | EncodingDetector::CYRILLIC_IBM_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::VIETNAMESE_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Baltic => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::ICELANDIC_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::VIETNAMESE_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Vietnamese => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::ICELANDIC_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Thai => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::EUC_KR_INDEX + | EncodingDetector::SHIFT_JIS_INDEX + | EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::CYRILLIC_WINDOWS_INDEX + | EncodingDetector::CYRILLIC_ISO_INDEX + | EncodingDetector::CYRILLIC_KOI_INDEX + | EncodingDetector::CYRILLIC_IBM_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::ARABIC_WINDOWS_INDEX + | EncodingDetector::ARABIC_ISO_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Simplified + | Tld::Traditional + | Tld::TraditionalSimplified + | Tld::SimplifiedTraditional + | Tld::Japanese + | Tld::Korean => { + // If TLD default is valid, everything else scores zero + return score; + } + Tld::IcelandicFaroese => { + match encoding { + EncodingDetector::CENTRAL_WINDOWS_INDEX + | EncodingDetector::CENTRAL_ISO_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::VIETNAMESE_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::CentralCyrillic => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::GREEK_WINDOWS_INDEX + | EncodingDetector::GREEK_ISO_INDEX + | EncodingDetector::VISUAL_INDEX + | EncodingDetector::LOGICAL_INDEX + | EncodingDetector::BALTIC_WINDOWS_INDEX + | EncodingDetector::BALTIC_ISO4_INDEX + | EncodingDetector::BALTIC_ISO13_INDEX + | EncodingDetector::TURKISH_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + Tld::Eu => { + match encoding { + EncodingDetector::BIG5_INDEX + | EncodingDetector::GBK_INDEX + | EncodingDetector::EUC_JP_INDEX + | EncodingDetector::TURKISH_INDEX + | EncodingDetector::VIETNAMESE_INDEX => { + // XXX Tune this better instead of this kind of absolute. + return score; + } + _ => (50, 60), + } + } + }; + (score / divisor) + constant +} + +struct Candidate { + inner: InnerCandidate, + score: Option<i64>, +} + +impl Candidate { + fn feed(&mut self, buffer: &[u8], last: bool) { + if let Some(old_score) = self.score { + if let Some(new_score) = self.inner.feed(buffer, last) { + self.score = Some(old_score + new_score); + } else { + self.score = None; + } + } + } + + fn new_latin(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::Latin(LatinCandidate::new(data)), + score: Some(0), + } + } + + fn new_non_latin_cased(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::NonLatinCased(NonLatinCasedCandidate::new(data)), + score: Some(0), + } + } + + fn new_caseless(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::Caseless(CaselessCandidate::new(data)), + score: Some(0), + } + } + + fn new_arabic_french(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::ArabicFrench(ArabicFrenchCandidate::new(data)), + score: Some(0), + } + } + + fn new_logical(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::Logical(LogicalCandidate::new(data)), + score: Some(0), + } + } + + fn new_visual(data: &'static SingleByteData) -> Self { + Candidate { + inner: InnerCandidate::Visual(VisualCandidate::new(data)), + score: Some(0), + } + } + + fn new_utf_8() -> Self { + Candidate { + inner: InnerCandidate::Utf8(Utf8Candidate { + decoder: UTF_8.new_decoder_without_bom_handling(), + }), + score: Some(0), + } + } + + fn new_iso_2022_jp() -> Self { + Candidate { + inner: InnerCandidate::Iso2022(Iso2022Candidate { + decoder: ISO_2022_JP.new_decoder_without_bom_handling(), + }), + score: Some(0), + } + } + + fn new_shift_jis() -> Self { + Candidate { + inner: InnerCandidate::Shift(ShiftJisCandidate { + decoder: SHIFT_JIS.new_decoder_without_bom_handling(), + half_width_katakana_seen: false, + half_width_katakana_state: HalfWidthKatakana::DakutenForbidden, + prev: LatinCj::Other, + prev_byte: 0, + pending_score: None, + }), + score: Some(0), + } + } + + fn new_euc_jp() -> Self { + Candidate { + inner: InnerCandidate::EucJp(EucJpCandidate { + decoder: EUC_JP.new_decoder_without_bom_handling(), + non_ascii_seen: false, + half_width_katakana_state: HalfWidthKatakana::DakutenForbidden, + prev: LatinCj::Other, + prev_byte: 0, + prev_prev_byte: 0, + }), + score: Some(0), + } + } + + fn new_euc_kr() -> Self { + Candidate { + inner: InnerCandidate::EucKr(EucKrCandidate { + decoder: EUC_KR.new_decoder_without_bom_handling(), + prev_byte: 0, + prev_was_euc_range: false, + prev: LatinKorean::Other, + current_word_len: 0, + pending_score: None, + }), + score: Some(0), + } + } + + fn new_big5() -> Self { + Candidate { + inner: InnerCandidate::Big5(Big5Candidate { + decoder: BIG5.new_decoder_without_bom_handling(), + prev: LatinCj::Other, + prev_byte: 0, + pending_score: None, + }), + score: Some(0), + } + } + + fn new_gbk() -> Self { + Candidate { + inner: InnerCandidate::Gbk(GbkCandidate { + decoder: GBK.new_decoder_without_bom_handling(), + prev: LatinCj::Other, + prev_byte: 0, + pending_score: None, + }), + score: Some(0), + } + } + + fn score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option<i64> { + match &self.inner { + InnerCandidate::NonLatinCased(c) => { + if c.longest_word < 2 { + return None; + } + } + InnerCandidate::Caseless(c) => { + if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { + return None; + } + } + InnerCandidate::ArabicFrench(c) => { + if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { + return None; + } + } + InnerCandidate::Logical(c) => { + if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { + return None; + } + } + InnerCandidate::Visual(c) => { + if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { + return None; + } + } + _ => {} + } + if tld == Tld::Generic { + return self.score; + } + if let Some(score) = self.score { + if encoding == encoding_for_tld(tld) { + return Some(score + 1); + } + if encoding_is_native_to_tld(tld, encoding) { + return Some(score); + } + if expectation_is_valid { + return Some(score - score_adjustment(score, encoding, tld)); + } + // If expectation is no longer valid, fall back to + // generic behavior. + // XXX Flipped Chinese and Central + return Some(score); + } + None + } + + fn plausible_punctuation(&self) -> u64 { + match &self.inner { + InnerCandidate::Logical(c) => { + return c.plausible_punctuation; + } + InnerCandidate::Visual(c) => { + return c.plausible_punctuation; + } + _ => { + unreachable!(); + } + } + } + + fn encoding(&self) -> &'static Encoding { + match &self.inner { + InnerCandidate::Latin(c) => { + return c.data.encoding; + } + InnerCandidate::NonLatinCased(c) => { + return c.data.encoding; + } + InnerCandidate::Caseless(c) => { + return c.data.encoding; + } + InnerCandidate::ArabicFrench(c) => { + return c.data.encoding; + } + InnerCandidate::Logical(c) => { + return c.data.encoding; + } + InnerCandidate::Visual(c) => { + return c.data.encoding; + } + InnerCandidate::Shift(_) => { + return SHIFT_JIS; + } + InnerCandidate::EucJp(_) => { + return EUC_JP; + } + InnerCandidate::Big5(_) => { + return BIG5; + } + InnerCandidate::EucKr(_) => { + return EUC_KR; + } + InnerCandidate::Gbk(_) => { + return GBK; + } + InnerCandidate::Utf8(_) => { + return UTF_8; + } + InnerCandidate::Iso2022(_) => { + return ISO_2022_JP; + } + } + } +} + +fn count_non_ascii(buffer: &[u8]) -> u64 { + let mut count = 0; + for &b in buffer { + if b >= 0x80 { + count += 1; + } + } + count +} + +#[derive(Clone, Copy)] +enum BeforeNonAscii { + None, + One([u8; 1]), + Two([u8; 2]), +} + +impl BeforeNonAscii { + fn as_slice(&self) -> &[u8] { + match self { + BeforeNonAscii::None => b"", + BeforeNonAscii::One(arr) => &arr[..], + BeforeNonAscii::Two(arr) => &arr[..], + } + } + + fn push(&mut self, buffer: &[u8]) { + let len = buffer.len(); + if len >= 2 { + let arr = [buffer[len - 2], buffer[len - 1]]; + *self = BeforeNonAscii::Two(arr); + } else if len == 1 { + match self { + BeforeNonAscii::None => { + let arr = [buffer[0]]; + *self = BeforeNonAscii::One(arr); + } + BeforeNonAscii::One(first) => { + let arr = [first[0], buffer[0]]; + *self = BeforeNonAscii::Two(arr); + } + BeforeNonAscii::Two(first) => { + let arr = [first[1], buffer[0]]; + *self = BeforeNonAscii::Two(arr); + } + } + } + } +} + +/// A Web browser-oriented detector for guessing what character +/// encoding a stream of bytes is encoded in. +/// +/// The bytes are fed to the detector incrementally using the `feed` +/// method. The current guess of the detector can be queried using +/// the `guess` method. The guessing parameters are arguments to the +/// `guess` method rather than arguments to the constructor in order +/// to enable the application to check if the arguments affect the +/// guessing outcome. (The specific use case is to disable UI for +/// re-running the detector with UTF-8 allowed and the top-level +/// domain name ignored if those arguments don't change the guess.) +pub struct EncodingDetector { + candidates: [Candidate; 27], + non_ascii_seen: u64, + // We need to feed up to two bytes of context before non-ASCII + // thanks to Spanish n.º. + last_before_non_ascii: BeforeNonAscii, + esc_seen: bool, + closed: bool, +} + +impl EncodingDetector { + fn feed_impl(&mut self, buffer: &[u8], last: bool) { + for candidate in self.candidates.iter_mut() { + candidate.feed(buffer, last); + } + self.non_ascii_seen += count_non_ascii(buffer); + } + + /// Inform the detector of a chunk of input. + /// + /// The byte stream is represented as a sequence of calls to this + /// method such that the concatenation of the arguments to this + /// method form the byte stream. It does not matter how the application + /// chooses to chunk the stream. It is OK to call this method with + /// a zero-length byte slice. + /// + /// The end of the stream is indicated by calling this method with + /// `last` set to `true`. In that case, the end of the stream is + /// considered to occur after the last byte of the `buffer` (which + /// may be zero-length) passed in the same call. Once this method + /// has been called with `last` set to `true` this method must not + /// be called again. + /// + /// If you want to perform detection on just the prefix of a longer + /// stream, do not pass `last=true` after the prefix if the stream + /// actually still continues. + /// + /// Returns `true` if after processing `buffer` the stream has + /// contained at least one non-ASCII byte and `false` if only + /// ASCII has been seen so far. + /// + /// # Panics + /// + /// If this method has previously been called with `last` set to `true`. + pub fn feed(&mut self, buffer: &[u8], last: bool) -> bool { + assert!( + !self.closed, + "Must not feed again after feeding with last equaling true." + ); + if last { + self.closed = true; + } + let start = if self.non_ascii_seen == 0 && !self.esc_seen { + let up_to = Encoding::ascii_valid_up_to(buffer); + let start = if let Some(escape) = memchr::memchr(0x1B, &buffer[..up_to]) { + self.esc_seen = true; + escape + } else { + up_to + }; + if start == buffer.len() { + self.last_before_non_ascii.push(buffer); + return self.non_ascii_seen != 0; + } + if start == 0 || start == 1 { + let last_before = self.last_before_non_ascii; + self.last_before_non_ascii = BeforeNonAscii::None; + self.feed_impl(last_before.as_slice(), false); + 0 + } else { + start - 2 + } + } else { + 0 + }; + self.feed_impl(&buffer[start..], last); + self.non_ascii_seen != 0 + } + + /// Guess the encoding given the bytes pushed to the detector so far + /// (via `feed()`), the top-level domain name from which the bytes were + /// loaded, and an indication of whether to consider UTF-8 as a permissible + /// guess. + /// + /// The `tld` argument takes the rightmost DNS label of the hostname of the + /// host the stream was loaded from in lower-case ASCII form. That is, if + /// the label is an internationalized top-level domain name, it must be + /// provided in its Punycode form. If the TLD that the stream was loaded + /// from is unavalable, `None` may be passed instead, which is equivalent + /// to passing `Some(b"com")`. + /// + /// If the `allow_utf8` argument is set to `false`, the return value of + /// this method won't be `encoding_rs::UTF_8`. When performing detection + /// on `text/html` on non-`file:` URLs, Web browsers must pass `false`, + /// unless the user has taken a specific contextual action to request an + /// override. This way, Web developers cannot start depending on UTF-8 + /// detection. Such reliance would make the Web Platform more brittle. + /// + /// Returns the guessed encoding. + /// + /// # Panics + /// + /// If `tld` contains non-ASCII, period, or upper-case letters. (The panic + /// condition is intentionally limited to signs of failing to extract the + /// label correctly, failing to provide it in its Punycode form, and failure + /// to lower-case it. Full DNS label validation is intentionally not performed + /// to avoid panics when the reality doesn't match the specs.) + pub fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding { + let mut tld_type = tld.map_or(Tld::Generic, |tld| { + assert!(!contains_upper_case_period_or_non_ascii(tld)); + classify_tld(tld) + }); + + if self.non_ascii_seen == 0 + && self.esc_seen + && self.candidates[Self::ISO_2022_JP_INDEX].score.is_some() + { + return ISO_2022_JP; + } + + if self.candidates[Self::UTF_8_INDEX].score.is_some() { + if allow_utf8 { + return UTF_8; + } + // Various test cases that prohibit UTF-8 detection want to + // see windows-1252 specifically. These tests run on generic + // domains. However, if we returned windows-1252 on + // some non-generic domains, we'd cause reloads. + return self.candidates[encoding_for_tld(tld_type)].encoding(); + } + + let mut encoding = self.candidates[encoding_for_tld(tld_type)].encoding(); + let mut max = 0i64; + let mut expectation_is_valid = false; + if tld_type != Tld::Generic { + for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { + if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() { + expectation_is_valid = true; + break; + } + } + } + if !expectation_is_valid { + // Flip Chinese and Central around + match tld_type { + Tld::Simplified => { + if self.candidates[Self::BIG5_INDEX].score.is_some() { + tld_type = Tld::Traditional; + expectation_is_valid = true; + } + } + Tld::Traditional => { + if self.candidates[Self::GBK_INDEX].score.is_some() { + tld_type = Tld::Simplified; + expectation_is_valid = true; + } + } + Tld::CentralWindows => { + if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() { + tld_type = Tld::CentralIso; + expectation_is_valid = true; + } + } + Tld::CentralIso => { + if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() { + tld_type = Tld::CentralWindows; + expectation_is_valid = true; + } + } + _ => {} + } + } + for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { + if let Some(score) = candidate.score(i, tld_type, expectation_is_valid) { + if score > max { + max = score; + encoding = candidate.encoding(); + } + } + } + let visual = &self.candidates[Self::VISUAL_INDEX]; + if let Some(visual_score) = visual.score(Self::VISUAL_INDEX, tld_type, expectation_is_valid) + { + if (visual_score > max || encoding == WINDOWS_1255) + && visual.plausible_punctuation() + > self.candidates[Self::LOGICAL_INDEX].plausible_punctuation() + { + // max = visual_score; + encoding = ISO_8859_8; + } + } + + encoding + } + + // XXX Test-only API + #[cfg(feature = "testing-only-no-semver-guarantees-do-not-use")] + pub fn find_score(&self, encoding: &'static Encoding) -> Option<i64> { + let mut tld_type = Tld::Generic; + let mut expectation_is_valid = false; + if tld_type != Tld::Generic { + for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { + if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() { + expectation_is_valid = true; + break; + } + } + } + if !expectation_is_valid { + // Flip Chinese and Central around + match tld_type { + Tld::Simplified => { + if self.candidates[Self::BIG5_INDEX].score.is_some() { + tld_type = Tld::Traditional; + expectation_is_valid = true; + } + } + Tld::Traditional => { + if self.candidates[Self::GBK_INDEX].score.is_some() { + tld_type = Tld::Simplified; + expectation_is_valid = true; + } + } + Tld::CentralWindows => { + if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() { + tld_type = Tld::CentralIso; + expectation_is_valid = true; + } + } + Tld::CentralIso => { + if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() { + tld_type = Tld::CentralWindows; + expectation_is_valid = true; + } + } + _ => {} + } + } + for (i, candidate) in self.candidates.iter().enumerate() { + if encoding == candidate.encoding() { + return candidate.score(i, tld_type, expectation_is_valid); + } + } + Some(0) + } + + const FIRST_NORMAL: usize = 3; + + const UTF_8_INDEX: usize = 0; + + const ISO_2022_JP_INDEX: usize = 1; + + const VISUAL_INDEX: usize = 2; + + const GBK_INDEX: usize = 3; + + const EUC_JP_INDEX: usize = 4; + + const EUC_KR_INDEX: usize = 5; + + const SHIFT_JIS_INDEX: usize = 6; + + const BIG5_INDEX: usize = 7; + + const WESTERN_INDEX: usize = 8; + + const CYRILLIC_WINDOWS_INDEX: usize = 9; + + const CENTRAL_WINDOWS_INDEX: usize = 10; + + const CENTRAL_ISO_INDEX: usize = 11; + + const ARABIC_WINDOWS_INDEX: usize = 12; + + const ICELANDIC_INDEX: usize = 13; + + const TURKISH_INDEX: usize = 14; + + const THAI_INDEX: usize = 15; + + const LOGICAL_INDEX: usize = 16; + + const GREEK_WINDOWS_INDEX: usize = 17; + + const GREEK_ISO_INDEX: usize = 18; + + const BALTIC_WINDOWS_INDEX: usize = 19; + + const BALTIC_ISO13_INDEX: usize = 20; + + const CYRILLIC_KOI_INDEX: usize = 21; + + const CYRILLIC_IBM_INDEX: usize = 22; + + const ARABIC_ISO_INDEX: usize = 23; + + const VIETNAMESE_INDEX: usize = 24; + + const BALTIC_ISO4_INDEX: usize = 25; + + const CYRILLIC_ISO_INDEX: usize = 26; + + /// Creates a new instance of the detector. + pub fn new() -> Self { + EncodingDetector { + candidates: [ + Candidate::new_utf_8(), // 0 + Candidate::new_iso_2022_jp(), // 1 + Candidate::new_visual(&SINGLE_BYTE_DATA[ISO_8859_8_INDEX]), // 2 + Candidate::new_gbk(), // 3 + Candidate::new_euc_jp(), // 4 + Candidate::new_euc_kr(), // 5 + Candidate::new_shift_jis(), // 6 + Candidate::new_big5(), // 7 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_INDEX]), // 8 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1251_INDEX]), // 9 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1250_INDEX]), // 10 + Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_2_INDEX]), // 11 + Candidate::new_arabic_french(&SINGLE_BYTE_DATA[WINDOWS_1256_INDEX]), // 12 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_ICELANDIC_INDEX]), // 13 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1254_INDEX]), // 14 + Candidate::new_caseless(&SINGLE_BYTE_DATA[WINDOWS_874_INDEX]), // 15 + Candidate::new_logical(&SINGLE_BYTE_DATA[WINDOWS_1255_INDEX]), // 16 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1253_INDEX]), // 17 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_7_INDEX]), // 18 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1257_INDEX]), // 19 + Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_13_INDEX]), // 20 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[KOI8_U_INDEX]), // 21 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[IBM866_INDEX]), // 22 + Candidate::new_caseless(&SINGLE_BYTE_DATA[ISO_8859_6_INDEX]), // 23 + Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1258_INDEX]), // 24 + Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_4_INDEX]), // 25 + Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_5_INDEX]), // 26 + ], + non_ascii_seen: 0, + last_before_non_ascii: BeforeNonAscii::None, + esc_seen: false, + closed: false, + } + } + + /// Queries whether the TLD is considered non-generic and could affect the guess. + pub fn tld_may_affect_guess(tld: Option<&[u8]>) -> bool { + if let Some(tld) = tld { + classify_tld(tld) != Tld::Generic + } else { + false + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use detone::IterDecomposeVietnamese; + use encoding_rs::IBM866; + use encoding_rs::ISO_8859_2; + use encoding_rs::ISO_8859_4; + use encoding_rs::ISO_8859_5; + use encoding_rs::ISO_8859_6; + use encoding_rs::ISO_8859_7; + use encoding_rs::KOI8_U; + use encoding_rs::WINDOWS_1250; + use encoding_rs::WINDOWS_1251; + use encoding_rs::WINDOWS_1252; + use encoding_rs::WINDOWS_1253; + use encoding_rs::WINDOWS_1254; + use encoding_rs::WINDOWS_1256; + use encoding_rs::WINDOWS_1257; + use encoding_rs::WINDOWS_1258; + use encoding_rs::WINDOWS_874; + + fn check_bytes(bytes: &[u8], encoding: &'static Encoding) { + let mut det = EncodingDetector::new(); + det.feed(bytes, true); + let enc = det.guess(None, false); + let (decoded, _) = enc.decode_without_bom_handling(bytes); + println!("{:?}", decoded); + assert_eq!(enc, encoding); + } + + fn check(input: &str, encoding: &'static Encoding) { + let orthographic; + let (bytes, _, _) = if encoding == WINDOWS_1258 { + orthographic = input + .chars() + .decompose_vietnamese_tones(true) + .collect::<String>(); + encoding.encode(&orthographic) + } else { + encoding.encode(input) + }; + check_bytes(&bytes, encoding); + } + + #[test] + fn test_i_apostrophe() { + let mut det = EncodingDetector::new(); + det.feed(b"I\x92", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_one_by_one() { + let mut det = EncodingDetector::new(); + det.feed(b"n", false); + det.feed(b".", false); + det.feed(b"\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_two_together() { + let mut det = EncodingDetector::new(); + det.feed(b"n.", false); + det.feed(b"\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_one_by_one_extra_before() { + let mut det = EncodingDetector::new(); + det.feed(b" n", false); + det.feed(b".", false); + det.feed(b"\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_one_before() { + let mut det = EncodingDetector::new(); + det.feed(b"n", false); + det.feed(b".\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_streaming_numero_longer_first_buffer() { + let mut det = EncodingDetector::new(); + det.feed(b"rrn.", false); + det.feed(b"\xBA", false); + det.feed(b"1", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + } + + #[test] + fn test_empty() { + let mut det = EncodingDetector::new(); + let seen_non_ascii = det.feed(b"", true); + let enc = det.guess(None, false); + assert_eq!(enc, WINDOWS_1252); + assert!(!seen_non_ascii); + } + + #[test] + fn test_fi() { + check("Ääni", WINDOWS_1252); + } + + #[test] + fn test_fi_bis() { + check("Tämä", WINDOWS_1252); + } + + #[test] + fn test_pt() { + check( + "Este é um teste de codificação de caracteres.", + WINDOWS_1252, + ); + } + + #[test] + fn test_is() { + check("Þetta er kóðunarpróf á staf. Fyrir sum tungumál sem nota latneska stafi þurfum við meira inntak til að taka ákvörðunina.", WINDOWS_1252); + } + + #[test] + fn test_ru_short() { + check("Русский", WINDOWS_1251); + } + + #[test] + fn test_ru() { + check("Это тест кодировки символов.", WINDOWS_1251); + } + + #[test] + fn test_ru_iso() { + check("Это тест кодировки символов.", ISO_8859_5); + } + + #[test] + fn test_ru_ibm() { + check("Это тест кодировки символов.", IBM866); + } + + #[test] + fn test_ru_koi() { + check("Это тест кодировки символов.", KOI8_U); + } + + #[test] + fn test_uk() { + check("Це тест на кодування символів.", WINDOWS_1251); + } + + #[test] + fn test_uk_koi() { + check("Це тест на кодування символів.", KOI8_U); + } + + #[test] + fn test_el_short() { + check("Ελληνικά", WINDOWS_1253); + } + + #[test] + fn test_el() { + check( + "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης", + WINDOWS_1253, + ); + } + + #[test] + fn test_el_iso() { + check( + "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης", + ISO_8859_7, + ); + } + + #[test] + fn test_de() { + check("Straße", WINDOWS_1252); + } + + #[test] + fn test_he() { + check("\u{5E2}\u{5D1}\u{5E8}\u{5D9}\u{5EA}", WINDOWS_1255); + } + + #[test] + fn test_2022() { + check("日本語", ISO_2022_JP); + } + + #[test] + fn test_th() { + check("นี่คือการทดสอบการเข้ารหัสอักขระ", WINDOWS_874); + } + + #[test] + fn test_vi() { + check("Đây là một thử nghiệm mã hóa ký tự.", WINDOWS_1258); + } + + #[test] + fn test_tr() { + check("Bu bir karakter kodlama testidir. Latince karakterleri kullanan bazı dillerde karar vermek için daha fazla girdiye ihtiyacımız var.", WINDOWS_1254); + } + + #[test] + fn test_simplified() { + check("这是一个字符编码测试。", GBK); + } + + #[test] + fn test_traditional() { + check("這是一個字符編碼測試。", BIG5); + } + + #[test] + fn test_ko() { + check("이것은 문자 인코딩 테스트입니다.", EUC_KR); + } + + #[test] + fn test_shift() { + check("これは文字実験です。", SHIFT_JIS); + } + + #[test] + fn test_euc() { + check("これは文字実験です。", EUC_JP); + } + + #[test] + fn test_ar() { + check("هذا هو اختبار ترميز الأحرف.", WINDOWS_1256); + } + + #[test] + fn test_ar_iso() { + check("هذا هو اختبار ترميز الأحرف.", ISO_8859_6); + } + + #[test] + fn test_fa() { + check("این یک تست رمزگذاری کاراکتر است.", WINDOWS_1256); + } + + #[test] + fn test_visual() { + check(".םיוות דודיק ןחבמ והז", ISO_8859_8); + } + + #[test] + fn test_yi() { + check("דאָס איז אַ טעסט פֿאַר קאָדירונג פון כאַראַקטער.", WINDOWS_1255); + } + + #[test] + fn test_it() { + check("è", WINDOWS_1252); + } + + #[test] + fn test_en() { + check("isn’t", WINDOWS_1252); + } + + #[test] + fn test_en_bis() { + check("Rock ’n Roll", WINDOWS_1252); + } + + #[test] + fn test_ca() { + check("Codificació de caràcters", WINDOWS_1252); + } + + #[test] + fn test_et() { + check("või", WINDOWS_1252); + } + + #[test] + fn test_pl_iso() { + check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", ISO_8859_2); + } + + #[test] + fn test_pl() { + check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", WINDOWS_1250); + } + + #[test] + fn test_lt() { + check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", WINDOWS_1257); + } + + // TODO: Detected as ISO-8859-2. + // #[test] + // fn test_lt_windows_iso_8859_4() { + // check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", ISO_8859_4); + // } + + #[test] + fn test_lv() { + check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", WINDOWS_1257); + } + + #[test] + fn test_lv_iso_8859_4() { + check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", ISO_8859_4); + } + + #[test] + fn test_a0() { + // Test that this isn't IBM866. TODO: What about GBK with fully paired 0xA0? + check("\u{A0}\u{A0} \u{A0}", WINDOWS_1252); + } + + #[test] + fn test_a0a0() { + // Test that this isn't GBK or EUC-KR. + check("\u{A0}\u{A0}", WINDOWS_1252); + } + + #[test] + fn test_space_copyright_space() { + check(" © ", WINDOWS_1252); + } + + #[test] + fn test_space_masculine_space() { + check(" º ", WINDOWS_1252); + } + + #[test] + fn test_space_feminine_space() { + check(" ª ", WINDOWS_1252); + } + + #[test] + fn test_period_masculine_space() { + check(".º ", WINDOWS_1252); + } + + #[test] + fn test_period_feminine_space() { + check(".ª ", WINDOWS_1252); + } + + #[test] + fn test_maria() { + check(" Mª ", WINDOWS_1252); + } + + #[test] + fn test_dona() { + check(" Dª ", WINDOWS_1252); + } + + #[test] + fn test_nuestra() { + check(" Nª ", WINDOWS_1252); + } + + #[test] + fn test_senora() { + check(" Sª ", WINDOWS_1252); + } + + #[test] + fn test_digit_feminine() { + check(" 42ª ", WINDOWS_1252); + } + + #[test] + fn test_digit_masculine() { + check(" 42º ", WINDOWS_1252); + } + + #[test] + fn test_roman_feminine() { + check(" XIVª ", WINDOWS_1252); + } + + #[test] + fn test_roman_masculine() { + check(" XIVº ", WINDOWS_1252); + } + + #[test] + fn test_numero_uno() { + check("Nº1", WINDOWS_1252); + } + + #[test] + fn test_numero() { + check("Nº", WINDOWS_1252); + } + + #[test] + fn test_euro() { + check(" €9", WINDOWS_1252); + } + + #[test] + fn test_shift_jis_half_width_katakana() { + check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS); + } + + #[test] + fn test_big5_pua() { + let mut v = Vec::new(); + for _ in 0..40 { + v.extend_from_slice(b"\xA4\x40"); + } + v.extend_from_slice(b"\x81\x40\xA4\x40"); + check_bytes(&v, BIG5); + } + + #[test] + fn test_big5_single_byte_a0() { + let mut v = Vec::new(); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\x40"); + } + v.extend_from_slice(b"\x81\x40\xA0 "); + check_bytes(&v, BIG5); + } + + #[test] + fn test_big5_single_byte_ff() { + let mut v = Vec::new(); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\x40"); + } + v.extend_from_slice(b"\x81\x40\xFF "); + check_bytes(&v, BIG5); + } + + #[test] + fn test_not_big5() { + let mut v = Vec::new(); + for _ in 0..40 { + v.extend_from_slice(b"\xA4\x40"); + } + v.extend_from_slice(b"\x81\x40\xA0\xA0"); + check_bytes(&v, IBM866); + } + + #[test] + fn test_euc_kr_pua() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xC9\xA1\xB0\xA1 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_euc_kr_pua_bis() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFE\xA1\xB0\xA1 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_euc_kr_single_byte_ff() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFF "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_euc_kr_single_byte_81() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x81 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_euc_kr_single_byte_84() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x84 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, EUC_KR); + } + + #[test] + fn test_not_euc_kr() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xC9\xA0\xB0\xA1 "); + for _ in 0..40 { + v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_shift_jis_x0213() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x87\xE5"); + for _ in 0..40 { + v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); + } + check_bytes(&v, SHIFT_JIS); + } + + #[test] + fn test_shift_jis_single_byte_fd() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFD"); + for _ in 0..40 { + v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); + } + check_bytes(&v, SHIFT_JIS); + } + + #[test] + fn test_not_shift_jis() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x84\xE0"); + for _ in 0..40 { + v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_not_shift_jis_bis() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x87\x7D"); + for _ in 0..40 { + v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_euc_jp_x0213() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xAD\xBF"); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); + } + check_bytes(&v, EUC_JP); + } + + #[test] + fn test_euc_jp_x0213_other_plane() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x8F\xFE\xF6"); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); + } + check_bytes(&v, EUC_JP); + } + + #[test] + fn test_not_euc_jp() { + let mut v = Vec::new(); + v.extend_from_slice(b"\x8F\xFE\xF7"); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); + } + check_bytes(&v, WINDOWS_1252); + } + + #[test] + fn test_not_euc_jp_bis() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xA8\xDF"); + for _ in 0..80 { + v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); + } + check_bytes(&v, BIG5); + } + + #[test] + fn test_gbk_single_byte_ff() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFF"); + for _ in 0..80 { + v.extend_from_slice(b"\xB5\xC4"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_gbk_single_byte_a0() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xA0 "); + for _ in 0..80 { + v.extend_from_slice(b"\xB5\xC4"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_gbk_single_byte_fe() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFE "); + for _ in 0..80 { + v.extend_from_slice(b"\xB5\xC4"); + } + check_bytes(&v, GBK); + } + + #[test] + fn test_not_gbk_single_byte_fc() { + let mut v = Vec::new(); + v.extend_from_slice(b"\xFC "); + for _ in 0..80 { + v.extend_from_slice(b"\xB5\xC4"); + } + check_bytes(&v, ISO_8859_5); + } +} diff --git a/third_party/rust/chardetng/src/tld.rs b/third_party/rust/chardetng/src/tld.rs new file mode 100644 index 0000000000..9f43af92d9 --- /dev/null +++ b/third_party/rust/chardetng/src/tld.rs @@ -0,0 +1,340 @@ +/* Any copyright is dedicated to the Public Domain. + * https://creativecommons.org/publicdomain/zero/1.0/ */ + +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Tld { + CentralWindows, + Cyrillic, + Western, + Greek, + TurkishAzeri, + Hebrew, + Arabic, + Baltic, + Vietnamese, + Thai, + Simplified, + Traditional, + Japanese, + Korean, + SimplifiedTraditional, + TraditionalSimplified, + CentralIso, + IcelandicFaroese, + WesternCyrillic, + CentralCyrillic, + WesternArabic, + Generic, + Eu, +} + +pub fn classify_tld(tld: &[u8]) -> Tld { + if tld.len() == 2 { + let key = [tld[0], tld[1]]; + if let Ok(i) = TWO_LETTER_KEYS.binary_search(&key) { + TWO_LETTER_VALUES[i] + } else { + Tld::Western + } + } else if tld.len() == 3 { + match tld { + b"edu" | b"gov" | b"mil" => Tld::Western, + _ => Tld::Generic, + } + } else if tld.starts_with(b"xn--") && tld.len() >= 8 { + // It's unclear is including the IDNs here is a good idea. + // Clearly, they are an anachronism relative to the era + // of legacy encodings. The idea, consistent with previous + // approach in Firefox is to address the case where one + // of these TLDs is configured as an alternative name for + // a server that also serves the same content from a + // two-ASCII-letter TLD. This makes the detection result + // the same either way even though otherwise this thing + // does not make much sense. + if let Ok(i) = PUNYCODE_KEYS.binary_search(&&tld[4..]) { + PUNYCODE_VALUES[i] + } else { + Tld::Generic + } + } else { + Tld::Generic + } +} + +static TWO_LETTER_VALUES: [Tld; 87] = [ + Tld::Generic, // ac + Tld::Arabic, // ae + Tld::Arabic, // af + Tld::Generic, // ai + Tld::WesternCyrillic, // am + Tld::TurkishAzeri, // az + Tld::CentralCyrillic, // ba + Tld::Cyrillic, // bg + Tld::Arabic, // bh + Tld::Cyrillic, // by + Tld::Generic, // bz + Tld::Generic, // cb + Tld::Generic, // cc + Tld::Generic, // cd + Tld::Simplified, // cn + Tld::Generic, // cx + Tld::Greek, // cy + Tld::CentralWindows, // cz + Tld::Generic, // dj + Tld::Arabic, // dz + Tld::Arabic, // eg + Tld::Eu, // eu + Tld::Generic, // fm + Tld::IcelandicFaroese, // fo + Tld::WesternCyrillic, // ge + Tld::Greek, // gr + Tld::TraditionalSimplified, // hk + Tld::CentralWindows, // hr + Tld::CentralIso, // hu + Tld::Hebrew, // il + Tld::Generic, // in + Tld::Arabic, // iq + Tld::Arabic, // ir + Tld::IcelandicFaroese, // is + Tld::Arabic, // jo + Tld::Japanese, // jp + Tld::Cyrillic, // kg + Tld::Korean, // kp + Tld::Korean, // kr + Tld::Arabic, // kw + Tld::Cyrillic, // kz + Tld::Generic, // la + Tld::Arabic, // lb + Tld::Baltic, // lt + Tld::Baltic, // lv + Tld::Arabic, // ly + Tld::Arabic, // ma + Tld::Cyrillic, // md + Tld::Generic, // me + Tld::Cyrillic, // mk + Tld::Cyrillic, // mn + Tld::TraditionalSimplified, // mo + Tld::Arabic, // mr + Tld::Generic, // ms + Tld::WesternArabic, // my + Tld::Generic, // nu + Tld::Arabic, // om + Tld::Arabic, // pk + Tld::CentralIso, // pl + Tld::Arabic, // ps + Tld::Arabic, // qa + Tld::CentralWindows, // ro + Tld::Cyrillic, // rs + Tld::Cyrillic, // ru + Tld::Arabic, // sa + Tld::Arabic, // sd + Tld::SimplifiedTraditional, // sg + Tld::CentralIso, // si + Tld::CentralWindows, // sk + Tld::Generic, // st + Tld::Cyrillic, // su + Tld::Arabic, // sy + Tld::Thai, // th + Tld::Cyrillic, // tj + Tld::Generic, // tk + Tld::Cyrillic, // tm + Tld::Arabic, // tn + Tld::Generic, // to + Tld::TurkishAzeri, // tr + Tld::Generic, // tv + Tld::Traditional, // tw + Tld::Cyrillic, // ua + Tld::Cyrillic, // uz + Tld::Generic, // vc + Tld::Vietnamese, // vn + Tld::Generic, // vu + Tld::Arabic, // ye +]; + +static TWO_LETTER_KEYS: [[u8; 2]; 87] = [ + [b'a', b'c'], // Generic + [b'a', b'e'], // Arabic + [b'a', b'f'], // Arabic + [b'a', b'i'], // Generic + [b'a', b'm'], // WesternCyrillic + [b'a', b'z'], // TurkishAzeri + [b'b', b'a'], // CentralCyrillic + [b'b', b'g'], // Cyrillic + [b'b', b'h'], // Arabic + [b'b', b'y'], // Cyrillic + [b'b', b'z'], // Generic + [b'c', b'b'], // Generic + [b'c', b'c'], // Generic + [b'c', b'd'], // Generic + [b'c', b'n'], // Simplified + [b'c', b'x'], // Generic + [b'c', b'y'], // Greek + [b'c', b'z'], // CentralWindows + [b'd', b'j'], // Generic + [b'd', b'z'], // Arabic + [b'e', b'g'], // Arabic + [b'e', b'u'], // Eu + [b'f', b'm'], // Generic + [b'f', b'o'], // IcelandicFaroese + [b'g', b'e'], // WesternCyrillic + [b'g', b'r'], // Greek + [b'h', b'k'], // TraditionalSimplified + [b'h', b'r'], // CentralWindows + [b'h', b'u'], // CentralIso + [b'i', b'l'], // Hebrew + [b'i', b'n'], // Generic + [b'i', b'q'], // Arabic + [b'i', b'r'], // Arabic + [b'i', b's'], // IcelandicFaroese + [b'j', b'o'], // Arabic + [b'j', b'p'], // Japanese + [b'k', b'g'], // Cyrillic + [b'k', b'p'], // Korean + [b'k', b'r'], // Korean + [b'k', b'w'], // Arabic + [b'k', b'z'], // Cyrillic + [b'l', b'a'], // Generic + [b'l', b'b'], // Arabic + [b'l', b't'], // Baltic + [b'l', b'v'], // Baltic + [b'l', b'y'], // Arabic + [b'm', b'a'], // Arabic + [b'm', b'd'], // Cyrillic + [b'm', b'e'], // Generic + [b'm', b'k'], // Cyrillic + [b'm', b'n'], // Cyrillic + [b'm', b'o'], // TraditionalSimplified + [b'm', b'r'], // Arabic + [b'm', b's'], // Generic + [b'm', b'y'], // WesternArabic + [b'n', b'u'], // Generic + [b'o', b'm'], // Arabic + [b'p', b'k'], // Arabic + [b'p', b'l'], // CentralIso + [b'p', b's'], // Arabic + [b'q', b'a'], // Arabic + [b'r', b'o'], // CentralWindows + [b'r', b's'], // Cyrillic + [b'r', b'u'], // Cyrillic + [b's', b'a'], // Arabic + [b's', b'd'], // Arabic + [b's', b'g'], // SimplifiedTraditional + [b's', b'i'], // CentralIso + [b's', b'k'], // CentralWindows + [b's', b't'], // Generic + [b's', b'u'], // Cyrillic + [b's', b'y'], // Arabic + [b't', b'h'], // Thai + [b't', b'j'], // Cyrillic + [b't', b'k'], // Generic + [b't', b'm'], // Cyrillic + [b't', b'n'], // Arabic + [b't', b'o'], // Generic + [b't', b'r'], // TurkishAzeri + [b't', b'v'], // Generic + [b't', b'w'], // Traditional + [b'u', b'a'], // Cyrillic + [b'u', b'z'], // Cyrillic + [b'v', b'c'], // Generic + [b'v', b'n'], // Vietnamese + [b'v', b'u'], // Generic + [b'y', b'e'], // Arabic +]; + +static PUNYCODE_KEYS: [&'static [u8]; 46] = [ + b"3e0b707e", // Korean + b"54b7fta0cc", // Western + b"80ao21a", // Cyrillic + b"90a3ac", // Cyrillic + b"90ae", // Cyrillic + b"90ais", // Cyrillic + b"clchc0ea0b2g2a9gcd", // SimplifiedTraditional + b"d1alf", // Cyrillic + b"e1a4c", // Eu + b"fiqs8S", // Simplified + b"fiqz9S", // Simplified + b"fzc2c9e2c", // Western + b"j1amh", // Cyrillic + b"j6w193g", // TraditionalSimplified + b"kprw13d", // Traditional + b"kpry57d", // Traditional + b"l1acc", // Cyrillic + b"lgbbat1ad8j", // Arabic + b"mgb2ddes", // Arabic + b"mgb9awbf", // Arabic + b"mgba3a4f16a", // Arabic + b"mgbaam7a8h", // Arabic + b"mgbah1a3hjkrd", // Arabic + b"mgbai9azgqp6j", // Arabic + b"mgbayh7gpa", // Arabic + b"mgbc0a9azcg", // Arabic + b"mgbcpq6gpa1a", // Arabic + b"mgberp4a5d4ar", // Arabic + b"mgbpl2fh", // Arabic + b"mgbtx2b", // Arabic + b"mgbx4cd0ab", // WesternArabic + b"mix891f", // TraditionalSimplified + b"node", // WesternCyrillic + b"o3cw4h", // Thai + b"ogbpf8fl", // Arabic + b"p1ai", // Cyrillic + b"pgbs0dh", // Arabic + b"q7ce6a", // Arabic + b"qxa6a", // Eu + b"qxam", // Greek + b"wgbh1c", // Arabic + b"wgbl6a", // Arabic + b"xkc2al3hye2a", // Western + b"y9a3aq", // WesternCyrillic + b"yfro4i67o", // SimplifiedTraditional + b"ygbi2ammx", // Arabic +]; + +static PUNYCODE_VALUES: [Tld; 46] = [ + Tld::Korean, // 3e0b707e + Tld::Western, // 54b7fta0cc + Tld::Cyrillic, // 80ao21a + Tld::Cyrillic, // 90a3ac + Tld::Cyrillic, // 90ae + Tld::Cyrillic, // 90ais + Tld::SimplifiedTraditional, // clchc0ea0b2g2a9gcd + Tld::Cyrillic, // d1alf + Tld::Eu, // e1a4c + Tld::Simplified, // fiqs8S + Tld::Simplified, // fiqz9S + Tld::Western, // fzc2c9e2c + Tld::Cyrillic, // j1amh + Tld::TraditionalSimplified, // j6w193g + Tld::Traditional, // kprw13d + Tld::Traditional, // kpry57d + Tld::Cyrillic, // l1acc + Tld::Arabic, // lgbbat1ad8j + Tld::Arabic, // mgb2ddes + Tld::Arabic, // mgb9awbf + Tld::Arabic, // mgba3a4f16a + Tld::Arabic, // mgbaam7a8h + Tld::Arabic, // mgbah1a3hjkrd + Tld::Arabic, // mgbai9azgqp6j + Tld::Arabic, // mgbayh7gpa + Tld::Arabic, // mgbc0a9azcg + Tld::Arabic, // mgbcpq6gpa1a + Tld::Arabic, // mgberp4a5d4ar + Tld::Arabic, // mgbpl2fh + Tld::Arabic, // mgbtx2b + Tld::WesternArabic, // mgbx4cd0ab + Tld::TraditionalSimplified, // mix891f + Tld::WesternCyrillic, // node + Tld::Thai, // o3cw4h + Tld::Arabic, // ogbpf8fl + Tld::Cyrillic, // p1ai + Tld::Arabic, // pgbs0dh + Tld::Arabic, // q7ce6a + Tld::Eu, // qxa6a + Tld::Greek, // qxam + Tld::Arabic, // wgbh1c + Tld::Arabic, // wgbl6a + Tld::Western, // xkc2al3hye2a + Tld::WesternCyrillic, // y9a3aq + Tld::SimplifiedTraditional, // yfro4i67o + Tld::Arabic, // ygbi2ammx +]; |