diff options
Diffstat (limited to 'contrib/google-ced/compact_enc_det.h')
-rw-r--r-- | contrib/google-ced/compact_enc_det.h | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/contrib/google-ced/compact_enc_det.h b/contrib/google-ced/compact_enc_det.h new file mode 100644 index 0000000..01adf45 --- /dev/null +++ b/contrib/google-ced/compact_enc_det.h @@ -0,0 +1,83 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_ +#define COMPACT_ENC_DET_COMPACT_ENC_DET_H_ + +#include "util/encodings/encodings.h" // for Encoding +#include "util/languages/languages.h" // for Language + +#include <string.h> + +namespace CompactEncDet { + // We may want different statistics, depending on whether the text being + // identfied is from the web, from email, etc. This is currently ignored, + // except WEB_CORPUS enables ignoring chars inside tags. + enum TextCorpusType { + WEB_CORPUS, + XML_CORPUS, + QUERY_CORPUS, // Use this for vanilla plaintext + EMAIL_CORPUS, + NUM_CORPA, // always last + }; + + // Scan raw bytes and detect most likely encoding + // Design goals: + // Skip over big initial stretches of seven-bit ASCII bytes very quickly + // Thread safe + // Works equally well on + // 50-byte queries, + // 5000-byte email and + // 50000-byte web pages + // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1) + // + // Inputs: text and text_length + // web page's url (preferred) or just + // top-level domain name (e.g. "com") or NULL as a hint + // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint + // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint + // an Encoding or UNKNOWN_ENCODING as a hint + // a Language or UNKNOWN_LANGUAGE as a hint + // corpus type from the list above. Currently ignored; may select + // different probability tables in the future + // ignore_7bit if true says to NOT return the pure seven-bit encodings + // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7. + // This may save a little scoring time on pure printable ASCII input text + // Outputs: bytes_consumed says how much of text_length was actually examined + // is_reliable set true if the returned encoding is at least 2**10 time more + // probable then the second-best encoding + // Return value: the most likely encoding for the input text + // + // Setting ignore_7bit_mail_encodings effectively turns off detection of + // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true + // when corpus_type is QUERY_CORPUS. + Encoding DetectEncoding( + const char* text, int text_length, const char* url_hint, + const char* http_charset_hint, const char* meta_charset_hint, + const int encoding_hint, + const Language language_hint, // User interface lang + const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, + int* bytes_consumed, bool* is_reliable); + + // Support functions for unit test program + int BackmapEncodingToRankedEncoding(Encoding enc); + Encoding TopEncodingOfLangHint(const char* name); + Encoding TopEncodingOfTLDHint(const char* name); + Encoding TopEncodingOfCharsetHint(const char* name); + const char* Version(void); +} // End namespace CompactEncDet + +#endif // COMPACT_ENC_DET_COMPACT_ENC_DET_H_ |