diff options
Diffstat (limited to 'toolkit/components/translation/cld2/internal/compact_lang_det.cc')
-rw-r--r-- | toolkit/components/translation/cld2/internal/compact_lang_det.cc | 322 |
1 files changed, 322 insertions, 0 deletions
diff --git a/toolkit/components/translation/cld2/internal/compact_lang_det.cc b/toolkit/components/translation/cld2/internal/compact_lang_det.cc new file mode 100644 index 0000000000..ccc6588fcb --- /dev/null +++ b/toolkit/components/translation/cld2/internal/compact_lang_det.cc @@ -0,0 +1,322 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + +#include <stdio.h> +#include <stdlib.h> + +#include "../public/compact_lang_det.h" +#include "../public/encodings.h" +#include "compact_lang_det_impl.h" +#include "integral_types.h" +#include "lang_script.h" + +namespace CLD2 { + +// String is "code_version - data_scrape_date" +//static const char* kDetectLanguageVersion = "V2.0 - 20130715"; + + +// Large-table version for all ~160 languages +// Small-table version for all ~60 languages + +// Scan interchange-valid UTF-8 bytes and detect most likely language +Language DetectLanguage( + const char* buffer, + int buffer_length, + bool is_plain_text, + bool* is_reliable) { + bool allow_extended_lang = false; + Language language3[3]; + int percent3[3]; + double normalized_score3[3]; + int text_bytes; + int flags = 0; + Language plus_one = UNKNOWN_LANGUAGE; + const char* tld_hint = ""; + int encoding_hint = UNKNOWN_ENCODING; + Language language_hint = UNKNOWN_LANGUAGE; + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; + + Language lang = DetectLanguageSummaryV2( + buffer, + buffer_length, + is_plain_text, + &cldhints, + allow_extended_lang, + flags, + plus_one, + language3, + percent3, + normalized_score3, + NULL, + &text_bytes, + is_reliable); + // Default to English + if (lang == UNKNOWN_LANGUAGE) { + lang = ENGLISH; + } + return lang; +} + +// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. +Language DetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + Language* language3, + int* percent3, + int* text_bytes, + bool* is_reliable) { + double normalized_score3[3]; + bool allow_extended_lang = false; + int flags = 0; + Language plus_one = UNKNOWN_LANGUAGE; + const char* tld_hint = ""; + int encoding_hint = UNKNOWN_ENCODING; + Language language_hint = UNKNOWN_LANGUAGE; + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; + + Language lang = DetectLanguageSummaryV2( + buffer, + buffer_length, + is_plain_text, + &cldhints, + allow_extended_lang, + flags, + plus_one, + language3, + percent3, + normalized_score3, + NULL, + text_bytes, + is_reliable); + // Default to English + if (lang == UNKNOWN_LANGUAGE) { + lang = ENGLISH; + } + return lang; +} + +// Same as above, with hints supplied +// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. +Language DetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + const char* tld_hint, // "id" boosts Indonesian + int encoding_hint, // SJS boosts Japanese + Language language_hint, // ITALIAN boosts it + Language* language3, + int* percent3, + int* text_bytes, + bool* is_reliable) { + double normalized_score3[3]; + bool allow_extended_lang = false; + int flags = 0; + Language plus_one = UNKNOWN_LANGUAGE; + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; + + Language lang = DetectLanguageSummaryV2( + buffer, + buffer_length, + is_plain_text, + &cldhints, + allow_extended_lang, + flags, + plus_one, + language3, + percent3, + normalized_score3, + NULL, + text_bytes, + is_reliable); + // Default to English + if (lang == UNKNOWN_LANGUAGE) { + lang = ENGLISH; + } + return lang; +} + + +// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended +// languages. +// Extended languages are additional Google interface languages and Unicode +// single-language scripts, from ext_lang_enc.h +Language ExtDetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + Language* language3, + int* percent3, + int* text_bytes, + bool* is_reliable) { + double normalized_score3[3]; + bool allow_extended_lang = true; + int flags = 0; + Language plus_one = UNKNOWN_LANGUAGE; + const char* tld_hint = ""; + int encoding_hint = UNKNOWN_ENCODING; + Language language_hint = UNKNOWN_LANGUAGE; + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; + + Language lang = DetectLanguageSummaryV2( + buffer, + buffer_length, + is_plain_text, + &cldhints, + allow_extended_lang, + flags, + plus_one, + language3, + percent3, + normalized_score3, + NULL, + text_bytes, + is_reliable); + // Do not default to English + return lang; +} + +// Same as above, with hints supplied +// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended +// languages. +// Extended languages are additional Google interface languages and Unicode +// single-language scripts, from ext_lang_enc.h +Language ExtDetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + const char* tld_hint, // "id" boosts Indonesian + int encoding_hint, // SJS boosts Japanese + Language language_hint, // ITALIAN boosts it + Language* language3, + int* percent3, + int* text_bytes, + bool* is_reliable) { + double normalized_score3[3]; + bool allow_extended_lang = true; + int flags = 0; + Language plus_one = UNKNOWN_LANGUAGE; + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; + + Language lang = DetectLanguageSummaryV2( + buffer, + buffer_length, + is_plain_text, + &cldhints, + allow_extended_lang, + flags, + plus_one, + language3, + percent3, + normalized_score3, + NULL, + text_bytes, + is_reliable); + // Do not default to English + return lang; +} + +// Same as above, and also returns internal language scores as a ratio to +// normal score for real text in that language. Scores close to 1.0 indicate +// normal text, while scores far away from 1.0 indicate badly-skewed text or +// gibberish +// +Language ExtDetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + const char* tld_hint, // "id" boosts Indonesian + int encoding_hint, // SJS boosts Japanese + Language language_hint, // ITALIAN boosts it + Language* language3, + int* percent3, + double* normalized_score3, + int* text_bytes, + bool* is_reliable) { + bool allow_extended_lang = true; + int flags = 0; + Language plus_one = UNKNOWN_LANGUAGE; + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; + + Language lang = DetectLanguageSummaryV2( + buffer, + buffer_length, + is_plain_text, + &cldhints, + allow_extended_lang, + flags, + plus_one, + language3, + percent3, + normalized_score3, + NULL, + text_bytes, + is_reliable); + // Do not default to English + return lang; +} + +// Use this one. +// Hints are collected into a struct. +// Flags are passed in (normally zero). +// +// Also returns 3 internal language scores as a ratio to +// normal score for real text in that language. Scores close to 1.0 indicate +// normal text, while scores far away from 1.0 indicate badly-skewed text or +// gibberish +// +// Returns a vector of chunks in different languages, so that caller may +// spell-check, translate, or otherwaise process different parts of the input +// buffer in language-dependant ways. +// +Language ExtDetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + const CLDHints* cld_hints, + int flags, + Language* language3, + int* percent3, + double* normalized_score3, + ResultChunkVector* resultchunkvector, + int* text_bytes, + bool* is_reliable) { + bool allow_extended_lang = true; + Language plus_one = UNKNOWN_LANGUAGE; + + Language lang = DetectLanguageSummaryV2( + buffer, + buffer_length, + is_plain_text, + cld_hints, + allow_extended_lang, + flags, + plus_one, + language3, + percent3, + normalized_score3, + resultchunkvector, + text_bytes, + is_reliable); + // Do not default to English + return lang; +} + +} // End namespace CLD2 + |