diff options
Diffstat (limited to 'intl/icu/source/i18n/inputext.cpp')
-rw-r--r-- | intl/icu/source/i18n/inputext.cpp | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/inputext.cpp b/intl/icu/source/i18n/inputext.cpp new file mode 100644 index 0000000000..2d4f8a388a --- /dev/null +++ b/intl/icu/source/i18n/inputext.cpp @@ -0,0 +1,164 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ********************************************************************** + * Copyright (C) 2005-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "inputext.h" + +#include "cmemory.h" +#include "cstring.h" + +#include <string.h> + +U_NAMESPACE_BEGIN + +#define BUFFER_SIZE 8192 + +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) uprv_free((void *) (array)) + +InputText::InputText(UErrorCode &status) + : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been + // removed if appropriate. + fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. + // Value is percent, not absolute. + fDeclaredEncoding(0), + fRawInput(0), + fRawLength(0) +{ + if (fInputBytes == NULL || fByteStats == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } +} + +InputText::~InputText() +{ + DELETE_ARRAY(fDeclaredEncoding); + DELETE_ARRAY(fByteStats); + DELETE_ARRAY(fInputBytes); +} + +void InputText::setText(const char *in, int32_t len) +{ + fInputLen = 0; + fC1Bytes = FALSE; + fRawInput = (const uint8_t *) in; + fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; +} + +void InputText::setDeclaredEncoding(const char* encoding, int32_t len) +{ + if(encoding) { + if (len == -1) { + len = (int32_t)uprv_strlen(encoding); + } + + len += 1; // to make place for the \0 at the end. + uprv_free(fDeclaredEncoding); + fDeclaredEncoding = NEW_ARRAY(char, len); + uprv_strncpy(fDeclaredEncoding, encoding, len); + } +} + +UBool InputText::isSet() const +{ + return fRawInput != NULL; +} + +/** +* MungeInput - after getting a set of raw input data to be analyzed, preprocess +* it by removing what appears to be html markup. +* +* @internal +*/ +void InputText::MungeInput(UBool fStripTags) { + int srci = 0; + int dsti = 0; + uint8_t b; + bool inMarkup = FALSE; + int32_t openTags = 0; + int32_t badTags = 0; + + // + // html / xml markup stripping. + // quick and dirty, not 100% accurate, but hopefully good enough, statistically. + // discard everything within < brackets > + // Count how many total '<' and illegal (nested) '<' occur, so we can make some + // guess as to whether the input was actually marked up at all. + // TODO: Think about how this interacts with EBCDIC charsets that are detected. + if (fStripTags) { + for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { + b = fRawInput[srci]; + + if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ + if (inMarkup) { + badTags += 1; + } + + inMarkup = TRUE; + openTags += 1; + } + + if (! inMarkup) { + fInputBytes[dsti++] = b; + } + + if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ + inMarkup = FALSE; + } + } + + fInputLen = dsti; + } + + // + // If it looks like this input wasn't marked up, or if it looks like it's + // essentially nothing but markup abandon the markup stripping. + // Detection will have to work on the unstripped input. + // + if (openTags<5 || openTags/5 < badTags || + (fInputLen < 100 && fRawLength>600)) + { + int32_t limit = fRawLength; + + if (limit > BUFFER_SIZE) { + limit = BUFFER_SIZE; + } + + for (srci=0; srci<limit; srci++) { + fInputBytes[srci] = fRawInput[srci]; + } + + fInputLen = srci; + } + + // + // Tally up the byte occurence statistics. + // These are available for use by the various detectors. + // + + uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); + + for (srci = 0; srci < fInputLen; srci += 1) { + fByteStats[fInputBytes[srci]] += 1; + } + + for (int32_t i = 0x80; i <= 0x9F; i += 1) { + if (fByteStats[i] != 0) { + fC1Bytes = TRUE; + break; + } + } +} + +U_NAMESPACE_END +#endif + |