diff options
Diffstat (limited to 'toolkit/components/translation/cld2/internal/utf8statetable.h')
-rw-r--r-- | toolkit/components/translation/cld2/internal/utf8statetable.h | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/toolkit/components/translation/cld2/internal/utf8statetable.h b/toolkit/components/translation/cld2/internal/utf8statetable.h new file mode 100644 index 0000000000..55c00f45e1 --- /dev/null +++ b/toolkit/components/translation/cld2/internal/utf8statetable.h @@ -0,0 +1,283 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// State Table follower for scanning UTF-8 strings without converting to +// 32- or 16-bit Unicode values. +// +// Author: dsites@google.com (Dick Sites) +// + +#ifndef UTIL_UTF8_UTF8STATETABLE_H_ +#define UTIL_UTF8_UTF8STATETABLE_H_ + +#include <string> +#include "integral_types.h" // for uint8, uint32, uint16 +#include "stringpiece.h" + + +namespace CLD2 { + +class OffsetMap; + + +// These four-byte entries compactly encode how many bytes 0..255 to delete +// in making a string replacement, how many bytes to add 0..255, and the offset +// 0..64k-1 of the replacement string in remap_string. +struct RemapEntry { + uint8 delete_bytes; + uint8 add_bytes; + uint16 bytes_offset; +}; + +// Exit type codes for state tables. All but the first get stuffed into +// signed one-byte entries. The first is only generated by executable code. +// To distinguish from next-state entries, these must be contiguous and +// all <= kExitNone +typedef enum { + kExitDstSpaceFull = 239, + kExitIllegalStructure, // 240 + kExitOK, // 241 + kExitReject, // ... + kExitReplace1, + kExitReplace2, + kExitReplace3, + kExitReplace21, + kExitReplace31, + kExitReplace32, + kExitReplaceOffset1, + kExitReplaceOffset2, + kExitReplace1S0, + kExitSpecial, + kExitDoAgain, + kExitRejectAlt, + kExitNone // 255 +} ExitReason; + +typedef enum { + kExitDstSpaceFull_2 = 32767, // 0x7fff + kExitIllegalStructure_2, // 32768 0x8000 + kExitOK_2, // 32769 0x8001 + kExitReject_2, // ... + kExitReplace1_2, + kExitReplace2_2, + kExitReplace3_2, + kExitReplace21_2, + kExitReplace31_2, + kExitReplace32_2, + kExitReplaceOffset1_2, + kExitReplaceOffset2_2, + kExitReplace1S0_2, + kExitSpecial_2, + kExitDoAgain_2, + kExitRejectAlt_2, + kExitNone_2 // 32783 0x800f +} ExitReason_2; + + +// This struct represents one entire state table. The three initialized byte +// areas are state_table, remap_base, and remap_string. state0 and state0_size +// give the byte offset and length within state_table of the initial state -- +// table lookups are expected to start and end in this state, but for +// truncated UTF-8 strings, may end in a different state. These allow a quick +// test for that condition. entry_shift is 8 for tables subscripted by a full +// byte value and 6 for space-optimized tables subscripted by only six +// significant bits in UTF-8 continuation bytes. +typedef struct { + const uint32 state0; + const uint32 state0_size; + const uint32 total_size; + const int max_expand; + const int entry_shift; + const int bytes_per_entry; + const uint32 losub; + const uint32 hiadd; + const uint8* state_table; + const RemapEntry* remap_base; + const uint8* remap_string; + const uint8* fast_state; +} UTF8StateMachineObj; + +// Near-duplicate declaration for tables with two-byte entries +typedef struct { + const uint32 state0; + const uint32 state0_size; + const uint32 total_size; + const int max_expand; + const int entry_shift; + const int bytes_per_entry; + const uint32 losub; + const uint32 hiadd; + const unsigned short* state_table; + const RemapEntry* remap_base; + const uint8* remap_string; + const uint8* fast_state; +} UTF8StateMachineObj_2; + + +typedef UTF8StateMachineObj UTF8PropObj; +typedef UTF8StateMachineObj UTF8ScanObj; +typedef UTF8StateMachineObj UTF8ReplaceObj; +typedef UTF8StateMachineObj_2 UTF8PropObj_2; +typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2; +// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2; + + +// Look up property of one UTF-8 character and advance over it +// Return 0 if input length is zero +// Return 0 and advance one byte if input is ill-formed +uint8 UTF8GenericProperty(const UTF8PropObj* st, + const uint8** src, + int* srclen); + +// Look up property of one UTF-8 character (assumed to be valid). +// (This is a faster version of UTF8GenericProperty.) +bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src); + + +// BigOneByte versions are needed for tables > 240 states, but most +// won't need the TwoByte versions. + +// Look up property of one UTF-8 character and advance over it +// Return 0 if input length is zero +// Return 0 and advance one byte if input is ill-formed +uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, + const uint8** src, + int* srclen); + + +// TwoByte versions are needed for tables > 240 states that don't fit onto +// BigOneByte -- rare ultimate fallback + +// Look up property of one UTF-8 character (assumed to be valid). +// (This is a faster version of UTF8GenericProperty.) +bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src); + +// Look up property of one UTF-8 character and advance over it +// Return 0 if input length is zero +// Return 0 and advance one byte if input is ill-formed +uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, + const uint8** src, + int* srclen); + +// Look up property of one UTF-8 character (assumed to be valid). +// (This is a faster version of UTF8GenericProperty.) +bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src); + +// Scan a UTF-8 stringpiece based on a state table. +// Always scan complete UTF-8 characters +// Set number of bytes scanned. Return reason for exiting +int UTF8GenericScan(const UTF8ScanObj* st, + const StringPiece& str, + int* bytes_consumed); + + + +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece +// and doing text replacements. +// Always scan complete UTF-8 characters +// Set number of bytes consumed from input, number filled to output. +// Return reason for exiting +// Also writes an optional OffsetMap. Pass NULL to skip writing one. +int UTF8GenericReplace(const UTF8ReplaceObj* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed, + OffsetMap* offsetmap); + +// Older version without offsetmap +int UTF8GenericReplace(const UTF8ReplaceObj* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed); + +// Older version without is_plain_text or offsetmap +int UTF8GenericReplace(const UTF8ReplaceObj* st, + const StringPiece& istr, + StringPiece& ostr, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed); + + +// TwoByte version is needed for tables > about 256 states, such +// as the table for full Unicode 4.1 canonical + compatibility mapping + +// Scan a UTF-8 stringpiece based on state table with two-byte entries, +// copying to output stringpiece +// and doing text replacements. +// Always scan complete UTF-8 characters +// Set number of bytes consumed from input, number filled to output. +// Return reason for exiting +// Also writes an optional OffsetMap. Pass NULL to skip writing one. +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed, + OffsetMap* offsetmap); + +// Older version without offsetmap +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed); + +// Older version without is_plain_text or offsetmap +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, + const StringPiece& istr, + StringPiece& ostr, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed); + + +static const unsigned char kUTF8LenTbl[256] = { + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 +}; + +inline int UTF8OneCharLen(const char* in) { + return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)]; +} + +// Adjust a stringpiece to encompass complete UTF-8 characters. +// The data pointer will be increased by 0..3 bytes to get to a character +// boundary, and the length will then be decreased by 0..3 bytes +// to encompass the last complete character. +// This is useful especially when a UTF-8 string must be put into a fixed- +// maximum-size buffer cleanly, such as a MySQL buffer. +void UTF8TrimToChars(StringPiece* istr); + +} // End namespace CLD2 + +#endif // UTIL_UTF8_UTF8STATETABLE_H_ |