From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- intl/icu/source/tools/toolutil/ppucd.cpp | 622 +++++++++++++++++++++++++++++++ 1 file changed, 622 insertions(+) create mode 100644 intl/icu/source/tools/toolutil/ppucd.cpp (limited to 'intl/icu/source/tools/toolutil/ppucd.cpp') diff --git a/intl/icu/source/tools/toolutil/ppucd.cpp b/intl/icu/source/tools/toolutil/ppucd.cpp new file mode 100644 index 0000000000..0d59b28ce4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ppucd.cpp @@ -0,0 +1,622 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2011-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: ppucd.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011dec11 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "charstr.h" +#include "cstring.h" +#include "ppucd.h" +#include "uassert.h" +#include "uparse.h" + +#include +#include + +U_NAMESPACE_BEGIN + +PropertyNames::~PropertyNames() {} + +// TODO: Create a concrete subclass for the default PropertyNames implementation +// using the ICU library built-in property names API & data. +// Currently only the genprops tool uses PreparsedUCD, and provides its own +// PropertyNames implementation using its just-build property names data and its own code. +// At some point, we should use PreparsedUCD in tests, and then we will need the +// default implementation somewhere. +#if 0 +int32_t +PropertyNames::getPropertyEnum(const char *name) const { + return u_getPropertyEnum(name); +} + +int32_t +PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { + return u_getPropertyValueEnum((UProperty)property, name); +} +#endif + +UniProps::UniProps() + : start(U_SENTINEL), end(U_SENTINEL), + bmg(U_SENTINEL), bpb(U_SENTINEL), + scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), + digitValue(-1), numericValue(nullptr), + name(nullptr), nameAlias(nullptr) { + memset(binProps, 0, sizeof(binProps)); + memset(intProps, 0, sizeof(intProps)); + memset(age, 0, 4); +} + +UniProps::~UniProps() {} + +const int32_t PreparsedUCD::kNumLineBuffers; + +PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) + : pnames(nullptr), + file(nullptr), + defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), + lineNumber(0), + lineType(NO_LINE), + fieldLimit(nullptr), lineLimit(nullptr) { + if(U_FAILURE(errorCode)) { return; } + + if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) { + filename=nullptr; + file=stdin; + } else { + file=fopen(filename, "r"); + } + if(file==nullptr) { + perror("error opening preparsed UCD"); + fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); + errorCode=U_FILE_ACCESS_ERROR; + return; + } + + memset(ucdVersion, 0, 4); + lines[0][0]=0; +} + +PreparsedUCD::~PreparsedUCD() { + if(file!=stdin) { + fclose(file); + } +} + +// Same order as the LineType values. +static const char *lineTypeStrings[]={ + nullptr, + nullptr, + "ucd", + "property", + "binary", + "value", + "defaults", + "block", + "cp", + "unassigned", + "algnamesrange" +}; + +PreparsedUCD::LineType +PreparsedUCD::readLine(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return NO_LINE; } + // Select the next available line buffer. + while(!isLineBufferAvailable(lineIndex)) { + ++lineIndex; + if (lineIndex == kNumLineBuffers) { + lineIndex = 0; + } + } + char *line=lines[lineIndex]; + *line=0; + lineLimit=fieldLimit=line; + lineType=NO_LINE; + char *result=fgets(line, sizeof(lines[0]), file); + if(result==nullptr) { + if(ferror(file)) { + perror("error reading preparsed UCD"); + fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); + errorCode=U_FILE_ACCESS_ERROR; + } + return NO_LINE; + } + ++lineNumber; + if(*line=='#') { + fieldLimit=strchr(line, 0); + return lineType=EMPTY_LINE; + } + // Remove trailing /r/n. + char c; + char *limit=strchr(line, 0); + while(line=0) { + fprintf(stderr, + "error in preparsed UCD: default line %ld after one or more block lines\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + if(defaultLineIndex>=0) { + fprintf(stderr, + "error in preparsed UCD: second line with default properties on line %ld\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + if(start!=0 || end!=0x10ffff) { + fprintf(stderr, + "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + props=&defaultProps; + defaultLineIndex=lineIndex; + break; + case BLOCK_LINE: + blockProps=defaultProps; // Block inherits default properties. + props=&blockProps; + blockLineIndex=lineIndex; + break; + case CP_LINE: + case UNASSIGNED_LINE: + if(blockProps.start<=start && end<=blockProps.end) { + insideBlock=true; + if(lineType==CP_LINE) { + // Code point range fully inside the last block inherits the block properties. + cpProps=blockProps; + } else { + // Unassigned line inside the block is based on default properties + // which override block properties. + cpProps=defaultProps; + newValues=blockValues; + // Except, it inherits the one blk=Block property. + int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START; + cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex]; + newValues.remove((UChar32)UCHAR_BLOCK); + } + } else if(start>blockProps.end || endstart=start; + props->end=end; + while((field=nextField())!=nullptr) { + if(!parseProperty(*props, field, newValues, errorCode)) { return nullptr; } + } + if(lineType==BLOCK_LINE) { + blockValues=newValues; + } else if(lineType==UNASSIGNED_LINE && insideBlock) { + // Unset newValues for values that are the same as the block values. + for(int32_t prop=0; propgetPropertyEnum(p); + if(prop<0) { + for(int32_t i=0;; ++i) { + if(i==UPRV_LENGTHOF(ppucdProperties)) { + // Ignore unknown property names. + return true; + } + if(0==uprv_stricmp(p, ppucdProperties[i].name)) { + prop=ppucdProperties[i].prop; + U_ASSERT(prop>=0); + break; + } + } + } + if(prop=0) { + props.binProps[prop]=(UBool)binaryValue; + } else { + // No binary value for a binary property. + fprintf(stderr, + "error in preparsed UCD: enum-property syntax '%s' " + "for binary property on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } + } else if(binaryValue>=0) { + // Binary value for a non-binary property. + fprintf(stderr, + "error in preparsed UCD: binary-property syntax '%s' " + "for non-binary property on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } else if (prop < UCHAR_INT_START) { + fprintf(stderr, + "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", + prop, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } else if(propgetPropertyValueEnum(prop, v); + if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { + // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. + char *end; + unsigned long ccc=uprv_strtoul(v, &end, 10); + if(v, just set null values. + switch(prop) { + case UCHAR_BIDI_MIRRORING_GLYPH: + props.bmg=U_SENTINEL; + break; + case UCHAR_BIDI_PAIRED_BRACKET: + props.bpb=U_SENTINEL; + break; + case UCHAR_SIMPLE_CASE_FOLDING: + props.scf=U_SENTINEL; + break; + case UCHAR_SIMPLE_LOWERCASE_MAPPING: + props.slc=U_SENTINEL; + break; + case UCHAR_SIMPLE_TITLECASE_MAPPING: + props.stc=U_SENTINEL; + break; + case UCHAR_SIMPLE_UPPERCASE_MAPPING: + props.suc=U_SENTINEL; + break; + case UCHAR_CASE_FOLDING: + props.cf.remove(); + break; + case UCHAR_LOWERCASE_MAPPING: + props.lc.remove(); + break; + case UCHAR_TITLECASE_MAPPING: + props.tc.remove(); + break; + case UCHAR_UPPERCASE_MAPPING: + props.uc.remove(); + break; + case UCHAR_SCRIPT_EXTENSIONS: + props.scx.clear(); + break; + default: + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } + } else { + char c; + switch(prop) { + case UCHAR_NUMERIC_VALUE: + props.numericValue=v; + c=*v; + if('0'<=c && c<='9' && v[1]==0) { + props.digitValue=c-'0'; + } else { + props.digitValue=-1; + } + break; + case UCHAR_NAME: + props.name=v; + break; + case UCHAR_AGE: + u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. + break; + case UCHAR_BIDI_MIRRORING_GLYPH: + props.bmg=parseCodePoint(v, errorCode); + break; + case UCHAR_BIDI_PAIRED_BRACKET: + props.bpb=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_CASE_FOLDING: + props.scf=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_LOWERCASE_MAPPING: + props.slc=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_TITLECASE_MAPPING: + props.stc=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_UPPERCASE_MAPPING: + props.suc=parseCodePoint(v, errorCode); + break; + case UCHAR_CASE_FOLDING: + parseString(v, props.cf, errorCode); + break; + case UCHAR_LOWERCASE_MAPPING: + parseString(v, props.lc, errorCode); + break; + case UCHAR_TITLECASE_MAPPING: + parseString(v, props.tc, errorCode); + break; + case UCHAR_UPPERCASE_MAPPING: + parseString(v, props.uc, errorCode); + break; + case PPUCD_NAME_ALIAS: + props.nameAlias=v; + break; + case PPUCD_CONDITIONAL_CASE_MAPPINGS: + case PPUCD_TURKIC_CASE_FOLDING: + // No need to parse their values: They are hardcoded in the runtime library. + break; + case UCHAR_SCRIPT_EXTENSIONS: + parseScriptExtensions(v, props.scx, errorCode); + break; + default: + // Ignore unhandled properties. + return true; + } + } + if(U_SUCCESS(errorCode)) { + newValues.add((UChar32)prop); + return true; + } else { + return false; + } +} + +UBool +PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return false; } + if(lineType!=ALG_NAMES_RANGE_LINE) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return false; + } + firstField(); + const char *field=nextField(); + if(field==nullptr) { + // No range field after the type. + fprintf(stderr, + "error in preparsed UCD: missing algnamesrange range field " + "(no second field) on line %ld\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return false; + } + return parseCodePointRange(field, start, end, errorCode); +} + +UChar32 +PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { + char *end; + uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || *end!=0 || value>=0x110000) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", + s, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return U_SENTINEL; + } + return (UChar32)value; +} + +UBool +PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { + uint32_t st, e; + u_parseCodePointRange(s, &st, &e, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", + s, (long)lineNumber); + return false; + } + start=(UChar32)st; + end=(UChar32)e; + return true; +} + +void +PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { + char16_t *buffer=toUCharPtr(uni.getBuffer(-1)); + int32_t length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode); + if(errorCode==U_BUFFER_OVERFLOW_ERROR) { + errorCode=U_ZERO_ERROR; + uni.releaseBuffer(0); + buffer=toUCharPtr(uni.getBuffer(length)); + length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode); + } + uni.releaseBuffer(length); + if(U_FAILURE(errorCode)) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", + s, (long)lineNumber); + } +} + +void +PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + scx.clear(); + CharString scString; + for(;;) { + const char *scs; + const char *scLimit=strchr(s, ' '); + if(scLimit!=nullptr) { + scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); + if(U_FAILURE(errorCode)) { return; } + } else { + scs=s; + } + int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); + if(script==UCHAR_INVALID_CODE) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", + scs, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return; + } else if(scx.contains(script)) { + fprintf(stderr, + "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", + scs, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return; + } else { + scx.add(script); + } + if(scLimit!=nullptr) { + s=scLimit+1; + } else { + break; + } + } + if(scx.isEmpty()) { + fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); + errorCode=U_PARSE_ERROR; + } +} + +U_NAMESPACE_END -- cgit v1.2.3