summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/toolutil/ppucd.cpp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--intl/icu/source/tools/toolutil/ppucd.cpp622
1 files changed, 622 insertions, 0 deletions
diff --git a/intl/icu/source/tools/toolutil/ppucd.cpp b/intl/icu/source/tools/toolutil/ppucd.cpp
new file mode 100644
index 0000000000..0d59b28ce4
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ppucd.cpp
@@ -0,0 +1,622 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2011-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* file name: ppucd.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2011dec11
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uchar.h"
+#include "charstr.h"
+#include "cstring.h"
+#include "ppucd.h"
+#include "uassert.h"
+#include "uparse.h"
+
+#include <stdio.h>
+#include <string.h>
+
+U_NAMESPACE_BEGIN
+
+PropertyNames::~PropertyNames() {}
+
+// TODO: Create a concrete subclass for the default PropertyNames implementation
+// using the ICU library built-in property names API & data.
+// Currently only the genprops tool uses PreparsedUCD, and provides its own
+// PropertyNames implementation using its just-build property names data and its own code.
+// At some point, we should use PreparsedUCD in tests, and then we will need the
+// default implementation somewhere.
+#if 0
+int32_t
+PropertyNames::getPropertyEnum(const char *name) const {
+ return u_getPropertyEnum(name);
+}
+
+int32_t
+PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
+ return u_getPropertyValueEnum((UProperty)property, name);
+}
+#endif
+
+UniProps::UniProps()
+ : start(U_SENTINEL), end(U_SENTINEL),
+ bmg(U_SENTINEL), bpb(U_SENTINEL),
+ scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
+ digitValue(-1), numericValue(nullptr),
+ name(nullptr), nameAlias(nullptr) {
+ memset(binProps, 0, sizeof(binProps));
+ memset(intProps, 0, sizeof(intProps));
+ memset(age, 0, 4);
+}
+
+UniProps::~UniProps() {}
+
+const int32_t PreparsedUCD::kNumLineBuffers;
+
+PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
+ : pnames(nullptr),
+ file(nullptr),
+ defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
+ lineNumber(0),
+ lineType(NO_LINE),
+ fieldLimit(nullptr), lineLimit(nullptr) {
+ if(U_FAILURE(errorCode)) { return; }
+
+ if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) {
+ filename=nullptr;
+ file=stdin;
+ } else {
+ file=fopen(filename, "r");
+ }
+ if(file==nullptr) {
+ perror("error opening preparsed UCD");
+ fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
+ errorCode=U_FILE_ACCESS_ERROR;
+ return;
+ }
+
+ memset(ucdVersion, 0, 4);
+ lines[0][0]=0;
+}
+
+PreparsedUCD::~PreparsedUCD() {
+ if(file!=stdin) {
+ fclose(file);
+ }
+}
+
+// Same order as the LineType values.
+static const char *lineTypeStrings[]={
+ nullptr,
+ nullptr,
+ "ucd",
+ "property",
+ "binary",
+ "value",
+ "defaults",
+ "block",
+ "cp",
+ "unassigned",
+ "algnamesrange"
+};
+
+PreparsedUCD::LineType
+PreparsedUCD::readLine(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return NO_LINE; }
+ // Select the next available line buffer.
+ while(!isLineBufferAvailable(lineIndex)) {
+ ++lineIndex;
+ if (lineIndex == kNumLineBuffers) {
+ lineIndex = 0;
+ }
+ }
+ char *line=lines[lineIndex];
+ *line=0;
+ lineLimit=fieldLimit=line;
+ lineType=NO_LINE;
+ char *result=fgets(line, sizeof(lines[0]), file);
+ if(result==nullptr) {
+ if(ferror(file)) {
+ perror("error reading preparsed UCD");
+ fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
+ errorCode=U_FILE_ACCESS_ERROR;
+ }
+ return NO_LINE;
+ }
+ ++lineNumber;
+ if(*line=='#') {
+ fieldLimit=strchr(line, 0);
+ return lineType=EMPTY_LINE;
+ }
+ // Remove trailing /r/n.
+ char c;
+ char *limit=strchr(line, 0);
+ while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
+ // Remove trailing white space.
+ while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
+ *limit=0;
+ lineLimit=limit;
+ if(line==limit) {
+ fieldLimit=limit;
+ return lineType=EMPTY_LINE;
+ }
+ // Split by ';'.
+ char *semi=line;
+ while((semi=strchr(semi, ';'))!=nullptr) { *semi++=0; }
+ fieldLimit=strchr(line, 0);
+ // Determine the line type.
+ int32_t type;
+ for(type=EMPTY_LINE+1;; ++type) {
+ if(type==LINE_TYPE_COUNT) {
+ fprintf(stderr,
+ "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
+ line, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return NO_LINE;
+ }
+ if(0==strcmp(line, lineTypeStrings[type])) {
+ break;
+ }
+ }
+ lineType=(LineType)type;
+ if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
+ u_versionFromString(ucdVersion, fieldLimit+1);
+ }
+ return lineType;
+}
+
+const char *
+PreparsedUCD::firstField() {
+ char *field=lines[lineIndex];
+ fieldLimit=strchr(field, 0);
+ return field;
+}
+
+const char *
+PreparsedUCD::nextField() {
+ if(fieldLimit==lineLimit) { return nullptr; }
+ char *field=fieldLimit+1;
+ fieldLimit=strchr(field, 0);
+ return field;
+}
+
+const UniProps *
+PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return nullptr; }
+ newValues.clear();
+ if(!lineHasPropertyValues()) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ firstField();
+ const char *field=nextField();
+ if(field==nullptr) {
+ // No range field after the type.
+ fprintf(stderr,
+ "error in preparsed UCD: missing default/block/cp range field "
+ "(no second field) on line %ld\n",
+ (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ UChar32 start, end;
+ if(!parseCodePointRange(field, start, end, errorCode)) { return nullptr; }
+ UniProps *props;
+ UBool insideBlock=false; // true if cp or unassigned range inside the block range.
+ switch(lineType) {
+ case DEFAULTS_LINE:
+ // Should occur before any block/cp/unassigned line.
+ if(blockLineIndex>=0) {
+ fprintf(stderr,
+ "error in preparsed UCD: default line %ld after one or more block lines\n",
+ (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ if(defaultLineIndex>=0) {
+ fprintf(stderr,
+ "error in preparsed UCD: second line with default properties on line %ld\n",
+ (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ if(start!=0 || end!=0x10ffff) {
+ fprintf(stderr,
+ "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ props=&defaultProps;
+ defaultLineIndex=lineIndex;
+ break;
+ case BLOCK_LINE:
+ blockProps=defaultProps; // Block inherits default properties.
+ props=&blockProps;
+ blockLineIndex=lineIndex;
+ break;
+ case CP_LINE:
+ case UNASSIGNED_LINE:
+ if(blockProps.start<=start && end<=blockProps.end) {
+ insideBlock=true;
+ if(lineType==CP_LINE) {
+ // Code point range fully inside the last block inherits the block properties.
+ cpProps=blockProps;
+ } else {
+ // Unassigned line inside the block is based on default properties
+ // which override block properties.
+ cpProps=defaultProps;
+ newValues=blockValues;
+ // Except, it inherits the one blk=Block property.
+ int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START;
+ cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex];
+ newValues.remove((UChar32)UCHAR_BLOCK);
+ }
+ } else if(start>blockProps.end || end<blockProps.start) {
+ // Code point range fully outside the last block inherits the default properties.
+ cpProps=defaultProps;
+ } else {
+ // Code point range partially overlapping with the last block is illegal.
+ fprintf(stderr,
+ "error in preparsed UCD: cp range %s on line %ld only "
+ "partially overlaps with block range %04lX..%04lX\n",
+ field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ props=&cpProps;
+ break;
+ default:
+ // Will not occur because of the range check above.
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ props->start=start;
+ props->end=end;
+ while((field=nextField())!=nullptr) {
+ if(!parseProperty(*props, field, newValues, errorCode)) { return nullptr; }
+ }
+ if(lineType==BLOCK_LINE) {
+ blockValues=newValues;
+ } else if(lineType==UNASSIGNED_LINE && insideBlock) {
+ // Unset newValues for values that are the same as the block values.
+ for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) {
+ if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) {
+ newValues.remove(prop);
+ }
+ }
+ for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) {
+ int32_t index=prop-UCHAR_INT_START;
+ if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) {
+ newValues.remove(prop);
+ }
+ }
+ }
+ return props;
+}
+
+static const struct {
+ const char *name;
+ int32_t prop;
+} ppucdProperties[]={
+ { "Name_Alias", PPUCD_NAME_ALIAS },
+ { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
+ { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
+};
+
+// Returns true for "ok to continue parsing fields".
+UBool
+PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
+ UErrorCode &errorCode) {
+ CharString pBuffer;
+ const char *p=field;
+ const char *v=strchr(p, '=');
+ int binaryValue;
+ if(*p=='-') {
+ if(v!=nullptr) {
+ fprintf(stderr,
+ "error in preparsed UCD: mix of binary-property-no and "
+ "enum-property syntax '%s' on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return false;
+ }
+ binaryValue=0;
+ ++p;
+ } else if(v==nullptr) {
+ binaryValue=1;
+ } else {
+ binaryValue=-1;
+ // Copy out the property name rather than modifying the field (writing a NUL).
+ pBuffer.append(p, (int32_t)(v-p), errorCode);
+ p=pBuffer.data();
+ ++v;
+ }
+ int32_t prop=pnames->getPropertyEnum(p);
+ if(prop<0) {
+ for(int32_t i=0;; ++i) {
+ if(i==UPRV_LENGTHOF(ppucdProperties)) {
+ // Ignore unknown property names.
+ return true;
+ }
+ if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
+ prop=ppucdProperties[i].prop;
+ U_ASSERT(prop>=0);
+ break;
+ }
+ }
+ }
+ if(prop<UCHAR_BINARY_LIMIT) {
+ if(binaryValue>=0) {
+ props.binProps[prop]=(UBool)binaryValue;
+ } else {
+ // No binary value for a binary property.
+ fprintf(stderr,
+ "error in preparsed UCD: enum-property syntax '%s' "
+ "for binary property on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ }
+ } else if(binaryValue>=0) {
+ // Binary value for a non-binary property.
+ fprintf(stderr,
+ "error in preparsed UCD: binary-property syntax '%s' "
+ "for non-binary property on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ } else if (prop < UCHAR_INT_START) {
+ fprintf(stderr,
+ "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
+ prop, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ } else if(prop<UCHAR_INT_LIMIT) {
+ int32_t value=pnames->getPropertyValueEnum(prop, v);
+ if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
+ // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
+ char *end;
+ unsigned long ccc=uprv_strtoul(v, &end, 10);
+ if(v<end && *end==0 && ccc<=254) {
+ value=(int32_t)ccc;
+ }
+ }
+ if(value==UCHAR_INVALID_CODE) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ } else {
+ props.intProps[prop-UCHAR_INT_START]=value;
+ }
+ } else if(*v=='<') {
+ // Do not parse default values like <code point>, just set null values.
+ switch(prop) {
+ case UCHAR_BIDI_MIRRORING_GLYPH:
+ props.bmg=U_SENTINEL;
+ break;
+ case UCHAR_BIDI_PAIRED_BRACKET:
+ props.bpb=U_SENTINEL;
+ break;
+ case UCHAR_SIMPLE_CASE_FOLDING:
+ props.scf=U_SENTINEL;
+ break;
+ case UCHAR_SIMPLE_LOWERCASE_MAPPING:
+ props.slc=U_SENTINEL;
+ break;
+ case UCHAR_SIMPLE_TITLECASE_MAPPING:
+ props.stc=U_SENTINEL;
+ break;
+ case UCHAR_SIMPLE_UPPERCASE_MAPPING:
+ props.suc=U_SENTINEL;
+ break;
+ case UCHAR_CASE_FOLDING:
+ props.cf.remove();
+ break;
+ case UCHAR_LOWERCASE_MAPPING:
+ props.lc.remove();
+ break;
+ case UCHAR_TITLECASE_MAPPING:
+ props.tc.remove();
+ break;
+ case UCHAR_UPPERCASE_MAPPING:
+ props.uc.remove();
+ break;
+ case UCHAR_SCRIPT_EXTENSIONS:
+ props.scx.clear();
+ break;
+ default:
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ }
+ } else {
+ char c;
+ switch(prop) {
+ case UCHAR_NUMERIC_VALUE:
+ props.numericValue=v;
+ c=*v;
+ if('0'<=c && c<='9' && v[1]==0) {
+ props.digitValue=c-'0';
+ } else {
+ props.digitValue=-1;
+ }
+ break;
+ case UCHAR_NAME:
+ props.name=v;
+ break;
+ case UCHAR_AGE:
+ u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric.
+ break;
+ case UCHAR_BIDI_MIRRORING_GLYPH:
+ props.bmg=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_BIDI_PAIRED_BRACKET:
+ props.bpb=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_SIMPLE_CASE_FOLDING:
+ props.scf=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_SIMPLE_LOWERCASE_MAPPING:
+ props.slc=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_SIMPLE_TITLECASE_MAPPING:
+ props.stc=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_SIMPLE_UPPERCASE_MAPPING:
+ props.suc=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_CASE_FOLDING:
+ parseString(v, props.cf, errorCode);
+ break;
+ case UCHAR_LOWERCASE_MAPPING:
+ parseString(v, props.lc, errorCode);
+ break;
+ case UCHAR_TITLECASE_MAPPING:
+ parseString(v, props.tc, errorCode);
+ break;
+ case UCHAR_UPPERCASE_MAPPING:
+ parseString(v, props.uc, errorCode);
+ break;
+ case PPUCD_NAME_ALIAS:
+ props.nameAlias=v;
+ break;
+ case PPUCD_CONDITIONAL_CASE_MAPPINGS:
+ case PPUCD_TURKIC_CASE_FOLDING:
+ // No need to parse their values: They are hardcoded in the runtime library.
+ break;
+ case UCHAR_SCRIPT_EXTENSIONS:
+ parseScriptExtensions(v, props.scx, errorCode);
+ break;
+ default:
+ // Ignore unhandled properties.
+ return true;
+ }
+ }
+ if(U_SUCCESS(errorCode)) {
+ newValues.add((UChar32)prop);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+UBool
+PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return false; }
+ if(lineType!=ALG_NAMES_RANGE_LINE) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return false;
+ }
+ firstField();
+ const char *field=nextField();
+ if(field==nullptr) {
+ // No range field after the type.
+ fprintf(stderr,
+ "error in preparsed UCD: missing algnamesrange range field "
+ "(no second field) on line %ld\n",
+ (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return false;
+ }
+ return parseCodePointRange(field, start, end, errorCode);
+}
+
+UChar32
+PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
+ char *end;
+ uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
+ if(end<=s || *end!=0 || value>=0x110000) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
+ s, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return U_SENTINEL;
+ }
+ return (UChar32)value;
+}
+
+UBool
+PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
+ uint32_t st, e;
+ u_parseCodePointRange(s, &st, &e, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
+ s, (long)lineNumber);
+ return false;
+ }
+ start=(UChar32)st;
+ end=(UChar32)e;
+ return true;
+}
+
+void
+PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
+ char16_t *buffer=toUCharPtr(uni.getBuffer(-1));
+ int32_t length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode);
+ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+ errorCode=U_ZERO_ERROR;
+ uni.releaseBuffer(0);
+ buffer=toUCharPtr(uni.getBuffer(length));
+ length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode);
+ }
+ uni.releaseBuffer(length);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
+ s, (long)lineNumber);
+ }
+}
+
+void
+PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ scx.clear();
+ CharString scString;
+ for(;;) {
+ const char *scs;
+ const char *scLimit=strchr(s, ' ');
+ if(scLimit!=nullptr) {
+ scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
+ if(U_FAILURE(errorCode)) { return; }
+ } else {
+ scs=s;
+ }
+ int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
+ if(script==UCHAR_INVALID_CODE) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
+ scs, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return;
+ } else if(scx.contains(script)) {
+ fprintf(stderr,
+ "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
+ scs, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return;
+ } else {
+ scx.add(script);
+ }
+ if(scLimit!=nullptr) {
+ s=scLimit+1;
+ } else {
+ break;
+ }
+ }
+ if(scx.isEmpty()) {
+ fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ }
+}
+
+U_NAMESPACE_END