1 files changed, 622 insertions, 0 deletions
diff --git a/intl/icu/source/tools/toolutil/ppucd.cpp b/intl/icu/source/tools/toolutil/ppucd.cpp
new file mode 100644
index 0000000000..0d59b28ce4
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ppucd.cpp
@@ -0,0 +1,622 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*   Copyright (C) 2011-2014, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*******************************************************************************
+*   file name:  ppucd.cpp
+*   encoding:   UTF-8
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2011dec11
+*   created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uchar.h"
+#include "charstr.h"
+#include "cstring.h"
+#include "ppucd.h"
+#include "uassert.h"
+#include "uparse.h"
+
+#include <stdio.h>
+#include <string.h>
+
+U_NAMESPACE_BEGIN
+
+PropertyNames::~PropertyNames() {}
+
+// TODO: Create a concrete subclass for the default PropertyNames implementation
+// using the ICU library built-in property names API & data.
+// Currently only the genprops tool uses PreparsedUCD, and provides its own
+// PropertyNames implementation using its just-build property names data and its own code.
+// At some point, we should use PreparsedUCD in tests, and then we will need the
+// default implementation somewhere.
+#if 0
+int32_t
+PropertyNames::getPropertyEnum(const char *name) const {
+    return u_getPropertyEnum(name);
+}
+
+int32_t
+PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
+    return u_getPropertyValueEnum((UProperty)property, name);
+}
+#endif
+
+UniProps::UniProps()
+        : start(U_SENTINEL), end(U_SENTINEL),
+          bmg(U_SENTINEL), bpb(U_SENTINEL),
+          scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
+          digitValue(-1), numericValue(nullptr),
+          name(nullptr), nameAlias(nullptr) {
+    memset(binProps, 0, sizeof(binProps));
+    memset(intProps, 0, sizeof(intProps));
+    memset(age, 0, 4);
+}
+
+UniProps::~UniProps() {}
+
+const int32_t PreparsedUCD::kNumLineBuffers;
+
+PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
+        : pnames(nullptr),
+          file(nullptr),
+          defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
+          lineNumber(0),
+          lineType(NO_LINE),
+          fieldLimit(nullptr), lineLimit(nullptr) {
+    if(U_FAILURE(errorCode)) { return; }
+
+    if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) {
+        filename=nullptr;
+        file=stdin;
+    } else {
+        file=fopen(filename, "r");
+    }
+    if(file==nullptr) {
+        perror("error opening preparsed UCD");
+        fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
+        errorCode=U_FILE_ACCESS_ERROR;
+        return;
+    }
+
+    memset(ucdVersion, 0, 4);
+    lines[0][0]=0;
+}
+
+PreparsedUCD::~PreparsedUCD() {
+    if(file!=stdin) {
+        fclose(file);
+    }
+}
+
+// Same order as the LineType values.
+static const char *lineTypeStrings[]={
+    nullptr,
+    nullptr,
+    "ucd",
+    "property",
+    "binary",
+    "value",
+    "defaults",
+    "block",
+    "cp",
+    "unassigned",
+    "algnamesrange"
+};
+
+PreparsedUCD::LineType
+PreparsedUCD::readLine(UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return NO_LINE; }
+    // Select the next available line buffer.
+    while(!isLineBufferAvailable(lineIndex)) {
+        ++lineIndex;
+        if (lineIndex == kNumLineBuffers) {
+            lineIndex = 0;
+        }
+    }
+    char *line=lines[lineIndex];
+    *line=0;
+    lineLimit=fieldLimit=line;
+    lineType=NO_LINE;
+    char *result=fgets(line, sizeof(lines[0]), file);
+    if(result==nullptr) {
+        if(ferror(file)) {
+            perror("error reading preparsed UCD");
+            fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
+            errorCode=U_FILE_ACCESS_ERROR;
+        }
+        return NO_LINE;
+    }
+    ++lineNumber;
+    if(*line=='#') {
+        fieldLimit=strchr(line, 0);
+        return lineType=EMPTY_LINE;
+    }
+    // Remove trailing /r/n.
+    char c;
+    char *limit=strchr(line, 0);
+    while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
+    // Remove trailing white space.
+    while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
+    *limit=0;
+    lineLimit=limit;
+    if(line==limit) {
+        fieldLimit=limit;
+        return lineType=EMPTY_LINE;
+    }
+    // Split by ';'.
+    char *semi=line;
+    while((semi=strchr(semi, ';'))!=nullptr) { *semi++=0; }
+    fieldLimit=strchr(line, 0);
+    // Determine the line type.
+    int32_t type;
+    for(type=EMPTY_LINE+1;; ++type) {
+        if(type==LINE_TYPE_COUNT) {
+            fprintf(stderr,
+                    "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
+                    line, (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+            return NO_LINE;
+        }
+        if(0==strcmp(line, lineTypeStrings[type])) {
+            break;
+        }
+    }
+    lineType=(LineType)type;
+    if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
+        u_versionFromString(ucdVersion, fieldLimit+1);
+    }
+    return lineType;
+}
+
+const char *
+PreparsedUCD::firstField() {
+    char *field=lines[lineIndex];
+    fieldLimit=strchr(field, 0);
+    return field;
+}
+
+const char *
+PreparsedUCD::nextField() {
+    if(fieldLimit==lineLimit) { return nullptr; }
+    char *field=fieldLimit+1;
+    fieldLimit=strchr(field, 0);
+    return field;
+}
+
+const UniProps *
+PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return nullptr; }
+    newValues.clear();
+    if(!lineHasPropertyValues()) {
+        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+    firstField();
+    const char *field=nextField();
+    if(field==nullptr) {
+        // No range field after the type.
+        fprintf(stderr,
+                "error in preparsed UCD: missing default/block/cp range field "
+                "(no second field) on line %ld\n",
+                (long)lineNumber);
+        errorCode=U_PARSE_ERROR;
+        return nullptr;
+    }
+    UChar32 start, end;
+    if(!parseCodePointRange(field, start, end, errorCode)) { return nullptr; }
+    UniProps *props;
+    UBool insideBlock=false;  // true if cp or unassigned range inside the block range.
+    switch(lineType) {
+    case DEFAULTS_LINE:
+        // Should occur before any block/cp/unassigned line.
+        if(blockLineIndex>=0) {
+            fprintf(stderr,
+                    "error in preparsed UCD: default line %ld after one or more block lines\n",
+                    (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+            return nullptr;
+        }
+        if(defaultLineIndex>=0) {
+            fprintf(stderr,
+                    "error in preparsed UCD: second line with default properties on line %ld\n",
+                    (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+            return nullptr;
+        }
+        if(start!=0 || end!=0x10ffff) {
+            fprintf(stderr,
+                    "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
+                    field, (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+            return nullptr;
+        }
+        props=&defaultProps;
+        defaultLineIndex=lineIndex;
+        break;
+    case BLOCK_LINE:
+        blockProps=defaultProps;  // Block inherits default properties.
+        props=&blockProps;
+        blockLineIndex=lineIndex;
+        break;
+    case CP_LINE:
+    case UNASSIGNED_LINE:
+        if(blockProps.start<=start && end<=blockProps.end) {
+            insideBlock=true;
+            if(lineType==CP_LINE) {
+                // Code point range fully inside the last block inherits the block properties.
+                cpProps=blockProps;
+            } else {
+                // Unassigned line inside the block is based on default properties
+                // which override block properties.
+                cpProps=defaultProps;
+                newValues=blockValues;
+                // Except, it inherits the one blk=Block property.
+                int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START;
+                cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex];
+                newValues.remove((UChar32)UCHAR_BLOCK);
+            }
+        } else if(start>blockProps.end || end<blockProps.start) {
+            // Code point range fully outside the last block inherits the default properties.
+            cpProps=defaultProps;
+        } else {
+            // Code point range partially overlapping with the last block is illegal.
+            fprintf(stderr,
+                    "error in preparsed UCD: cp range %s on line %ld only "
+                    "partially overlaps with block range %04lX..%04lX\n",
+                    field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
+            errorCode=U_PARSE_ERROR;
+            return nullptr;
+        }
+        props=&cpProps;
+        break;
+    default:
+        // Will not occur because of the range check above.
+        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+    props->start=start;
+    props->end=end;
+    while((field=nextField())!=nullptr) {
+        if(!parseProperty(*props, field, newValues, errorCode)) { return nullptr; }
+    }
+    if(lineType==BLOCK_LINE) {
+        blockValues=newValues;
+    } else if(lineType==UNASSIGNED_LINE && insideBlock) {
+        // Unset newValues for values that are the same as the block values.
+        for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) {
+            if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) {
+                newValues.remove(prop);
+            }
+        }
+        for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) {
+            int32_t index=prop-UCHAR_INT_START;
+            if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) {
+                newValues.remove(prop);
+            }
+        }
+    }
+    return props;
+}
+
+static const struct {
+    const char *name;
+    int32_t prop;
+} ppucdProperties[]={
+    { "Name_Alias", PPUCD_NAME_ALIAS },
+    { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
+    { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
+};
+
+// Returns true for "ok to continue parsing fields".
+UBool
+PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
+                            UErrorCode &errorCode) {
+    CharString pBuffer;
+    const char *p=field;
+    const char *v=strchr(p, '=');
+    int binaryValue;
+    if(*p=='-') {
+        if(v!=nullptr) {
+            fprintf(stderr,
+                    "error in preparsed UCD: mix of binary-property-no and "
+                    "enum-property syntax '%s' on line %ld\n",
+                    field, (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+            return false;
+        }
+        binaryValue=0;
+        ++p;
+    } else if(v==nullptr) {
+        binaryValue=1;
+    } else {
+        binaryValue=-1;
+        // Copy out the property name rather than modifying the field (writing a NUL).
+        pBuffer.append(p, (int32_t)(v-p), errorCode);
+        p=pBuffer.data();
+        ++v;
+    }
+    int32_t prop=pnames->getPropertyEnum(p);
+    if(prop<0) {
+        for(int32_t i=0;; ++i) {
+            if(i==UPRV_LENGTHOF(ppucdProperties)) {
+                // Ignore unknown property names.
+                return true;
+            }
+            if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
+                prop=ppucdProperties[i].prop;
+                U_ASSERT(prop>=0);
+                break;
+            }
+        }
+    }
+    if(prop<UCHAR_BINARY_LIMIT) {
+        if(binaryValue>=0) {
+            props.binProps[prop]=(UBool)binaryValue;
+        } else {
+            // No binary value for a binary property.
+            fprintf(stderr,
+                    "error in preparsed UCD: enum-property syntax '%s' "
+                    "for binary property on line %ld\n",
+                    field, (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+        }
+    } else if(binaryValue>=0) {
+        // Binary value for a non-binary property.
+        fprintf(stderr,
+                "error in preparsed UCD: binary-property syntax '%s' "
+                "for non-binary property on line %ld\n",
+                field, (long)lineNumber);
+        errorCode=U_PARSE_ERROR;
+    } else if (prop < UCHAR_INT_START) {
+        fprintf(stderr,
+                "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
+                prop, (long)lineNumber);
+        errorCode=U_PARSE_ERROR;
+    } else if(prop<UCHAR_INT_LIMIT) {
+        int32_t value=pnames->getPropertyValueEnum(prop, v);
+        if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
+            // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
+            char *end;
+            unsigned long ccc=uprv_strtoul(v, &end, 10);
+            if(v<end && *end==0 && ccc<=254) {
+                value=(int32_t)ccc;
+            }
+        }
+        if(value==UCHAR_INVALID_CODE) {
+            fprintf(stderr,
+                    "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
+                    field, (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+        } else {
+            props.intProps[prop-UCHAR_INT_START]=value;
+        }
+    } else if(*v=='<') {
+        // Do not parse default values like <code point>, just set null values.
+        switch(prop) {
+        case UCHAR_BIDI_MIRRORING_GLYPH:
+            props.bmg=U_SENTINEL;
+            break;
+        case UCHAR_BIDI_PAIRED_BRACKET:
+            props.bpb=U_SENTINEL;
+            break;
+        case UCHAR_SIMPLE_CASE_FOLDING:
+            props.scf=U_SENTINEL;
+            break;
+        case UCHAR_SIMPLE_LOWERCASE_MAPPING:
+            props.slc=U_SENTINEL;
+            break;
+        case UCHAR_SIMPLE_TITLECASE_MAPPING:
+            props.stc=U_SENTINEL;
+            break;
+        case UCHAR_SIMPLE_UPPERCASE_MAPPING:
+            props.suc=U_SENTINEL;
+            break;
+        case UCHAR_CASE_FOLDING:
+            props.cf.remove();
+            break;
+        case UCHAR_LOWERCASE_MAPPING:
+            props.lc.remove();
+            break;
+        case UCHAR_TITLECASE_MAPPING:
+            props.tc.remove();
+            break;
+        case UCHAR_UPPERCASE_MAPPING:
+            props.uc.remove();
+            break;
+        case UCHAR_SCRIPT_EXTENSIONS:
+            props.scx.clear();
+            break;
+        default:
+            fprintf(stderr,
+                    "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
+                    field, (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+        }
+    } else {
+        char c;
+        switch(prop) {
+        case UCHAR_NUMERIC_VALUE:
+            props.numericValue=v;
+            c=*v;
+            if('0'<=c && c<='9' && v[1]==0) {
+                props.digitValue=c-'0';
+            } else {
+                props.digitValue=-1;
+            }
+            break;
+        case UCHAR_NAME:
+            props.name=v;
+            break;
+        case UCHAR_AGE:
+            u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
+            break;
+        case UCHAR_BIDI_MIRRORING_GLYPH:
+            props.bmg=parseCodePoint(v, errorCode);
+            break;
+        case UCHAR_BIDI_PAIRED_BRACKET:
+            props.bpb=parseCodePoint(v, errorCode);
+            break;
+        case UCHAR_SIMPLE_CASE_FOLDING:
+            props.scf=parseCodePoint(v, errorCode);
+            break;
+        case UCHAR_SIMPLE_LOWERCASE_MAPPING:
+            props.slc=parseCodePoint(v, errorCode);
+            break;
+        case UCHAR_SIMPLE_TITLECASE_MAPPING:
+            props.stc=parseCodePoint(v, errorCode);
+            break;
+        case UCHAR_SIMPLE_UPPERCASE_MAPPING:
+            props.suc=parseCodePoint(v, errorCode);
+            break;
+        case UCHAR_CASE_FOLDING:
+            parseString(v, props.cf, errorCode);
+            break;
+        case UCHAR_LOWERCASE_MAPPING:
+            parseString(v, props.lc, errorCode);
+            break;
+        case UCHAR_TITLECASE_MAPPING:
+            parseString(v, props.tc, errorCode);
+            break;
+        case UCHAR_UPPERCASE_MAPPING:
+            parseString(v, props.uc, errorCode);
+            break;
+        case PPUCD_NAME_ALIAS:
+            props.nameAlias=v;
+            break;
+        case PPUCD_CONDITIONAL_CASE_MAPPINGS:
+        case PPUCD_TURKIC_CASE_FOLDING:
+            // No need to parse their values: They are hardcoded in the runtime library.
+            break;
+        case UCHAR_SCRIPT_EXTENSIONS:
+            parseScriptExtensions(v, props.scx, errorCode);
+            break;
+        default:
+            // Ignore unhandled properties.
+            return true;
+        }
+    }
+    if(U_SUCCESS(errorCode)) {
+        newValues.add((UChar32)prop);
+        return true;
+    } else {
+        return false;
+    }
+}
+
+UBool
+PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return false; }
+    if(lineType!=ALG_NAMES_RANGE_LINE) {
+        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return false;
+    }
+    firstField();
+    const char *field=nextField();
+    if(field==nullptr) {
+        // No range field after the type.
+        fprintf(stderr,
+                "error in preparsed UCD: missing algnamesrange range field "
+                "(no second field) on line %ld\n",
+                (long)lineNumber);
+        errorCode=U_PARSE_ERROR;
+        return false;
+    }
+    return parseCodePointRange(field, start, end, errorCode);
+}
+
+UChar32
+PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
+    char *end;
+    uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
+    if(end<=s || *end!=0 || value>=0x110000) {
+        fprintf(stderr,
+                "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
+                s, (long)lineNumber);
+        errorCode=U_PARSE_ERROR;
+        return U_SENTINEL;
+    }
+    return (UChar32)value;
+}
+
+UBool
+PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
+    uint32_t st, e;
+    u_parseCodePointRange(s, &st, &e, &errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr,
+                "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
+                s, (long)lineNumber);
+        return false;
+    }
+    start=(UChar32)st;
+    end=(UChar32)e;
+    return true;
+}
+
+void
+PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
+    char16_t *buffer=toUCharPtr(uni.getBuffer(-1));
+    int32_t length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode);
+    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+        errorCode=U_ZERO_ERROR;
+        uni.releaseBuffer(0);
+        buffer=toUCharPtr(uni.getBuffer(length));
+        length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode);
+    }
+    uni.releaseBuffer(length);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr,
+                "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
+                s, (long)lineNumber);
+    }
+}
+
+void
+PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return; }
+    scx.clear();
+    CharString scString;
+    for(;;) {
+        const char *scs;
+        const char *scLimit=strchr(s, ' ');
+        if(scLimit!=nullptr) {
+            scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
+            if(U_FAILURE(errorCode)) { return; }
+        } else {
+            scs=s;
+        }
+        int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
+        if(script==UCHAR_INVALID_CODE) {
+            fprintf(stderr,
+                    "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
+                    scs, (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+            return;
+        } else if(scx.contains(script)) {
+            fprintf(stderr,
+                    "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
+                    scs, (long)lineNumber);
+            errorCode=U_PARSE_ERROR;
+            return;
+        } else {
+            scx.add(script);
+        }
+        if(scLimit!=nullptr) {
+            s=scLimit+1;
+        } else {
+            break;
+        }
+    }
+    if(scx.isEmpty()) {
+        fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
+        errorCode=U_PARSE_ERROR;
+    }
+}
+
+U_NAMESPACE_END