// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2011-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ppucd.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2011dec11 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #include "unicode/uchar.h" #include "charstr.h" #include "cstring.h" #include "ppucd.h" #include "uassert.h" #include "uparse.h" #include #include U_NAMESPACE_BEGIN PropertyNames::~PropertyNames() {} // TODO: Create a concrete subclass for the default PropertyNames implementation // using the ICU library built-in property names API & data. // Currently only the genprops tool uses PreparsedUCD, and provides its own // PropertyNames implementation using its just-build property names data and its own code. // At some point, we should use PreparsedUCD in tests, and then we will need the // default implementation somewhere. #if 0 int32_t PropertyNames::getPropertyEnum(const char *name) const { return u_getPropertyEnum(name); } int32_t PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { return u_getPropertyValueEnum((UProperty)property, name); } #endif UniProps::UniProps() : start(U_SENTINEL), end(U_SENTINEL), bmg(U_SENTINEL), bpb(U_SENTINEL), scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), digitValue(-1), numericValue(nullptr), name(nullptr), nameAlias(nullptr) { memset(binProps, 0, sizeof(binProps)); memset(intProps, 0, sizeof(intProps)); memset(age, 0, 4); } UniProps::~UniProps() {} const int32_t PreparsedUCD::kNumLineBuffers; PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) : pnames(nullptr), file(nullptr), defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), lineNumber(0), lineType(NO_LINE), fieldLimit(nullptr), lineLimit(nullptr) { if(U_FAILURE(errorCode)) { return; } if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) { filename=nullptr; file=stdin; } else { file=fopen(filename, "r"); } if(file==nullptr) { perror("error opening preparsed UCD"); fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); errorCode=U_FILE_ACCESS_ERROR; return; } memset(ucdVersion, 0, 4); lines[0][0]=0; } PreparsedUCD::~PreparsedUCD() { if(file!=stdin) { fclose(file); } } // Same order as the LineType values. static const char *lineTypeStrings[]={ nullptr, nullptr, "ucd", "property", "binary", "value", "defaults", "block", "cp", "unassigned", "algnamesrange" }; PreparsedUCD::LineType PreparsedUCD::readLine(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return NO_LINE; } // Select the next available line buffer. while(!isLineBufferAvailable(lineIndex)) { ++lineIndex; if (lineIndex == kNumLineBuffers) { lineIndex = 0; } } char *line=lines[lineIndex]; *line=0; lineLimit=fieldLimit=line; lineType=NO_LINE; char *result=fgets(line, sizeof(lines[0]), file); if(result==nullptr) { if(ferror(file)) { perror("error reading preparsed UCD"); fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); errorCode=U_FILE_ACCESS_ERROR; } return NO_LINE; } ++lineNumber; if(*line=='#') { fieldLimit=strchr(line, 0); return lineType=EMPTY_LINE; } // Remove trailing /r/n. char c; char *limit=strchr(line, 0); while(line=0) { fprintf(stderr, "error in preparsed UCD: default line %ld after one or more block lines\n", (long)lineNumber); errorCode=U_PARSE_ERROR; return nullptr; } if(defaultLineIndex>=0) { fprintf(stderr, "error in preparsed UCD: second line with default properties on line %ld\n", (long)lineNumber); errorCode=U_PARSE_ERROR; return nullptr; } if(start!=0 || end!=0x10ffff) { fprintf(stderr, "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; return nullptr; } props=&defaultProps; defaultLineIndex=lineIndex; break; case BLOCK_LINE: blockProps=defaultProps; // Block inherits default properties. props=&blockProps; blockLineIndex=lineIndex; break; case CP_LINE: case UNASSIGNED_LINE: if(blockProps.start<=start && end<=blockProps.end) { insideBlock=true; if(lineType==CP_LINE) { // Code point range fully inside the last block inherits the block properties. cpProps=blockProps; } else { // Unassigned line inside the block is based on default properties // which override block properties. cpProps=defaultProps; newValues=blockValues; // Except, it inherits the one blk=Block property. int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START; cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex]; newValues.remove((UChar32)UCHAR_BLOCK); } } else if(start>blockProps.end || endstart=start; props->end=end; while((field=nextField())!=nullptr) { if(!parseProperty(*props, field, newValues, errorCode)) { return nullptr; } } if(lineType==BLOCK_LINE) { blockValues=newValues; } else if(lineType==UNASSIGNED_LINE && insideBlock) { // Unset newValues for values that are the same as the block values. for(int32_t prop=0; propgetPropertyEnum(p); if(prop<0) { for(int32_t i=0;; ++i) { if(i==UPRV_LENGTHOF(ppucdProperties)) { // Ignore unknown property names. return true; } if(0==uprv_stricmp(p, ppucdProperties[i].name)) { prop=ppucdProperties[i].prop; U_ASSERT(prop>=0); break; } } } if(prop=0) { props.binProps[prop]=(UBool)binaryValue; } else { // No binary value for a binary property. fprintf(stderr, "error in preparsed UCD: enum-property syntax '%s' " "for binary property on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; } } else if(binaryValue>=0) { // Binary value for a non-binary property. fprintf(stderr, "error in preparsed UCD: binary-property syntax '%s' " "for non-binary property on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; } else if (prop < UCHAR_INT_START) { fprintf(stderr, "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", prop, (long)lineNumber); errorCode=U_PARSE_ERROR; } else if(propgetPropertyValueEnum(prop, v); if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. char *end; unsigned long ccc=uprv_strtoul(v, &end, 10); if(v, just set null values. switch(prop) { case UCHAR_BIDI_MIRRORING_GLYPH: props.bmg=U_SENTINEL; break; case UCHAR_BIDI_PAIRED_BRACKET: props.bpb=U_SENTINEL; break; case UCHAR_SIMPLE_CASE_FOLDING: props.scf=U_SENTINEL; break; case UCHAR_SIMPLE_LOWERCASE_MAPPING: props.slc=U_SENTINEL; break; case UCHAR_SIMPLE_TITLECASE_MAPPING: props.stc=U_SENTINEL; break; case UCHAR_SIMPLE_UPPERCASE_MAPPING: props.suc=U_SENTINEL; break; case UCHAR_CASE_FOLDING: props.cf.remove(); break; case UCHAR_LOWERCASE_MAPPING: props.lc.remove(); break; case UCHAR_TITLECASE_MAPPING: props.tc.remove(); break; case UCHAR_UPPERCASE_MAPPING: props.uc.remove(); break; case UCHAR_SCRIPT_EXTENSIONS: props.scx.clear(); break; default: fprintf(stderr, "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; } } else { char c; switch(prop) { case UCHAR_NUMERIC_VALUE: props.numericValue=v; c=*v; if('0'<=c && c<='9' && v[1]==0) { props.digitValue=c-'0'; } else { props.digitValue=-1; } break; case UCHAR_NAME: props.name=v; break; case UCHAR_AGE: u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. break; case UCHAR_BIDI_MIRRORING_GLYPH: props.bmg=parseCodePoint(v, errorCode); break; case UCHAR_BIDI_PAIRED_BRACKET: props.bpb=parseCodePoint(v, errorCode); break; case UCHAR_SIMPLE_CASE_FOLDING: props.scf=parseCodePoint(v, errorCode); break; case UCHAR_SIMPLE_LOWERCASE_MAPPING: props.slc=parseCodePoint(v, errorCode); break; case UCHAR_SIMPLE_TITLECASE_MAPPING: props.stc=parseCodePoint(v, errorCode); break; case UCHAR_SIMPLE_UPPERCASE_MAPPING: props.suc=parseCodePoint(v, errorCode); break; case UCHAR_CASE_FOLDING: parseString(v, props.cf, errorCode); break; case UCHAR_LOWERCASE_MAPPING: parseString(v, props.lc, errorCode); break; case UCHAR_TITLECASE_MAPPING: parseString(v, props.tc, errorCode); break; case UCHAR_UPPERCASE_MAPPING: parseString(v, props.uc, errorCode); break; case PPUCD_NAME_ALIAS: props.nameAlias=v; break; case PPUCD_CONDITIONAL_CASE_MAPPINGS: case PPUCD_TURKIC_CASE_FOLDING: // No need to parse their values: They are hardcoded in the runtime library. break; case UCHAR_SCRIPT_EXTENSIONS: parseScriptExtensions(v, props.scx, errorCode); break; default: // Ignore unhandled properties. return true; } } if(U_SUCCESS(errorCode)) { newValues.add((UChar32)prop); return true; } else { return false; } } UBool PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return false; } if(lineType!=ALG_NAMES_RANGE_LINE) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; return false; } firstField(); const char *field=nextField(); if(field==nullptr) { // No range field after the type. fprintf(stderr, "error in preparsed UCD: missing algnamesrange range field " "(no second field) on line %ld\n", (long)lineNumber); errorCode=U_PARSE_ERROR; return false; } return parseCodePointRange(field, start, end, errorCode); } UChar32 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { char *end; uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || *end!=0 || value>=0x110000) { fprintf(stderr, "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", s, (long)lineNumber); errorCode=U_PARSE_ERROR; return U_SENTINEL; } return (UChar32)value; } UBool PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { uint32_t st, e; u_parseCodePointRange(s, &st, &e, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", s, (long)lineNumber); return false; } start=(UChar32)st; end=(UChar32)e; return true; } void PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { char16_t *buffer=toUCharPtr(uni.getBuffer(-1)); int32_t length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode); if(errorCode==U_BUFFER_OVERFLOW_ERROR) { errorCode=U_ZERO_ERROR; uni.releaseBuffer(0); buffer=toUCharPtr(uni.getBuffer(length)); length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode); } uni.releaseBuffer(length); if(U_FAILURE(errorCode)) { fprintf(stderr, "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", s, (long)lineNumber); } } void PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } scx.clear(); CharString scString; for(;;) { const char *scs; const char *scLimit=strchr(s, ' '); if(scLimit!=nullptr) { scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); if(U_FAILURE(errorCode)) { return; } } else { scs=s; } int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); if(script==UCHAR_INVALID_CODE) { fprintf(stderr, "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", scs, (long)lineNumber); errorCode=U_PARSE_ERROR; return; } else if(scx.contains(script)) { fprintf(stderr, "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", scs, (long)lineNumber); errorCode=U_PARSE_ERROR; return; } else { scx.add(script); } if(scLimit!=nullptr) { s=scLimit+1; } else { break; } } if(scx.isEmpty()) { fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); errorCode=U_PARSE_ERROR; } } U_NAMESPACE_END