summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/toolutil/ppucd.h
blob: d5c63fab4974200c3c063e255e170c4f5e2c803e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*   Copyright (C) 2011-2013, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   file name:  ppucd.h
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2011dec11
*   created by: Markus W. Scherer
*/

#ifndef __PPUCD_H__
#define __PPUCD_H__

#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"

#include <stdio.h>

/** Additions to the uchar.h enum UProperty. */
enum {
    /** Name_Alias */
    PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
    PPUCD_CONDITIONAL_CASE_MAPPINGS,
    PPUCD_TURKIC_CASE_FOLDING
};

U_NAMESPACE_BEGIN

class U_TOOLUTIL_API PropertyNames {
public:
    virtual ~PropertyNames();
    virtual int32_t getPropertyEnum(const char *name) const = 0;
    virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const = 0;
};

struct U_TOOLUTIL_API UniProps {
    UniProps();
    ~UniProps();

    int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }

    UChar32 start, end;
    UBool binProps[UCHAR_BINARY_LIMIT];
    int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
    UVersionInfo age;
    UChar32 bmg, bpb;
    UChar32 scf, slc, stc, suc;
    int32_t digitValue;
    const char *numericValue;
    const char *name;
    const char *nameAlias;
    UnicodeString cf, lc, tc, uc;
    UnicodeSet scx;
};

class U_TOOLUTIL_API PreparsedUCD {
public:
    enum LineType {
        /** No line, end of file. */
        NO_LINE,
        /** Empty line. (Might contain a comment.) */
        EMPTY_LINE,

        /** ucd;6.1.0 */
        UNICODE_VERSION_LINE,

        /** property;Binary;Alpha;Alphabetic */
        PROPERTY_LINE,
        /** binary;N;No;F;False */
        BINARY_LINE,
        /** value;gc;Zs;Space_Separator */
        VALUE_LINE,

        /** defaults;0000..10FFFF;age=NA;bc=L;... */
        DEFAULTS_LINE,
        /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
        BLOCK_LINE,
        /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
        CP_LINE,
        /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */
        UNASSIGNED_LINE,

        /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
        ALG_NAMES_RANGE_LINE,

        LINE_TYPE_COUNT
    };

    /**
     * Constructor.
     * Prepare this object for a new, empty package.
     */
    PreparsedUCD(const char *filename, UErrorCode &errorCode);

    /** Destructor. */
    ~PreparsedUCD();

    /** Sets (aliases) a PropertyNames implementation. Caller retains ownership. */
    void setPropertyNames(const PropertyNames *pn) { pnames=pn; }

    /**
     * Reads a line from the preparsed UCD file.
     * Splits the line by replacing each ';' with a NUL.
     */
    LineType readLine(UErrorCode &errorCode);

    /** Returns the number of the line read by readLine(). */
    int32_t getLineNumber() const { return lineNumber; }

    /** Returns the line's next field, or nullptr. */
    const char *nextField();

    /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
    const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }

    /** Returns true if the current line has property values. */
    UBool lineHasPropertyValues() const {
        return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE;
    }

    /**
     * Parses properties from the current line.
     * Clears newValues and sets UProperty codes for property values mentioned
     * on the current line (as opposed to being inherited).
     * Returns a pointer to the filled-in UniProps, or nullptr if something went wrong.
     * The returned UniProps are usable until the next line of the same type is read.
     */
    const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);

    /**
     * Returns the code point range for the current algnamesrange line.
     * Calls & parses nextField().
     * Further nextField() calls will yield the range's type & prefix string.
     * Returns U_SUCCESS(errorCode).
     */
    UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);

private:
    UBool isLineBufferAvailable(int32_t i) {
        return defaultLineIndex!=i && blockLineIndex!=i;
    }

    /** Resets the field iterator and returns the line's first field (the line type field). */
    const char *firstField();

    UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
                        UErrorCode &errorCode);
    UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
    UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
    void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
    void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);

    static const int32_t kNumLineBuffers=3;

    const PropertyNames *pnames;  // aliased
    FILE *file;
    int32_t defaultLineIndex, blockLineIndex, lineIndex;
    int32_t lineNumber;
    LineType lineType;
    char *fieldLimit;
    char *lineLimit;

    UVersionInfo ucdVersion;
    UniProps defaultProps, blockProps, cpProps;
    UnicodeSet blockValues;
    // Multiple lines so that default and block properties can maintain pointers
    // into their line buffers.
    char lines[kNumLineBuffers][4096];
};

U_NAMESPACE_END

#endif  // __PPUCD_H__