diff options
Diffstat (limited to '')
-rw-r--r-- | intl/icu/source/tools/toolutil/uparse.h | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/intl/icu/source/tools/toolutil/uparse.h b/intl/icu/source/tools/toolutil/uparse.h new file mode 100644 index 0000000000..df0e79a21f --- /dev/null +++ b/intl/icu/source/tools/toolutil/uparse.h @@ -0,0 +1,153 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uparse.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr18 +* created by: Markus W. Scherer +* +* This file provides a parser for files that are delimited by one single +* character like ';' or TAB. Example: the Unicode Character Properties files +* like UnicodeData.txt are semicolon-delimited. +*/ + +#ifndef __UPARSE_H__ +#define __UPARSE_H__ + +#include "unicode/utypes.h" + +/** + * Is c an invariant-character whitespace? + * @param c invariant character + */ +#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n') + +U_CDECL_BEGIN + +/** + * Skip space ' ' and TAB '\t' characters. + * + * @param s Pointer to characters. + * @return Pointer to first character at or after s that is not a space or TAB. + */ +U_CAPI const char * U_EXPORT2 +u_skipWhitespace(const char *s); + +/** + * Trim whitespace (including line endings) from the end of the string. + * + * @param s Pointer to the string. + * @return Pointer to the new end of the string. + */ +U_CAPI char * U_EXPORT2 +u_rtrim(char *s); + +/** Function type for u_parseDelimitedFile(). */ +typedef void U_CALLCONV +UParseLineFn(void *context, + char *fields[][2], + int32_t fieldCount, + UErrorCode *pErrorCode); + +/** + * Parser for files that are similar to UnicodeData.txt: + * This function opens the file and reads it line by line. It skips empty lines + * and comment lines that start with a '#'. + * All other lines are separated into fields with one delimiter character + * (semicolon for Unicode Properties files) between two fields. The last field in + * a line does not need to be terminated with a delimiter. + * + * For each line, after segmenting it, a line function is called. + * It gets passed the array of field start and limit pointers that is + * passed into this parser and filled by it for each line. + * For each field i of the line, the start pointer in fields[i][0] + * points to the beginning of the field, while the limit pointer in fields[i][1] + * points behind the field, i.e., to the delimiter or the line end. + * + * The context parameter of the line function is + * the same as the one for the parse function. + * + * The line function may modify the contents of the fields including the + * limit characters. + * + * If the file cannot be opened, or there is a parsing error or a field function + * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code. + */ +U_CAPI void U_EXPORT2 +u_parseDelimitedFile(const char *filename, char delimiter, + char *fields[][2], int32_t fieldCount, + UParseLineFn *lineFn, void *context, + UErrorCode *pErrorCode); + +/** + * Parse a string of code points like 0061 0308 0300. + * s must end with either ';' or NUL. + * + * @return Number of code points. + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePoints(const char *s, + uint32_t *dest, int32_t destCapacity, + UErrorCode *pErrorCode); + +/** + * Parse a list of code points like 0061 0308 0300 + * into a UChar * string. + * s must end with either ';' or NUL. + * + * Set the first code point in *pFirst. + * + * @param s Input char * string. + * @param dest Output string buffer. + * @param destCapacity Capacity of dest in numbers of UChars. + * @param pFirst If pFirst!=NULL the *pFirst will be set to the first + * code point in the string. + * @param pErrorCode ICU error code. + * @return The length of the string in numbers of UChars. + */ +U_CAPI int32_t U_EXPORT2 +u_parseString(const char *s, + UChar *dest, int32_t destCapacity, + uint32_t *pFirst, + UErrorCode *pErrorCode); + +/** + * Parse a code point range like + * 0085 or + * 4E00..9FA5. + * + * s must contain such a range and end with either ';' or NUL. + * + * @return Length of code point range, end-start+1 + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRange(const char *s, + uint32_t *pStart, uint32_t *pEnd, + UErrorCode *pErrorCode); + +/** + * Same as u_parseCodePointRange() but the range may be terminated by + * any character. The position of the terminating character is returned via + * the *terminator output parameter. + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRangeAnyTerminator(const char *s, + uint32_t *pStart, uint32_t *pEnd, + const char **terminator, + UErrorCode *pErrorCode); + +U_CAPI int32_t U_EXPORT2 +u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status); + +U_CDECL_END + +#endif |