diff options
Diffstat (limited to 'intl/icu/source/tools/genrb/read.c')
-rw-r--r-- | intl/icu/source/tools/genrb/read.c | 479 |
1 files changed, 479 insertions, 0 deletions
diff --git a/intl/icu/source/tools/genrb/read.c b/intl/icu/source/tools/genrb/read.c new file mode 100644 index 0000000000..0d4a318a89 --- /dev/null +++ b/intl/icu/source/tools/genrb/read.c @@ -0,0 +1,479 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File read.c +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#include <stdbool.h> + +#include "read.h" +#include "errmsg.h" +#include "toolutil.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" + +#define OPENBRACE 0x007B +#define CLOSEBRACE 0x007D +#define COMMA 0x002C +#define QUOTE 0x0022 +#define ESCAPE 0x005C +#define SLASH 0x002F +#define ASTERISK 0x002A +#define SPACE 0x0020 +#define COLON 0x003A +#define BADBOM 0xFFFE +#define CR 0x000D +#define LF 0x000A + +static int32_t lineCount; + +/* Protos */ +static enum ETokenType getStringToken(UCHARBUF *buf, + UChar32 initialChar, + struct UString *token, + UErrorCode *status); + +static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); +static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); +static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); +static UBool isWhitespace (UChar32 c); +static UBool isNewline (UChar32 c); + +U_CFUNC void resetLineNumber() { + lineCount = 1; +} + +/* Read and return the next token from the stream. If the token is of + type eString, fill in the token parameter with the token. If the + token is eError, then the status parameter will contain the + specific error. This will be eItemNotFound at the end of file, + indicating that all tokens have been returned. This method will + never return eString twice in a row; instead, multiple adjacent + string tokens will be merged into one, with no intervening + space. */ +U_CFUNC enum ETokenType +getNextToken(UCHARBUF* buf, + struct UString *token, + uint32_t *linenumber, /* out: linenumber of token */ + struct UString *comment, + UErrorCode *status) { + enum ETokenType result; + UChar32 c; + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + /* Skip whitespace */ + c = getNextChar(buf, true, comment, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + *linenumber = lineCount; + + switch(c) { + case BADBOM: + return TOK_ERROR; + case OPENBRACE: + return TOK_OPEN_BRACE; + case CLOSEBRACE: + return TOK_CLOSE_BRACE; + case COMMA: + return TOK_COMMA; + case U_EOF: + return TOK_EOF; + case COLON: + return TOK_COLON; + + default: + result = getStringToken(buf, c, token, status); + } + + *linenumber = lineCount; + return result; +} + +/* Copy a string token into the given UnicodeString. Upon entry, we + have already read the first character of the string token, which is + not a whitespace character (but may be a QUOTE or ESCAPE). This + function reads all subsequent characters that belong with this + string, and copy them into the token parameter. The other + important, and slightly convoluted purpose of this function is to + merge adjacent strings. It looks forward a bit, and if the next + non comment, non whitespace item is a string, it reads it in as + well. If two adjacent strings are quoted, they are merged without + intervening space. Otherwise a single SPACE character is + inserted. */ +static enum ETokenType getStringToken(UCHARBUF* buf, + UChar32 initialChar, + struct UString *token, + UErrorCode *status) { + UBool lastStringWasQuoted; + UChar32 c; + UChar target[3] = { '\0' }; + UChar *pTarget = target; + int len=0; + UBool isFollowingCharEscaped=false; + UBool isNLUnescaped = false; + UChar32 prevC=0; + + /* We are guaranteed on entry that initialChar is not a whitespace + character. If we are at the EOF, or have some other problem, it + doesn't matter; we still want to validly return the initialChar + (if nothing else) as a string token. */ + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + /* setup */ + lastStringWasQuoted = false; + c = initialChar; + ustr_setlen(token, 0, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + for (;;) { + if (c == QUOTE) { + if (!lastStringWasQuoted && token->fLength > 0) { + ustr_ucat(token, SPACE, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + + lastStringWasQuoted = true; + + for (;;) { + c = ucbuf_getc(buf,status); + + /* EOF reached */ + if (c == U_EOF) { + return TOK_EOF; + } + + /* Unterminated quoted strings */ + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + if (c == QUOTE && !isFollowingCharEscaped) { + break; + } + + if (c == ESCAPE && !isFollowingCharEscaped) { + pTarget = target; + c = unescape(buf, status); + + if (c == U_ERR) { + return TOK_ERROR; + } + if(c == CR || c == LF){ + isNLUnescaped = true; + } + } + + if(c==ESCAPE && !isFollowingCharEscaped){ + isFollowingCharEscaped = true; + }else{ + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + isFollowingCharEscaped = false; + len=0; + if(c == CR || c == LF){ + if(isNLUnescaped == false && prevC!=CR){ + lineCount++; + } + isNLUnescaped = false; + } + } + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + prevC = c; + } + } else { + if (token->fLength > 0) { + ustr_ucat(token, SPACE, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + + if(lastStringWasQuoted){ + if(getShowWarning()){ + warning(lineCount, "Mixing quoted and unquoted strings"); + } + if(isStrict()){ + return TOK_ERROR; + } + + } + + lastStringWasQuoted = false; + + /* if we reach here we are mixing + * quoted and unquoted strings + * warn in normal mode and error in + * pedantic mode + */ + + if (c == ESCAPE) { + pTarget = target; + c = unescape(buf, status); + + /* EOF reached */ + if (c == U_EOF) { + return TOK_ERROR; + } + } + + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + len=0; + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + for (;;) { + /* DON'T skip whitespace */ + c = getNextChar(buf, false, NULL, status); + + /* EOF reached */ + if (c == U_EOF) { + ucbuf_ungetc(c, buf); + return TOK_STRING; + } + + if (U_FAILURE(*status)) { + return TOK_STRING; + } + + if (c == QUOTE + || c == OPENBRACE + || c == CLOSEBRACE + || c == COMMA + || c == COLON) { + ucbuf_ungetc(c, buf); + break; + } + + if (isWhitespace(c)) { + break; + } + + if (c == ESCAPE) { + pTarget = target; + c = unescape(buf, status); + + if (c == U_ERR) { + return TOK_ERROR; + } + } + + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + len=0; + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + } + + /* DO skip whitespace */ + c = getNextChar(buf, true, NULL, status); + + if (U_FAILURE(*status)) { + return TOK_STRING; + } + + if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { + ucbuf_ungetc(c, buf); + return TOK_STRING; + } + } +} + +/* Retrieve the next character. If skipwhite is + true, whitespace is skipped as well. */ +static UChar32 getNextChar(UCHARBUF* buf, + UBool skipwhite, + struct UString *token, + UErrorCode *status) { + UChar32 c, c2; + + if (U_FAILURE(*status)) { + return U_EOF; + } + + for (;;) { + c = ucbuf_getc(buf,status); + + if (c == U_EOF) { + return U_EOF; + } + + if (skipwhite && isWhitespace(c)) { + continue; + } + + /* This also handles the get() failing case */ + if (c != SLASH) { + return c; + } + + c = ucbuf_getc(buf,status); /* "/c" */ + + if (c == U_EOF) { + return U_EOF; + } + + switch (c) { + case SLASH: /* "//" */ + seekUntilNewline(buf, NULL, status); + break; + + case ASTERISK: /* " / * " */ + c2 = ucbuf_getc(buf, status); /* "/ * c" */ + if(c2 == ASTERISK){ /* "/ * *" */ + /* parse multi-line comment and store it in token*/ + seekUntilEndOfComment(buf, token, status); + } else { + ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ + seekUntilEndOfComment(buf, NULL, status); + } + break; + + default: + ucbuf_ungetc(c, buf); /* "/c" - put back the c */ + /* If get() failed this is a NOP */ + return SLASH; + } + + } +} + +static void seekUntilNewline(UCHARBUF* buf, + struct UString *token, + UErrorCode *status) { + UChar32 c; + + if (U_FAILURE(*status)) { + return; + } + + do { + c = ucbuf_getc(buf,status); + /* add the char to token */ + if(token!=NULL){ + ustr_u32cat(token, c, status); + } + } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); +} + +static void seekUntilEndOfComment(UCHARBUF *buf, + struct UString *token, + UErrorCode *status) { + UChar32 c, d; + uint32_t line; + + if (U_FAILURE(*status)) { + return; + } + + line = lineCount; + + do { + c = ucbuf_getc(buf, status); + + if (c == ASTERISK) { + d = ucbuf_getc(buf, status); + + if (d != SLASH) { + ucbuf_ungetc(d, buf); + } else { + break; + } + } + /* add the char to token */ + if(token!=NULL){ + ustr_u32cat(token, c, status); + } + /* increment the lineCount */ + isNewline(c); + + } while (c != U_EOF && *status == U_ZERO_ERROR); + + if (c == U_EOF) { + *status = U_INVALID_FORMAT_ERROR; + error(line, "unterminated comment detected"); + } +} + +U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { + if (U_FAILURE(*status)) { + return U_EOF; + } + + /* We expect to be called after the ESCAPE has been seen, but + * u_fgetcx needs an ESCAPE to do its magic. */ + ucbuf_ungetc(ESCAPE, buf); + + return ucbuf_getcx32(buf, status); +} + +static UBool isWhitespace(UChar32 c) { + switch (c) { + /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ + case 0x000A: + case 0x2029: + lineCount++; + case 0x000D: + case 0x0020: + case 0x0009: + case 0xFEFF: + return true; + + default: + return false; + } +} + +static UBool isNewline(UChar32 c) { + switch (c) { + /* '\n', '\r', 0x2029 */ + case 0x000A: + case 0x2029: + lineCount++; + case 0x000D: + return true; + + default: + return false; + } +} |