diff options
Diffstat (limited to 'intl/lwbrk/rulebrk.c')
-rw-r--r-- | intl/lwbrk/rulebrk.c | 388 |
1 files changed, 388 insertions, 0 deletions
diff --git a/intl/lwbrk/rulebrk.c b/intl/lwbrk/rulebrk.c new file mode 100644 index 0000000000..d7574b929f --- /dev/null +++ b/intl/lwbrk/rulebrk.c @@ -0,0 +1,388 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#define TH_UNICODE + +#include <stdlib.h> +#include <stdint.h> +#include <assert.h> +#include "th_char.h" +#define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) +#define th_isspace(c) ((c) == ' ' || (c) == '\t') + +/* +///////////////////////////////////////////////// +// Thai character type array +*/ + +typedef unsigned short twb_t; +extern const twb_t _TwbType[0x100 - 0xa0]; + +/* +// bit definition +*/ + +#define VRS 0x0001 +#define VRE 0x0002 +#define VRX 0x0004 + +#define VRA 0x0008 + +#define VLA 0x0010 +#define VLO 0x0020 +#define VLI 0x0040 + +#define VC 0x0080 + +#define CC 0x0100 +#define CS 0x0200 + +#define C2 0x0400 +#define CHB 0x0800 +#define CHE 0x1000 + +#define MT 0x2000 +/* +//_#define me 0x2000 +*/ +#define M 0x4000 + +#define T 0x8000 + +#define VL (VLA | VLO | VLI) +#define VR (VRS | VRE | VRX) +#define NE (VL | VRS) +#define NB (VR | M) +#define V (VL | VR) +#define CX (CC | CS) +#define C (CX | VC) +#define A (C | V | M) + +#define twbtype(c) (_TwbType[th_zcode(c)]) + +#ifndef TRUE +# define TRUE 1 +# define FALSE 0 +#endif +#define RETURN(b) return (b) + +/* +///////////////////////////////////////////////// +*/ + +int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr, + int right) +/* const ThBreakIterator *it, const th_char **p)*/ +{ + /* + //int left, right; + //const th_char *s = *p; + */ + const th_char* lstr = pstr + left; + th_char _c[6]; + twb_t _t[6]; +#define c(i) (_c[(i) + 3]) +#define t(i) (_t[(i) + 3]) + int i, j; + + /* + //left = s - it->begin; + */ + if (left < 0) return -1; + /* + //right = (it->end == NULL) ? 4 : it->begin - s; + */ + if (right < 1) return -1; + + /* + // get c(0), t(0) + */ + c(0) = rstr[0]; /* may be '\0' */ + if (!th_isthai(c(0))) return -1; + t(0) = twbtype(c(0)); + if (!(t(0) & A)) return -1; + + /* + // get c(-1), t(-1) + */ + if (left >= 1) { + c(-1) = lstr[-1]; + if (!th_isthai(c(-1))) return 0; + t(-1) = twbtype(c(-1)); + if (!(t(-1) & A)) return 0; /* handle punctuation marks here */ + } else { + c(-1) = 0; + t(-1) = 0; + } + + /* + // get c(1..2), t(1..2) + */ + for (i = 1; i <= 2; i++) { + if (i >= right) { + c(i) = 0; + t(i) = 0; + } else { + c(i) = rstr[i]; /* may be '\0'; */ + if (!th_isthai(c(i))) + right = i--; + else { + t(i) = twbtype(c(i)); + if (!(t(i) & A)) right = i--; + } + } + } + /* + // get c(-2..-3), t(-2..-3) + */ + for (i = -2, j = -2; i >= -3; j--) { + if (j < -left) { + c(i) = 0; + t(i) = 0; + i--; + } else { + c(i) = lstr[j]; + if (!th_isthai(c(i))) + left = 0; + else { + t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); + if (!(t(i) & A)) + left = 0; + else { + if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) { + c(i + 1) = c(i); + t(i + 1) = t(i); + } else + i--; + } + } + } + } + + /* + // prohibit the unlikely + */ + if ((t(-1) & C) && (t(0) & C)) { + if ((t(-1) & CHE) || (t(0) & CHB)) return -1; + } + /* + // special case : vlao, C/ sara_a|aa, !sara_a + */ + if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && + (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) + return 0; + + /* + // prohibit break + */ + if (t(0) & NB) return -1; + if (t(-1) & NE) return -1; + + /* + // apply 100% rules + */ + if (t(-1) & VRE) { + if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; + return -1; /* usually too short syllable, part of word */ + } + + if (t(-2) & VRE) return -1; + + if ((t(0) & C) && (t(1) & (VR | MT)) && + (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ + if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ + if (t(-1) & (V | M)) return 0; /* !C/ C, NB */ + if (t(-2) & VRS) return 0; /* VRS, C / C, NB */ + if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ + if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ + if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ + } + } + if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ + if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M))) + return 0; /* VRS, C/ !C */ + + if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { + if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ + if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ + } + /* + // apply 90% rules + */ + if (t(0) & VL) return 0; + if (t(1) & VL) return -1; + if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) + return 0; + + /* + //return -1; + // apply 80% rules + */ + if (t(0) & CHE) { + if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ + /*if(t(-1) & VRX) return 0; // VRX/ CHE */ + if (t(-1) & VC) return 0; /* VC/ CHE */ + } + if (t(-1) & CHB) { + if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ + if (t(0) & VC) return 0; /* CHB/ VC */ + } + + if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ + if (t(-2) & VLI) + return 0; /* VLI,C/C,VR .*/ + else { /* vlao, C ? C , VR */ + if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ + if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ + if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ + } + } + /* C,MT,C */ + if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; + + return -1; +} + +int TrbFollowing(const th_char* begin, int length, int offset) +/* +//(ThBreakIterator *this, int offset) +*/ +{ + const th_char* w = begin + offset; + const th_char* end = begin + length; + while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; + + if (w < end && *w && !th_isthai(*w)) { + int english = FALSE; + while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { + if (th_isalpha(*w)) english = TRUE; + w++; + } + if (english || w == end || (!th_isthai(*w) && th_isspace(*w))) + return w - begin; + } + if (w == end || *w == 0 || !th_isthai(*w)) return w - begin; + w++; + if (w < end && *w && th_isthai(*w)) { + int brk = TrbWordBreakPos(begin, w - begin, w, end - w); + while (brk < 0) { + w++; + if (w == end || *w == 0 || !th_isthai(*w)) break; + brk = TrbWordBreakPos(begin, w - begin, w, end - w); + } + if (brk > 0) w += brk; + } + if (w < end && *w && !th_isthai(*w)) { + while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) && + !th_isspace(*w)) + w++; + } + return w - begin; +} + +/* +///////////////////////////////////////////////// +*/ +const twb_t _TwbType[0x100 - 0xa0] = { +#if 0 +/* 80 */ T, +/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +/* 90 */ T, +/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +#endif + /* a0 */ 0, + /* a1 ¡ */ CS, + /* a2 ¢ */ CS | CHE, + /* a3 £ */ CC | CHE, + /* a4 € */ CS | CHE, + /* a5 ¥ */ CC | CHE, + /* a6 Š */ CS, + /* a7 § */ CS | CHB, + /* a8 š */ CS, + /* a9 © */ CC | CHE, + /* aa ª */ CS, + /* ab « */ CC | CHE, + /* ac ¬ */ CC | CHB | CHE, + /* ad */ CS | CHB, + /* ae ® */ CS | CHB, + /* af ¯ */ CS | CHB, + /* b0 ° */ CS, + /* b1 ± */ CS | CHB | CHE, + /* b2 ² */ CS | CHB | CHE, + /* b3 ³ */ CS | CHB, + /* b4 Ž */ CS, + /* b5 µ */ CS, + /* b6 ¶ */ CS, + /* b7 · */ CS, + /* b8 ž */ CS, + /* b9 ¹ */ CS, + /* ba º */ CS, + /* bb » */ CS, + /* bc Œ */ CC | CHE, + /* bd œ */ CC | CHE, + /* be Ÿ */ CS, + /* bf ¿ */ CS, + /* c0 À */ CS | CHE, + /* c1 Á */ CS, + /* c2  */ CS, + /* c3 à */ CS | C2 | CHE, /* ? add CHE */ + /* c4 Ä */ VC | CHE, + /* c5 Å */ CS | C2, + /* c6 Æ */ VC | CHE, + /* c7 Ç */ VC | C2, + /* c8 È */ CS, + /* c9 É */ CS | CHB, + /* ca Ê */ CS | CHE, + /* cb Ë */ CC | CHE, + /* CC Ì */ CS | CHB | CHE, + /* cd Í */ VC, + /* ce Î */ CC | CHE, + /* cf Ï */ T, + /* d0 Ð */ VRE | VRA, + /* d1 Ñ */ VRS, + /* d2 Ò */ VRX | VRA, + /* d3 Ó */ VRE, + /* d4 Ô */ VRX | VRA, + /* d5 Õ */ VRX | VRA, + /* d6 Ö */ VRS, + /* d7 × */ VRS | VRA, + /* d8 Ø */ VRX, + /* d9 Ù */ VRX, + /* da Ú */ T, + /* db Û */ 0, + /* dc Ü */ 0, + /* dd Ý */ 0, + /* de Þ */ 0, + /* df ß */ T, + /* e0 à */ VLA, + /* e1 á */ VLO, + /* e2 â */ VLO, + /* e3 ã */ VLI, + /* e4 ä */ VLI, + /* e5 å */ VRE, + /* e6 æ */ M, + /* e7 ç */ M, + /* e8 è */ M | MT, + /* e9 é */ M | MT, + /* ea ê */ M | MT, + /* eb ë */ M | MT, + /* ec ì */ M, + /* ed í */ T, + /* ee î */ T, + /* ef ï */ T, + /* f0 ð */ T, + /* f1 ñ */ T, + /* f2 ò */ T, + /* f3 ó */ T, + /* f4 ô */ T, + /* f5 õ */ T, + /* f6 ö */ T, + /* f7 ÷ */ T, + /* f8 ø */ T, + /* f9 ù */ T, + /* fa ú */ T, + /* fb û */ T, + /* fc ü */ 0, + /* fd ý */ 0, + /* fe þ */ 0, + /* ff */ 0}; |