/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #define TH_UNICODE #include #include #include #include "th_char.h" #define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) #define th_isspace(c) ((c) == ' ' || (c) == '\t') /* ///////////////////////////////////////////////// // Thai character type array */ typedef unsigned short twb_t; extern const twb_t _TwbType[0x100 - 0xa0]; /* // bit definition */ #define VRS 0x0001 #define VRE 0x0002 #define VRX 0x0004 #define VRA 0x0008 #define VLA 0x0010 #define VLO 0x0020 #define VLI 0x0040 #define VC 0x0080 #define CC 0x0100 #define CS 0x0200 #define C2 0x0400 #define CHB 0x0800 #define CHE 0x1000 #define MT 0x2000 /* //_#define me 0x2000 */ #define M 0x4000 #define T 0x8000 #define VL (VLA | VLO | VLI) #define VR (VRS | VRE | VRX) #define NE (VL | VRS) #define NB (VR | M) #define V (VL | VR) #define CX (CC | CS) #define C (CX | VC) #define A (C | V | M) #define twbtype(c) (_TwbType[th_zcode(c)]) #ifndef TRUE # define TRUE 1 # define FALSE 0 #endif #define RETURN(b) return (b) /* ///////////////////////////////////////////////// */ int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr, int right) /* const ThBreakIterator *it, const th_char **p)*/ { /* //int left, right; //const th_char *s = *p; */ const th_char* lstr = pstr + left; th_char _c[6]; twb_t _t[6]; #define c(i) (_c[(i) + 3]) #define t(i) (_t[(i) + 3]) int i, j; /* //left = s - it->begin; */ if (left < 0) return -1; /* //right = (it->end == NULL) ? 4 : it->begin - s; */ if (right < 1) return -1; /* // get c(0), t(0) */ c(0) = rstr[0]; /* may be '\0' */ if (!th_isthai(c(0))) return -1; t(0) = twbtype(c(0)); if (!(t(0) & A)) return -1; /* // get c(-1), t(-1) */ if (left >= 1) { c(-1) = lstr[-1]; if (!th_isthai(c(-1))) return 0; t(-1) = twbtype(c(-1)); if (!(t(-1) & A)) return 0; /* handle punctuation marks here */ } else { c(-1) = 0; t(-1) = 0; } /* // get c(1..2), t(1..2) */ for (i = 1; i <= 2; i++) { if (i >= right) { c(i) = 0; t(i) = 0; } else { c(i) = rstr[i]; /* may be '\0'; */ if (!th_isthai(c(i))) right = i--; else { t(i) = twbtype(c(i)); if (!(t(i) & A)) right = i--; } } } /* // get c(-2..-3), t(-2..-3) */ for (i = -2, j = -2; i >= -3; j--) { if (j < -left) { c(i) = 0; t(i) = 0; i--; } else { c(i) = lstr[j]; if (!th_isthai(c(i))) left = 0; else { t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); if (!(t(i) & A)) left = 0; else { if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) { c(i + 1) = c(i); t(i + 1) = t(i); } else i--; } } } } /* // prohibit the unlikely */ if ((t(-1) & C) && (t(0) & C)) { if ((t(-1) & CHE) || (t(0) & CHB)) return -1; } /* // special case : vlao, C/ sara_a|aa, !sara_a */ if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0; /* // prohibit break */ if (t(0) & NB) return -1; if (t(-1) & NE) return -1; /* // apply 100% rules */ if (t(-1) & VRE) { if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; return -1; /* usually too short syllable, part of word */ } if (t(-2) & VRE) return -1; if ((t(0) & C) && (t(1) & (VR | MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ if (t(-1) & (V | M)) return 0; /* !C/ C, NB */ if (t(-2) & VRS) return 0; /* VRS, C / C, NB */ if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ } } if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M))) return 0; /* VRS, C/ !C */ if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ } /* // apply 90% rules */ if (t(0) & VL) return 0; if (t(1) & VL) return -1; if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0; /* //return -1; // apply 80% rules */ if (t(0) & CHE) { if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ /*if(t(-1) & VRX) return 0; // VRX/ CHE */ if (t(-1) & VC) return 0; /* VC/ CHE */ } if (t(-1) & CHB) { if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ if (t(0) & VC) return 0; /* CHB/ VC */ } if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ if (t(-2) & VLI) return 0; /* VLI,C/C,VR .*/ else { /* vlao, C ? C , VR */ if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ } } /* C,MT,C */ if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; return -1; } int TrbFollowing(const th_char* begin, int length, int offset) /* //(ThBreakIterator *this, int offset) */ { const th_char* w = begin + offset; const th_char* end = begin + length; while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; if (w < end && *w && !th_isthai(*w)) { int english = FALSE; while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { if (th_isalpha(*w)) english = TRUE; w++; } if (english || w == end || (!th_isthai(*w) && th_isspace(*w))) return w - begin; } if (w == end || *w == 0 || !th_isthai(*w)) return w - begin; w++; if (w < end && *w && th_isthai(*w)) { int brk = TrbWordBreakPos(begin, w - begin, w, end - w); while (brk < 0) { w++; if (w == end || *w == 0 || !th_isthai(*w)) break; brk = TrbWordBreakPos(begin, w - begin, w, end - w); } if (brk > 0) w += brk; } if (w < end && *w && !th_isthai(*w)) { while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) && !th_isspace(*w)) w++; } return w - begin; } /* ///////////////////////////////////////////////// */ const twb_t _TwbType[0x100 - 0xa0] = { #if 0 /* 80 € */ T, /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 90  */ T, /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, #endif /* a0   */ 0, /* a1 ¡ */ CS, /* a2 ¢ */ CS | CHE, /* a3 £ */ CC | CHE, /* a4 € */ CS | CHE, /* a5 ¥ */ CC | CHE, /* a6 Š */ CS, /* a7 § */ CS | CHB, /* a8 š */ CS, /* a9 © */ CC | CHE, /* aa ª */ CS, /* ab « */ CC | CHE, /* ac ¬ */ CC | CHB | CHE, /* ad ­ */ CS | CHB, /* ae ® */ CS | CHB, /* af ¯ */ CS | CHB, /* b0 ° */ CS, /* b1 ± */ CS | CHB | CHE, /* b2 ² */ CS | CHB | CHE, /* b3 ³ */ CS | CHB, /* b4 Ž */ CS, /* b5 µ */ CS, /* b6 ¶ */ CS, /* b7 · */ CS, /* b8 ž */ CS, /* b9 ¹ */ CS, /* ba º */ CS, /* bb » */ CS, /* bc Œ */ CC | CHE, /* bd œ */ CC | CHE, /* be Ÿ */ CS, /* bf ¿ */ CS, /* c0 À */ CS | CHE, /* c1 Á */ CS, /* c2  */ CS, /* c3 à */ CS | C2 | CHE, /* ? add CHE */ /* c4 Ä */ VC | CHE, /* c5 Å */ CS | C2, /* c6 Æ */ VC | CHE, /* c7 Ç */ VC | C2, /* c8 È */ CS, /* c9 É */ CS | CHB, /* ca Ê */ CS | CHE, /* cb Ë */ CC | CHE, /* CC Ì */ CS | CHB | CHE, /* cd Í */ VC, /* ce Î */ CC | CHE, /* cf Ï */ T, /* d0 Ð */ VRE | VRA, /* d1 Ñ */ VRS, /* d2 Ò */ VRX | VRA, /* d3 Ó */ VRE, /* d4 Ô */ VRX | VRA, /* d5 Õ */ VRX | VRA, /* d6 Ö */ VRS, /* d7 × */ VRS | VRA, /* d8 Ø */ VRX, /* d9 Ù */ VRX, /* da Ú */ T, /* db Û */ 0, /* dc Ü */ 0, /* dd Ý */ 0, /* de Þ */ 0, /* df ß */ T, /* e0 à */ VLA, /* e1 á */ VLO, /* e2 â */ VLO, /* e3 ã */ VLI, /* e4 ä */ VLI, /* e5 å */ VRE, /* e6 æ */ M, /* e7 ç */ M, /* e8 è */ M | MT, /* e9 é */ M | MT, /* ea ê */ M | MT, /* eb ë */ M | MT, /* ec ì */ M, /* ed í */ T, /* ee î */ T, /* ef ï */ T, /* f0 ð */ T, /* f1 ñ */ T, /* f2 ò */ T, /* f3 ó */ T, /* f4 ô */ T, /* f5 õ */ T, /* f6 ö */ T, /* f7 ÷ */ T, /* f8 ø */ T, /* f9 ù */ T, /* fa ú */ T, /* fb û */ T, /* fc ü */ 0, /* fd ý */ 0, /* fe þ */ 0, /* ff ’ */ 0};