summaryrefslogtreecommitdiffstats
path: root/strings/ctype-uca-scanner_next.inl
diff options
context:
space:
mode:
Diffstat (limited to 'strings/ctype-uca-scanner_next.inl')
-rw-r--r--strings/ctype-uca-scanner_next.inl223
1 files changed, 223 insertions, 0 deletions
diff --git a/strings/ctype-uca-scanner_next.inl b/strings/ctype-uca-scanner_next.inl
new file mode 100644
index 00000000..e8489ddf
--- /dev/null
+++ b/strings/ctype-uca-scanner_next.inl
@@ -0,0 +1,223 @@
+/* Copyright (c) 2004, 2013, Oracle and/or its affiliates.
+ Copyright (c) 2009, 2021, MariaDB
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; version 2
+ of the License.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+ MA 02110-1335 USA */
+
+
+#ifdef SCANNER_NEXT_NCHARS
+
+#define SCANNER_NEXT_RETURN(_w,_n) \
+ do { weight_and_nchars_t rc= {_w, _n}; return rc; } while(0)
+
+#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
+ do { \
+ weight_and_nchars_t rc= { _cnt->weight[0], \
+ _ignorable_nchars + \
+ my_contraction_char_length(_cnt) }; \
+ return rc; \
+ } while(0)
+
+#else
+
+#define SCANNER_NEXT_RETURN(_w,_n) do { return _w; } while (0)
+
+#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
+ do { return _cnt->weight[0]; } while(0)
+
+#endif
+
+static inline
+#ifdef SCANNER_NEXT_NCHARS
+weight_and_nchars_t
+MY_FUNCTION_NAME(scanner_next_with_nchars)(my_uca_scanner *scanner,
+ const my_uca_scanner_param *param,
+ size_t nchars)
+#else
+int
+MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner,
+ const my_uca_scanner_param *param)
+#endif
+{
+#ifdef SCANNER_NEXT_NCHARS
+ uint ignorable_nchars;
+#define LOCAL_MAX_CONTRACTION_LENGTH nchars
+#else
+#define LOCAL_MAX_CONTRACTION_LENGTH MY_UCA_MAX_CONTRACTION
+#endif
+ uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
+ if (weight)
+ {
+ /*
+ More weights left from the previous step.
+ Return the next weight from the current expansion.
+ Return "0" as "nchars". The real nchars was set on a previous
+ iteration.
+ */
+ SCANNER_NEXT_RETURN(weight, 0);
+ }
+
+#ifdef SCANNER_NEXT_NCHARS
+ for (ignorable_nchars= 0 ; ; ignorable_nchars++)
+#else
+ for ( ; ; )
+#endif
+ {
+ const uint16 *wpage;
+ int mblen;
+ my_wc_t currwc= 0;
+ const uint16 *cweight;
+
+#if MY_UCA_ASCII_OPTIMIZE && !defined(SCANNER_NEXT_NCHARS)
+ if (scanner->sbeg + 1 < scanner->send)
+ {
+ const MY_UCA_2BYTES_ITEM *ww;
+ ww= my_uca_level_booster_2bytes_item_addr_const(param->level->booster,
+ scanner->sbeg[0],
+ scanner->sbeg[1]);
+ if (my_uca_2bytes_item_is_applicable(ww))
+ {
+ /*
+ Byte pairs that make 2-byte head characters in previous
+ context pairs are marked as not applicable for optimization
+ during the collation initialization. So when we come here
+ sbeg[0] and sbeg[1] are:
+ - either two ASCII characters
+ - or one 2-byte character which IS NOT a previous context head
+ Just remember sbeg[1] as the previous character for simplicity.
+ This may erroneously interpret bytes 0x80..0x9F as previous context
+ head characters U+0080..U+009F. However, CLDR does not have any real
+ collations that use these characters as previous context heads.
+ */
+ scanner->page= 0;
+ scanner->code= (int) scanner->sbeg[1];
+ scanner->sbeg+= 2;
+ if ((weight= my_uca_scanner_set_weight(scanner, ww->weight)))
+ {
+ /*
+ TODO: add support for scanner_next_with_nchars and do this:
+ SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+ */
+ return weight;
+ }
+ continue; /* Ignorable character */
+ }
+ /* 2 byte optimization is not applicable, go the slow path */
+ }
+#endif
+
+
+ /* Get next character */
+#if MY_UCA_ASCII_OPTIMIZE
+ /* Get next ASCII character */
+ if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
+ {
+ currwc= scanner->sbeg[0];
+ scanner->sbeg+= 1;
+
+#if MY_UCA_COMPILE_CONTRACTIONS
+ if (my_uca_needs_context_handling(param->level, currwc))
+ {
+ const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param,
+ currwc,
+ LOCAL_MAX_CONTRACTION_LENGTH);
+ if (cnt)
+ {
+ if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
+ SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
+ continue; /* Ignorable contraction */
+ }
+ }
+#endif
+
+ scanner->page= 0;
+ scanner->code= (int) currwc;
+ cweight= param->level->weights[0] + scanner->code * param->level->lengths[0];
+ if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+ SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+ continue; /* Ignorable character */
+ }
+ else
+#endif
+ /* Get next MB character */
+ if (((mblen= MY_MB_WC(scanner, param, &currwc, scanner->sbeg,
+ scanner->send)) <= 0))
+ {
+ if (scanner->sbeg >= scanner->send)
+ {
+ /* No more bytes, end of line reached */
+ SCANNER_NEXT_RETURN(-1, ignorable_nchars);
+ }
+ /*
+ There are some more bytes left. Non-positive mb_len means that
+ we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
+ */
+ if ((scanner->sbeg+= param->cs->mbminlen) > scanner->send)
+ {
+ /* For safety purposes don't go beyond the string range. */
+ scanner->sbeg= scanner->send;
+ }
+ /*
+ Treat every complete or incomplete mbminlen unit as a weight which is
+ greater than weight for any possible normal character.
+ 0xFFFF is greater than any possible weight in the UCA weight table.
+ */
+ SCANNER_NEXT_RETURN(0xFFFF, ignorable_nchars + 1);
+ }
+
+ scanner->sbeg+= mblen;
+ if (currwc > param->level->maxchar)
+ {
+ SCANNER_NEXT_RETURN(my_uca_scanner_set_weight_outside_maxchar(scanner),
+ ignorable_nchars + 1);
+ }
+
+#if MY_UCA_COMPILE_CONTRACTIONS
+ if (my_uca_needs_context_handling(param->level, currwc))
+ {
+ const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param, currwc,
+ LOCAL_MAX_CONTRACTION_LENGTH);
+ if (cnt)
+ {
+ if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
+ SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
+ continue; /* Ignorable contraction */
+ }
+ }
+#endif
+
+ /* Process single character */
+ scanner->page= currwc >> 8;
+ scanner->code= currwc & 0xFF;
+
+ /* If weight page for w[0] does not exist, then calculate algoritmically */
+ if (!(wpage= param->level->weights[scanner->page]))
+ SCANNER_NEXT_RETURN(my_uca_scanner_next_implicit(scanner, param),
+ ignorable_nchars + 1);
+
+ /* Calculate pointer to w[0]'s weight, using page and offset */
+ cweight= wpage + scanner->code * param->level->lengths[scanner->page];
+ if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+ SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+ continue; /* Ignorable character */
+ }
+
+ SCANNER_NEXT_RETURN(0, 0); /* Not reachable */
+}
+
+#undef SCANNER_NEXT_NCHARS
+#undef SCANNER_NEXT_RETURN
+#undef SCANNER_NEXT_RETURN_CONTRACTION
+#undef LOCAL_MAX_CONTRACTION_LENGTH