1 files changed, 223 insertions, 0 deletions
diff --git a/strings/ctype-uca-scanner_next.inl b/strings/ctype-uca-scanner_next.inl
new file mode 100644
index 00000000..e8489ddf
--- /dev/null
+++ b/strings/ctype-uca-scanner_next.inl
@@ -0,0 +1,223 @@
+/* Copyright (c) 2004, 2013, Oracle and/or its affiliates.
+   Copyright (c) 2009, 2021, MariaDB
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public
+   License as published by the Free Software Foundation; version 2
+   of the License.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with this library; if not, write to the Free
+   Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+   MA 02110-1335  USA */
+
+
+#ifdef SCANNER_NEXT_NCHARS
+
+#define SCANNER_NEXT_RETURN(_w,_n) \
+  do { weight_and_nchars_t rc= {_w, _n}; return rc; } while(0)
+
+#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
+  do { \
+    weight_and_nchars_t rc= { _cnt->weight[0], \
+                              _ignorable_nchars + \
+                              my_contraction_char_length(_cnt) }; \
+     return rc; \
+  } while(0)
+
+#else
+
+#define SCANNER_NEXT_RETURN(_w,_n) do { return _w; } while (0)
+
+#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
+  do { return _cnt->weight[0]; } while(0)
+
+#endif
+
+static inline
+#ifdef SCANNER_NEXT_NCHARS
+weight_and_nchars_t
+MY_FUNCTION_NAME(scanner_next_with_nchars)(my_uca_scanner *scanner,
+                                           const my_uca_scanner_param *param,
+                                           size_t nchars)
+#else
+int
+MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner,
+                               const my_uca_scanner_param *param)
+#endif
+{
+#ifdef SCANNER_NEXT_NCHARS
+  uint ignorable_nchars;
+#define LOCAL_MAX_CONTRACTION_LENGTH nchars
+#else
+#define LOCAL_MAX_CONTRACTION_LENGTH MY_UCA_MAX_CONTRACTION
+#endif
+  uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
+  if (weight)
+  {
+    /*
+      More weights left from the previous step.
+      Return the next weight from the current expansion.
+      Return "0" as "nchars". The real nchars was set on a previous
+      iteration.
+    */
+    SCANNER_NEXT_RETURN(weight, 0);
+  }
+
+#ifdef SCANNER_NEXT_NCHARS
+  for (ignorable_nchars= 0 ; ; ignorable_nchars++)
+#else
+  for ( ; ; )
+#endif
+  {
+    const uint16 *wpage;
+    int mblen;
+    my_wc_t currwc= 0;
+    const uint16 *cweight;
+
+#if MY_UCA_ASCII_OPTIMIZE && !defined(SCANNER_NEXT_NCHARS)
+    if (scanner->sbeg + 1 < scanner->send)
+    {
+      const MY_UCA_2BYTES_ITEM *ww;
+      ww= my_uca_level_booster_2bytes_item_addr_const(param->level->booster,
+                                                      scanner->sbeg[0],
+                                                      scanner->sbeg[1]);
+      if (my_uca_2bytes_item_is_applicable(ww))
+      {
+        /*
+          Byte pairs that make 2-byte head characters in previous
+          context pairs are marked as not applicable for optimization
+          during the collation initialization. So when we come here
+          sbeg[0] and sbeg[1] are:
+          - either two ASCII characters
+          - or one 2-byte character which IS NOT a previous context head
+          Just remember sbeg[1] as the previous character for simplicity.
+          This may erroneously interpret bytes 0x80..0x9F as previous context
+          head characters U+0080..U+009F. However, CLDR does not have any real
+          collations that use these characters as previous context heads.
+        */
+        scanner->page= 0;
+        scanner->code= (int) scanner->sbeg[1];
+        scanner->sbeg+= 2;
+        if ((weight= my_uca_scanner_set_weight(scanner, ww->weight)))
+        {
+          /*
+            TODO: add support for scanner_next_with_nchars and do this:
+            SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+          */
+          return weight;
+        }
+        continue; /* Ignorable character */
+      }
+      /* 2 byte optimization is not applicable, go the slow path */
+    }
+#endif
+
+
+    /* Get next character */
+#if MY_UCA_ASCII_OPTIMIZE
+    /* Get next ASCII character */
+    if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
+    {
+      currwc= scanner->sbeg[0];
+      scanner->sbeg+= 1;
+
+#if MY_UCA_COMPILE_CONTRACTIONS
+      if (my_uca_needs_context_handling(param->level, currwc))
+      {
+        const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param,
+                                                              currwc,
+                                                  LOCAL_MAX_CONTRACTION_LENGTH);
+        if (cnt)
+        {
+          if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
+            SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
+          continue;  /* Ignorable contraction */
+        }
+      }
+#endif
+
+      scanner->page= 0;
+      scanner->code= (int) currwc;
+      cweight= param->level->weights[0] + scanner->code * param->level->lengths[0];
+      if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+        SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+      continue; /* Ignorable character */
+    }
+    else
+#endif
+    /* Get next MB character */
+    if (((mblen= MY_MB_WC(scanner, param, &currwc, scanner->sbeg,
+                                            scanner->send)) <= 0))
+    {
+      if (scanner->sbeg >= scanner->send)
+      {
+        /* No more bytes, end of line reached */
+        SCANNER_NEXT_RETURN(-1, ignorable_nchars);
+      }
+      /*
+        There are some more bytes left. Non-positive mb_len means that
+        we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
+      */
+      if ((scanner->sbeg+= param->cs->mbminlen) > scanner->send)
+      {
+        /* For safety purposes don't go beyond the string range. */
+        scanner->sbeg= scanner->send;
+      }
+      /*
+        Treat every complete or incomplete mbminlen unit as a weight which is
+        greater than weight for any possible normal character.
+        0xFFFF is greater than any possible weight in the UCA weight table.
+      */
+      SCANNER_NEXT_RETURN(0xFFFF, ignorable_nchars + 1);
+    }
+
+    scanner->sbeg+= mblen;
+    if (currwc > param->level->maxchar)
+    {
+      SCANNER_NEXT_RETURN(my_uca_scanner_set_weight_outside_maxchar(scanner),
+                          ignorable_nchars + 1);
+    }
+
+#if MY_UCA_COMPILE_CONTRACTIONS
+    if (my_uca_needs_context_handling(param->level, currwc))
+    {
+      const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param, currwc,
+                                                LOCAL_MAX_CONTRACTION_LENGTH);
+      if (cnt)
+      {
+        if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
+          SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
+        continue;  /* Ignorable contraction */
+      }
+    }
+#endif
+
+    /* Process single character */
+    scanner->page= currwc >> 8;
+    scanner->code= currwc & 0xFF;
+
+    /* If weight page for w[0] does not exist, then calculate algoritmically */
+    if (!(wpage= param->level->weights[scanner->page]))
+      SCANNER_NEXT_RETURN(my_uca_scanner_next_implicit(scanner, param),
+                          ignorable_nchars + 1);
+
+    /* Calculate pointer to w[0]'s weight, using page and offset */
+    cweight= wpage + scanner->code * param->level->lengths[scanner->page];
+    if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+      SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+    continue; /* Ignorable character */
+  }
+
+  SCANNER_NEXT_RETURN(0, 0); /* Not reachable */
+}
+
+#undef SCANNER_NEXT_NCHARS
+#undef SCANNER_NEXT_RETURN
+#undef SCANNER_NEXT_RETURN_CONTRACTION
+#undef LOCAL_MAX_CONTRACTION_LENGTH