1 files changed, 348 insertions, 0 deletions
diff --git a/src/VBox/Runtime/common/string/utf-8-case.cpp b/src/VBox/Runtime/common/string/utf-8-case.cpp
new file mode 100644
index 00000000..dbd0e026
--- /dev/null
+++ b/src/VBox/Runtime/common/string/utf-8-case.cpp
@@ -0,0 +1,348 @@
+/* $Id: utf-8-case.cpp $ */
+/** @file
+ * IPRT - UTF-8 Case Sensitivity and Folding, Part 1.
+ */
+
+/*
+ * Copyright (C) 2006-2020 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
+ * VirtualBox OSE distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ */
+
+
+/*********************************************************************************************************************************
+*   Header Files                                                                                                                 *
+*********************************************************************************************************************************/
+#include <iprt/string.h>
+#include "internal/iprt.h"
+
+#include <iprt/uni.h>
+#include <iprt/alloc.h>
+#include <iprt/assert.h>
+#include <iprt/errcore.h>
+#include "internal/string.h"
+
+
+
+/**
+ * Performs a case insensitive string compare between two UTF-8 strings.
+ *
+ * This is a simplified compare, as only the simplified lower/upper case folding
+ * specified by the unicode specs are used. It does not consider character pairs
+ * as they are used in some languages, just simple upper & lower case compares.
+ *
+ * The result is the difference between the mismatching codepoints after they
+ * both have been lower cased.
+ *
+ * If the string encoding is invalid the function will assert (strict builds)
+ * and use RTStrCmp for the remainder of the string.
+ *
+ * @returns < 0 if the first string less than the second string.
+ * @returns 0 if the first string identical to the second string.
+ * @returns > 0 if the first string greater than the second string.
+ * @param   psz1        First UTF-8 string. Null is allowed.
+ * @param   psz2        Second UTF-8 string. Null is allowed.
+ */
+RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
+{
+    if (psz1 == psz2)
+        return 0;
+    if (!psz1)
+        return -1;
+    if (!psz2)
+        return 1;
+
+    const char *pszStart1 = psz1;
+    for (;;)
+    {
+        /* Get the codepoints */
+        RTUNICP uc1;
+        int rc = RTStrGetCpEx(&psz1, &uc1);
+        if (RT_FAILURE(rc))
+        {
+            AssertRC(rc);
+            psz1--;
+            break;
+        }
+
+        RTUNICP uc2;
+        rc = RTStrGetCpEx(&psz2, &uc2);
+        if (RT_FAILURE(rc))
+        {
+            AssertRC(rc);
+            psz2--;
+            psz1 = RTStrPrevCp(pszStart1, psz1);
+            break;
+        }
+
+        /* compare */
+        int iDiff = uc1 - uc2;
+        if (iDiff)
+        {
+            iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
+            if (iDiff)
+            {
+                iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
+                if (iDiff)
+                    return iDiff;
+            }
+        }
+
+        /* hit the terminator? */
+        if (!uc1)
+            return 0;
+    }
+
+    /* Hit some bad encoding, continue in case sensitive mode. */
+    return RTStrCmp(psz1, psz2);
+}
+RT_EXPORT_SYMBOL(RTStrICmp);
+
+
+/**
+ * Performs a case insensitive string compare between two UTF-8 strings, given a
+ * maximum string length.
+ *
+ * This is a simplified compare, as only the simplified lower/upper case folding
+ * specified by the unicode specs are used. It does not consider character pairs
+ * as they are used in some languages, just simple upper & lower case compares.
+ *
+ * The result is the difference between the mismatching codepoints after they
+ * both have been lower cased.
+ *
+ * If the string encoding is invalid the function will assert (strict builds)
+ * and use RTStrCmp for the remainder of the string.
+ *
+ * @returns < 0 if the first string less than the second string.
+ * @returns 0 if the first string identical to the second string.
+ * @returns > 0 if the first string greater than the second string.
+ * @param   psz1        First UTF-8 string. Null is allowed.
+ * @param   psz2        Second UTF-8 string. Null is allowed.
+ * @param   cchMax      Maximum string length
+ */
+RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
+{
+    if (cchMax == 0)
+        return 0;
+    if (psz1 == psz2)
+        return 0;
+    if (!psz1)
+        return -1;
+    if (!psz2)
+        return 1;
+
+    for (;;)
+    {
+        /* Get the codepoints */
+        RTUNICP uc1;
+        size_t cchMax2 = cchMax;
+        int rc = RTStrGetCpNEx(&psz1, &cchMax, &uc1);
+        if (RT_FAILURE(rc))
+        {
+            AssertRC(rc);
+            psz1--;
+            cchMax++;
+            break;
+        }
+
+        RTUNICP uc2;
+        rc = RTStrGetCpNEx(&psz2, &cchMax2, &uc2);
+        if (RT_FAILURE(rc))
+        {
+            AssertRC(rc);
+            psz2--;
+            psz1 -= (cchMax - cchMax2 + 1);  /* This can't overflow, can it? */
+            cchMax = cchMax2 + 1;
+            break;
+        }
+
+        /* compare */
+        int iDiff = uc1 - uc2;
+        if (iDiff)
+        {
+            iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
+            if (iDiff)
+            {
+                iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
+                if (iDiff)
+                    return iDiff;
+            }
+        }
+
+        /* hit the terminator? */
+        if (!uc1 || cchMax == 0)
+            return 0;
+    }
+
+    /* Hit some bad encoding, continue in case insensitive mode. */
+    return RTStrNCmp(psz1, psz2, cchMax);
+}
+RT_EXPORT_SYMBOL(RTStrNICmp);
+
+
+RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
+{
+    /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
+    if (!pszHaystack)
+        return NULL;
+    if (!pszNeedle)
+        return NULL;
+
+    /* The empty string matches everything. */
+    if (!*pszNeedle)
+        return (char *)pszHaystack;
+
+    /*
+     * The search strategy is to pick out the first char of the needle, fold it,
+     * and match it against the haystack code point by code point. When encountering
+     * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
+     */
+    const char * const pszNeedleStart = pszNeedle;
+    RTUNICP Cp0;
+    RTStrGetCpEx(&pszNeedle, &Cp0);     /* pszNeedle is advanced one code point. */
+    size_t const    cchNeedle   = strlen(pszNeedle);
+    size_t const    cchNeedleCp0= pszNeedle - pszNeedleStart;
+    RTUNICP const   Cp0Lower    = RTUniCpToLower(Cp0);
+    RTUNICP const   Cp0Upper    = RTUniCpToUpper(Cp0);
+    if (    Cp0Lower == Cp0Upper
+        &&  Cp0Lower == Cp0)
+    {
+        /* Cp0 is not a case sensitive char. */
+        for (;;)
+        {
+            RTUNICP Cp;
+            RTStrGetCpEx(&pszHaystack, &Cp);
+            if (!Cp)
+                break;
+            if (    Cp == Cp0
+                &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
+                return (char *)pszHaystack - cchNeedleCp0;
+        }
+    }
+    else if (   Cp0Lower == Cp0
+             || Cp0Upper != Cp0)
+    {
+        /* Cp0 is case sensitive */
+        for (;;)
+        {
+            RTUNICP Cp;
+            RTStrGetCpEx(&pszHaystack, &Cp);
+            if (!Cp)
+                break;
+            if (    (   Cp == Cp0Upper
+                     || Cp == Cp0Lower)
+                &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
+                return (char *)pszHaystack - cchNeedleCp0;
+        }
+    }
+    else
+    {
+        /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
+        for (;;)
+        {
+            RTUNICP Cp;
+            RTStrGetCpEx(&pszHaystack, &Cp);
+            if (!Cp)
+                break;
+            if (    (   Cp == Cp0
+                     || Cp == Cp0Upper
+                     || Cp == Cp0Lower)
+                &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
+                return (char *)pszHaystack - cchNeedleCp0;
+        }
+    }
+
+
+    return NULL;
+}
+RT_EXPORT_SYMBOL(RTStrIStr);
+
+
+RTDECL(char *) RTStrToLower(char *psz)
+{
+    /*
+     * Loop the code points in the string, converting them one by one.
+     *
+     * ASSUMES that the folded code points have an encoding that is equal or
+     *         shorter than the original (this is presently correct).
+     */
+    const char *pszSrc = psz;
+    char       *pszDst = psz;
+    RTUNICP     uc;
+    do
+    {
+        int rc = RTStrGetCpEx(&pszSrc, &uc);
+        if (RT_SUCCESS(rc))
+        {
+            RTUNICP uc2 = RTUniCpToLower(uc);
+            if (RT_LIKELY(   uc2 == uc
+                          || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
+                pszDst = RTStrPutCp(pszDst, uc2);
+            else
+                pszDst = RTStrPutCp(pszDst, uc);
+        }
+        else
+        {
+            /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
+            AssertRC(rc);
+            *pszDst++ = pszSrc[-1];
+        }
+        Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
+    } while (uc != 0);
+
+    return psz;
+}
+RT_EXPORT_SYMBOL(RTStrToLower);
+
+
+RTDECL(char *) RTStrToUpper(char *psz)
+{
+    /*
+     * Loop the code points in the string, converting them one by one.
+     *
+     * ASSUMES that the folded code points have an encoding that is equal or
+     *         shorter than the original (this is presently correct).
+     */
+    const char *pszSrc = psz;
+    char       *pszDst = psz;
+    RTUNICP     uc;
+    do
+    {
+        int rc = RTStrGetCpEx(&pszSrc, &uc);
+        if (RT_SUCCESS(rc))
+        {
+            RTUNICP uc2 = RTUniCpToUpper(uc);
+            if (RT_LIKELY(   uc2 == uc
+                          || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
+                pszDst = RTStrPutCp(pszDst, uc2);
+            else
+                pszDst = RTStrPutCp(pszDst, uc);
+        }
+        else
+        {
+            /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
+            AssertRC(rc);
+            *pszDst++ = pszSrc[-1];
+        }
+        Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
+    } while (uc != 0);
+
+    return psz;
+}
+RT_EXPORT_SYMBOL(RTStrToUpper);
+