summaryrefslogtreecommitdiffstats
path: root/src/VBox/Runtime/common/string/utf-8-case.cpp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/VBox/Runtime/common/string/utf-8-case.cpp348
1 files changed, 348 insertions, 0 deletions
diff --git a/src/VBox/Runtime/common/string/utf-8-case.cpp b/src/VBox/Runtime/common/string/utf-8-case.cpp
new file mode 100644
index 00000000..274d92ae
--- /dev/null
+++ b/src/VBox/Runtime/common/string/utf-8-case.cpp
@@ -0,0 +1,348 @@
+/* $Id: utf-8-case.cpp $ */
+/** @file
+ * IPRT - UTF-8 Case Sensitivity and Folding, Part 1.
+ */
+
+/*
+ * Copyright (C) 2006-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
+ * VirtualBox OSE distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include <iprt/string.h>
+#include "internal/iprt.h"
+
+#include <iprt/uni.h>
+#include <iprt/alloc.h>
+#include <iprt/assert.h>
+#include <iprt/errcore.h>
+#include "internal/string.h"
+
+
+
+/**
+ * Performs a case insensitive string compare between two UTF-8 strings.
+ *
+ * This is a simplified compare, as only the simplified lower/upper case folding
+ * specified by the unicode specs are used. It does not consider character pairs
+ * as they are used in some languages, just simple upper & lower case compares.
+ *
+ * The result is the difference between the mismatching codepoints after they
+ * both have been lower cased.
+ *
+ * If the string encoding is invalid the function will assert (strict builds)
+ * and use RTStrCmp for the remainder of the string.
+ *
+ * @returns < 0 if the first string less than the second string.
+ * @returns 0 if the first string identical to the second string.
+ * @returns > 0 if the first string greater than the second string.
+ * @param psz1 First UTF-8 string. Null is allowed.
+ * @param psz2 Second UTF-8 string. Null is allowed.
+ */
+RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
+{
+ if (psz1 == psz2)
+ return 0;
+ if (!psz1)
+ return -1;
+ if (!psz2)
+ return 1;
+
+ const char *pszStart1 = psz1;
+ for (;;)
+ {
+ /* Get the codepoints */
+ RTUNICP uc1;
+ int rc = RTStrGetCpEx(&psz1, &uc1);
+ if (RT_FAILURE(rc))
+ {
+ AssertRC(rc);
+ psz1--;
+ break;
+ }
+
+ RTUNICP uc2;
+ rc = RTStrGetCpEx(&psz2, &uc2);
+ if (RT_FAILURE(rc))
+ {
+ AssertRC(rc);
+ psz2--;
+ psz1 = RTStrPrevCp(pszStart1, psz1);
+ break;
+ }
+
+ /* compare */
+ int iDiff = uc1 - uc2;
+ if (iDiff)
+ {
+ iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
+ if (iDiff)
+ {
+ iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
+ if (iDiff)
+ return iDiff;
+ }
+ }
+
+ /* hit the terminator? */
+ if (!uc1)
+ return 0;
+ }
+
+ /* Hit some bad encoding, continue in case sensitive mode. */
+ return RTStrCmp(psz1, psz2);
+}
+RT_EXPORT_SYMBOL(RTStrICmp);
+
+
+/**
+ * Performs a case insensitive string compare between two UTF-8 strings, given a
+ * maximum string length.
+ *
+ * This is a simplified compare, as only the simplified lower/upper case folding
+ * specified by the unicode specs are used. It does not consider character pairs
+ * as they are used in some languages, just simple upper & lower case compares.
+ *
+ * The result is the difference between the mismatching codepoints after they
+ * both have been lower cased.
+ *
+ * If the string encoding is invalid the function will assert (strict builds)
+ * and use RTStrCmp for the remainder of the string.
+ *
+ * @returns < 0 if the first string less than the second string.
+ * @returns 0 if the first string identical to the second string.
+ * @returns > 0 if the first string greater than the second string.
+ * @param psz1 First UTF-8 string. Null is allowed.
+ * @param psz2 Second UTF-8 string. Null is allowed.
+ * @param cchMax Maximum string length
+ */
+RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
+{
+ if (cchMax == 0)
+ return 0;
+ if (psz1 == psz2)
+ return 0;
+ if (!psz1)
+ return -1;
+ if (!psz2)
+ return 1;
+
+ for (;;)
+ {
+ /* Get the codepoints */
+ RTUNICP uc1;
+ size_t cchMax2 = cchMax;
+ int rc = RTStrGetCpNEx(&psz1, &cchMax, &uc1);
+ if (RT_FAILURE(rc))
+ {
+ AssertRC(rc);
+ psz1--;
+ cchMax++;
+ break;
+ }
+
+ RTUNICP uc2;
+ rc = RTStrGetCpNEx(&psz2, &cchMax2, &uc2);
+ if (RT_FAILURE(rc))
+ {
+ AssertRC(rc);
+ psz2--;
+ psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
+ cchMax = cchMax2 + 1;
+ break;
+ }
+
+ /* compare */
+ int iDiff = uc1 - uc2;
+ if (iDiff)
+ {
+ iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
+ if (iDiff)
+ {
+ iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
+ if (iDiff)
+ return iDiff;
+ }
+ }
+
+ /* hit the terminator? */
+ if (!uc1 || cchMax == 0)
+ return 0;
+ }
+
+ /* Hit some bad encoding, continue in case insensitive mode. */
+ return RTStrNCmp(psz1, psz2, cchMax);
+}
+RT_EXPORT_SYMBOL(RTStrNICmp);
+
+
+RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
+{
+ /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
+ if (!pszHaystack)
+ return NULL;
+ if (!pszNeedle)
+ return NULL;
+
+ /* The empty string matches everything. */
+ if (!*pszNeedle)
+ return (char *)pszHaystack;
+
+ /*
+ * The search strategy is to pick out the first char of the needle, fold it,
+ * and match it against the haystack code point by code point. When encountering
+ * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
+ */
+ const char * const pszNeedleStart = pszNeedle;
+ RTUNICP Cp0;
+ RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
+ size_t const cchNeedle = strlen(pszNeedle);
+ size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
+ RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
+ RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
+ if ( Cp0Lower == Cp0Upper
+ && Cp0Lower == Cp0)
+ {
+ /* Cp0 is not a case sensitive char. */
+ for (;;)
+ {
+ RTUNICP Cp;
+ RTStrGetCpEx(&pszHaystack, &Cp);
+ if (!Cp)
+ break;
+ if ( Cp == Cp0
+ && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
+ return (char *)pszHaystack - cchNeedleCp0;
+ }
+ }
+ else if ( Cp0Lower == Cp0
+ || Cp0Upper != Cp0)
+ {
+ /* Cp0 is case sensitive */
+ for (;;)
+ {
+ RTUNICP Cp;
+ RTStrGetCpEx(&pszHaystack, &Cp);
+ if (!Cp)
+ break;
+ if ( ( Cp == Cp0Upper
+ || Cp == Cp0Lower)
+ && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
+ return (char *)pszHaystack - cchNeedleCp0;
+ }
+ }
+ else
+ {
+ /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
+ for (;;)
+ {
+ RTUNICP Cp;
+ RTStrGetCpEx(&pszHaystack, &Cp);
+ if (!Cp)
+ break;
+ if ( ( Cp == Cp0
+ || Cp == Cp0Upper
+ || Cp == Cp0Lower)
+ && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
+ return (char *)pszHaystack - cchNeedleCp0;
+ }
+ }
+
+
+ return NULL;
+}
+RT_EXPORT_SYMBOL(RTStrIStr);
+
+
+RTDECL(char *) RTStrToLower(char *psz)
+{
+ /*
+ * Loop the code points in the string, converting them one by one.
+ *
+ * ASSUMES that the folded code points have an encoding that is equal or
+ * shorter than the original (this is presently correct).
+ */
+ const char *pszSrc = psz;
+ char *pszDst = psz;
+ RTUNICP uc;
+ do
+ {
+ int rc = RTStrGetCpEx(&pszSrc, &uc);
+ if (RT_SUCCESS(rc))
+ {
+ RTUNICP uc2 = RTUniCpToLower(uc);
+ if (RT_LIKELY( uc2 == uc
+ || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
+ pszDst = RTStrPutCp(pszDst, uc2);
+ else
+ pszDst = RTStrPutCp(pszDst, uc);
+ }
+ else
+ {
+ /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
+ AssertRC(rc);
+ *pszDst++ = pszSrc[-1];
+ }
+ Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
+ } while (uc != 0);
+
+ return psz;
+}
+RT_EXPORT_SYMBOL(RTStrToLower);
+
+
+RTDECL(char *) RTStrToUpper(char *psz)
+{
+ /*
+ * Loop the code points in the string, converting them one by one.
+ *
+ * ASSUMES that the folded code points have an encoding that is equal or
+ * shorter than the original (this is presently correct).
+ */
+ const char *pszSrc = psz;
+ char *pszDst = psz;
+ RTUNICP uc;
+ do
+ {
+ int rc = RTStrGetCpEx(&pszSrc, &uc);
+ if (RT_SUCCESS(rc))
+ {
+ RTUNICP uc2 = RTUniCpToUpper(uc);
+ if (RT_LIKELY( uc2 == uc
+ || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
+ pszDst = RTStrPutCp(pszDst, uc2);
+ else
+ pszDst = RTStrPutCp(pszDst, uc);
+ }
+ else
+ {
+ /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
+ AssertRC(rc);
+ *pszDst++ = pszSrc[-1];
+ }
+ Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
+ } while (uc != 0);
+
+ return psz;
+}
+RT_EXPORT_SYMBOL(RTStrToUpper);
+