summaryrefslogtreecommitdiffstats
path: root/include/iprt/uni.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/iprt/uni.h')
-rw-r--r--include/iprt/uni.h491
1 files changed, 491 insertions, 0 deletions
diff --git a/include/iprt/uni.h b/include/iprt/uni.h
new file mode 100644
index 00000000..ad804dcb
--- /dev/null
+++ b/include/iprt/uni.h
@@ -0,0 +1,491 @@
+/** @file
+ * IPRT - Unicode Code Points.
+ */
+
+/*
+ * Copyright (C) 2006-2022 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+#ifndef IPRT_INCLUDED_uni_h
+#define IPRT_INCLUDED_uni_h
+#ifndef RT_WITHOUT_PRAGMA_ONCE
+# pragma once
+#endif
+
+/** @defgroup grp_rt_uni RTUniCp - Unicode Code Points
+ * @ingroup grp_rt
+ * @{
+ */
+
+/** @def RTUNI_USE_WCTYPE
+ * Define RTUNI_USE_WCTYPE to not use the IPRT unicode data but the
+ * data which the C runtime library provides. */
+#ifdef DOXYGEN_RUNNING
+# define RTUNI_USE_WCTYPE
+#endif
+
+#include <iprt/types.h>
+#ifdef RTUNI_USE_WCTYPE
+# include <wctype.h>
+#endif
+
+RT_C_DECLS_BEGIN
+
+
+#ifndef RTUNI_USE_WCTYPE
+
+/**
+ * A unicode flags range.
+ * @internal
+ */
+typedef struct RTUNIFLAGSRANGE
+{
+ /** The first code point of the range. */
+ RTUNICP BeginCP;
+ /** The last + 1 code point of the range. */
+ RTUNICP EndCP;
+ /** Pointer to the array of case folded code points. */
+ const uint8_t *pafFlags;
+} RTUNIFLAGSRANGE;
+/** Pointer to a flags range.
+ * @internal */
+typedef RTUNIFLAGSRANGE *PRTUNIFLAGSRANGE;
+/** Pointer to a const flags range.
+ * @internal */
+typedef const RTUNIFLAGSRANGE *PCRTUNIFLAGSRANGE;
+
+/**
+ * A unicode case folded range.
+ * @internal
+ */
+typedef struct RTUNICASERANGE
+{
+ /** The first code point of the range. */
+ RTUNICP BeginCP;
+ /** The last + 1 code point of the range. */
+ RTUNICP EndCP;
+ /** Pointer to the array of case folded code points. */
+ PCRTUNICP paFoldedCPs;
+} RTUNICASERANGE;
+/** Pointer to a case folded range.
+ * @internal */
+typedef RTUNICASERANGE *PRTUNICASERANGE;
+/** Pointer to a const case folded range.
+ * @internal */
+typedef const RTUNICASERANGE *PCRTUNICASERANGE;
+
+/** @name Unicode Code Point Flags.
+ * @internal
+ * @{ */
+#define RTUNI_UPPER RT_BIT(0)
+#define RTUNI_LOWER RT_BIT(1)
+#define RTUNI_ALPHA RT_BIT(2)
+#define RTUNI_XDIGIT RT_BIT(3)
+#define RTUNI_DDIGIT RT_BIT(4)
+#define RTUNI_WSPACE RT_BIT(5)
+/*#define RTUNI_BSPACE RT_BIT(6) - later */
+/** When set, the codepoint requires further checking wrt NFC and NFD
+ * normalization. I.e. set when either of QC_NFD and QC_NFC are not Y. */
+#define RTUNI_QC_NFX RT_BIT(7)
+/** @} */
+
+
+/**
+ * Array of flags ranges.
+ * @internal
+ */
+extern RTDATADECL(const RTUNIFLAGSRANGE) g_aRTUniFlagsRanges[];
+
+/**
+ * Gets the flags for a unicode code point.
+ *
+ * @returns The flag mask. (RTUNI_*)
+ * @param CodePoint The unicode code point.
+ * @internal
+ */
+DECLINLINE(RTUNICP) rtUniCpFlags(RTUNICP CodePoint)
+{
+ PCRTUNIFLAGSRANGE pCur = &g_aRTUniFlagsRanges[0];
+ do
+ {
+ if (pCur->EndCP > CodePoint)
+ {
+ if (pCur->BeginCP <= CodePoint)
+ return pCur->pafFlags[CodePoint - pCur->BeginCP];
+ break;
+ }
+ pCur++;
+ } while (pCur->EndCP != RTUNICP_MAX);
+ return 0;
+}
+
+
+/**
+ * Checks if a unicode code point is upper case.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
+{
+ return (rtUniCpFlags(CodePoint) & RTUNI_UPPER) != 0;
+}
+
+
+/**
+ * Checks if a unicode code point is lower case.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
+{
+ return (rtUniCpFlags(CodePoint) & RTUNI_LOWER) != 0;
+}
+
+
+/**
+ * Checks if a unicode code point is case foldable.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
+{
+ /* Right enough. */
+ return (rtUniCpFlags(CodePoint) & (RTUNI_LOWER | RTUNI_UPPER)) != 0;
+}
+
+
+/**
+ * Checks if a unicode code point is alphabetic.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
+{
+ return (rtUniCpFlags(CodePoint) & RTUNI_ALPHA) != 0;
+}
+
+
+/**
+ * Checks if a unicode code point is a decimal digit.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
+{
+ return (rtUniCpFlags(CodePoint) & RTUNI_DDIGIT) != 0;
+}
+
+
+/**
+ * Checks if a unicode code point is a hexadecimal digit.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
+{
+ return (rtUniCpFlags(CodePoint) & RTUNI_XDIGIT) != 0;
+}
+
+
+/**
+ * Checks if a unicode code point is white space.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
+{
+ return (rtUniCpFlags(CodePoint) & RTUNI_WSPACE) != 0;
+}
+
+
+
+/**
+ * Array of uppercase ranges.
+ * @internal
+ */
+extern RTDATADECL(const RTUNICASERANGE) g_aRTUniUpperRanges[];
+
+/**
+ * Array of lowercase ranges.
+ * @internal
+ */
+extern RTDATADECL(const RTUNICASERANGE) g_aRTUniLowerRanges[];
+
+
+/**
+ * Folds a unicode code point using the specified range array.
+ *
+ * @returns FOlded code point.
+ * @param CodePoint The unicode code point to fold.
+ * @param pCur The case folding range to use.
+ */
+DECLINLINE(RTUNICP) rtUniCpFold(RTUNICP CodePoint, PCRTUNICASERANGE pCur)
+{
+ do
+ {
+ if (pCur->EndCP > CodePoint)
+ {
+ if (pCur->BeginCP <= CodePoint)
+ CodePoint = pCur->paFoldedCPs[CodePoint - pCur->BeginCP];
+ break;
+ }
+ pCur++;
+ } while (pCur->EndCP != RTUNICP_MAX);
+ return CodePoint;
+}
+
+
+/**
+ * Folds a unicode code point to upper case.
+ *
+ * @returns Folded code point.
+ * @param CodePoint The unicode code point to fold.
+ */
+DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
+{
+ return rtUniCpFold(CodePoint, &g_aRTUniUpperRanges[0]);
+}
+
+
+/**
+ * Folds a unicode code point to lower case.
+ *
+ * @returns Folded code point.
+ * @param CodePoint The unicode code point to fold.
+ */
+DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
+{
+ return rtUniCpFold(CodePoint, &g_aRTUniLowerRanges[0]);
+}
+
+
+#else /* RTUNI_USE_WCTYPE */
+
+
+/**
+ * Checks if a unicode code point is upper case.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
+{
+ return !!iswupper(CodePoint);
+}
+
+
+/**
+ * Checks if a unicode code point is lower case.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
+{
+ return !!iswlower(CodePoint);
+}
+
+
+/**
+ * Checks if a unicode code point is case foldable.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
+{
+ /* Right enough. */
+ return iswupper(CodePoint) || iswlower(CodePoint);
+}
+
+
+/**
+ * Checks if a unicode code point is alphabetic.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
+{
+ return !!iswalpha(CodePoint);
+}
+
+
+/**
+ * Checks if a unicode code point is a decimal digit.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
+{
+ return !!iswdigit(CodePoint);
+}
+
+
+/**
+ * Checks if a unicode code point is a hexadecimal digit.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
+{
+ return !!iswxdigit(CodePoint);
+}
+
+
+/**
+ * Checks if a unicode code point is white space.
+ *
+ * @returns true if it is.
+ * @returns false if it isn't.
+ * @param CodePoint The code point.
+ */
+DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
+{
+ return !!iswspace(CodePoint);
+}
+
+
+/**
+ * Folds a unicode code point to upper case.
+ *
+ * @returns Folded code point.
+ * @param CodePoint The unicode code point to fold.
+ */
+DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
+{
+ return towupper(CodePoint);
+}
+
+
+/**
+ * Folds a unicode code point to lower case.
+ *
+ * @returns Folded code point.
+ * @param CodePoint The unicode code point to fold.
+ */
+DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
+{
+ return towlower(CodePoint);
+}
+
+
+#endif /* RTUNI_USE_WCTYPE */
+
+
+/**
+ * Frees a unicode string.
+ *
+ * @param pusz The string to free.
+ */
+RTDECL(void) RTUniFree(PRTUNICP pusz);
+
+
+/**
+ * Checks if a code point valid.
+ *
+ * Any code point (defined or not) within the 17 unicode planes (0 thru 16),
+ * except surrogates will be considered valid code points by this function.
+ *
+ * @returns true if in range, false if not.
+ * @param CodePoint The unicode code point to validate.
+ */
+DECLINLINE(bool) RTUniCpIsValid(RTUNICP CodePoint)
+{
+ return CodePoint <= 0x00d7ff
+ || ( CodePoint <= 0x10ffff
+ && CodePoint >= 0x00e000);
+}
+
+
+/**
+ * Checks if the given code point is in the BMP range.
+ *
+ * Surrogates are not considered in the BMP range by this function.
+ *
+ * @returns true if in BMP, false if not.
+ * @param CodePoint The unicode code point to consider.
+ */
+DECLINLINE(bool) RTUniCpIsBMP(RTUNICP CodePoint)
+{
+ return CodePoint <= 0xd7ff
+ || ( CodePoint <= 0xffff
+ && CodePoint >= 0xe000);
+}
+
+
+/**
+ * Folds a unicode code point to lower case.
+ *
+ * @returns Folded code point.
+ * @param CodePoint The unicode code point to fold.
+ */
+DECLINLINE(size_t) RTUniCpCalcUtf8Len(RTUNICP CodePoint)
+{
+ if (CodePoint < 0x80)
+ return 1;
+ return 2
+ + (CodePoint >= 0x00000800)
+ + (CodePoint >= 0x00010000)
+ + (CodePoint >= 0x00200000)
+ + (CodePoint >= 0x04000000)
+ + (CodePoint >= 0x80000000) /* illegal */;
+}
+
+
+
+RT_C_DECLS_END
+/** @} */
+
+
+#endif /* !IPRT_INCLUDED_uni_h */
+