491 lines
11 KiB
C
491 lines
11 KiB
C
/** @file
|
|
* IPRT - Unicode Code Points.
|
|
*/
|
|
|
|
/*
|
|
* Copyright (C) 2006-2023 Oracle and/or its affiliates.
|
|
*
|
|
* This file is part of VirtualBox base platform packages, as
|
|
* available from https://www.virtualbox.org.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation, in version 3 of the
|
|
* License.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, see <https://www.gnu.org/licenses>.
|
|
*
|
|
* The contents of this file may alternatively be used under the terms
|
|
* of the Common Development and Distribution License Version 1.0
|
|
* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
|
|
* in the VirtualBox distribution, in which case the provisions of the
|
|
* CDDL are applicable instead of those of the GPL.
|
|
*
|
|
* You may elect to license modified versions of this file under the
|
|
* terms and conditions of either the GPL or the CDDL or both.
|
|
*
|
|
* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
|
|
*/
|
|
|
|
#ifndef IPRT_INCLUDED_uni_h
|
|
#define IPRT_INCLUDED_uni_h
|
|
#ifndef RT_WITHOUT_PRAGMA_ONCE
|
|
# pragma once
|
|
#endif
|
|
|
|
/** @defgroup grp_rt_uni RTUniCp - Unicode Code Points
|
|
* @ingroup grp_rt
|
|
* @{
|
|
*/
|
|
|
|
/** @def RTUNI_USE_WCTYPE
|
|
* Define RTUNI_USE_WCTYPE to not use the IPRT unicode data but the
|
|
* data which the C runtime library provides. */
|
|
#ifdef DOXYGEN_RUNNING
|
|
# define RTUNI_USE_WCTYPE
|
|
#endif
|
|
|
|
#include <iprt/types.h>
|
|
#ifdef RTUNI_USE_WCTYPE
|
|
# include <wctype.h>
|
|
#endif
|
|
|
|
RT_C_DECLS_BEGIN
|
|
|
|
|
|
#ifndef RTUNI_USE_WCTYPE
|
|
|
|
/**
|
|
* A unicode flags range.
|
|
* @internal
|
|
*/
|
|
typedef struct RTUNIFLAGSRANGE
|
|
{
|
|
/** The first code point of the range. */
|
|
RTUNICP BeginCP;
|
|
/** The last + 1 code point of the range. */
|
|
RTUNICP EndCP;
|
|
/** Pointer to the array of case folded code points. */
|
|
const uint8_t *pafFlags;
|
|
} RTUNIFLAGSRANGE;
|
|
/** Pointer to a flags range.
|
|
* @internal */
|
|
typedef RTUNIFLAGSRANGE *PRTUNIFLAGSRANGE;
|
|
/** Pointer to a const flags range.
|
|
* @internal */
|
|
typedef const RTUNIFLAGSRANGE *PCRTUNIFLAGSRANGE;
|
|
|
|
/**
|
|
* A unicode case folded range.
|
|
* @internal
|
|
*/
|
|
typedef struct RTUNICASERANGE
|
|
{
|
|
/** The first code point of the range. */
|
|
RTUNICP BeginCP;
|
|
/** The last + 1 code point of the range. */
|
|
RTUNICP EndCP;
|
|
/** Pointer to the array of case folded code points. */
|
|
PCRTUNICP paFoldedCPs;
|
|
} RTUNICASERANGE;
|
|
/** Pointer to a case folded range.
|
|
* @internal */
|
|
typedef RTUNICASERANGE *PRTUNICASERANGE;
|
|
/** Pointer to a const case folded range.
|
|
* @internal */
|
|
typedef const RTUNICASERANGE *PCRTUNICASERANGE;
|
|
|
|
/** @name Unicode Code Point Flags.
|
|
* @internal
|
|
* @{ */
|
|
#define RTUNI_UPPER RT_BIT(0)
|
|
#define RTUNI_LOWER RT_BIT(1)
|
|
#define RTUNI_ALPHA RT_BIT(2)
|
|
#define RTUNI_XDIGIT RT_BIT(3)
|
|
#define RTUNI_DDIGIT RT_BIT(4)
|
|
#define RTUNI_WSPACE RT_BIT(5)
|
|
/*#define RTUNI_BSPACE RT_BIT(6) - later */
|
|
/** When set, the codepoint requires further checking wrt NFC and NFD
|
|
* normalization. I.e. set when either of QC_NFD and QC_NFC are not Y. */
|
|
#define RTUNI_QC_NFX RT_BIT(7)
|
|
/** @} */
|
|
|
|
|
|
/**
|
|
* Array of flags ranges.
|
|
* @internal
|
|
*/
|
|
extern RTDATADECL(const RTUNIFLAGSRANGE) g_aRTUniFlagsRanges[];
|
|
|
|
/**
|
|
* Gets the flags for a unicode code point.
|
|
*
|
|
* @returns The flag mask. (RTUNI_*)
|
|
* @param CodePoint The unicode code point.
|
|
* @internal
|
|
*/
|
|
DECLINLINE(RTUNICP) rtUniCpFlags(RTUNICP CodePoint)
|
|
{
|
|
PCRTUNIFLAGSRANGE pCur = &g_aRTUniFlagsRanges[0];
|
|
do
|
|
{
|
|
if (pCur->EndCP > CodePoint)
|
|
{
|
|
if (pCur->BeginCP <= CodePoint)
|
|
return pCur->pafFlags[CodePoint - pCur->BeginCP];
|
|
break;
|
|
}
|
|
pCur++;
|
|
} while (pCur->EndCP != RTUNICP_MAX);
|
|
return 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is upper case.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
|
|
{
|
|
return (rtUniCpFlags(CodePoint) & RTUNI_UPPER) != 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is lower case.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
|
|
{
|
|
return (rtUniCpFlags(CodePoint) & RTUNI_LOWER) != 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is case foldable.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
|
|
{
|
|
/* Right enough. */
|
|
return (rtUniCpFlags(CodePoint) & (RTUNI_LOWER | RTUNI_UPPER)) != 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is alphabetic.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
|
|
{
|
|
return (rtUniCpFlags(CodePoint) & RTUNI_ALPHA) != 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is a decimal digit.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
|
|
{
|
|
return (rtUniCpFlags(CodePoint) & RTUNI_DDIGIT) != 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is a hexadecimal digit.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
|
|
{
|
|
return (rtUniCpFlags(CodePoint) & RTUNI_XDIGIT) != 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is white space.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
|
|
{
|
|
return (rtUniCpFlags(CodePoint) & RTUNI_WSPACE) != 0;
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* Array of uppercase ranges.
|
|
* @internal
|
|
*/
|
|
extern RTDATADECL(const RTUNICASERANGE) g_aRTUniUpperRanges[];
|
|
|
|
/**
|
|
* Array of lowercase ranges.
|
|
* @internal
|
|
*/
|
|
extern RTDATADECL(const RTUNICASERANGE) g_aRTUniLowerRanges[];
|
|
|
|
|
|
/**
|
|
* Folds a unicode code point using the specified range array.
|
|
*
|
|
* @returns FOlded code point.
|
|
* @param CodePoint The unicode code point to fold.
|
|
* @param pCur The case folding range to use.
|
|
*/
|
|
DECLINLINE(RTUNICP) rtUniCpFold(RTUNICP CodePoint, PCRTUNICASERANGE pCur)
|
|
{
|
|
do
|
|
{
|
|
if (pCur->EndCP > CodePoint)
|
|
{
|
|
if (pCur->BeginCP <= CodePoint)
|
|
CodePoint = pCur->paFoldedCPs[CodePoint - pCur->BeginCP];
|
|
break;
|
|
}
|
|
pCur++;
|
|
} while (pCur->EndCP != RTUNICP_MAX);
|
|
return CodePoint;
|
|
}
|
|
|
|
|
|
/**
|
|
* Folds a unicode code point to upper case.
|
|
*
|
|
* @returns Folded code point.
|
|
* @param CodePoint The unicode code point to fold.
|
|
*/
|
|
DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
|
|
{
|
|
return rtUniCpFold(CodePoint, &g_aRTUniUpperRanges[0]);
|
|
}
|
|
|
|
|
|
/**
|
|
* Folds a unicode code point to lower case.
|
|
*
|
|
* @returns Folded code point.
|
|
* @param CodePoint The unicode code point to fold.
|
|
*/
|
|
DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
|
|
{
|
|
return rtUniCpFold(CodePoint, &g_aRTUniLowerRanges[0]);
|
|
}
|
|
|
|
|
|
#else /* RTUNI_USE_WCTYPE */
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is upper case.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
|
|
{
|
|
return !!iswupper(CodePoint);
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is lower case.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
|
|
{
|
|
return !!iswlower(CodePoint);
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is case foldable.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
|
|
{
|
|
/* Right enough. */
|
|
return iswupper(CodePoint) || iswlower(CodePoint);
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is alphabetic.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
|
|
{
|
|
return !!iswalpha(CodePoint);
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is a decimal digit.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
|
|
{
|
|
return !!iswdigit(CodePoint);
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is a hexadecimal digit.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
|
|
{
|
|
return !!iswxdigit(CodePoint);
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if a unicode code point is white space.
|
|
*
|
|
* @returns true if it is.
|
|
* @returns false if it isn't.
|
|
* @param CodePoint The code point.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
|
|
{
|
|
return !!iswspace(CodePoint);
|
|
}
|
|
|
|
|
|
/**
|
|
* Folds a unicode code point to upper case.
|
|
*
|
|
* @returns Folded code point.
|
|
* @param CodePoint The unicode code point to fold.
|
|
*/
|
|
DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
|
|
{
|
|
return towupper(CodePoint);
|
|
}
|
|
|
|
|
|
/**
|
|
* Folds a unicode code point to lower case.
|
|
*
|
|
* @returns Folded code point.
|
|
* @param CodePoint The unicode code point to fold.
|
|
*/
|
|
DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
|
|
{
|
|
return towlower(CodePoint);
|
|
}
|
|
|
|
|
|
#endif /* RTUNI_USE_WCTYPE */
|
|
|
|
|
|
/**
|
|
* Frees a unicode string.
|
|
*
|
|
* @param pusz The string to free.
|
|
*/
|
|
RTDECL(void) RTUniFree(PRTUNICP pusz);
|
|
|
|
|
|
/**
|
|
* Checks if a code point valid.
|
|
*
|
|
* Any code point (defined or not) within the 17 unicode planes (0 thru 16),
|
|
* except surrogates will be considered valid code points by this function.
|
|
*
|
|
* @returns true if in range, false if not.
|
|
* @param CodePoint The unicode code point to validate.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsValid(RTUNICP CodePoint)
|
|
{
|
|
return CodePoint <= 0x00d7ff
|
|
|| ( CodePoint <= 0x10ffff
|
|
&& CodePoint >= 0x00e000);
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks if the given code point is in the BMP range.
|
|
*
|
|
* Surrogates are not considered in the BMP range by this function.
|
|
*
|
|
* @returns true if in BMP, false if not.
|
|
* @param CodePoint The unicode code point to consider.
|
|
*/
|
|
DECLINLINE(bool) RTUniCpIsBMP(RTUNICP CodePoint)
|
|
{
|
|
return CodePoint <= 0xd7ff
|
|
|| ( CodePoint <= 0xffff
|
|
&& CodePoint >= 0xe000);
|
|
}
|
|
|
|
|
|
/**
|
|
* Folds a unicode code point to lower case.
|
|
*
|
|
* @returns Folded code point.
|
|
* @param CodePoint The unicode code point to fold.
|
|
*/
|
|
DECLINLINE(size_t) RTUniCpCalcUtf8Len(RTUNICP CodePoint)
|
|
{
|
|
if (CodePoint < 0x80)
|
|
return 1;
|
|
return 2
|
|
+ (CodePoint >= 0x00000800)
|
|
+ (CodePoint >= 0x00010000)
|
|
+ (CodePoint >= 0x00200000)
|
|
+ (CodePoint >= 0x04000000)
|
|
+ (CodePoint >= 0x80000000) /* illegal */;
|
|
}
|
|
|
|
|
|
|
|
RT_C_DECLS_END
|
|
/** @} */
|
|
|
|
|
|
#endif /* !IPRT_INCLUDED_uni_h */
|
|
|