summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/nsBidiUtils.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/unicharutil/util/nsBidiUtils.h')
-rw-r--r--intl/unicharutil/util/nsBidiUtils.h295
1 files changed, 295 insertions, 0 deletions
diff --git a/intl/unicharutil/util/nsBidiUtils.h b/intl/unicharutil/util/nsBidiUtils.h
new file mode 100644
index 0000000000..21a7ee7ffa
--- /dev/null
+++ b/intl/unicharutil/util/nsBidiUtils.h
@@ -0,0 +1,295 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsBidiUtils_h__
+#define nsBidiUtils_h__
+
+#include "nsString.h"
+#include "encoding_rs_mem.h"
+
+/**
+ * Read ftp://ftp.unicode.org/Public/UNIDATA/ReadMe-Latest.txt
+ * section BIDIRECTIONAL PROPERTIES
+ * for the detailed definition of the following categories
+ *
+ * The values here must match the equivalents in %bidicategorycode in
+ * mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl,
+ * and must also match the values used by ICU's UCharDirection.
+ */
+
+enum nsCharType {
+ eCharType_LeftToRight = 0,
+ eCharType_RightToLeft = 1,
+ eCharType_EuropeanNumber = 2,
+ eCharType_EuropeanNumberSeparator = 3,
+ eCharType_EuropeanNumberTerminator = 4,
+ eCharType_ArabicNumber = 5,
+ eCharType_CommonNumberSeparator = 6,
+ eCharType_BlockSeparator = 7,
+ eCharType_SegmentSeparator = 8,
+ eCharType_WhiteSpaceNeutral = 9,
+ eCharType_OtherNeutral = 10,
+ eCharType_LeftToRightEmbedding = 11,
+ eCharType_LeftToRightOverride = 12,
+ eCharType_RightToLeftArabic = 13,
+ eCharType_RightToLeftEmbedding = 14,
+ eCharType_RightToLeftOverride = 15,
+ eCharType_PopDirectionalFormat = 16,
+ eCharType_DirNonSpacingMark = 17,
+ eCharType_BoundaryNeutral = 18,
+ eCharType_FirstStrongIsolate = 19,
+ eCharType_LeftToRightIsolate = 20,
+ eCharType_RightToLeftIsolate = 21,
+ eCharType_PopDirectionalIsolate = 22,
+ eCharType_CharTypeCount
+};
+
+/**
+ * This specifies the language directional property of a character set.
+ */
+typedef enum nsCharType nsCharType;
+
+/**
+ * Find the direction of an embedding level or paragraph level set by
+ * the Unicode Bidi Algorithm. (Even levels are left-to-right, odd
+ * levels right-to-left.
+ */
+#define IS_LEVEL_RTL(level) (((level)&1) == 1)
+
+/**
+ * Check whether two bidi levels have the same parity and thus the same
+ * directionality
+ */
+#define IS_SAME_DIRECTION(level1, level2) (((level1 ^ level2) & 1) == 0)
+
+/**
+ * Convert from nsBidiLevel to nsBidiDirection
+ */
+#define DIRECTION_FROM_LEVEL(level) \
+ ((IS_LEVEL_RTL(level)) ? NSBIDI_RTL : NSBIDI_LTR)
+
+/**
+ * definitions of bidirection character types by category
+ */
+
+#define CHARTYPE_IS_RTL(val) \
+ (((val) == eCharType_RightToLeft) || ((val) == eCharType_RightToLeftArabic))
+
+#define CHARTYPE_IS_WEAK(val) \
+ (((val) == eCharType_EuropeanNumberSeparator) || \
+ ((val) == eCharType_EuropeanNumberTerminator) || \
+ (((val) > eCharType_ArabicNumber) && \
+ ((val) != eCharType_RightToLeftArabic)))
+
+/**
+ * Inspects a Unichar, converting numbers to Arabic or Hindi forms and
+ * returning them
+ * @param aChar is the character
+ * @param aPrevCharArabic is true if the previous character in the string is
+ * an Arabic char
+ * @param aNumFlag specifies the conversion to perform:
+ * IBMBIDI_NUMERAL_NOMINAL: don't do any conversion
+ * IBMBIDI_NUMERAL_HINDI: convert to Hindi forms
+ * (Unicode 0660-0669)
+ * IBMBIDI_NUMERAL_ARABIC: convert to Arabic forms
+ * (Unicode 0030-0039)
+ * IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to
+ * Hindi, otherwise to Arabic
+ * @return the converted Unichar
+ */
+char16_t HandleNumberInChar(char16_t aChar, bool aPrevCharArabic,
+ uint32_t aNumFlag);
+
+/**
+ * Scan a Unichar string, converting numbers to Arabic or Hindi forms in
+ * place
+ * @param aBuffer is the string
+ * @param aSize is the size of aBuffer
+ * @param aNumFlag specifies the conversion to perform:
+ * IBMBIDI_NUMERAL_NOMINAL: don't do any conversion
+ * IBMBIDI_NUMERAL_HINDI: convert to Hindi forms
+ * (Unicode 0660-0669)
+ * IBMBIDI_NUMERAL_ARABIC: convert to Arabic forms
+ * (Unicode 0030-0039)
+ * IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to
+ * Hindi, otherwise to Arabic
+ */
+nsresult HandleNumbers(char16_t* aBuffer, uint32_t aSize, uint32_t aNumFlag);
+
+/**
+ * Give a UTF-32 codepoint
+ * return true if the codepoint is a Bidi control character (LRM, RLM, ALM;
+ * LRE, RLE, PDF, LRO, RLO; LRI, RLI, FSI, PDI).
+ * Return false, otherwise
+ */
+#define LRM_CHAR 0x200e
+#define RLM_CHAR 0x200f
+
+#define LRE_CHAR 0x202a
+#define RLE_CHAR 0x202b
+#define PDF_CHAR 0x202c
+#define LRO_CHAR 0x202d
+#define RLO_CHAR 0x202e
+
+#define LRI_CHAR 0x2066
+#define RLI_CHAR 0x2067
+#define FSI_CHAR 0x2068
+#define PDI_CHAR 0x2069
+
+#define ALM_CHAR 0x061C
+inline bool IsBidiControl(uint32_t aChar) {
+ return ((LRE_CHAR <= aChar && aChar <= RLO_CHAR) ||
+ (LRI_CHAR <= aChar && aChar <= PDI_CHAR) || (aChar == ALM_CHAR) ||
+ (aChar & 0xfffffe) == LRM_CHAR);
+}
+
+/**
+ * Give a UTF-32 codepoint
+ * Return true if the codepoint is a Bidi control character that may result
+ * in RTL directionality and therefore needs to trigger bidi resolution;
+ * return false otherwise.
+ */
+inline bool IsBidiControlRTL(uint32_t aChar) {
+ return aChar == RLM_CHAR || aChar == RLE_CHAR || aChar == RLO_CHAR ||
+ aChar == RLI_CHAR || aChar == ALM_CHAR;
+}
+
+/**
+ * Give a 16-bit (UTF-16) text buffer
+ * @return true if the string contains right-to-left characters
+ */
+inline bool HasRTLChars(mozilla::Span<const char16_t> aBuffer) {
+ // Span ensures we never pass a nullptr to Rust--even if the
+ // length of the buffer is zero.
+ return encoding_mem_is_utf16_bidi(aBuffer.Elements(), aBuffer.Length());
+}
+
+// These values are shared with Preferences dialog
+// ------------------
+// If Pref values are to be changed
+// in the XUL file of Prefs. the values
+// Must be changed here too..
+// ------------------
+//
+#define IBMBIDI_TEXTDIRECTION_STR "bidi.direction"
+#define IBMBIDI_TEXTTYPE_STR "bidi.texttype"
+#define IBMBIDI_NUMERAL_STR "bidi.numeral"
+
+// ------------------
+// Text Direction
+// ------------------
+// bidi.direction
+#define IBMBIDI_TEXTDIRECTION_LTR 1 // 1 = directionLTRBidi *
+#define IBMBIDI_TEXTDIRECTION_RTL 2 // 2 = directionRTLBidi
+// ------------------
+// Text Type
+// ------------------
+// bidi.texttype
+#define IBMBIDI_TEXTTYPE_CHARSET 1 // 1 = charsettexttypeBidi *
+#define IBMBIDI_TEXTTYPE_LOGICAL 2 // 2 = logicaltexttypeBidi
+#define IBMBIDI_TEXTTYPE_VISUAL 3 // 3 = visualtexttypeBidi
+// ------------------
+// Numeral Style
+// ------------------
+// bidi.numeral
+#define IBMBIDI_NUMERAL_NOMINAL 0 // 0 = nominalnumeralBidi *
+#define IBMBIDI_NUMERAL_REGULAR 1 // 1 = regularcontextnumeralBidi
+#define IBMBIDI_NUMERAL_HINDICONTEXT 2 // 2 = hindicontextnumeralBidi
+#define IBMBIDI_NUMERAL_ARABIC 3 // 3 = arabicnumeralBidi
+#define IBMBIDI_NUMERAL_HINDI 4 // 4 = hindinumeralBidi
+#define IBMBIDI_NUMERAL_PERSIANCONTEXT 5 // 5 = persiancontextnumeralBidi
+#define IBMBIDI_NUMERAL_PERSIAN 6 // 6 = persiannumeralBidi
+
+#define IBMBIDI_DEFAULT_BIDI_OPTIONS \
+ ((IBMBIDI_TEXTDIRECTION_LTR << 0) | (IBMBIDI_TEXTTYPE_CHARSET << 4) | \
+ (IBMBIDI_NUMERAL_NOMINAL << 8))
+
+#define GET_BIDI_OPTION_DIRECTION(bo) \
+ (((bo) >> 0) & 0x0000000F) /* 4 bits for DIRECTION */
+#define GET_BIDI_OPTION_TEXTTYPE(bo) \
+ (((bo) >> 4) & 0x0000000F) /* 4 bits for TEXTTYPE */
+#define GET_BIDI_OPTION_NUMERAL(bo) \
+ (((bo) >> 8) & 0x0000000F) /* 4 bits for NUMERAL */
+
+#define SET_BIDI_OPTION_DIRECTION(bo, dir) \
+ { (bo) = ((bo)&0xFFFFFFF0) | (((dir)&0x0000000F) << 0); }
+#define SET_BIDI_OPTION_TEXTTYPE(bo, tt) \
+ { (bo) = ((bo)&0xFFFFFF0F) | (((tt)&0x0000000F) << 4); }
+#define SET_BIDI_OPTION_NUMERAL(bo, num) \
+ { (bo) = ((bo)&0xFFFFF0FF) | (((num)&0x0000000F) << 8); }
+
+/* Constants related to the position of numerics in the codepage */
+#define START_HINDI_DIGITS 0x0660
+#define END_HINDI_DIGITS 0x0669
+#define START_ARABIC_DIGITS 0x0030
+#define END_ARABIC_DIGITS 0x0039
+#define START_FARSI_DIGITS 0x06f0
+#define END_FARSI_DIGITS 0x06f9
+#define IS_HINDI_DIGIT(u) \
+ (((u) >= START_HINDI_DIGITS) && ((u) <= END_HINDI_DIGITS))
+#define IS_ARABIC_DIGIT(u) \
+ (((u) >= START_ARABIC_DIGITS) && ((u) <= END_ARABIC_DIGITS))
+#define IS_FARSI_DIGIT(u) \
+ (((u) >= START_FARSI_DIGITS) && ((u) <= END_FARSI_DIGITS))
+/**
+ * Arabic numeric separator and numeric formatting characters:
+ * U+0600;ARABIC NUMBER SIGN
+ * U+0601;ARABIC SIGN SANAH
+ * U+0602;ARABIC FOOTNOTE MARKER
+ * U+0603;ARABIC SIGN SAFHA
+ * U+066A;ARABIC PERCENT SIGN
+ * U+066B;ARABIC DECIMAL SEPARATOR
+ * U+066C;ARABIC THOUSANDS SEPARATOR
+ * U+06DD;ARABIC END OF AYAH
+ */
+#define IS_ARABIC_SEPARATOR(u) \
+ ((/*(u) >= 0x0600 &&*/ (u) <= 0x0603) || ((u) >= 0x066A && (u) <= 0x066C) || \
+ ((u) == 0x06DD))
+
+#define IS_BIDI_DIACRITIC(u) \
+ (((u) >= 0x0591 && (u) <= 0x05A1) || ((u) >= 0x05A3 && (u) <= 0x05B9) || \
+ ((u) >= 0x05BB && (u) <= 0x05BD) || ((u) == 0x05BF) || ((u) == 0x05C1) || \
+ ((u) == 0x05C2) || ((u) == 0x05C4) || ((u) >= 0x064B && (u) <= 0x0652) || \
+ ((u) == 0x0670) || ((u) >= 0x06D7 && (u) <= 0x06E4) || ((u) == 0x06E7) || \
+ ((u) == 0x06E8) || ((u) >= 0x06EA && (u) <= 0x06ED))
+
+#define IS_HEBREW_CHAR(c) \
+ (((0x0590 <= (c)) && ((c) <= 0x05FF)) || (((c) >= 0xfb1d) && ((c) <= 0xfb4f)))
+#define IS_ARABIC_CHAR(c) \
+ ((0x0600 <= (c) && (c) <= 0x08FF) && \
+ ((c) <= 0x06ff || ((c) >= 0x0750 && (c) <= 0x077f) || (c) >= 0x08a0))
+#define IS_ARABIC_ALPHABETIC(c) \
+ (IS_ARABIC_CHAR(c) && \
+ !(IS_HINDI_DIGIT(c) || IS_FARSI_DIGIT(c) || IS_ARABIC_SEPARATOR(c)))
+
+/**
+ * The codepoint ranges in the following macros are based on the blocks
+ * allocated, or planned to be allocated, to right-to-left characters in the
+ * BMP (Basic Multilingual Plane) and SMP (Supplementary Multilingual Plane)
+ * according to
+ * http://unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt and
+ * http://www.unicode.org/roadmaps/
+ */
+
+#define IS_IN_BMP_RTL_BLOCK(c) ((0x590 <= (c)) && ((c) <= 0x8ff))
+#define IS_RTL_PRESENTATION_FORM(c) \
+ (((0xfb1d <= (c)) && ((c) <= 0xfdff)) || ((0xfe70 <= (c)) && ((c) <= 0xfefe)))
+#define IS_IN_SMP_RTL_BLOCK(c) \
+ (((0x10800 <= (c)) && ((c) <= 0x10fff)) || \
+ ((0x1e800 <= (c)) && ((c) <= 0x1eFFF)))
+// Due to the supplementary-plane RTL blocks being identifiable from the
+// high surrogate without examining the low surrogate, it is correct to
+// use this by-code-unit check on potentially astral text without doing
+// the math to decode surrogate pairs into code points. However, unpaired
+// high surrogates that are RTL high surrogates then count as RTL even
+// though, if replaced by the REPLACEMENT CHARACTER, it would not be
+// RTL.
+#define UTF16_CODE_UNIT_IS_BIDI(c) \
+ ((IS_IN_BMP_RTL_BLOCK(c)) || (IS_RTL_PRESENTATION_FORM(c)) || \
+ (c) == 0xD802 || (c) == 0xD803 || (c) == 0xD83A || (c) == 0xD83B)
+#define UTF32_CHAR_IS_BIDI(c) \
+ ((IS_IN_BMP_RTL_BLOCK(c)) || (IS_RTL_PRESENTATION_FORM(c)) || \
+ (IS_IN_SMP_RTL_BLOCK(c)))
+#endif /* nsBidiUtils_h__ */