summaryrefslogtreecommitdiffstats
path: root/src/arabic.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 08:50:31 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 08:50:31 +0000
commitaed8ce9da277f5ecffe968b324f242c41c3b752a (patch)
treed2e538394cb7a8a7c42a4aac6ccf1a8e3256999b /src/arabic.c
parentInitial commit. (diff)
downloadvim-aed8ce9da277f5ecffe968b324f242c41c3b752a.tar.xz
vim-aed8ce9da277f5ecffe968b324f242c41c3b752a.zip
Adding upstream version 2:9.0.1378.upstream/2%9.0.1378upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arabic.c')
-rw-r--r--src/arabic.c396
1 files changed, 396 insertions, 0 deletions
diff --git a/src/arabic.c b/src/arabic.c
new file mode 100644
index 0000000..f64dab2
--- /dev/null
+++ b/src/arabic.c
@@ -0,0 +1,396 @@
+/* vi:set ts=8 sts=4 sw=4 noet:
+ *
+ * VIM - Vi IMproved by Bram Moolenaar
+ *
+ * Do ":help uganda" in Vim to read copying and usage conditions.
+ * Do ":help credits" in Vim to see a list of people who contributed.
+ * See README.txt for an overview of the Vim source code.
+ */
+
+/*
+ * arabic.c: functions for Arabic language
+ *
+ * Author: Nadim Shaikli & Isam Bayazidi
+ * Farsi support and restructuring to make adding new letters easier by Ali
+ * Gholami Rudi. Further work by Ameretat Reith.
+ */
+
+/*
+ * Sorted list of unicode Arabic characters. Each entry holds the
+ * presentation forms of a letter.
+ *
+ * Arabic characters are categorized into following types:
+ *
+ * Isolated - iso-8859-6 form
+ * Initial - unicode form-B start
+ * Medial - unicode form-B middle
+ * Final - unicode form-B final
+ * Stand-Alone - unicode form-B isolated
+ */
+
+#include "vim.h"
+
+#if defined(FEAT_ARABIC) || defined(PROTO)
+
+// Unicode values for Arabic characters.
+#define a_HAMZA 0x0621
+#define a_ALEF_MADDA 0x0622
+#define a_ALEF_HAMZA_ABOVE 0x0623
+#define a_WAW_HAMZA 0x0624
+#define a_ALEF_HAMZA_BELOW 0x0625
+#define a_YEH_HAMZA 0x0626
+#define a_ALEF 0x0627
+#define a_BEH 0x0628
+#define a_TEH_MARBUTA 0x0629
+#define a_TEH 0x062a
+#define a_THEH 0x062b
+#define a_JEEM 0x062c
+#define a_HAH 0x062d
+#define a_KHAH 0x062e
+#define a_DAL 0x062f
+#define a_THAL 0x0630
+#define a_REH 0x0631
+#define a_ZAIN 0x0632
+#define a_SEEN 0x0633
+#define a_SHEEN 0x0634
+#define a_SAD 0x0635
+#define a_DAD 0x0636
+#define a_TAH 0x0637
+#define a_ZAH 0x0638
+#define a_AIN 0x0639
+#define a_GHAIN 0x063a
+#define a_TATWEEL 0x0640
+#define a_FEH 0x0641
+#define a_QAF 0x0642
+#define a_KAF 0x0643
+#define a_LAM 0x0644
+#define a_MEEM 0x0645
+#define a_NOON 0x0646
+#define a_HEH 0x0647
+#define a_WAW 0x0648
+#define a_ALEF_MAKSURA 0x0649
+#define a_YEH 0x064a
+#define a_FATHATAN 0x064b
+#define a_DAMMATAN 0x064c
+#define a_KASRATAN 0x064d
+#define a_FATHA 0x064e
+#define a_DAMMA 0x064f
+#define a_KASRA 0x0650
+#define a_SHADDA 0x0651
+#define a_SUKUN 0x0652
+#define a_MADDA_ABOVE 0x0653
+#define a_HAMZA_ABOVE 0x0654
+#define a_HAMZA_BELOW 0x0655
+
+#define a_PEH 0x067e
+#define a_TCHEH 0x0686
+#define a_JEH 0x0698
+#define a_FKAF 0x06a9
+#define a_GAF 0x06af
+#define a_FYEH 0x06cc
+
+#define a_s_LAM_ALEF_MADDA_ABOVE 0xfef5
+#define a_f_LAM_ALEF_MADDA_ABOVE 0xfef6
+#define a_s_LAM_ALEF_HAMZA_ABOVE 0xfef7
+#define a_f_LAM_ALEF_HAMZA_ABOVE 0xfef8
+#define a_s_LAM_ALEF_HAMZA_BELOW 0xfef9
+#define a_f_LAM_ALEF_HAMZA_BELOW 0xfefa
+#define a_s_LAM_ALEF 0xfefb
+#define a_f_LAM_ALEF 0xfefc
+
+static struct achar {
+ unsigned c;
+ unsigned isolated;
+ unsigned initial;
+ unsigned medial;
+ unsigned final;
+} achars[] = {
+ {a_HAMZA, 0xfe80, 0, 0, 0},
+ {a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82},
+ {a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84},
+ {a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86},
+ {a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88},
+ {a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a},
+ {a_ALEF, 0xfe8d, 0, 0, 0xfe8e},
+ {a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90},
+ {a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94},
+ {a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96},
+ {a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a},
+ {a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e},
+ {a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2},
+ {a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6},
+ {a_DAL, 0xfea9, 0, 0, 0xfeaa},
+ {a_THAL, 0xfeab, 0, 0, 0xfeac},
+ {a_REH, 0xfead, 0, 0, 0xfeae},
+ {a_ZAIN, 0xfeaf, 0, 0, 0xfeb0},
+ {a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2},
+ {a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6},
+ {a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba},
+ {a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe},
+ {a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2},
+ {a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6},
+ {a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca},
+ {a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece},
+ {a_TATWEEL, 0, 0x0640, 0x0640, 0x0640},
+ {a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2},
+ {a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6},
+ {a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda},
+ {a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede},
+ {a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2},
+ {a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6},
+ {a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea},
+ {a_WAW, 0xfeed, 0, 0, 0xfeee},
+ {a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0},
+ {a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2},
+ {a_FATHATAN, 0xfe70, 0, 0, 0},
+ {a_DAMMATAN, 0xfe72, 0, 0, 0},
+ {a_KASRATAN, 0xfe74, 0, 0, 0},
+ {a_FATHA, 0xfe76, 0, 0xfe77, 0},
+ {a_DAMMA, 0xfe78, 0, 0xfe79, 0},
+ {a_KASRA, 0xfe7a, 0, 0xfe7b, 0},
+ {a_SHADDA, 0xfe7c, 0, 0xfe7c, 0},
+ {a_SUKUN, 0xfe7e, 0, 0xfe7f, 0},
+ {a_MADDA_ABOVE, 0, 0, 0, 0},
+ {a_HAMZA_ABOVE, 0, 0, 0, 0},
+ {a_HAMZA_BELOW, 0, 0, 0, 0},
+ {a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57},
+ {a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b},
+ {a_JEH, 0xfb8a, 0, 0, 0xfb8b},
+ {a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f},
+ {a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93},
+ {a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd},
+};
+
+#define a_BYTE_ORDER_MARK 0xfeff
+
+/*
+ * Find the struct achar pointer to the given Arabic char.
+ * Returns NULL if not found.
+ */
+ static struct achar *
+find_achar(int c)
+{
+ int h, m, l;
+
+ // using binary search to find c
+ h = ARRAY_LENGTH(achars);
+ l = 0;
+ while (l < h)
+ {
+ m = (h + l) / 2;
+ if (achars[m].c == (unsigned)c)
+ return &achars[m];
+ if ((unsigned)c < achars[m].c)
+ h = m;
+ else
+ l = m + 1;
+ }
+ return NULL;
+}
+
+/*
+ * Change shape - from Combination (2 char) to an Isolated
+ */
+ static int
+chg_c_laa2i(int hid_c)
+{
+ int tempc;
+
+ switch (hid_c)
+ {
+ case a_ALEF_MADDA:
+ tempc = a_s_LAM_ALEF_MADDA_ABOVE;
+ break;
+ case a_ALEF_HAMZA_ABOVE:
+ tempc = a_s_LAM_ALEF_HAMZA_ABOVE;
+ break;
+ case a_ALEF_HAMZA_BELOW:
+ tempc = a_s_LAM_ALEF_HAMZA_BELOW;
+ break;
+ case a_ALEF:
+ tempc = a_s_LAM_ALEF;
+ break;
+ default:
+ tempc = 0;
+ }
+
+ return tempc;
+}
+
+/*
+ * Change shape - from Combination-Isolated to Final
+ */
+ static int
+chg_c_laa2f(int hid_c)
+{
+ int tempc;
+
+ switch (hid_c)
+ {
+ case a_ALEF_MADDA:
+ tempc = a_f_LAM_ALEF_MADDA_ABOVE;
+ break;
+ case a_ALEF_HAMZA_ABOVE:
+ tempc = a_f_LAM_ALEF_HAMZA_ABOVE;
+ break;
+ case a_ALEF_HAMZA_BELOW:
+ tempc = a_f_LAM_ALEF_HAMZA_BELOW;
+ break;
+ case a_ALEF:
+ tempc = a_f_LAM_ALEF;
+ break;
+ default:
+ tempc = 0;
+ }
+
+ return tempc;
+}
+
+/*
+ * Returns whether it is possible to join the given letters
+ */
+ static int
+can_join(int c1, int c2)
+{
+ struct achar *a1 = find_achar(c1);
+ struct achar *a2 = find_achar(c2);
+
+ return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial);
+}
+
+/*
+ * Check whether we are dealing with a character that could be regarded as an
+ * Arabic combining character, need to check the character before this.
+ */
+ int
+arabic_maycombine(int two)
+{
+ if (p_arshape && !p_tbidi)
+ return (two == a_ALEF_MADDA
+ || two == a_ALEF_HAMZA_ABOVE
+ || two == a_ALEF_HAMZA_BELOW
+ || two == a_ALEF);
+ return FALSE;
+}
+
+/*
+ * Check whether we are dealing with Arabic combining characters.
+ * Note: these are NOT really composing characters!
+ */
+ int
+arabic_combine(
+ int one, // first character
+ int two) // character just after "one"
+{
+ if (one == a_LAM)
+ return arabic_maycombine(two);
+ return FALSE;
+}
+
+/*
+ * A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character
+ * (alphabet/number/punctuation)
+ */
+ static int
+A_is_iso(int c)
+{
+ return find_achar(c) != NULL;
+}
+
+/*
+ * A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
+ */
+ static int
+A_is_ok(int c)
+{
+ return (A_is_iso(c) || c == a_BYTE_ORDER_MARK);
+}
+
+/*
+ * A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
+ * with some exceptions/exclusions
+ */
+ static int
+A_is_valid(int c)
+{
+ return (A_is_ok(c) && c != a_HAMZA);
+}
+
+/*
+ * Do Arabic shaping on character "c". Returns the shaped character.
+ * out: "ccp" points to the first byte of the character to be shaped.
+ * in/out: "c1p" points to the first composing char for "c".
+ * in: "prev_c" is the previous character (not shaped)
+ * in: "prev_c1" is the first composing char for the previous char
+ * (not shaped)
+ * in: "next_c" is the next character (not shaped).
+ */
+ int
+arabic_shape(
+ int c,
+ int *ccp,
+ int *c1p,
+ int prev_c,
+ int prev_c1,
+ int next_c)
+{
+ int curr_c;
+ int curr_laa;
+ int prev_laa;
+
+ // Deal only with Arabic characters, pass back all others
+ if (!A_is_ok(c))
+ return c;
+
+ curr_laa = arabic_combine(c, *c1p);
+ prev_laa = arabic_combine(prev_c, prev_c1);
+
+ if (curr_laa)
+ {
+ if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa)
+ curr_c = chg_c_laa2f(*c1p);
+ else
+ curr_c = chg_c_laa2i(*c1p);
+
+ // Remove the composing character
+ *c1p = 0;
+ }
+ else
+ {
+ struct achar *curr_a = find_achar(c);
+ int backward_combine = !prev_laa && can_join(prev_c, c);
+ int forward_combine = can_join(c, next_c);
+
+ if (backward_combine)
+ {
+ if (forward_combine)
+ curr_c = curr_a->medial;
+ else
+ curr_c = curr_a->final;
+ }
+ else
+ {
+ if (forward_combine)
+ curr_c = curr_a->initial;
+ else
+ curr_c = curr_a->isolated;
+ }
+ }
+
+ // Character missing from the table means using original character.
+ if (curr_c == NUL)
+ curr_c = c;
+
+ if (curr_c != c && ccp != NULL)
+ {
+ char_u buf[MB_MAXBYTES + 1];
+
+ // Update the first byte of the character.
+ (*mb_char2bytes)(curr_c, buf);
+ *ccp = buf[0];
+ }
+
+ // Return the shaped character
+ return curr_c;
+}
+#endif // FEAT_ARABIC