diff options
Diffstat (limited to '')
-rw-r--r-- | gfx/harfbuzz/src/hb-ot-tag.cc | 658 |
1 files changed, 658 insertions, 0 deletions
diff --git a/gfx/harfbuzz/src/hb-ot-tag.cc b/gfx/harfbuzz/src/hb-ot-tag.cc new file mode 100644 index 0000000000..547f9573d0 --- /dev/null +++ b/gfx/harfbuzz/src/hb-ot-tag.cc @@ -0,0 +1,658 @@ +/* + * Copyright © 2009 Red Hat, Inc. + * Copyright © 2011 Google, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + * + * Red Hat Author(s): Behdad Esfahbod + * Google Author(s): Behdad Esfahbod, Roozbeh Pournader + */ + +#include "hb.hh" + +#ifndef HB_NO_OT_TAG + + +/* hb_script_t */ + +static hb_tag_t +hb_ot_old_tag_from_script (hb_script_t script) +{ + /* This seems to be accurate as of end of 2012. */ + + switch ((hb_tag_t) script) + { + case HB_SCRIPT_INVALID: return HB_OT_TAG_DEFAULT_SCRIPT; + case HB_SCRIPT_MATH: return HB_OT_TAG_MATH_SCRIPT; + + /* KATAKANA and HIRAGANA both map to 'kana' */ + case HB_SCRIPT_HIRAGANA: return HB_TAG('k','a','n','a'); + + /* Spaces at the end are preserved, unlike ISO 15924 */ + case HB_SCRIPT_LAO: return HB_TAG('l','a','o',' '); + case HB_SCRIPT_YI: return HB_TAG('y','i',' ',' '); + /* Unicode-5.0 additions */ + case HB_SCRIPT_NKO: return HB_TAG('n','k','o',' '); + /* Unicode-5.1 additions */ + case HB_SCRIPT_VAI: return HB_TAG('v','a','i',' '); + } + + /* Else, just change first char to lowercase and return */ + return ((hb_tag_t) script) | 0x20000000u; +} + +static hb_script_t +hb_ot_old_tag_to_script (hb_tag_t tag) +{ + if (unlikely (tag == HB_OT_TAG_DEFAULT_SCRIPT)) + return HB_SCRIPT_INVALID; + if (unlikely (tag == HB_OT_TAG_MATH_SCRIPT)) + return HB_SCRIPT_MATH; + + /* This side of the conversion is fully algorithmic. */ + + /* Any spaces at the end of the tag are replaced by repeating the last + * letter. Eg 'nko ' -> 'Nkoo' */ + if (unlikely ((tag & 0x0000FF00u) == 0x00002000u)) + tag |= (tag >> 8) & 0x0000FF00u; /* Copy second letter to third */ + if (unlikely ((tag & 0x000000FFu) == 0x00000020u)) + tag |= (tag >> 8) & 0x000000FFu; /* Copy third letter to fourth */ + + /* Change first char to uppercase and return */ + return (hb_script_t) (tag & ~0x20000000u); +} + +static hb_tag_t +hb_ot_new_tag_from_script (hb_script_t script) +{ + switch ((hb_tag_t) script) { + case HB_SCRIPT_BENGALI: return HB_TAG('b','n','g','2'); + case HB_SCRIPT_DEVANAGARI: return HB_TAG('d','e','v','2'); + case HB_SCRIPT_GUJARATI: return HB_TAG('g','j','r','2'); + case HB_SCRIPT_GURMUKHI: return HB_TAG('g','u','r','2'); + case HB_SCRIPT_KANNADA: return HB_TAG('k','n','d','2'); + case HB_SCRIPT_MALAYALAM: return HB_TAG('m','l','m','2'); + case HB_SCRIPT_ORIYA: return HB_TAG('o','r','y','2'); + case HB_SCRIPT_TAMIL: return HB_TAG('t','m','l','2'); + case HB_SCRIPT_TELUGU: return HB_TAG('t','e','l','2'); + case HB_SCRIPT_MYANMAR: return HB_TAG('m','y','m','2'); + } + + return HB_OT_TAG_DEFAULT_SCRIPT; +} + +static hb_script_t +hb_ot_new_tag_to_script (hb_tag_t tag) +{ + switch (tag) { + case HB_TAG('b','n','g','2'): return HB_SCRIPT_BENGALI; + case HB_TAG('d','e','v','2'): return HB_SCRIPT_DEVANAGARI; + case HB_TAG('g','j','r','2'): return HB_SCRIPT_GUJARATI; + case HB_TAG('g','u','r','2'): return HB_SCRIPT_GURMUKHI; + case HB_TAG('k','n','d','2'): return HB_SCRIPT_KANNADA; + case HB_TAG('m','l','m','2'): return HB_SCRIPT_MALAYALAM; + case HB_TAG('o','r','y','2'): return HB_SCRIPT_ORIYA; + case HB_TAG('t','m','l','2'): return HB_SCRIPT_TAMIL; + case HB_TAG('t','e','l','2'): return HB_SCRIPT_TELUGU; + case HB_TAG('m','y','m','2'): return HB_SCRIPT_MYANMAR; + } + + return HB_SCRIPT_UNKNOWN; +} + +#ifndef HB_DISABLE_DEPRECATED +/** + * hb_ot_tags_from_script: + * @script: an #hb_script_t to convert. + * @script_tag_1: (out): output #hb_tag_t. + * @script_tag_2: (out): output #hb_tag_t. + * + * Converts an #hb_script_t to script tags. + * + * Since: 0.6.0 + * Deprecated: 2.0.0: use hb_ot_tags_from_script_and_language() instead + **/ +void +hb_ot_tags_from_script (hb_script_t script, + hb_tag_t *script_tag_1, + hb_tag_t *script_tag_2) +{ + unsigned int count = 2; + hb_tag_t tags[2]; + hb_ot_tags_from_script_and_language (script, HB_LANGUAGE_INVALID, &count, tags, nullptr, nullptr); + *script_tag_1 = count > 0 ? tags[0] : HB_OT_TAG_DEFAULT_SCRIPT; + *script_tag_2 = count > 1 ? tags[1] : HB_OT_TAG_DEFAULT_SCRIPT; +} +#endif + +/* + * Complete list at: + * https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags + * + * Most of the script tags are the same as the ISO 15924 tag but lowercased. + * So we just do that, and handle the exceptional cases in a switch. + */ + +static void +hb_ot_all_tags_from_script (hb_script_t script, + unsigned int *count /* IN/OUT */, + hb_tag_t *tags /* OUT */) +{ + unsigned int i = 0; + + hb_tag_t new_tag = hb_ot_new_tag_from_script (script); + if (unlikely (new_tag != HB_OT_TAG_DEFAULT_SCRIPT)) + { + /* HB_SCRIPT_MYANMAR maps to 'mym2', but there is no 'mym3'. */ + if (new_tag != HB_TAG('m','y','m','2')) + tags[i++] = new_tag | '3'; + if (*count > i) + tags[i++] = new_tag; + } + + if (*count > i) + { + hb_tag_t old_tag = hb_ot_old_tag_from_script (script); + if (old_tag != HB_OT_TAG_DEFAULT_SCRIPT) + tags[i++] = old_tag; + } + + *count = i; +} + +/** + * hb_ot_tag_to_script: + * @tag: a script tag + * + * Converts a script tag to an #hb_script_t. + * + * Return value: The #hb_script_t corresponding to @tag. + * + **/ +hb_script_t +hb_ot_tag_to_script (hb_tag_t tag) +{ + unsigned char digit = tag & 0x000000FFu; + if (unlikely (digit == '2' || digit == '3')) + return hb_ot_new_tag_to_script (tag & 0xFFFFFF32); + + return hb_ot_old_tag_to_script (tag); +} + + +/* hb_language_t */ + +static inline bool +subtag_matches (const char *lang_str, + const char *limit, + const char *subtag, + unsigned subtag_len) +{ + if (likely ((unsigned) (limit - lang_str) < subtag_len)) + return false; + + do { + const char *s = strstr (lang_str, subtag); + if (!s || s >= limit) + return false; + if (!ISALNUM (s[subtag_len])) + return true; + lang_str = s + subtag_len; + } while (true); +} + +static bool +lang_matches (const char *lang_str, + const char *limit, + const char *spec, + unsigned spec_len) +{ + /* Same as hb_language_matches(); duplicated. */ + + if (likely ((unsigned) (limit - lang_str) < spec_len)) + return false; + + return strncmp (lang_str, spec, spec_len) == 0 && + (lang_str[spec_len] == '\0' || lang_str[spec_len] == '-'); +} + +struct LangTag +{ + hb_tag_t language; + hb_tag_t tag; + + int cmp (hb_tag_t a) const + { + return a < this->language ? -1 : a > this->language ? +1 : 0; + } + int cmp (const LangTag *that) const + { return cmp (that->language); } +}; + +#include "hb-ot-tag-table.hh" + +/* The corresponding languages IDs for the following IDs are unclear, + * overlap, or are architecturally weird. Needs more research. */ + +/*{"??", {HB_TAG('B','C','R',' ')}},*/ /* Bible Cree */ +/*{"zh?", {HB_TAG('C','H','N',' ')}},*/ /* Chinese (seen in Microsoft fonts) */ +/*{"ar-Syrc?", {HB_TAG('G','A','R',' ')}},*/ /* Garshuni */ +/*{"??", {HB_TAG('N','G','R',' ')}},*/ /* Nagari */ +/*{"??", {HB_TAG('Y','I','C',' ')}},*/ /* Yi Classic */ +/*{"zh?", {HB_TAG('Z','H','P',' ')}},*/ /* Chinese Phonetic */ + +#ifndef HB_DISABLE_DEPRECATED +/** + * hb_ot_tag_from_language: + * @language: an #hb_language_t to convert. + * + * Converts an #hb_language_t to an #hb_tag_t. + * + * Since: 0.6.0 + * Deprecated: 2.0.0: use hb_ot_tags_from_script_and_language() instead + **/ +hb_tag_t +hb_ot_tag_from_language (hb_language_t language) +{ + unsigned int count = 1; + hb_tag_t tags[1]; + hb_ot_tags_from_script_and_language (HB_SCRIPT_UNKNOWN, language, nullptr, nullptr, &count, tags); + return count > 0 ? tags[0] : HB_OT_TAG_DEFAULT_LANGUAGE; +} +#endif + +static void +hb_ot_tags_from_language (const char *lang_str, + const char *limit, + unsigned int *count, + hb_tag_t *tags) +{ + +#ifndef HB_NO_LANGUAGE_LONG + /* Check for matches of multiple subtags. */ + if (hb_ot_tags_from_complex_language (lang_str, limit, count, tags)) + return; +#endif + + /* Find a language matching in the first component. */ +#ifndef HB_NO_LANGUAGE_LONG + const char *s; s = strchr (lang_str, '-'); +#endif + { +#ifndef HB_NO_LANGUAGE_LONG + if (s && limit - lang_str >= 6) + { + const char *extlang_end = strchr (s + 1, '-'); + /* If there is an extended language tag, use it. */ + if (3 == (extlang_end ? extlang_end - s - 1 : strlen (s + 1)) && + ISALPHA (s[1])) + lang_str = s + 1; + } +#endif + const LangTag *ot_languages = nullptr; + unsigned ot_languages_len = 0; + const char *dash = strchr (lang_str, '-'); + unsigned first_len = dash ? dash - lang_str : limit - lang_str; + if (first_len == 2) + { + ot_languages = ot_languages2; + ot_languages_len = ARRAY_LENGTH (ot_languages2); + } +#ifndef HB_NO_LANGUAGE_LONG + else if (first_len == 3) + { + ot_languages = ot_languages3; + ot_languages_len = ARRAY_LENGTH (ot_languages3); + } +#endif + + hb_tag_t lang_tag = hb_tag_from_string (lang_str, first_len); + + static hb_atomic_int_t last_tag_idx; /* Poor man's cache. */ + unsigned tag_idx = last_tag_idx; + + if (likely (tag_idx < ot_languages_len && ot_languages[tag_idx].language == lang_tag) || + hb_sorted_array (ot_languages, ot_languages_len).bfind (lang_tag, &tag_idx)) + { + last_tag_idx = tag_idx; + unsigned int i; + while (tag_idx != 0 && + ot_languages[tag_idx].language == ot_languages[tag_idx - 1].language) + tag_idx--; + for (i = 0; + i < *count && + tag_idx + i < ot_languages_len && + ot_languages[tag_idx + i].tag != HB_TAG_NONE && + ot_languages[tag_idx + i].language == ot_languages[tag_idx].language; + i++) + tags[i] = ot_languages[tag_idx + i].tag; + *count = i; + return; + } + } + +#ifndef HB_NO_LANGUAGE_LONG + if (!s) + s = lang_str + strlen (lang_str); + if (s - lang_str == 3) { + /* Assume it's ISO-639-3 and upper-case and use it. */ + tags[0] = hb_tag_from_string (lang_str, s - lang_str) & ~0x20202000u; + *count = 1; + return; + } +#endif + + *count = 0; +} + +static bool +parse_private_use_subtag (const char *private_use_subtag, + unsigned int *count, + hb_tag_t *tags, + const char *prefix, + unsigned char (*normalize) (unsigned char)) +{ +#ifdef HB_NO_LANGUAGE_PRIVATE_SUBTAG + return false; +#endif + + if (!(private_use_subtag && count && tags && *count)) return false; + + const char *s = strstr (private_use_subtag, prefix); + if (!s) return false; + + char tag[4]; + int i; + s += strlen (prefix); + if (s[0] == '-') { + s += 1; + char c; + for (i = 0; i < 8 && ISHEX (s[i]); i++) + { + c = FROMHEX (s[i]); + if (i % 2 == 0) + tag[i / 2] = c << 4; + else + tag[i / 2] += c; + } + if (i != 8) return false; + } else { + for (i = 0; i < 4 && ISALNUM (s[i]); i++) + tag[i] = normalize (s[i]); + if (!i) return false; + + for (; i < 4; i++) + tag[i] = ' '; + } + tags[0] = HB_TAG (tag[0], tag[1], tag[2], tag[3]); + if ((tags[0] & 0xDFDFDFDF) == HB_OT_TAG_DEFAULT_SCRIPT) + tags[0] ^= ~0xDFDFDFDF; + *count = 1; + return true; +} + +/** + * hb_ot_tags_from_script_and_language: + * @script: an #hb_script_t to convert. + * @language: an #hb_language_t to convert. + * @script_count: (inout) (optional): maximum number of script tags to retrieve (IN) + * and actual number of script tags retrieved (OUT) + * @script_tags: (out) (optional): array of size at least @script_count to store the + * script tag results + * @language_count: (inout) (optional): maximum number of language tags to retrieve + * (IN) and actual number of language tags retrieved (OUT) + * @language_tags: (out) (optional): array of size at least @language_count to store + * the language tag results + * + * Converts an #hb_script_t and an #hb_language_t to script and language tags. + * + * Since: 2.0.0 + **/ +void +hb_ot_tags_from_script_and_language (hb_script_t script, + hb_language_t language, + unsigned int *script_count /* IN/OUT */, + hb_tag_t *script_tags /* OUT */, + unsigned int *language_count /* IN/OUT */, + hb_tag_t *language_tags /* OUT */) +{ + bool needs_script = true; + + if (language == HB_LANGUAGE_INVALID) + { + if (language_count && language_tags && *language_count) + *language_count = 0; + } + else + { + const char *lang_str, *s, *limit, *private_use_subtag; + bool needs_language; + + lang_str = hb_language_to_string (language); + limit = nullptr; + private_use_subtag = nullptr; + if (lang_str[0] == 'x' && lang_str[1] == '-') + { + private_use_subtag = lang_str; + } else { + for (s = lang_str + 1; *s; s++) + { + if (s[-1] == '-' && s[1] == '-') + { + if (s[0] == 'x') + { + private_use_subtag = s; + if (!limit) + limit = s - 1; + break; + } else if (!limit) + { + limit = s - 1; + } + } + } + if (!limit) + limit = s; + } + + needs_script = !parse_private_use_subtag (private_use_subtag, script_count, script_tags, "-hbsc", TOLOWER); + needs_language = !parse_private_use_subtag (private_use_subtag, language_count, language_tags, "-hbot", TOUPPER); + + if (needs_language && language_count && language_tags && *language_count) + hb_ot_tags_from_language (lang_str, limit, language_count, language_tags); + } + + if (needs_script && script_count && script_tags && *script_count) + hb_ot_all_tags_from_script (script, script_count, script_tags); +} + +/** + * hb_ot_tag_to_language: + * @tag: an language tag + * + * Converts a language tag to an #hb_language_t. + * + * Return value: (transfer none) (nullable): + * The #hb_language_t corresponding to @tag. + * + * Since: 0.9.2 + **/ +hb_language_t +hb_ot_tag_to_language (hb_tag_t tag) +{ + unsigned int i; + + if (tag == HB_OT_TAG_DEFAULT_LANGUAGE) + return nullptr; + +#ifndef HB_NO_LANGUAGE_LONG + { + hb_language_t disambiguated_tag = hb_ot_ambiguous_tag_to_language (tag); + if (disambiguated_tag != HB_LANGUAGE_INVALID) + return disambiguated_tag; + } +#endif + + char buf[4]; + for (i = 0; i < ARRAY_LENGTH (ot_languages2); i++) + if (ot_languages2[i].tag == tag) + { + hb_tag_to_string (ot_languages2[i].language, buf); + return hb_language_from_string (buf, 2); + } +#ifndef HB_NO_LANGUAGE_LONG + for (i = 0; i < ARRAY_LENGTH (ot_languages3); i++) + if (ot_languages3[i].tag == tag) + { + hb_tag_to_string (ot_languages3[i].language, buf); + return hb_language_from_string (buf, 3); + } +#endif + + /* Return a custom language in the form of "x-hbot-AABBCCDD". + * If it's three letters long, also guess it's ISO 639-3 and lower-case and + * prepend it (if it's not a registered tag, the private use subtags will + * ensure that calling hb_ot_tag_from_language on the result will still return + * the same tag as the original tag). + */ + { + char buf[20]; + char *str = buf; + if (ISALPHA (tag >> 24) + && ISALPHA ((tag >> 16) & 0xFF) + && ISALPHA ((tag >> 8) & 0xFF) + && (tag & 0xFF) == ' ') + { + buf[0] = TOLOWER (tag >> 24); + buf[1] = TOLOWER ((tag >> 16) & 0xFF); + buf[2] = TOLOWER ((tag >> 8) & 0xFF); + buf[3] = '-'; + str += 4; + } + snprintf (str, 16, "x-hbot-%08x", tag); + return hb_language_from_string (&*buf, -1); + } +} + +/** + * hb_ot_tags_to_script_and_language: + * @script_tag: a script tag + * @language_tag: a language tag + * @script: (out) (optional): the #hb_script_t corresponding to @script_tag. + * @language: (out) (optional): the #hb_language_t corresponding to @script_tag and + * @language_tag. + * + * Converts a script tag and a language tag to an #hb_script_t and an + * #hb_language_t. + * + * Since: 2.0.0 + **/ +void +hb_ot_tags_to_script_and_language (hb_tag_t script_tag, + hb_tag_t language_tag, + hb_script_t *script /* OUT */, + hb_language_t *language /* OUT */) +{ + hb_script_t script_out = hb_ot_tag_to_script (script_tag); + if (script) + *script = script_out; + if (language) + { + unsigned int script_count = 1; + hb_tag_t primary_script_tag[1]; + hb_ot_tags_from_script_and_language (script_out, + HB_LANGUAGE_INVALID, + &script_count, + primary_script_tag, + nullptr, nullptr); + *language = hb_ot_tag_to_language (language_tag); + if (script_count == 0 || primary_script_tag[0] != script_tag) + { + unsigned char *buf; + const char *lang_str = hb_language_to_string (*language); + size_t len = strlen (lang_str); + buf = (unsigned char *) hb_malloc (len + 16); + if (unlikely (!buf)) + { + *language = nullptr; + } + else + { + int shift; + hb_memcpy (buf, lang_str, len); + if (lang_str[0] != 'x' || lang_str[1] != '-') { + buf[len++] = '-'; + buf[len++] = 'x'; + } + buf[len++] = '-'; + buf[len++] = 'h'; + buf[len++] = 'b'; + buf[len++] = 's'; + buf[len++] = 'c'; + buf[len++] = '-'; + for (shift = 28; shift >= 0; shift -= 4) + buf[len++] = TOHEX (script_tag >> shift); + *language = hb_language_from_string ((char *) buf, len); + hb_free (buf); + } + } + } +} + +#ifdef MAIN +static inline void +test_langs_sorted () +{ + for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages2); i++) + { + int c = ot_languages2[i].cmp (&ot_languages2[i - 1]); + if (c > 0) + { + fprintf (stderr, "ot_languages2 not sorted at index %u: %08x %d %08x\n", + i, ot_languages2[i-1].language, c, ot_languages2[i].language); + abort(); + } + } +#ifndef HB_NO_LANGUAGE_LONG + for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages3); i++) + { + int c = ot_languages3[i].cmp (&ot_languages3[i - 1]); + if (c > 0) + { + fprintf (stderr, "ot_languages3 not sorted at index %u: %08x %d %08x\n", + i, ot_languages3[i-1].language, c, ot_languages3[i].language); + abort(); + } + } +#endif +} + +int +main () +{ + test_langs_sorted (); + return 0; +} + +#endif + + +#endif |