diff options
Diffstat (limited to 'toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs')
-rw-r--r-- | toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs | 406 |
1 files changed, 406 insertions, 0 deletions
diff --git a/toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs b/toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs new file mode 100644 index 0000000000..8a1d5ba55e --- /dev/null +++ b/toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs @@ -0,0 +1,406 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// FormAutofillNameUtils is initially translated from +// https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817 +export var FormAutofillNameUtils = { + NAME_PREFIXES: [ + "1lt", + "1st", + "2lt", + "2nd", + "3rd", + "admiral", + "capt", + "captain", + "col", + "cpt", + "dr", + "gen", + "general", + "lcdr", + "lt", + "ltc", + "ltg", + "ltjg", + "maj", + "major", + "mg", + "mr", + "mrs", + "ms", + "pastor", + "prof", + "rep", + "reverend", + "rev", + "sen", + "st", + ], + + NAME_SUFFIXES: [ + "b.a", + "ba", + "d.d.s", + "dds", + "i", + "ii", + "iii", + "iv", + "ix", + "jr", + "m.a", + "m.d", + "ma", + "md", + "ms", + "ph.d", + "phd", + "sr", + "v", + "vi", + "vii", + "viii", + "x", + ], + + FAMILY_NAME_PREFIXES: [ + "d'", + "de", + "del", + "der", + "di", + "la", + "le", + "mc", + "san", + "st", + "ter", + "van", + "von", + ], + + // The common and non-ambiguous CJK surnames (last names) that have more than + // one character. + COMMON_CJK_MULTI_CHAR_SURNAMES: [ + // Korean, taken from the list of surnames: + // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D + "남궁", + "사공", + "서문", + "선우", + "제갈", + "황보", + "독고", + "망절", + + // Chinese, taken from the top 10 Chinese 2-character surnames: + // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93 + // Simplified Chinese (mostly mainland China) + "欧阳", + "令狐", + "皇甫", + "上官", + "司徒", + "诸葛", + "司马", + "宇文", + "呼延", + "端木", + // Traditional Chinese (mostly Taiwan) + "張簡", + "歐陽", + "諸葛", + "申屠", + "尉遲", + "司馬", + "軒轅", + "夏侯", + ], + + // All Korean surnames that have more than one character, even the + // rare/ambiguous ones. + KOREAN_MULTI_CHAR_SURNAMES: [ + "강전", + "남궁", + "독고", + "동방", + "망절", + "사공", + "서문", + "선우", + "소봉", + "어금", + "장곡", + "제갈", + "황목", + "황보", + ], + + // The whitespace definition based on + // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817 + WHITESPACE: [ + "\u0009", // CHARACTER TABULATION + "\u000A", // LINE FEED (LF) + "\u000B", // LINE TABULATION + "\u000C", // FORM FEED (FF) + "\u000D", // CARRIAGE RETURN (CR) + "\u0020", // SPACE + "\u0085", // NEXT LINE (NEL) + "\u00A0", // NO-BREAK SPACE + "\u1680", // OGHAM SPACE MARK + "\u2000", // EN QUAD + "\u2001", // EM QUAD + "\u2002", // EN SPACE + "\u2003", // EM SPACE + "\u2004", // THREE-PER-EM SPACE + "\u2005", // FOUR-PER-EM SPACE + "\u2006", // SIX-PER-EM SPACE + "\u2007", // FIGURE SPACE + "\u2008", // PUNCTUATION SPACE + "\u2009", // THIN SPACE + "\u200A", // HAIR SPACE + "\u2028", // LINE SEPARATOR + "\u2029", // PARAGRAPH SEPARATOR + "\u202F", // NARROW NO-BREAK SPACE + "\u205F", // MEDIUM MATHEMATICAL SPACE + "\u3000", // IDEOGRAPHIC SPACE + ], + + // The middle dot is used as a separator for foreign names in Japanese. + MIDDLE_DOT: [ + "\u30FB", // KATAKANA MIDDLE DOT + "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT" + ], + + // The Unicode range is based on Wiki: + // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs + // https://en.wikipedia.org/wiki/Hangul + // https://en.wikipedia.org/wiki/Japanese_writing_system + CJK_RANGE: [ + "\u1100-\u11FF", // Hangul Jamo + "\u3040-\u309F", // Hiragana + "\u30A0-\u30FF", // Katakana + "\u3105-\u312C", // Bopomofo + "\u3130-\u318F", // Hangul Compatibility Jamo + "\u31F0-\u31FF", // Katakana Phonetic Extensions + "\u3200-\u32FF", // Enclosed CJK Letters and Months + "\u3400-\u4DBF", // CJK unified ideographs Extension A + "\u4E00-\u9FFF", // CJK Unified Ideographs + "\uA960-\uA97F", // Hangul Jamo Extended-A + "\uAC00-\uD7AF", // Hangul Syllables + "\uD7B0-\uD7FF", // Hangul Jamo Extended-B + "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms + ], + + HANGUL_RANGE: [ + "\u1100-\u11FF", // Hangul Jamo + "\u3130-\u318F", // Hangul Compatibility Jamo + "\uA960-\uA97F", // Hangul Jamo Extended-A + "\uAC00-\uD7AF", // Hangul Syllables + "\uD7B0-\uD7FF", // Hangul Jamo Extended-B + ], + + _dataLoaded: false, + + // Returns true if |set| contains |token|, modulo a final period. + _containsString(set, token) { + let target = token.replace(/\.$/, "").toLowerCase(); + return set.includes(target); + }, + + // Removes common name prefixes from |name_tokens|. + _stripPrefixes(nameTokens) { + for (let i in nameTokens) { + if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) { + return nameTokens.slice(i); + } + } + return []; + }, + + // Removes common name suffixes from |name_tokens|. + _stripSuffixes(nameTokens) { + for (let i = nameTokens.length - 1; i >= 0; i--) { + if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) { + return nameTokens.slice(0, i + 1); + } + } + return []; + }, + + _isCJKName(name) { + // The name is considered to be a CJK name if it is only CJK characters, + // spaces, and "middle dot" separators, with at least one CJK character, and + // no more than 2 words. + // + // Chinese and Japanese names are usually spelled out using the Han + // characters (logographs), which constitute the "CJK Unified Ideographs" + // block in Unicode, also referred to as Unihan. Korean names are usually + // spelled out in the Korean alphabet (Hangul), although they do have a Han + // equivalent as well. + + if (!name) { + return false; + } + + let previousWasCJK = false; + let wordCount = 0; + + for (let c of name) { + let isMiddleDot = this.MIDDLE_DOT.includes(c); + let isCJK = !isMiddleDot && this.reCJK.test(c); + if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) { + return false; + } + if (isCJK && !previousWasCJK) { + wordCount++; + } + previousWasCJK = isCJK; + } + + return wordCount > 0 && wordCount < 3; + }, + + // Tries to split a Chinese, Japanese, or Korean name into its given name & + // surname parts. If splitting did not work for whatever reason, returns null. + _splitCJKName(nameTokens) { + // The convention for CJK languages is to put the surname (last name) first, + // and the given name (first name) second. In a continuous text, there is + // normally no space between the two parts of the name. When entering their + // name into a field, though, some people add a space to disambiguate. CJK + // names (almost) never have a middle name. + + let reHangulName = new RegExp( + "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$", + "u" + ); + let nameParts = { + given: "", + middle: "", + family: "", + }; + + if (nameTokens.length == 1) { + // There is no space between the surname and given name. Try to infer + // where to separate between the two. Most Chinese and Korean surnames + // have only one character, but there are a few that have 2. If the name + // does not start with a surname from a known list, default to one + // character. + let name = nameTokens[0]; + let isKorean = reHangulName.test(name); + let surnameLength = 0; + + // 4-character Korean names are more likely to be 2/2 than 1/3, so use + // the full list of Korean 2-char surnames. (instead of only the common + // ones) + let multiCharSurnames = + isKorean && name.length > 3 + ? this.KOREAN_MULTI_CHAR_SURNAMES + : this.COMMON_CJK_MULTI_CHAR_SURNAMES; + + // Default to 1 character if the surname is not in the list. + surnameLength = multiCharSurnames.some(surname => + name.startsWith(surname) + ) + ? 2 + : 1; + + nameParts.family = name.substr(0, surnameLength); + nameParts.given = name.substr(surnameLength); + } else if (nameTokens.length == 2) { + // The user entered a space between the two name parts. This makes our job + // easier. Family name first, given name second. + nameParts.family = nameTokens[0]; + nameParts.given = nameTokens[1]; + } else { + return null; + } + + return nameParts; + }, + + init() { + if (this._dataLoaded) { + return; + } + this._dataLoaded = true; + + this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u"); + }, + + splitName(name) { + let nameParts = { + given: "", + middle: "", + family: "", + }; + + if (!name) { + return nameParts; + } + + let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/); + nameTokens = this._stripPrefixes(nameTokens); + + if (this._isCJKName(name)) { + let parts = this._splitCJKName(nameTokens); + if (parts) { + return parts; + } + } + + // Don't assume "Ma" is a suffix in John Ma. + if (nameTokens.length > 2) { + nameTokens = this._stripSuffixes(nameTokens); + } + + if (!nameTokens.length) { + // Bad things have happened; just assume the whole thing is a given name. + nameParts.given = name; + return nameParts; + } + + // Only one token, assume given name. + if (nameTokens.length == 1) { + nameParts.given = nameTokens[0]; + return nameParts; + } + + // 2 or more tokens. Grab the family, which is the last word plus any + // recognizable family prefixes. + let familyTokens = [nameTokens.pop()]; + while (nameTokens.length) { + let lastToken = nameTokens[nameTokens.length - 1]; + if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) { + break; + } + familyTokens.unshift(lastToken); + nameTokens.pop(); + } + nameParts.family = familyTokens.join(" "); + + // Take the last remaining token as the middle name (if there are at least 2 + // tokens). + if (nameTokens.length >= 2) { + nameParts.middle = nameTokens.pop(); + } + + // Remainder is given name. + nameParts.given = nameTokens.join(" "); + + return nameParts; + }, + + joinNameParts({ given, middle, family }) { + if (this._isCJKName(given) && this._isCJKName(family) && !middle) { + return family + given; + } + return [given, middle, family] + .filter(part => part && part.length) + .join(" "); + }, +}; + +FormAutofillNameUtils.init(); |