summaryrefslogtreecommitdiffstats
path: root/toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs
diff options
context:
space:
mode:
Diffstat (limited to 'toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs')
-rw-r--r--toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs406
1 files changed, 406 insertions, 0 deletions
diff --git a/toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs b/toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs
new file mode 100644
index 0000000000..8a1d5ba55e
--- /dev/null
+++ b/toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs
@@ -0,0 +1,406 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// FormAutofillNameUtils is initially translated from
+// https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
+export var FormAutofillNameUtils = {
+ NAME_PREFIXES: [
+ "1lt",
+ "1st",
+ "2lt",
+ "2nd",
+ "3rd",
+ "admiral",
+ "capt",
+ "captain",
+ "col",
+ "cpt",
+ "dr",
+ "gen",
+ "general",
+ "lcdr",
+ "lt",
+ "ltc",
+ "ltg",
+ "ltjg",
+ "maj",
+ "major",
+ "mg",
+ "mr",
+ "mrs",
+ "ms",
+ "pastor",
+ "prof",
+ "rep",
+ "reverend",
+ "rev",
+ "sen",
+ "st",
+ ],
+
+ NAME_SUFFIXES: [
+ "b.a",
+ "ba",
+ "d.d.s",
+ "dds",
+ "i",
+ "ii",
+ "iii",
+ "iv",
+ "ix",
+ "jr",
+ "m.a",
+ "m.d",
+ "ma",
+ "md",
+ "ms",
+ "ph.d",
+ "phd",
+ "sr",
+ "v",
+ "vi",
+ "vii",
+ "viii",
+ "x",
+ ],
+
+ FAMILY_NAME_PREFIXES: [
+ "d'",
+ "de",
+ "del",
+ "der",
+ "di",
+ "la",
+ "le",
+ "mc",
+ "san",
+ "st",
+ "ter",
+ "van",
+ "von",
+ ],
+
+ // The common and non-ambiguous CJK surnames (last names) that have more than
+ // one character.
+ COMMON_CJK_MULTI_CHAR_SURNAMES: [
+ // Korean, taken from the list of surnames:
+ // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
+ "남궁",
+ "사공",
+ "서문",
+ "선우",
+ "제갈",
+ "황보",
+ "독고",
+ "망절",
+
+ // Chinese, taken from the top 10 Chinese 2-character surnames:
+ // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
+ // Simplified Chinese (mostly mainland China)
+ "欧阳",
+ "令狐",
+ "皇甫",
+ "上官",
+ "司徒",
+ "诸葛",
+ "司马",
+ "宇文",
+ "呼延",
+ "端木",
+ // Traditional Chinese (mostly Taiwan)
+ "張簡",
+ "歐陽",
+ "諸葛",
+ "申屠",
+ "尉遲",
+ "司馬",
+ "軒轅",
+ "夏侯",
+ ],
+
+ // All Korean surnames that have more than one character, even the
+ // rare/ambiguous ones.
+ KOREAN_MULTI_CHAR_SURNAMES: [
+ "강전",
+ "남궁",
+ "독고",
+ "동방",
+ "망절",
+ "사공",
+ "서문",
+ "선우",
+ "소봉",
+ "어금",
+ "장곡",
+ "제갈",
+ "황목",
+ "황보",
+ ],
+
+ // The whitespace definition based on
+ // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
+ WHITESPACE: [
+ "\u0009", // CHARACTER TABULATION
+ "\u000A", // LINE FEED (LF)
+ "\u000B", // LINE TABULATION
+ "\u000C", // FORM FEED (FF)
+ "\u000D", // CARRIAGE RETURN (CR)
+ "\u0020", // SPACE
+ "\u0085", // NEXT LINE (NEL)
+ "\u00A0", // NO-BREAK SPACE
+ "\u1680", // OGHAM SPACE MARK
+ "\u2000", // EN QUAD
+ "\u2001", // EM QUAD
+ "\u2002", // EN SPACE
+ "\u2003", // EM SPACE
+ "\u2004", // THREE-PER-EM SPACE
+ "\u2005", // FOUR-PER-EM SPACE
+ "\u2006", // SIX-PER-EM SPACE
+ "\u2007", // FIGURE SPACE
+ "\u2008", // PUNCTUATION SPACE
+ "\u2009", // THIN SPACE
+ "\u200A", // HAIR SPACE
+ "\u2028", // LINE SEPARATOR
+ "\u2029", // PARAGRAPH SEPARATOR
+ "\u202F", // NARROW NO-BREAK SPACE
+ "\u205F", // MEDIUM MATHEMATICAL SPACE
+ "\u3000", // IDEOGRAPHIC SPACE
+ ],
+
+ // The middle dot is used as a separator for foreign names in Japanese.
+ MIDDLE_DOT: [
+ "\u30FB", // KATAKANA MIDDLE DOT
+ "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT"
+ ],
+
+ // The Unicode range is based on Wiki:
+ // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
+ // https://en.wikipedia.org/wiki/Hangul
+ // https://en.wikipedia.org/wiki/Japanese_writing_system
+ CJK_RANGE: [
+ "\u1100-\u11FF", // Hangul Jamo
+ "\u3040-\u309F", // Hiragana
+ "\u30A0-\u30FF", // Katakana
+ "\u3105-\u312C", // Bopomofo
+ "\u3130-\u318F", // Hangul Compatibility Jamo
+ "\u31F0-\u31FF", // Katakana Phonetic Extensions
+ "\u3200-\u32FF", // Enclosed CJK Letters and Months
+ "\u3400-\u4DBF", // CJK unified ideographs Extension A
+ "\u4E00-\u9FFF", // CJK Unified Ideographs
+ "\uA960-\uA97F", // Hangul Jamo Extended-A
+ "\uAC00-\uD7AF", // Hangul Syllables
+ "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
+ "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms
+ ],
+
+ HANGUL_RANGE: [
+ "\u1100-\u11FF", // Hangul Jamo
+ "\u3130-\u318F", // Hangul Compatibility Jamo
+ "\uA960-\uA97F", // Hangul Jamo Extended-A
+ "\uAC00-\uD7AF", // Hangul Syllables
+ "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
+ ],
+
+ _dataLoaded: false,
+
+ // Returns true if |set| contains |token|, modulo a final period.
+ _containsString(set, token) {
+ let target = token.replace(/\.$/, "").toLowerCase();
+ return set.includes(target);
+ },
+
+ // Removes common name prefixes from |name_tokens|.
+ _stripPrefixes(nameTokens) {
+ for (let i in nameTokens) {
+ if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) {
+ return nameTokens.slice(i);
+ }
+ }
+ return [];
+ },
+
+ // Removes common name suffixes from |name_tokens|.
+ _stripSuffixes(nameTokens) {
+ for (let i = nameTokens.length - 1; i >= 0; i--) {
+ if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) {
+ return nameTokens.slice(0, i + 1);
+ }
+ }
+ return [];
+ },
+
+ _isCJKName(name) {
+ // The name is considered to be a CJK name if it is only CJK characters,
+ // spaces, and "middle dot" separators, with at least one CJK character, and
+ // no more than 2 words.
+ //
+ // Chinese and Japanese names are usually spelled out using the Han
+ // characters (logographs), which constitute the "CJK Unified Ideographs"
+ // block in Unicode, also referred to as Unihan. Korean names are usually
+ // spelled out in the Korean alphabet (Hangul), although they do have a Han
+ // equivalent as well.
+
+ if (!name) {
+ return false;
+ }
+
+ let previousWasCJK = false;
+ let wordCount = 0;
+
+ for (let c of name) {
+ let isMiddleDot = this.MIDDLE_DOT.includes(c);
+ let isCJK = !isMiddleDot && this.reCJK.test(c);
+ if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) {
+ return false;
+ }
+ if (isCJK && !previousWasCJK) {
+ wordCount++;
+ }
+ previousWasCJK = isCJK;
+ }
+
+ return wordCount > 0 && wordCount < 3;
+ },
+
+ // Tries to split a Chinese, Japanese, or Korean name into its given name &
+ // surname parts. If splitting did not work for whatever reason, returns null.
+ _splitCJKName(nameTokens) {
+ // The convention for CJK languages is to put the surname (last name) first,
+ // and the given name (first name) second. In a continuous text, there is
+ // normally no space between the two parts of the name. When entering their
+ // name into a field, though, some people add a space to disambiguate. CJK
+ // names (almost) never have a middle name.
+
+ let reHangulName = new RegExp(
+ "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$",
+ "u"
+ );
+ let nameParts = {
+ given: "",
+ middle: "",
+ family: "",
+ };
+
+ if (nameTokens.length == 1) {
+ // There is no space between the surname and given name. Try to infer
+ // where to separate between the two. Most Chinese and Korean surnames
+ // have only one character, but there are a few that have 2. If the name
+ // does not start with a surname from a known list, default to one
+ // character.
+ let name = nameTokens[0];
+ let isKorean = reHangulName.test(name);
+ let surnameLength = 0;
+
+ // 4-character Korean names are more likely to be 2/2 than 1/3, so use
+ // the full list of Korean 2-char surnames. (instead of only the common
+ // ones)
+ let multiCharSurnames =
+ isKorean && name.length > 3
+ ? this.KOREAN_MULTI_CHAR_SURNAMES
+ : this.COMMON_CJK_MULTI_CHAR_SURNAMES;
+
+ // Default to 1 character if the surname is not in the list.
+ surnameLength = multiCharSurnames.some(surname =>
+ name.startsWith(surname)
+ )
+ ? 2
+ : 1;
+
+ nameParts.family = name.substr(0, surnameLength);
+ nameParts.given = name.substr(surnameLength);
+ } else if (nameTokens.length == 2) {
+ // The user entered a space between the two name parts. This makes our job
+ // easier. Family name first, given name second.
+ nameParts.family = nameTokens[0];
+ nameParts.given = nameTokens[1];
+ } else {
+ return null;
+ }
+
+ return nameParts;
+ },
+
+ init() {
+ if (this._dataLoaded) {
+ return;
+ }
+ this._dataLoaded = true;
+
+ this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u");
+ },
+
+ splitName(name) {
+ let nameParts = {
+ given: "",
+ middle: "",
+ family: "",
+ };
+
+ if (!name) {
+ return nameParts;
+ }
+
+ let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/);
+ nameTokens = this._stripPrefixes(nameTokens);
+
+ if (this._isCJKName(name)) {
+ let parts = this._splitCJKName(nameTokens);
+ if (parts) {
+ return parts;
+ }
+ }
+
+ // Don't assume "Ma" is a suffix in John Ma.
+ if (nameTokens.length > 2) {
+ nameTokens = this._stripSuffixes(nameTokens);
+ }
+
+ if (!nameTokens.length) {
+ // Bad things have happened; just assume the whole thing is a given name.
+ nameParts.given = name;
+ return nameParts;
+ }
+
+ // Only one token, assume given name.
+ if (nameTokens.length == 1) {
+ nameParts.given = nameTokens[0];
+ return nameParts;
+ }
+
+ // 2 or more tokens. Grab the family, which is the last word plus any
+ // recognizable family prefixes.
+ let familyTokens = [nameTokens.pop()];
+ while (nameTokens.length) {
+ let lastToken = nameTokens[nameTokens.length - 1];
+ if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) {
+ break;
+ }
+ familyTokens.unshift(lastToken);
+ nameTokens.pop();
+ }
+ nameParts.family = familyTokens.join(" ");
+
+ // Take the last remaining token as the middle name (if there are at least 2
+ // tokens).
+ if (nameTokens.length >= 2) {
+ nameParts.middle = nameTokens.pop();
+ }
+
+ // Remainder is given name.
+ nameParts.given = nameTokens.join(" ");
+
+ return nameParts;
+ },
+
+ joinNameParts({ given, middle, family }) {
+ if (this._isCJKName(given) && this._isCJKName(family) && !middle) {
+ return family + given;
+ }
+ return [given, middle, family]
+ .filter(part => part && part.length)
+ .join(" ");
+ },
+};
+
+FormAutofillNameUtils.init();