diff options
Diffstat (limited to 'toolkit/components/formautofill/shared/AddressParser.sys.mjs')
-rw-r--r-- | toolkit/components/formautofill/shared/AddressParser.sys.mjs | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/toolkit/components/formautofill/shared/AddressParser.sys.mjs b/toolkit/components/formautofill/shared/AddressParser.sys.mjs new file mode 100644 index 0000000000..8fe0dc7f80 --- /dev/null +++ b/toolkit/components/formautofill/shared/AddressParser.sys.mjs @@ -0,0 +1,281 @@ +/* eslint-disable no-useless-concat */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// NamedCaptureGroup class represents a named capturing group in a regular expression +class NamedCaptureGroup { + // The named of this capturing group + #name = null; + + // The capturing group + #capture = null; + + // The matched result + #match = null; + + constructor(name, capture) { + this.#name = name; + this.#capture = capture; + } + + get name() { + return this.#name; + } + + get capture() { + return this.#capture; + } + + get match() { + return this.#match; + } + + // Setter for the matched result based on the match groups + setMatch(matchGroups) { + this.#match = matchGroups[this.#name]; + } +} + +// Base class for different part of a street address regular expression. +// The regular expression is constructed with prefix, pattern, suffix +// and separator to extract "value" part. +// For examplem, when we write "apt 4." to for floor number, its prefix is `apt`, +// suffix is `.` and value to represent apartment number is `4`. +class StreetAddressPartRegExp extends NamedCaptureGroup { + constructor(name, prefix, pattern, suffix, sep, optional = false) { + prefix = prefix ?? ""; + suffix = suffix ?? ""; + super( + name, + `((?:${prefix})(?<${name}>${pattern})(?:${suffix})(?:${sep})+)${ + optional ? "?" : "" + }` + ); + } +} + +// A regular expression to match the street number portion of a street address, +class StreetNumberRegExp extends StreetAddressPartRegExp { + static PREFIX = "((no|°|º|number)(\\.|-|\\s)*)?"; // From chromium source + + static PATTERN = "\\d+\\w?"; + + // TODO: possible suffix : (th\\.|\\.)? + static SUFFIX = null; + + constructor(sep, optional) { + super( + StreetNumberRegExp.name, + StreetNumberRegExp.PREFIX, + StreetNumberRegExp.PATTERN, + StreetNumberRegExp.SUFFIX, + sep, + optional + ); + } +} + +// A regular expression to match the street name portion of a street address, +class StreetNameRegExp extends StreetAddressPartRegExp { + static PREFIX = null; + + static PATTERN = "(?:[^\\s,]+(?:[^\\S\\r\\n]+[^\\s,]+)*?)"; // From chromium source + + // TODO: Should we consider suffix like (ave|st)? + static SUFFIX = null; + + constructor(sep, optional) { + super( + StreetNameRegExp.name, + StreetNameRegExp.PREFIX, + StreetNameRegExp.PATTERN, + StreetNameRegExp.SUFFIX, + sep, + optional + ); + } +} + +// A regular expression to match the apartment number portion of a street address, +class ApartmentNumberRegExp extends StreetAddressPartRegExp { + static keyword = "apt|apartment|wohnung|apto|-" + "|unit|suite|ste|#|room"; // From chromium source // Firefox specific + static PREFIX = `(${ApartmentNumberRegExp.keyword})(\\.|\\s|-)*`; + + static PATTERN = "\\w*([-|\\/]\\w*)?"; + + static SUFFIX = "(\\.|\\s|-)*(ª)?"; // From chromium source + + constructor(sep, optional) { + super( + ApartmentNumberRegExp.name, + ApartmentNumberRegExp.PREFIX, + ApartmentNumberRegExp.PATTERN, + ApartmentNumberRegExp.SUFFIX, + sep, + optional + ); + } +} + +// A regular expression to match the floor number portion of a street address, +class FloorNumberRegExp extends StreetAddressPartRegExp { + static keyword = + "floor|flur|fl|og|obergeschoss|ug|untergeschoss|geschoss|andar|piso|º" + // From chromium source + "|level|lvl"; // Firefox specific + static PREFIX = `(${FloorNumberRegExp.keyword})?(\\.|\\s|-)*`; // TODO + static PATTERN = "\\d{1,3}\\w?"; + static SUFFIX = `(st|nd|rd|th)?(\\.|\\s|-)*(${FloorNumberRegExp.keyword})?`; // TODO + + constructor(sep, optional) { + super( + FloorNumberRegExp.name, + FloorNumberRegExp.PREFIX, + FloorNumberRegExp.PATTERN, + FloorNumberRegExp.SUFFIX, + sep, + optional + ); + } +} + +/** + * Class represents a street address with the following fields: + * - street number + * - street name + * - apartment number + * - floor number + */ +export class StructuredStreetAddress { + #street_number = null; + #street_name = null; + #apartment_number = null; + #floor_number = null; + + constructor(street_number, street_name, apartment_number, floor_number) { + this.#street_number = street_number?.toString(); + this.#street_name = street_name?.toString(); + this.#apartment_number = apartment_number?.toString(); + this.#floor_number = floor_number?.toString(); + } + + get street_number() { + return this.#street_number; + } + + get street_name() { + return this.#street_name; + } + + get apartment_number() { + return this.#apartment_number; + } + + get floor_number() { + return this.#floor_number; + } + + toString() { + return ` + street number: ${this.#street_number}\n + street name: ${this.#street_name}\n + apartment number: ${this.#apartment_number}\n + floor number: ${this.#floor_number}\n + `; + } +} + +export class AddressParser { + /** + * Parse street address with the following pattern. + * street number, street name, apartment number(optional), floor number(optional) + * For example, 2 Harrison St #175 floor 2 + * + * @param {string} address The street address to be parsed. + * @returns {StructuredStreetAddress} + */ + static parseStreetAddress(address) { + const separator = "(\\s|,|$)"; + + const regexpes = [ + new StreetNumberRegExp(separator), + new StreetNameRegExp(separator), + new ApartmentNumberRegExp(separator, true), + new FloorNumberRegExp(separator, true), + ]; + + return AddressParser.parse(address, regexpes) + ? new StructuredStreetAddress(...regexpes.map(regexp => regexp.match)) + : null; + } + + static parse(address, regexpes) { + const options = { + trim: true, + merge_whitespace: true, + ignore_case: true, + }; + address = AddressParser.normalizeString(address, options); + + const match = address.match( + new RegExp(`^(${regexpes.map(regexp => regexp.capture).join("")})$`) + ); + if (!match) { + return null; + } + + regexpes.forEach(regexp => regexp.setMatch(match.groups)); + return regexpes.reduce((acc, current) => { + return { ...acc, [current.name]: current.match }; + }, {}); + } + + static normalizeString(s, options) { + if (typeof s != "string") { + return s; + } + + if (options.ignore_case) { + s = s.toLowerCase(); + } + + // process punctuation before whitespace because if a punctuation + // is replaced with whitespace, we might want to merge it later + if (options.remove_punctuation) { + s = AddressParser.replacePunctuation(s, ""); + } else if ("replace_punctuation" in options) { + const replace = options.replace_punctuation; + s = AddressParser.replacePunctuation(s, replace); + } + + // process whitespace + if (options.merge_whitespace) { + s = AddressParser.mergeWhitespace(s); + } else if (options.remove_whitespace) { + s = AddressParser.removeWhitespace(s); + } + + return s.trim(); + } + + static replacePunctuation(s, replace) { + const regex = /\p{Punctuation}/gu; + return s?.replace(regex, replace); + } + + static removePunctuation(s) { + return s?.replace(/[.,\/#!$%\^&\*;:{}=\-_~()]/g, ""); + } + + static replaceControlCharacters(s, replace) { + return s?.replace(/[\t\n\r]/g, " "); + } + + static removeWhitespace(s) { + return s?.replace(/[\s]/g, ""); + } + + static mergeWhitespace(s) { + return s?.replace(/\s{2,}/g, " "); + } +} |