diff options
Diffstat (limited to 'comm/calendar/extract')
-rw-r--r-- | comm/calendar/extract/CalExtractParser.jsm | 545 | ||||
-rw-r--r-- | comm/calendar/extract/CalExtractParserService.jsm | 308 | ||||
-rw-r--r-- | comm/calendar/extract/moz.build | 9 |
3 files changed, 862 insertions, 0 deletions
diff --git a/comm/calendar/extract/CalExtractParser.jsm b/comm/calendar/extract/CalExtractParser.jsm new file mode 100644 index 0000000000..355fa3a6da --- /dev/null +++ b/comm/calendar/extract/CalExtractParser.jsm @@ -0,0 +1,545 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const EXPORTED_SYMBOLS = [ + "CalExtractToken", + "CalExtractParseNode", + "CalExtractParser", + "extendParseRule", + "prepareArguments", +]; + +/** + * CalExtractOptions holds configuration options used by CalExtractParser. + * + * @typedef {object} CalExtractOptions + * @property {RegExp} sentenceBoundary - A pattern used to split text at the + * sentence boundary. This should capture + * the boundary only and not any other + * part of the sentence. Use lookaheads + * if needed. + */ + +/** + * @type {CalExtractOptions} + */ +const defaultOptions = { + sentenceBoundary: /(?<=\w)[.!?]+\s(?=[A-Z0-9])|[.!?]$/, +}; + +const FLAG_OPTIONAL = 1; +const FLAG_MULTIPLE = 2; +const FLAG_NONEMPTY = 4; + +const flagBits = new Map([ + ["?", FLAG_OPTIONAL], + ["+", FLAG_MULTIPLE | FLAG_NONEMPTY], + ["*", FLAG_MULTIPLE | FLAG_OPTIONAL], +]); + +/** + * CalExtractToken represents a lexical unit of valid text. These are produced + * during the tokenisation stage of CalExtractParser by matching regular + * expressions against a text sequence. + */ +class CalExtractToken { + /** + * Identifies the token. Should be in uppercase with no spaces for consistency. + * + * @type {string} + */ + type = ""; + + /** + * The text captured by this token. + * + * @type {string[]} + */ + text = []; + + /** + * Indicates which sentence in the source text the token was found. + * + * @type {number} + */ + sentence = -1; + + /** + * Indicates the position with the sentence the token occurs. + * + * @type {number} + */ + position = -1; + + /** + * @param {string} type + * @param {string[]} text + * @param {number} sentence + * @param {number} position + */ + constructor(type, text, sentence, position) { + this.type = type; + this.text = text; + this.sentence = sentence; + this.position = position; + } +} + +/** + * Function used to produce a value when a CalExtractParseRule is matched. + * + * @callback CallExtractParseRuleAction + * @param {any[]} args - An array containing all the values produced from each + * pattern in the rule when they are matched or the + * CalExtractToken when lexical tokens are used instead. + */ + +/** + * CalExtractParseRule specifies a named pattern that is looked for when parsing + * the tokenized source text. Patterns are a sequence of one or more CalExtactToken + * types or CalExtractParseRule names. Each pattern specified can optionally + * have one (and only one) of the following flags: + * + * 1) "?" - Optional flag, indicates a pattern may be skipped if not matched. + * 2) "*" - Multiple flag, indicates a pattern may match 0 or more times. + * 3) "+" - Non-empty multiple flag, indicates a pattern may match 1 or more times. + * + * Flags must be specified as the last character of the pattern name, example: + * ["subject", "text?", "MEET", "text*", "time+"] + * + * @typedef {object} CalExtractParseRule + * + * @property {string} name - The name of the rule that can + * be used in other patterns. + * Should be lowercase for + * consistency. + * @property {string[]} patterns - The pattern that will be + * searched for on the tokenized + * string. Can contain flags. + * + * @property {CalExtractParseRuleAction} action - Produces the result of the + * rule being satisfied. + */ + +/** + * CalExtractExtParseRule is derived from a CalExtractParseRule to include + * additional information needed during parsing. + * + * @typedef {CalExtractParseRule} CalExtractExtParseRule + * + * @property {string[]} patterns - The patterns here are stripped of + * any flags. + * @property {number[]} flags - An array containing the flags + * specified for each patterns element. + * @property {CalExtractParseNode} graph - A graph used to determine what parse + * rule can be applied for an encountered + * production. + */ + +/** + * CalExtractParseNode is used to represent the patterns of a CalExtractParseRule + * as a graph. This graph is traversed during stack reduction until one of the + * following end conditions are met: + * + * 1) There are no more descendant nodes. + * 2) The only descendant node is the node itself (cyclic). + * 3) All of the descendant nodes are optional, there are no more tokens to + * shift and we have traversed the entire stack. + */ +class CalExtractParseNode { + /** + * @type {string} + */ + symbol = null; + + /** + * @type {number} + */ + flags = null; + + /** + * Contains each possible descendant node of this node. + * + * @type {CalExtractParseNode[]} + */ + descendants = null; + + static FLAG_OPTIONAL = FLAG_OPTIONAL; + static FLAG_MULTIPLE = FLAG_MULTIPLE; + static FLAG_NONEMPTY = FLAG_NONEMPTY; + + /** + * @param {string} symbol - The pattern this node represents. + * @param {number} flags - The computed flags assigned to the pattern. + * @param {CalExtractParseNode[]} descendants - Descendant nodes of this node. + */ + constructor(symbol, flags, descendants = []) { + this.symbol = symbol; + this.flags = flags; + this.descendants = descendants; + } + + /** + * Indicates this is the last node in its graph. This will always be false + * for cyclic nodes. + */ + get isEnd() { + return !this.descendants.length; + } + + /** + * Appends a new descendant to this node. + * + * @param {CalExtractParseNode} node - The node to append. + * + * @returns {CalExtractParseNode} The appended node. + */ + append(node) { + this.descendants.push(node); + return node; + } + + /** + * Provides the descendant CalExtractParseNode of this one given its symbol + * name. The result depends on the following rules: + * 1) If this node has a descendant that matches the name, return that node. + * 2) If the node does not have a matching descendant but has descendants + * with the optional flag set, delegate to those nodes. This implements + * the "?" and optional aspect of "*". + * 3) If none of the above produce a node, null is returned which means this + * graph cannot be traversed any further. + * + * @returns {CalExtractParseNode|null} + */ + getDescendant(name) { + // It is important the direct descendants are checked first. + let node = this.descendants.find(node => node.symbol == name); + if (node) { + return node; + } + + // Now try any optional descendants. + for (let node of this.descendants) { + let hit = node.isOptional() && node != this && node.getDescendant(name); + if (hit) { + return hit; + } + } + return null; + } + + /** + * Indicates this node can terminate the graph if so desired. This is acceptable + * if all of the descendants of this node are optional and there is nothing + * more to match on the stack. + * + * @returns {boolean} + */ + canEnd() { + return this.descendants.filter(desc => desc != this).every(desc => desc.isOptional()); + } + + /** + * Indicates whether this node has the optional flag set. + * + * @returns {boolean} + */ + isOptional() { + return Boolean(this.flags & FLAG_OPTIONAL); + } + + /** + * Indicates whether this node is cyclic. A cyclic node is one whose only + * descendants is itself thus creating a loop. This occurs one a multiple + * flagged node is in the tail position of the graph. + */ + isCyclic() { + return !this.isEnd && this.descendants.every(desc => desc == this); + } +} + +/** + * CalExtractParser provides an API for detecting interesting information within + * a text sequence that can be used for event detection and creation. It is a + * naive implementation of a shift-reduce parser, naive in the sense that not + * too much attention has been paid to optimisation or semantics. + * + * This parser works by first splitting the source string into sentences, then + * tokenizing each using the token rules specified. The boundary for splitting + * into sentences can be specified in the options object. + * + * After tokenisation, the parser uses the parse rules to shift/reduce each + * sentence into a final result. The first parse rule is treated as the intended + * rule to reduce the tokens of each sentence to. If all of the tokens have been + * processed and the result is not the first rule, parsing is considered to have + * failed and null is returned for that sentence. For this reason, it is a good + * idea to specify parse rules that are robust but not too specific in their + * patterns. + */ +class CalExtractParser { + /** + * @type {[RegExp,string?][]} + */ + tokenRules = []; + + /** + * @type {CalExtractParseRule[]} + */ + parseRules = []; + + /** + * @type {CalExtractOptions} + */ + options = null; + + /** + * Use the static createInstance() method instead of this constructor directly. + * + * @param {[RegExp, string?][]} tokenRules + * @param {CalExtractExtParseRule[]} parseRules + * @param {CalExtractOptions} [options] - Configuration object. + * + * @private + */ + constructor(tokenRules, parseRules, options = defaultOptions) { + this.tokenRules = tokenRules; + this.parseRules = parseRules; + this.options = options; + } + + /** + * This method creates a new CalExtractParser instance using the simpler + * CalExtractParseRule interface instead of the extended one. It takes care + * of creating a graph for each rule and normalizing pattern names that may + * be using flags. + * + * @param {[RegExp, string?][]} tokenRules - A list of rules to apply during + * tokenisation. The first element of each rule is a regular expression used + * to detect lexical tokens and the second element is the type to assign to + * the token. Order matters slightly here, in general, more complex but specific + * rules should appear before simpler, more general ones. + * + * When specifying token rules, they should be anchored to the start of the + * string via "^" or tokenize() will produce unexpected results. Some regular + * expressions should also include a word boundary to prevent matching within + * a large string, example: "at" in "attachment". If a string is to be matched, + * but no token is desired you can omit the token type from the rule and it + * will be omitted completely. + * + * @param {CalExtractParseRule[]} parseRules - A list of CalExtractParseRules + * that will be extended then used during parsing. Multiple parse rules can + * share the same name and will all be considered the same when matching patterns. + * Use this to specify variations of the same rule. + * + * @param {CalExtractOptions} [options] - Configuration object. + */ + static createInstance(tokenRules, parseRules, options = defaultOptions) { + return new CalExtractParser(tokenRules, parseRules.map(extendParseRule), options); + } + + /** + * Tokenizes a string to make it easier to match against the parse rule + * patterns. If text is encountered that cannot be tokenized, the result for + * that sentence is null. + * + * @param {string} str - The string to tokenize. + * + * @returns {CalExtractToken[][]} For each sentence encountered, a list of + * CalExtractTokens. + */ + tokenize(str) { + let allTokens = []; + let sentences = str.split(this.options.sentenceBoundary).filter(Boolean); + + for (let i = 0; i < sentences.length; i++) { + let sentence = sentences[i]; + let pos = 0; + let tokens = []; + let buffer = ""; + + let matched; + while (pos < sentence.length) { + buffer = sentence.substr(pos); + for (let [pattern, type] of this.tokenRules) { + matched = pattern.exec(buffer); + if (matched) { + if (type) { + tokens.push(new CalExtractToken(type, matched[0], i, pos)); + } + pos += matched[0].length; + break; + } + } + + if (!matched) { + // No rules for the encountered text, bail out. + tokens = null; + break; + } + } + allTokens.push(tokens); + } + return allTokens; + } + + /** + * Parses a string into an array of values representing the final result of + * parsing each sentence encountered. The elements of the resulting array + * are either the result of applying the action of the first (top) parse rule + * or null if we could not successfully parse the sentence. + * + * @param {string} str + * + * @returns {any[]} + */ + parse(str) { + return this.tokenize(str).map(tokens => { + if (!this.parseRules.length || !tokens) { + return null; + } + + let lookahead = null; + let stack = []; + while (true) { + if (tokens.length) { + let next = tokens.shift(); + stack.push([next.type, next]); + lookahead = tokens[0] ? tokens[0].type : null; + while (this.reduceStack(stack, lookahead)) { + continue; + } + } else { + // Attempt to reduce anything still on the stack now that the + // tokens have all been pushed. + while (this.reduceStack(stack, lookahead)) { + continue; + } + break; + } + } + return stack.length == 1 && stack[0][0] == this.parseRules[0].name ? stack[0][1] : null; + }); + } + + /** + * Attempts to reduce the given stack exactly once using the internal parsing + * rules. If successful, the stack will be modified to contain the matched + * rule at the location it was found. This methods modifies the stack given. + * + * @returns {boolean} - True if the stack was reduced false if otherwise. + */ + reduceStack(stack, lookahead) { + for (let i = 0; i < stack.length; i++) { + for (let rule of this.parseRules) { + let node = rule.graph; + let n = i; + let matchCount = 0; + while (n < stack.length && (node = node.getDescendant(stack[n][0]))) { + matchCount++; + if ( + node.isEnd || + (n == stack.length - 1 && !lookahead && (node.isCyclic() || node.canEnd())) + ) { + let result = [rule.name, null]; + let matched = stack.splice(i, matchCount, result); + result[1] = rule.action(prepareArguments(rule, matched)); + return true; + } + n++; + } + } + } + return false; + } +} + +/** + * Converts a CalExtractParseRule to a CalExtractExtParseRule. + * + * @param {CalExtractParseRule} rule + * + * @returns {CalExtractExtParseRule} + */ +function extendParseRule(rule) { + let { name, action } = rule; + let flags = []; + let patterns = []; + let start = new CalExtractParseNode(null, null); + let graph = start; + + for (let pattern of rule.patterns) { + let patternFlag = pattern[pattern.length - 1]; + let bits = 0; + + // Compute the flag value. + for (let [flag, value] of flagBits) { + if (patternFlag == flag) { + bits = bits | value; + } + } + + // Removes the flag from patterns that have them. + pattern = bits ? pattern.substring(0, pattern.length - 1) : pattern; + patterns.push(pattern); + graph = graph.append(new CalExtractParseNode(pattern, bits)); + + // Create a loop node if this flag is set. + if (bits & FLAG_MULTIPLE) { + graph.append(graph); + } + + flags.push(bits); + } + + return { + name, + action, + patterns, + flags, + graph: start, + }; +} + +/** + * Normalizes the matched arguments to be passed to an CalExtractParseRuleAction + * by ensuring the number is the same as the patterns for the action. This takes + * care of converting multi matches into an array and providing "null" when + * an optional pattern is unmatched. + * + * @param {CalExtractExtRule} rule - The rule the action belongs to. + * @param {string[]} matched - An sub-array of the stack containing what + * was actually matched. This array will be + * modified to match the full rule (inclusive + * of optional patterns). + * + * + * @returns {Array} Arguments for a CalExtractParseRuleAction. + */ +function prepareArguments(rule, matched) { + return rule.patterns.map((pattern, index) => { + if (rule.flags[index] & FLAG_MULTIPLE) { + let c = index; + let arrayArg = []; + + while (c < matched.length && matched[c][0] == pattern) { + arrayArg.push(matched[c][1]); + c++; + } + if (!arrayArg.length) { + // This rule was not matched, make a blank space for it. + matched.splice(index, 0, null); + } else { + // Move all the matches into a single element so we match the pattern. + matched.splice(index, arrayArg.length, arrayArg); + } + return arrayArg; + } else if (matched[index] && matched[index][0] == pattern) { + return matched[index][1]; + } + + // The pattern was unmatched, it should be optional. + matched.splice(index, 0, null); + return null; + }); +} diff --git a/comm/calendar/extract/CalExtractParserService.jsm b/comm/calendar/extract/CalExtractParserService.jsm new file mode 100644 index 0000000000..aa1ca38e1f --- /dev/null +++ b/comm/calendar/extract/CalExtractParserService.jsm @@ -0,0 +1,308 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const EXPORTED_SYMBOLS = ["CalExtractParserService"]; + +const { CalExtractParser } = ChromeUtils.import( + "resource:///modules/calendar/extract/CalExtractParser.jsm" +); + +const defaultRules = [ + [ + // Start clean up patterns. + + // remove last line preceding quoted message and first line of the quote + [/^(\r?\n[^>].*\r?\n>+.*$)/, ""], + + // remove the rest of quoted content + [/^(>+.*$)/, ""], + + // urls often contain dates dates that can confuse extraction + [/^(https?:\/\/[^\s]+\s)/, ""], + [/^(www\.[^\s]+\s)/, ""], + + // remove phone numbers + [/^(\d-\d\d\d-\d\d\d-\d\d\d\d)/, ""], + + // remove standard signature + [/^(\r?\n-- \r?\n[\S\s]+$)/, ""], + + // XXX remove timezone info, for now + [/^(gmt[+-]\d{2}:\d{2})/i, ""], + + // End clean up patterns. + + [/^meet\b/i, "MEET"], + [/^(we will|we'll|we)\b/i, "WE"], + + // Meridiem + [/^(a[.]?m[.]?)/i, "AM"], + [/^(p[.]?m[.]?)/i, "PM"], + + [/^(hours|hour|hrs|hr)\b/i, "HOURS"], + [/^(minutes|min|mins)\b/i, "MINUTES"], + [/^(days|day)\b/i, "DAYS"], + + // Words commonly used when specifying begin/end time or duration. + [/^at\b/i, "AT"], + [/^until\b/i, "UNTIL"], + [/^for\b/i, "FOR"], + + // Units of time + [/^(((0|1)?[0-9])|(2[0-4]))\b/, "HOUR_VALUE"], + + [/^\d+\b/, "NUMBER"], + + // Any text we don't know the meaning of. + [/^\S+/, "TEXT"], + + // Whitespace + [/^\s+/, ""], + ], + [ + { + name: "event-guess", + patterns: ["subject", "meet", "start-time", "text*", "end-time"], + action: ([, , startTime, , endTime]) => ({ + type: "event-guess", + startTime, + endTime, + priority: 0, + }), + }, + { + name: "event-guess", + patterns: ["subject", "meet", "start-time", "text*", "duration-time"], + action: ([, , startTime, , endTime]) => ({ + type: "event-guess", + startTime, + endTime, + priority: 0, + }), + }, + { + name: "subject", + patterns: ["WE"], + action: ([subject]) => ({ + type: "subject", + subject: subject.text, + }), + }, + { + name: "start-time", + patterns: ["start-time-prefix", "meridiem-time"], + action: ([, time]) => time, + }, + { + name: "start-time-prefix", + patterns: ["AT"], + action: ([prefix]) => prefix.text, + }, + { + name: "end-time", + patterns: ["end-time-prefix", "meridiem-time"], + action: ([, time]) => time, + }, + { + name: "end-time-prefix", + patterns: ["UNTIL"], + action: ([prefix]) => prefix.text, + }, + { + name: "meridiem-time", + patterns: ["HOUR_VALUE", "meridiem"], + action: ([hour, meridiem]) => ({ + type: "meridiem-time", + hour: Number(hour.text), + meridiem, + }), + }, + { + name: "meridiem", + patterns: ["AM"], + action: () => "am", + }, + { + name: "meridiem", + patterns: ["PM"], + action: () => "pm", + }, + + { + name: "duration-time", + patterns: ["duration-prefix", "duration"], + action: ([, duration]) => ({ + type: "duration-time", + duration, + }), + }, + { + name: "duration-prefix", + patterns: ["FOR"], + action: ([prefix]) => prefix.text, + }, + { + name: "duration", + patterns: ["NUMBER", "MINUTES"], + action: ([value]) => Number(value.text), + }, + { + name: "duration", + patterns: ["NUMBER", "HOURS"], + action: ([value]) => Number(value.text) * 60, + }, + { + name: "duration", + patterns: ["NUMBER", "DAYS"], + action: ([value]) => Number(value.text) * 60 * 24, + }, + { + name: "meet", + patterns: ["MEET"], + action: () => "meet", + }, + { + name: "text", + patterns: ["TEXT"], + action: ([text]) => text, + }, + ], +]; + +/** + * CalExtractParserServiceContext represents the context parsing and extraction + * takes place in. It holds values used in various calculations. For example, + * the current date. + * + * @typedef {object} CalExtractParserServiceContext + * @property {Date} now - The Date to use when calculating start and relative + * times. + */ + +/** + * CalExtractParserService provides a frontend to the CalExtractService. + * It holds lexical and parse rules for multiple locales (or any string + * identifier) that can be used on demand when parsing text. + */ +class CalExtractParserService { + rules = new Map([["en-US", defaultRules]]); + + /** + * Parses and extract the most relevant event creation data based on the + * rules of the locale given. + * + * @param {string} source + * @param {CalExtractParserServiceContext} context + * @param {string} locale + */ + extract(source, ctx = { now: new Date() }, locale = "en-US") { + let rules = this.rules.get(locale); + if (!rules) { + return null; + } + + let [lex, parse] = rules; + let parser = CalExtractParser.createInstance(lex, parse); + let result = parser.parse(source).sort((a, b) => a - b)[0]; + return result && convertDurationToEndTime(populateTimes(result, ctx.now)); + } +} + +/** + * Populates the missing values of the startTime and endTime. + * + * @param {object?} guess - The result of CalExtractParserService.extract(). + * @param {Date} now - A Date object representing the contextual date and + * time. + * + * @returns {object} The result with the populated startTime and endTime. + */ +function populateTimes(guess, now) { + return populateTime(populateTime(guess, now, "startTime"), now, "endTime"); +} + +/** + * Populates the missing values of the specified time property based on the Date + * provided. + * + * @param {object?} guess + * @param {Date} now + * @param {string} prop + * + * @returns {object} + */ +function populateTime(guess, now, prop) { + let time = guess[prop]; + + if (!time) { + return guess; + } + if (time.hour && time.meridiem) { + time.hour = normalizeHour(time.hour, time.meridiem); + } + + time.year = time.year || now.getFullYear(); + time.month = time.month || now.getMonth() + 1; + time.day = time.day || now.getDay(); + time.hour = time.hour || now.getHours(); + time.minute = time.minute || now.getMinutes(); + return guess; +} + +/** + * Coverts an hour using the Meridiem to a 24 hour value. + * + * @param {number} hour - The hour value. + * @param {string} meridiem - "am" or "pm" + * + * @returns {number} + */ +function normalizeHour(hour, meridiem) { + if (meridiem == "am" && hour == 12) { + return hour - 12; + } else if (meridiem == "pm" && hour != 12) { + return hour + 12; + } + + let dayStart = Services.prefs.getIntPref("calendar.view.daystarthour", 6); + if (hour < dayStart && hour <= 11) { + return hour + 12; + } + + return hour; +} + +/** + * Takes care of converting an end duration to an actual time relative to the + * start time detected (if any). + * + * @param {object} guess - Results from CalExtractParserService#extract() + * + * @returns {object} The result with the endTime property expanded. + */ +function convertDurationToEndTime(guess) { + if (guess.startTime && guess.endTime && guess.endTime.type == "duration-time") { + let startTime = guess.startTime; + let duration = guess.endTime.duration; + if (duration != 0) { + let startDate = new Date(startTime.year, startTime.month - 1, startTime.day); + if ("hour" in startTime) { + startDate.setHours(startTime.hour); + startDate.setMinutes(startTime.minute); + } + + let endDate = new Date(startDate.getTime() + duration * 60 * 1000); + let endTime = { type: "date-time" }; + endTime.year = endDate.getFullYear(); + endTime.month = endDate.getMonth() + 1; + endTime.day = endDate.getDate(); + if (endDate.getHours() != 0 || endDate.getMinutes() != 0) { + endTime.hour = endDate.getHours(); + endTime.minute = endDate.getMinutes(); + } + guess.endTime = endTime; + } + } + return guess; +} diff --git a/comm/calendar/extract/moz.build b/comm/calendar/extract/moz.build new file mode 100644 index 0000000000..8f5514a49a --- /dev/null +++ b/comm/calendar/extract/moz.build @@ -0,0 +1,9 @@ +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +EXTRA_JS_MODULES.calendar.extract += [ + "CalExtractParser.jsm", + "CalExtractParserService.jsm", +] |