summaryrefslogtreecommitdiffstats
path: root/comm/calendar/extract/CalExtractParser.jsm
diff options
context:
space:
mode:
Diffstat (limited to 'comm/calendar/extract/CalExtractParser.jsm')
-rw-r--r--comm/calendar/extract/CalExtractParser.jsm545
1 files changed, 545 insertions, 0 deletions
diff --git a/comm/calendar/extract/CalExtractParser.jsm b/comm/calendar/extract/CalExtractParser.jsm
new file mode 100644
index 0000000000..355fa3a6da
--- /dev/null
+++ b/comm/calendar/extract/CalExtractParser.jsm
@@ -0,0 +1,545 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const EXPORTED_SYMBOLS = [
+ "CalExtractToken",
+ "CalExtractParseNode",
+ "CalExtractParser",
+ "extendParseRule",
+ "prepareArguments",
+];
+
+/**
+ * CalExtractOptions holds configuration options used by CalExtractParser.
+ *
+ * @typedef {object} CalExtractOptions
+ * @property {RegExp} sentenceBoundary - A pattern used to split text at the
+ * sentence boundary. This should capture
+ * the boundary only and not any other
+ * part of the sentence. Use lookaheads
+ * if needed.
+ */
+
+/**
+ * @type {CalExtractOptions}
+ */
+const defaultOptions = {
+ sentenceBoundary: /(?<=\w)[.!?]+\s(?=[A-Z0-9])|[.!?]$/,
+};
+
+const FLAG_OPTIONAL = 1;
+const FLAG_MULTIPLE = 2;
+const FLAG_NONEMPTY = 4;
+
+const flagBits = new Map([
+ ["?", FLAG_OPTIONAL],
+ ["+", FLAG_MULTIPLE | FLAG_NONEMPTY],
+ ["*", FLAG_MULTIPLE | FLAG_OPTIONAL],
+]);
+
+/**
+ * CalExtractToken represents a lexical unit of valid text. These are produced
+ * during the tokenisation stage of CalExtractParser by matching regular
+ * expressions against a text sequence.
+ */
+class CalExtractToken {
+ /**
+ * Identifies the token. Should be in uppercase with no spaces for consistency.
+ *
+ * @type {string}
+ */
+ type = "";
+
+ /**
+ * The text captured by this token.
+ *
+ * @type {string[]}
+ */
+ text = [];
+
+ /**
+ * Indicates which sentence in the source text the token was found.
+ *
+ * @type {number}
+ */
+ sentence = -1;
+
+ /**
+ * Indicates the position with the sentence the token occurs.
+ *
+ * @type {number}
+ */
+ position = -1;
+
+ /**
+ * @param {string} type
+ * @param {string[]} text
+ * @param {number} sentence
+ * @param {number} position
+ */
+ constructor(type, text, sentence, position) {
+ this.type = type;
+ this.text = text;
+ this.sentence = sentence;
+ this.position = position;
+ }
+}
+
+/**
+ * Function used to produce a value when a CalExtractParseRule is matched.
+ *
+ * @callback CallExtractParseRuleAction
+ * @param {any[]} args - An array containing all the values produced from each
+ * pattern in the rule when they are matched or the
+ * CalExtractToken when lexical tokens are used instead.
+ */
+
+/**
+ * CalExtractParseRule specifies a named pattern that is looked for when parsing
+ * the tokenized source text. Patterns are a sequence of one or more CalExtactToken
+ * types or CalExtractParseRule names. Each pattern specified can optionally
+ * have one (and only one) of the following flags:
+ *
+ * 1) "?" - Optional flag, indicates a pattern may be skipped if not matched.
+ * 2) "*" - Multiple flag, indicates a pattern may match 0 or more times.
+ * 3) "+" - Non-empty multiple flag, indicates a pattern may match 1 or more times.
+ *
+ * Flags must be specified as the last character of the pattern name, example:
+ * ["subject", "text?", "MEET", "text*", "time+"]
+ *
+ * @typedef {object} CalExtractParseRule
+ *
+ * @property {string} name - The name of the rule that can
+ * be used in other patterns.
+ * Should be lowercase for
+ * consistency.
+ * @property {string[]} patterns - The pattern that will be
+ * searched for on the tokenized
+ * string. Can contain flags.
+ *
+ * @property {CalExtractParseRuleAction} action - Produces the result of the
+ * rule being satisfied.
+ */
+
+/**
+ * CalExtractExtParseRule is derived from a CalExtractParseRule to include
+ * additional information needed during parsing.
+ *
+ * @typedef {CalExtractParseRule} CalExtractExtParseRule
+ *
+ * @property {string[]} patterns - The patterns here are stripped of
+ * any flags.
+ * @property {number[]} flags - An array containing the flags
+ * specified for each patterns element.
+ * @property {CalExtractParseNode} graph - A graph used to determine what parse
+ * rule can be applied for an encountered
+ * production.
+ */
+
+/**
+ * CalExtractParseNode is used to represent the patterns of a CalExtractParseRule
+ * as a graph. This graph is traversed during stack reduction until one of the
+ * following end conditions are met:
+ *
+ * 1) There are no more descendant nodes.
+ * 2) The only descendant node is the node itself (cyclic).
+ * 3) All of the descendant nodes are optional, there are no more tokens to
+ * shift and we have traversed the entire stack.
+ */
+class CalExtractParseNode {
+ /**
+ * @type {string}
+ */
+ symbol = null;
+
+ /**
+ * @type {number}
+ */
+ flags = null;
+
+ /**
+ * Contains each possible descendant node of this node.
+ *
+ * @type {CalExtractParseNode[]}
+ */
+ descendants = null;
+
+ static FLAG_OPTIONAL = FLAG_OPTIONAL;
+ static FLAG_MULTIPLE = FLAG_MULTIPLE;
+ static FLAG_NONEMPTY = FLAG_NONEMPTY;
+
+ /**
+ * @param {string} symbol - The pattern this node represents.
+ * @param {number} flags - The computed flags assigned to the pattern.
+ * @param {CalExtractParseNode[]} descendants - Descendant nodes of this node.
+ */
+ constructor(symbol, flags, descendants = []) {
+ this.symbol = symbol;
+ this.flags = flags;
+ this.descendants = descendants;
+ }
+
+ /**
+ * Indicates this is the last node in its graph. This will always be false
+ * for cyclic nodes.
+ */
+ get isEnd() {
+ return !this.descendants.length;
+ }
+
+ /**
+ * Appends a new descendant to this node.
+ *
+ * @param {CalExtractParseNode} node - The node to append.
+ *
+ * @returns {CalExtractParseNode} The appended node.
+ */
+ append(node) {
+ this.descendants.push(node);
+ return node;
+ }
+
+ /**
+ * Provides the descendant CalExtractParseNode of this one given its symbol
+ * name. The result depends on the following rules:
+ * 1) If this node has a descendant that matches the name, return that node.
+ * 2) If the node does not have a matching descendant but has descendants
+ * with the optional flag set, delegate to those nodes. This implements
+ * the "?" and optional aspect of "*".
+ * 3) If none of the above produce a node, null is returned which means this
+ * graph cannot be traversed any further.
+ *
+ * @returns {CalExtractParseNode|null}
+ */
+ getDescendant(name) {
+ // It is important the direct descendants are checked first.
+ let node = this.descendants.find(node => node.symbol == name);
+ if (node) {
+ return node;
+ }
+
+ // Now try any optional descendants.
+ for (let node of this.descendants) {
+ let hit = node.isOptional() && node != this && node.getDescendant(name);
+ if (hit) {
+ return hit;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Indicates this node can terminate the graph if so desired. This is acceptable
+ * if all of the descendants of this node are optional and there is nothing
+ * more to match on the stack.
+ *
+ * @returns {boolean}
+ */
+ canEnd() {
+ return this.descendants.filter(desc => desc != this).every(desc => desc.isOptional());
+ }
+
+ /**
+ * Indicates whether this node has the optional flag set.
+ *
+ * @returns {boolean}
+ */
+ isOptional() {
+ return Boolean(this.flags & FLAG_OPTIONAL);
+ }
+
+ /**
+ * Indicates whether this node is cyclic. A cyclic node is one whose only
+ * descendants is itself thus creating a loop. This occurs one a multiple
+ * flagged node is in the tail position of the graph.
+ */
+ isCyclic() {
+ return !this.isEnd && this.descendants.every(desc => desc == this);
+ }
+}
+
+/**
+ * CalExtractParser provides an API for detecting interesting information within
+ * a text sequence that can be used for event detection and creation. It is a
+ * naive implementation of a shift-reduce parser, naive in the sense that not
+ * too much attention has been paid to optimisation or semantics.
+ *
+ * This parser works by first splitting the source string into sentences, then
+ * tokenizing each using the token rules specified. The boundary for splitting
+ * into sentences can be specified in the options object.
+ *
+ * After tokenisation, the parser uses the parse rules to shift/reduce each
+ * sentence into a final result. The first parse rule is treated as the intended
+ * rule to reduce the tokens of each sentence to. If all of the tokens have been
+ * processed and the result is not the first rule, parsing is considered to have
+ * failed and null is returned for that sentence. For this reason, it is a good
+ * idea to specify parse rules that are robust but not too specific in their
+ * patterns.
+ */
+class CalExtractParser {
+ /**
+ * @type {[RegExp,string?][]}
+ */
+ tokenRules = [];
+
+ /**
+ * @type {CalExtractParseRule[]}
+ */
+ parseRules = [];
+
+ /**
+ * @type {CalExtractOptions}
+ */
+ options = null;
+
+ /**
+ * Use the static createInstance() method instead of this constructor directly.
+ *
+ * @param {[RegExp, string?][]} tokenRules
+ * @param {CalExtractExtParseRule[]} parseRules
+ * @param {CalExtractOptions} [options] - Configuration object.
+ *
+ * @private
+ */
+ constructor(tokenRules, parseRules, options = defaultOptions) {
+ this.tokenRules = tokenRules;
+ this.parseRules = parseRules;
+ this.options = options;
+ }
+
+ /**
+ * This method creates a new CalExtractParser instance using the simpler
+ * CalExtractParseRule interface instead of the extended one. It takes care
+ * of creating a graph for each rule and normalizing pattern names that may
+ * be using flags.
+ *
+ * @param {[RegExp, string?][]} tokenRules - A list of rules to apply during
+ * tokenisation. The first element of each rule is a regular expression used
+ * to detect lexical tokens and the second element is the type to assign to
+ * the token. Order matters slightly here, in general, more complex but specific
+ * rules should appear before simpler, more general ones.
+ *
+ * When specifying token rules, they should be anchored to the start of the
+ * string via "^" or tokenize() will produce unexpected results. Some regular
+ * expressions should also include a word boundary to prevent matching within
+ * a large string, example: "at" in "attachment". If a string is to be matched,
+ * but no token is desired you can omit the token type from the rule and it
+ * will be omitted completely.
+ *
+ * @param {CalExtractParseRule[]} parseRules - A list of CalExtractParseRules
+ * that will be extended then used during parsing. Multiple parse rules can
+ * share the same name and will all be considered the same when matching patterns.
+ * Use this to specify variations of the same rule.
+ *
+ * @param {CalExtractOptions} [options] - Configuration object.
+ */
+ static createInstance(tokenRules, parseRules, options = defaultOptions) {
+ return new CalExtractParser(tokenRules, parseRules.map(extendParseRule), options);
+ }
+
+ /**
+ * Tokenizes a string to make it easier to match against the parse rule
+ * patterns. If text is encountered that cannot be tokenized, the result for
+ * that sentence is null.
+ *
+ * @param {string} str - The string to tokenize.
+ *
+ * @returns {CalExtractToken[][]} For each sentence encountered, a list of
+ * CalExtractTokens.
+ */
+ tokenize(str) {
+ let allTokens = [];
+ let sentences = str.split(this.options.sentenceBoundary).filter(Boolean);
+
+ for (let i = 0; i < sentences.length; i++) {
+ let sentence = sentences[i];
+ let pos = 0;
+ let tokens = [];
+ let buffer = "";
+
+ let matched;
+ while (pos < sentence.length) {
+ buffer = sentence.substr(pos);
+ for (let [pattern, type] of this.tokenRules) {
+ matched = pattern.exec(buffer);
+ if (matched) {
+ if (type) {
+ tokens.push(new CalExtractToken(type, matched[0], i, pos));
+ }
+ pos += matched[0].length;
+ break;
+ }
+ }
+
+ if (!matched) {
+ // No rules for the encountered text, bail out.
+ tokens = null;
+ break;
+ }
+ }
+ allTokens.push(tokens);
+ }
+ return allTokens;
+ }
+
+ /**
+ * Parses a string into an array of values representing the final result of
+ * parsing each sentence encountered. The elements of the resulting array
+ * are either the result of applying the action of the first (top) parse rule
+ * or null if we could not successfully parse the sentence.
+ *
+ * @param {string} str
+ *
+ * @returns {any[]}
+ */
+ parse(str) {
+ return this.tokenize(str).map(tokens => {
+ if (!this.parseRules.length || !tokens) {
+ return null;
+ }
+
+ let lookahead = null;
+ let stack = [];
+ while (true) {
+ if (tokens.length) {
+ let next = tokens.shift();
+ stack.push([next.type, next]);
+ lookahead = tokens[0] ? tokens[0].type : null;
+ while (this.reduceStack(stack, lookahead)) {
+ continue;
+ }
+ } else {
+ // Attempt to reduce anything still on the stack now that the
+ // tokens have all been pushed.
+ while (this.reduceStack(stack, lookahead)) {
+ continue;
+ }
+ break;
+ }
+ }
+ return stack.length == 1 && stack[0][0] == this.parseRules[0].name ? stack[0][1] : null;
+ });
+ }
+
+ /**
+ * Attempts to reduce the given stack exactly once using the internal parsing
+ * rules. If successful, the stack will be modified to contain the matched
+ * rule at the location it was found. This methods modifies the stack given.
+ *
+ * @returns {boolean} - True if the stack was reduced false if otherwise.
+ */
+ reduceStack(stack, lookahead) {
+ for (let i = 0; i < stack.length; i++) {
+ for (let rule of this.parseRules) {
+ let node = rule.graph;
+ let n = i;
+ let matchCount = 0;
+ while (n < stack.length && (node = node.getDescendant(stack[n][0]))) {
+ matchCount++;
+ if (
+ node.isEnd ||
+ (n == stack.length - 1 && !lookahead && (node.isCyclic() || node.canEnd()))
+ ) {
+ let result = [rule.name, null];
+ let matched = stack.splice(i, matchCount, result);
+ result[1] = rule.action(prepareArguments(rule, matched));
+ return true;
+ }
+ n++;
+ }
+ }
+ }
+ return false;
+ }
+}
+
+/**
+ * Converts a CalExtractParseRule to a CalExtractExtParseRule.
+ *
+ * @param {CalExtractParseRule} rule
+ *
+ * @returns {CalExtractExtParseRule}
+ */
+function extendParseRule(rule) {
+ let { name, action } = rule;
+ let flags = [];
+ let patterns = [];
+ let start = new CalExtractParseNode(null, null);
+ let graph = start;
+
+ for (let pattern of rule.patterns) {
+ let patternFlag = pattern[pattern.length - 1];
+ let bits = 0;
+
+ // Compute the flag value.
+ for (let [flag, value] of flagBits) {
+ if (patternFlag == flag) {
+ bits = bits | value;
+ }
+ }
+
+ // Removes the flag from patterns that have them.
+ pattern = bits ? pattern.substring(0, pattern.length - 1) : pattern;
+ patterns.push(pattern);
+ graph = graph.append(new CalExtractParseNode(pattern, bits));
+
+ // Create a loop node if this flag is set.
+ if (bits & FLAG_MULTIPLE) {
+ graph.append(graph);
+ }
+
+ flags.push(bits);
+ }
+
+ return {
+ name,
+ action,
+ patterns,
+ flags,
+ graph: start,
+ };
+}
+
+/**
+ * Normalizes the matched arguments to be passed to an CalExtractParseRuleAction
+ * by ensuring the number is the same as the patterns for the action. This takes
+ * care of converting multi matches into an array and providing "null" when
+ * an optional pattern is unmatched.
+ *
+ * @param {CalExtractExtRule} rule - The rule the action belongs to.
+ * @param {string[]} matched - An sub-array of the stack containing what
+ * was actually matched. This array will be
+ * modified to match the full rule (inclusive
+ * of optional patterns).
+ *
+ *
+ * @returns {Array} Arguments for a CalExtractParseRuleAction.
+ */
+function prepareArguments(rule, matched) {
+ return rule.patterns.map((pattern, index) => {
+ if (rule.flags[index] & FLAG_MULTIPLE) {
+ let c = index;
+ let arrayArg = [];
+
+ while (c < matched.length && matched[c][0] == pattern) {
+ arrayArg.push(matched[c][1]);
+ c++;
+ }
+ if (!arrayArg.length) {
+ // This rule was not matched, make a blank space for it.
+ matched.splice(index, 0, null);
+ } else {
+ // Move all the matches into a single element so we match the pattern.
+ matched.splice(index, arrayArg.length, arrayArg);
+ }
+ return arrayArg;
+ } else if (matched[index] && matched[index][0] == pattern) {
+ return matched[index][1];
+ }
+
+ // The pattern was unmatched, it should be optional.
+ matched.splice(index, 0, null);
+ return null;
+ });
+}