1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
const EXPORTED_SYMBOLS = ["AttachmentChecker"];
var AttachmentChecker = {
getAttachmentKeywords,
};
/**
* Check whether the character is a CJK character or not.
*
* @returns true if it is a CJK character.
*/
function IsCJK(code) {
if (code >= 0x2000 && code <= 0x9fff) {
// Hiragana, Katakana and Kanaji
return true;
} else if (code >= 0xac00 && code <= 0xd7ff) {
// Hangul
return true;
} else if (code >= 0xf900 && code <= 0xffff) {
// Hiragana, Katakana and Kanaji
return true;
}
return false;
}
/**
* Get the (possibly-empty) list of attachment keywords in this message.
*
* @returns the (possibly-empty) list of attachment keywords in this message
*/
function getAttachmentKeywords(mailData, keywordsInCsv) {
// The empty string would get split to an array of size 1. Avoid that...
var keywordsArray = keywordsInCsv ? keywordsInCsv.split(",") : [];
function escapeRegxpSpecials(inputString) {
const specials = [
".",
"\\",
"^",
"$",
"*",
"+",
"?",
"|",
"(",
")",
"[",
"]",
"{",
"}",
];
var re = new RegExp("(\\" + specials.join("|\\") + ")", "g");
inputString = inputString.replace(re, "\\$1");
return inputString.replace(" ", "\\s+");
}
// NOT_W is the character class that isn't in the Unicode classes "Ll",
// "Lu" and "Lt". It should work like \W, if \W knew about Unicode.
const NOT_W =
"[^\\u0041-\\u005a\\u0061-\\u007a\\u00aa\\u00b5\\u00ba\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u01ba\\u01bc-\\u01bf\\u01c4-\\u02ad\\u0386\\u0388-\\u0481\\u048c-\\u0556\\u0561-\\u0587\\u10a0-\\u10c5\\u1e00-\\u1fbc\\u1fbe\\u1fc2-\\u1fcc\\u1fd0-\\u1fdb\\u1fe0-\\u1fec\\u1ff2-\\u1ffc\\u207f\\u2102\\u2107\\u210a-\\u2113\\u2115\\u2119-\\u211d\\u2124\\u2126\\u2128\\u212a-\\u212d\\u212f-\\u2131\\u2133\\u2134\\u2139\\ufb00-\\ufb17\\uff21-\\uff3a\\uff41-\\uff5a]";
var keywordsFound = [];
for (var i = 0; i < keywordsArray.length; i++) {
var kw = escapeRegxpSpecials(keywordsArray[i]);
// If the keyword starts (ends) with a CJK character, we don't care
// what the previous (next) character is, because the words aren't
// space delimited.
if (keywordsArray[i].charAt(0) == ".") {
// like .pdf
// For this case we want to match the whole document name.
let start = "(([^\\s]*)\\b)";
let end = IsCJK(kw.charCodeAt(kw.length - 1)) ? "" : "(\\s|$)";
let re = new RegExp(start + kw + end, "ig");
let matching = mailData.match(re);
if (matching) {
for (var j = 0; j < matching.length; j++) {
// Ignore the match if it was in a URL.
if (!/^(https?|ftp):\/\//i.test(matching[j])) {
// We can have several *different* matches for one dot-keyword.
// E.g. foo.pdf and bar.pdf would both match for .pdf.
var m = matching[j].trim();
if (!keywordsFound.includes(m)) {
keywordsFound.push(m);
}
}
}
}
} else {
let start = IsCJK(kw.charCodeAt(0)) ? "" : "((^|\\s)\\S*)";
let end = IsCJK(kw.charCodeAt(kw.length - 1)) ? "" : "(" + NOT_W + "|$)";
let re = new RegExp(start + kw + end, "ig");
let matching;
while ((matching = re.exec(mailData)) !== null) {
// Ignore the match if it was in a URL.
if (!/^(https?|ftp):\/\//i.test(matching[0].trim())) {
keywordsFound.push(keywordsArray[i]);
break;
}
}
}
}
return keywordsFound;
}
// This file is also used as a Worker.
/* exported onmessage */
/* globals postMessage */
var onmessage = function (event) {
var keywordsFound = AttachmentChecker.getAttachmentKeywords(
event.data[0],
event.data[1]
);
postMessage(keywordsFound);
};
|