diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
commit | 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /comm/mailnews/db/gloda/test/unit/test_intl.js | |
parent | Initial commit. (diff) | |
download | thunderbird-upstream.tar.xz thunderbird-upstream.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'comm/mailnews/db/gloda/test/unit/test_intl.js')
-rw-r--r-- | comm/mailnews/db/gloda/test/unit/test_intl.js | 355 |
1 files changed, 355 insertions, 0 deletions
diff --git a/comm/mailnews/db/gloda/test/unit/test_intl.js b/comm/mailnews/db/gloda/test/unit/test_intl.js new file mode 100644 index 0000000000..e6e9868189 --- /dev/null +++ b/comm/mailnews/db/gloda/test/unit/test_intl.js @@ -0,0 +1,355 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * Sanity check our encoding transforms and make sure the mozporter tokenizer + * is resulting in the expected fulltext search results. Specifically: + * - Check that subject, body, and attachment names are properly indexed; + * previously we screwed up at least one of these in terms of handling + * encodings properly. + * - Check that we can fulltext search on those things afterwards. + */ + +var { + assertExpectedMessagesIndexed, + glodaTestHelperInitialize, + waitForGlodaIndexer, +} = ChromeUtils.import("resource://testing-common/gloda/GlodaTestHelper.jsm"); +var { waitForGlodaDBFlush } = ChromeUtils.import( + "resource://testing-common/gloda/GlodaTestHelperFunctions.jsm" +); +var { queryExpect } = ChromeUtils.import( + "resource://testing-common/gloda/GlodaQueryHelper.jsm" +); +var { Gloda } = ChromeUtils.import("resource:///modules/gloda/GlodaPublic.jsm"); +var { GlodaMsgIndexer } = ChromeUtils.import( + "resource:///modules/gloda/IndexMsg.jsm" +); +var { MessageGenerator, SyntheticMessageSet } = ChromeUtils.import( + "resource://testing-common/mailnews/MessageGenerator.jsm" +); +var { MessageInjection } = ChromeUtils.import( + "resource://testing-common/mailnews/MessageInjection.jsm" +); + +/** + * To make the encoding pairs: + * - For the subject bit: + * import email + * h = email.Header.Header(charset=CHARSET) + * h.append(STRING) + * h.encode() + * - For the body bit + * s.encode(CHARSET) + */ +var intlPhrases = [ + // -- CJK case + { + name: "CJK: Vending Machine", + actual: "\u81ea\u52d5\u552e\u8ca8\u6a5f", + encodings: { + "utf-8": [ + "=?utf-8?b?6Ieq5YuV5ZSu6LKo5qmf?=", + "\xe8\x87\xaa\xe5\x8b\x95\xe5\x94\xae\xe8\xb2\xa8\xe6\xa9\x9f", + ], + "euc-jp": [ + "=?shift-jis?b?jqmTrppTid2LQA==?=", + "\xbc\xab\xc6\xb0\xd3\xb4\xb2\xdf\xb5\xa1", + ], + "shift-jis": [ + "=?shift-jis?b?jqmTrppTid2LQA==?=", + "\x8e\xa9\x93\xae\x9aS\x89\xdd\x8b@", + ], + }, + searchPhrases: [ + // Match bi-gram driven matches starting from the front. + { body: '"\u81ea\u52d5"', match: true }, + { body: '"\u81ea\u52d5\u552e"', match: true }, + { body: '"\u81ea\u52d5\u552e\u8ca8"', match: true }, + { body: '"\u81ea\u52d5\u552e\u8ca8\u6a5f"', match: true }, + // Now match from the back (bi-gram based). + { body: '"\u52d5\u552e\u8ca8\u6a5f"', match: true }, + { body: '"\u552e\u8ca8\u6a5f"', match: true }, + { body: '"\u8ca8\u6a5f"', match: true }, + // Now everybody in the middle! + { body: '"\u52d5\u552e\u8ca8"', match: true }, + { body: '"\u552e\u8ca8"', match: true }, + { body: '"\u52d5\u552e"', match: true }, + // -- Now match nobody! + // Nothing in common with the right answer. + { body: '"\u81eb\u52dc"', match: false }, + // Too long, no match! + { body: '"\u81ea\u52d5\u552e\u8ca8\u6a5f\u6a5f"', match: false }, + // Minor change at the end. + { body: '"\u81ea\u52d5\u552e\u8ca8\u6a5e"', match: false }, + ], + }, + // Use two words where the last character is a multi-byte sequence and one of + // them is the last word in the string. This helps test an off-by-one error + // in both the asymmetric case (query's last character is last character in + // the tokenized string but it is not the last character in the body string) + // and symmetric case (last character in the query and the body). + { + name: "Czech diacritics", + actual: "Slov\u00e1cko Moravsk\u00e9 rodin\u011b", + encodings: { + "utf-8": [ + "=?utf-8?b?U2xvdsOhY2tvIE1vcmF2c2vDqSByb2RpbsSb?=", + "Slov\xc3\xa1cko Moravsk\xc3\xa9 rodin\xc4\x9b", + ], + }, + searchPhrases: [ + // -- Desired + // Match on exact for either word should work + { body: "Slov\u00e1cko", match: true }, + { body: "Moravsk\u00e9", match: true }, + { body: "rodin\u011b", match: true }, + // The ASCII uppercase letters get case-folded + { body: "slov\u00e1cko", match: true }, + { body: "moravsk\u00e9", match: true }, + { body: "rODIN\u011b", match: true }, + ], + }, + // Ignore accent search! + { + name: "having accent: Paris", + actual: "Par\u00eds", + encodings: { + "utf-8": ["=?UTF-8?B?UGFyw61z?=", "Par\xc3\xads"], + }, + searchPhrases: [{ body: "paris", match: true }], + }, + // Case insensitive case for non-ASCII characters. + { + name: "Russian: new", + actual: "\u041d\u043e\u0432\u043e\u0435", + encodings: { + "utf-8": [ + "=?UTF-8?B?0J3QvtCy0L7QtQ==?=", + "\xd0\x9d\xd0\xbe\xd0\xb2\xd0\xbe\xd0\xb5", + ], + }, + searchPhrases: [{ body: "\u043d\u043e\u0432\u043e\u0435", match: true }], + }, + // Case-folding happens after decomposition. + { + name: "Awesome where A has a bar over it", + actual: "\u0100wesome", + encodings: { + "utf-8": ["=?utf-8?q?=C4=80wesome?=", "\xc4\x80wesome"], + }, + searchPhrases: [ + { body: "\u0100wesome", match: true }, // Upper A-bar + { body: "\u0101wesome", match: true }, // Lower a-bar + { body: "Awesome", match: true }, // Upper A + { body: "awesome", match: true }, // Lower a + ], + }, + // Deep decomposition happens and after that, case folding. + { + name: "Upper case upsilon with diaeresis and hook goes to small upsilon", + actual: "\u03d4esterday", + encodings: { + "utf-8": ["=?utf-8?q?=CF=94esterday?=", "\xcf\x94esterday"], + }, + searchPhrases: [ + { body: "\u03d4esterday", match: true }, // Y_: 03d4 => 03d2 (decomposed) + { body: "\u03d3esterday", match: true }, // Y_' 03d3 => 03d2 (decomposed) + { body: "\u03d2esterday", match: true }, // Y_ 03d2 => 03a5 (decomposed) + { body: "\u03a5esterday", match: true }, // Y 03a5 => 03c5 (lowercase) + { body: "\u03c5esterday", match: true }, // y 03c5 (final state) + ], + }, + // Full-width alphabet. + // Even if search phrases are ASCII, it has to hit. + { + name: "Full-width Thunderbird", + actual: + "\uff34\uff48\uff55\uff4e\uff44\uff45\uff52\uff42\uff49\uff52\uff44", + encodings: { + "utf-8": [ + "=?UTF-8?B?77y0772I772V772O772E772F772S772C772J772S772E?=", + "\xef\xbc\xb4\xef\xbd\x88\xef\xbd\x95\xef\xbd\x8e\xef\xbd\x84\xef\xbd\x85\xef\xbd\x92\xef\xbd\x82\xef\xbd\x89\xef\xbd\x92\xef\xbd\x84", + ], + }, + searchPhrases: [ + // Full-width lower. + { + body: "\uff34\uff28\uff35\uff2e\uff24\uff25\uff32\uff22\uff29\uff32\uff24", + match: true, + }, + // Half-width. + { body: "Thunderbird", match: true }, + ], + }, + // Half-width Katakana with voiced sound mark. + // Even if search phrases are full-width, it has to hit. + { + name: "Half-width Katakana: Thunderbird (SANDAABAADO)", + actual: "\uff7b\uff9d\uff80\uff9e\uff70\uff8a\uff9e\uff70\uff84\uff9e", + encodings: { + "utf-8": [ + "=?UTF-8?B?7727776d776A776e772w776K776e772w776E776e?=", + "\xef\xbd\xbb\xef\xbe\x9d\xef\xbe\x80\xef\xbe\x9e\xef\xbd\xb0\xef\xbe\x8a\xef\xbe\x9e\xef\xbd\xb0\xef\xbe\x84\xef\xbe\x9e", + ], + }, + searchPhrases: [ + { body: "\u30b5\u30f3\u30c0\u30fc\u30d0\u30fc\u30c9", match: true }, + ], + }, + // Thai: Would you like to see the movie? + { + name: "Thai: query movie word into Thai language content", + actual: + "\u0e04\u0e38\u0e13\u0e2d\u0e22\u0e32\u0e01\u0e44\u0e1b\u0e14\u0e39\u0e2b\u0e19\u0e31\u0e07", + encodings: { + "utf-8": [ + "=?UTF-8?B?4LiE4Li44LiT4Lit4Lii4Liy4LiB4LmE4Lib4LiU4Li54Lir4LiZ4Lix4LiH?=", + "\xe0\xb8\x84\xe0\xb8\xb8\xe0\xb8\x93\xe0\xb8\xad\xe0\xb8\xa2\xe0\xb8\xb2\xe0\xb8\x81\xe0\xb9\x84\xe0\xb8\x9b\xe0\xb8\x94\xe0\xb8\xb9\xe0\xb8\xab\xe0\xb8\x99\xe0\xb8\xb1\xe0\xb8\x87", + ], + }, + searchPhrases: [{ body: "\u0e2b\u0e19\u0e31\u0e07", match: true }], + }, +]; + +var msgGen; +var messageInjection; + +add_setup(function () { + msgGen = new MessageGenerator(); + // Use mbox injection because the fake server chokes sometimes right now. + messageInjection = new MessageInjection({ mode: "local" }, msgGen); + glodaTestHelperInitialize(messageInjection); +}); + +add_task(async function test_index_all_phrases() { + for (let phrase of intlPhrases) { + await indexPhrase(phrase); + } +}); + +add_task(async function flush_db() { + // Force a db flush so I can investigate the database if I want. + await waitForGlodaDBFlush(); +}); + +add_task(async function test_fulltextsearch_all_phrases() { + for (let phrase of intlPhrases) { + await fulltextsearchPhrase(phrase); + } +}); + +/** + * Names with encoded commas in them can screw up our mail address parsing if + * we perform the mime decoding prior to handing the mail address off for + * parsing. + */ +add_task(async function test_encoding_complications_with_mail_addresses() { + let basePair = msgGen.makeNameAndAddress(); + // The =2C encodes a comma! + let encodedCommaPair = ["=?iso-8859-1?Q?=DFnake=2C_=DFammy?=", basePair[1]]; + // "Snake, Sammy", but with a much cooler looking S-like character! + let decodedName = "\u00dfnake, \u00dfammy"; + // Use the thing with the comma in it for all cases; previously there was an + // asymmetry between to and cc... + let smsg = msgGen.makeMessage({ + from: encodedCommaPair, + to: [encodedCommaPair], + cc: [encodedCommaPair], + }); + function verify_sammy_snake(unused, gmsg) { + Assert.equal(gmsg.from.contact.name, decodedName); + Assert.equal(gmsg.to.length, 1); + Assert.equal(gmsg.to[0].id, gmsg.from.id); + Assert.equal(gmsg.cc.length, 1); + Assert.equal(gmsg.cc[0].id, gmsg.from.id); + } + + let synSet = new SyntheticMessageSet([smsg]); + await messageInjection.addSetsToFolders( + [messageInjection.getInboxFolder()], + [synSet] + ); + await waitForGlodaIndexer(); + Assert.ok( + ...assertExpectedMessagesIndexed([synSet], { verifier: verify_sammy_snake }) + ); +}); + +/** + * For each phrase in the intlPhrases array (we are parameterized over it using + * parameterizeTest in the 'tests' declaration), create a message where the + * subject, body, and attachment name are populated using the encodings in + * the phrase's "encodings" attribute, one encoding per message. Make sure + * that the strings as exposed by the gloda representation are equal to the + * expected/actual value. + * Stash each created synthetic message in a resultList list on the phrase so + * that we can use them as expected query results in + * |fulltextsearchPhrase|. + */ +async function indexPhrase(aPhrase) { + // Create a synthetic message for each of the delightful encoding types. + let messages = []; + aPhrase.resultList = []; + for (let charset in aPhrase.encodings) { + let [quoted, bodyEncoded] = aPhrase.encodings[charset]; + + let smsg = msgGen.makeMessage({ + subject: quoted, + body: { charset, encoding: "8bit", body: bodyEncoded }, + attachments: [{ filename: quoted, body: "gabba gabba hey" }], + // Save off the actual value for checking. + callerData: [charset, aPhrase.actual], + }); + + messages.push(smsg); + aPhrase.resultList.push(smsg); + } + let synSet = new SyntheticMessageSet(messages); + await messageInjection.addSetsToFolders( + [messageInjection.getInboxFolder()], + [synSet] + ); + + await waitForGlodaIndexer(); + Assert.ok( + ...assertExpectedMessagesIndexed([synSet], { verifier: verify_index }) + ); +} + +/** + * Does the per-message verification for indexPhrase. Knows what is right for + * each message because of the callerData attribute on the synthetic message. + */ +function verify_index(smsg, gmsg) { + let [charset, actual] = smsg.callerData; + let subject = gmsg.subject; + let indexedBodyText = gmsg.indexedBodyText.trim(); + let attachmentName = gmsg.attachmentNames[0]; + dump("using character set: " + charset + " actual: " + actual + "\n"); + dump("subject: " + subject + " (len: " + subject.length + ")\n"); + Assert.equal(actual, subject); + dump("Body: " + indexedBodyText + " (len: " + indexedBodyText.length + ")\n"); + Assert.equal(actual, indexedBodyText); + dump( + "Attachment name: " + + attachmentName + + " (len: " + + attachmentName.length + + ")\n" + ); + Assert.equal(actual, attachmentName); +} + +/** + * For each phrase, make sure that all of the searchPhrases either match or fail + * to match as appropriate. + */ +async function fulltextsearchPhrase(aPhrase) { + for (let searchPhrase of aPhrase.searchPhrases) { + let query = Gloda.newQuery(GlodaConstants.NOUN_MESSAGE); + query.bodyMatches(searchPhrase.body); + await queryExpect(query, searchPhrase.match ? aPhrase.resultList : []); + } +} |