From 6bf0a5cb5034a7e684dcc3500e841785237ce2dd Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 19:32:43 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- .../unit/lib/PersonalityProvider/Tokenize.test.js | 134 +++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js (limited to 'browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js') diff --git a/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js b/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js new file mode 100644 index 0000000000..8503c2903b --- /dev/null +++ b/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js @@ -0,0 +1,134 @@ +import { + tokenize, + toksToTfIdfVector, +} from "lib/PersonalityProvider/Tokenize.jsm"; + +const EPSILON = 0.00001; + +describe("TF-IDF Term Vectorizer", () => { + describe("#tokenize", () => { + let testCases = [ + { input: "HELLO there", expected: ["hello", "there"] }, + { input: "blah,,,blah,blah", expected: ["blah", "blah", "blah"] }, + { + input: "Call Jenny: 967-5309", + expected: ["call", "jenny", "967", "5309"], + }, + { + input: "Yo(what)[[hello]]{{jim}}}bob{1:2:1+2=$3", + expected: [ + "yo", + "what", + "hello", + "jim", + "bob", + "1", + "2", + "1", + "2", + "3", + ], + }, + { input: "čÄfė 80's", expected: ["čäfė", "80", "s"] }, + { input: "我知道很多东西。", expected: ["我知道很多东西"] }, + ]; + let checkTokenization = tc => { + it(`${tc.input} should tokenize to ${tc.expected}`, () => { + assert.deepEqual(tc.expected, tokenize(tc.input)); + }); + }; + + for (let i = 0; i < testCases.length; i++) { + checkTokenization(testCases[i]); + } + }); + + describe("#tfidf", () => { + let vocab_idfs = { + deal: [221, 5.5058519847862275], + easy: [269, 5.5058519847862275], + tanks: [867, 5.601162164590552], + sites: [792, 5.957837108529285], + care: [153, 5.957837108529285], + needs: [596, 5.824305715904762], + finally: [334, 5.706522680248379], + }; + let testCases = [ + { + input: "Finally! Easy care for your tanks!", + expected: { + finally: [334, 0.5009816295853761], + easy: [269, 0.48336453811728713], + care: [153, 0.5230447876368227], + tanks: [867, 0.49173191907236774], + }, + }, + { + input: "Easy easy EASY", + expected: { easy: [269, 1.0] }, + }, + { + input: "Easy easy care", + expected: { + easy: [269, 0.8795205218806832], + care: [153, 0.4758609582543317], + }, + }, + { + input: "easy care", + expected: { + easy: [269, 0.6786999710383944], + care: [153, 0.7344156515982504], + }, + }, + { + input: "这个空间故意留空。", + expected: { + /* This space is left intentionally blank. */ + }, + }, + ]; + let checkTokenGeneration = tc => { + describe(`${tc.input} should have only vocabulary tokens`, () => { + let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs); + + it(`${tc.input} should generate exactly ${Object.keys( + tc.expected + )}`, () => { + let seen = {}; + Object.keys(actual).forEach(actualTok => { + assert.isTrue(actualTok in tc.expected); + seen[actualTok] = true; + }); + Object.keys(tc.expected).forEach(expectedTok => { + assert.isTrue(expectedTok in seen); + }); + }); + + it(`${tc.input} should have the correct token ids`, () => { + Object.keys(actual).forEach(actualTok => { + assert.equal(tc.expected[actualTok][0], actual[actualTok][0]); + }); + }); + }); + }; + + let checkTfIdfVector = tc => { + let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs); + it(`${tc.input} should have the correct tf-idf`, () => { + Object.keys(actual).forEach(actualTok => { + let delta = Math.abs( + tc.expected[actualTok][1] - actual[actualTok][1] + ); + assert.isTrue(delta <= EPSILON); + }); + }); + }; + + // run the tests + for (let i = 0; i < testCases.length; i++) { + checkTokenGeneration(testCases[i]); + checkTfIdfVector(testCases[i]); + } + }); +}); -- cgit v1.2.3