summaryrefslogtreecommitdiffstats
path: root/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js
diff options
context:
space:
mode:
Diffstat (limited to 'browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js')
-rw-r--r--browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js134
1 files changed, 134 insertions, 0 deletions
diff --git a/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js b/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js
new file mode 100644
index 0000000000..8503c2903b
--- /dev/null
+++ b/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js
@@ -0,0 +1,134 @@
+import {
+ tokenize,
+ toksToTfIdfVector,
+} from "lib/PersonalityProvider/Tokenize.jsm";
+
+const EPSILON = 0.00001;
+
+describe("TF-IDF Term Vectorizer", () => {
+ describe("#tokenize", () => {
+ let testCases = [
+ { input: "HELLO there", expected: ["hello", "there"] },
+ { input: "blah,,,blah,blah", expected: ["blah", "blah", "blah"] },
+ {
+ input: "Call Jenny: 967-5309",
+ expected: ["call", "jenny", "967", "5309"],
+ },
+ {
+ input: "Yo(what)[[hello]]{{jim}}}bob{1:2:1+2=$3",
+ expected: [
+ "yo",
+ "what",
+ "hello",
+ "jim",
+ "bob",
+ "1",
+ "2",
+ "1",
+ "2",
+ "3",
+ ],
+ },
+ { input: "čÄfė 80's", expected: ["čäfė", "80", "s"] },
+ { input: "我知道很多东西。", expected: ["我知道很多东西"] },
+ ];
+ let checkTokenization = tc => {
+ it(`${tc.input} should tokenize to ${tc.expected}`, () => {
+ assert.deepEqual(tc.expected, tokenize(tc.input));
+ });
+ };
+
+ for (let i = 0; i < testCases.length; i++) {
+ checkTokenization(testCases[i]);
+ }
+ });
+
+ describe("#tfidf", () => {
+ let vocab_idfs = {
+ deal: [221, 5.5058519847862275],
+ easy: [269, 5.5058519847862275],
+ tanks: [867, 5.601162164590552],
+ sites: [792, 5.957837108529285],
+ care: [153, 5.957837108529285],
+ needs: [596, 5.824305715904762],
+ finally: [334, 5.706522680248379],
+ };
+ let testCases = [
+ {
+ input: "Finally! Easy care for your tanks!",
+ expected: {
+ finally: [334, 0.5009816295853761],
+ easy: [269, 0.48336453811728713],
+ care: [153, 0.5230447876368227],
+ tanks: [867, 0.49173191907236774],
+ },
+ },
+ {
+ input: "Easy easy EASY",
+ expected: { easy: [269, 1.0] },
+ },
+ {
+ input: "Easy easy care",
+ expected: {
+ easy: [269, 0.8795205218806832],
+ care: [153, 0.4758609582543317],
+ },
+ },
+ {
+ input: "easy care",
+ expected: {
+ easy: [269, 0.6786999710383944],
+ care: [153, 0.7344156515982504],
+ },
+ },
+ {
+ input: "这个空间故意留空。",
+ expected: {
+ /* This space is left intentionally blank. */
+ },
+ },
+ ];
+ let checkTokenGeneration = tc => {
+ describe(`${tc.input} should have only vocabulary tokens`, () => {
+ let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs);
+
+ it(`${tc.input} should generate exactly ${Object.keys(
+ tc.expected
+ )}`, () => {
+ let seen = {};
+ Object.keys(actual).forEach(actualTok => {
+ assert.isTrue(actualTok in tc.expected);
+ seen[actualTok] = true;
+ });
+ Object.keys(tc.expected).forEach(expectedTok => {
+ assert.isTrue(expectedTok in seen);
+ });
+ });
+
+ it(`${tc.input} should have the correct token ids`, () => {
+ Object.keys(actual).forEach(actualTok => {
+ assert.equal(tc.expected[actualTok][0], actual[actualTok][0]);
+ });
+ });
+ });
+ };
+
+ let checkTfIdfVector = tc => {
+ let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs);
+ it(`${tc.input} should have the correct tf-idf`, () => {
+ Object.keys(actual).forEach(actualTok => {
+ let delta = Math.abs(
+ tc.expected[actualTok][1] - actual[actualTok][1]
+ );
+ assert.isTrue(delta <= EPSILON);
+ });
+ });
+ };
+
+ // run the tests
+ for (let i = 0; i < testCases.length; i++) {
+ checkTokenGeneration(testCases[i]);
+ checkTfIdfVector(testCases[i]);
+ }
+ });
+});