1 files changed, 134 insertions, 0 deletions
diff --git a/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js b/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js
new file mode 100644
index 0000000000..8503c2903b
--- /dev/null
+++ b/browser/components/newtab/test/unit/lib/PersonalityProvider/Tokenize.test.js
@@ -0,0 +1,134 @@
+import {
+  tokenize,
+  toksToTfIdfVector,
+} from "lib/PersonalityProvider/Tokenize.jsm";
+
+const EPSILON = 0.00001;
+
+describe("TF-IDF Term Vectorizer", () => {
+  describe("#tokenize", () => {
+    let testCases = [
+      { input: "HELLO there", expected: ["hello", "there"] },
+      { input: "blah,,,blah,blah", expected: ["blah", "blah", "blah"] },
+      {
+        input: "Call Jenny: 967-5309",
+        expected: ["call", "jenny", "967", "5309"],
+      },
+      {
+        input: "Yo(what)[[hello]]{{jim}}}bob{1:2:1+2=$3",
+        expected: [
+          "yo",
+          "what",
+          "hello",
+          "jim",
+          "bob",
+          "1",
+          "2",
+          "1",
+          "2",
+          "3",
+        ],
+      },
+      { input: "čÄfė 80's", expected: ["čäfė", "80", "s"] },
+      { input: "我知道很多东西。", expected: ["我知道很多东西"] },
+    ];
+    let checkTokenization = tc => {
+      it(`${tc.input} should tokenize to ${tc.expected}`, () => {
+        assert.deepEqual(tc.expected, tokenize(tc.input));
+      });
+    };
+
+    for (let i = 0; i < testCases.length; i++) {
+      checkTokenization(testCases[i]);
+    }
+  });
+
+  describe("#tfidf", () => {
+    let vocab_idfs = {
+      deal: [221, 5.5058519847862275],
+      easy: [269, 5.5058519847862275],
+      tanks: [867, 5.601162164590552],
+      sites: [792, 5.957837108529285],
+      care: [153, 5.957837108529285],
+      needs: [596, 5.824305715904762],
+      finally: [334, 5.706522680248379],
+    };
+    let testCases = [
+      {
+        input: "Finally! Easy care for your tanks!",
+        expected: {
+          finally: [334, 0.5009816295853761],
+          easy: [269, 0.48336453811728713],
+          care: [153, 0.5230447876368227],
+          tanks: [867, 0.49173191907236774],
+        },
+      },
+      {
+        input: "Easy easy EASY",
+        expected: { easy: [269, 1.0] },
+      },
+      {
+        input: "Easy easy care",
+        expected: {
+          easy: [269, 0.8795205218806832],
+          care: [153, 0.4758609582543317],
+        },
+      },
+      {
+        input: "easy care",
+        expected: {
+          easy: [269, 0.6786999710383944],
+          care: [153, 0.7344156515982504],
+        },
+      },
+      {
+        input: "这个空间故意留空。",
+        expected: {
+          /* This space is left intentionally blank. */
+        },
+      },
+    ];
+    let checkTokenGeneration = tc => {
+      describe(`${tc.input} should have only vocabulary tokens`, () => {
+        let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs);
+
+        it(`${tc.input} should generate exactly ${Object.keys(
+          tc.expected
+        )}`, () => {
+          let seen = {};
+          Object.keys(actual).forEach(actualTok => {
+            assert.isTrue(actualTok in tc.expected);
+            seen[actualTok] = true;
+          });
+          Object.keys(tc.expected).forEach(expectedTok => {
+            assert.isTrue(expectedTok in seen);
+          });
+        });
+
+        it(`${tc.input} should have the correct token ids`, () => {
+          Object.keys(actual).forEach(actualTok => {
+            assert.equal(tc.expected[actualTok][0], actual[actualTok][0]);
+          });
+        });
+      });
+    };
+
+    let checkTfIdfVector = tc => {
+      let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs);
+      it(`${tc.input} should have the correct tf-idf`, () => {
+        Object.keys(actual).forEach(actualTok => {
+          let delta = Math.abs(
+            tc.expected[actualTok][1] - actual[actualTok][1]
+          );
+          assert.isTrue(delta <= EPSILON);
+        });
+      });
+    };
+
+    // run the tests
+    for (let i = 0; i < testCases.length; i++) {
+      checkTokenGeneration(testCases[i]);
+      checkTfIdfVector(testCases[i]);
+    }
+  });
+});