348 lines
10 KiB
JavaScript
348 lines
10 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
import { ENGLISH_STOP_WORDS } from "chrome://global/content/ml/StopWords.sys.mjs";
|
|
|
|
// max number of keywords to extract for each document in corpus
|
|
const MAX_WORDS_PER_DOCUMENT = 3;
|
|
|
|
// min threshold to extract a keyword from document in corpus
|
|
const MIN_THRESHOLD_FOR_KEYWORD = 0.05;
|
|
|
|
/**
|
|
* A simple implementation of matrix multiplication
|
|
*
|
|
* @param {float [][]} A 2D matrix
|
|
* @param {float [][]} B 2D matrix
|
|
* @returns {float [][]} 2D matrix multiplication result
|
|
*/
|
|
export function matMul(A, B) {
|
|
// Check if multiplication is possible (i.e., number of columns in A == number of rows in B)
|
|
if (!A || !B || A[0].length !== B.length) {
|
|
throw new Error("Matrix dimensions are incompatible for multiplication.");
|
|
}
|
|
|
|
// Create the result matrix with appropriate dimensions (rows of A, columns of B)
|
|
const result = [];
|
|
for (let i = 0; i < A.length; i++) {
|
|
result[i] = [];
|
|
for (let j = 0; j < B[0].length; j++) {
|
|
let sum = 0;
|
|
// Sum the product of corresponding elements
|
|
for (let k = 0; k < A[0].length; k++) {
|
|
sum += A[i][k] * B[k][j];
|
|
}
|
|
result[i][j] = sum;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Calculates cosine similarity between two lists of floats
|
|
* The lists don't need to be normalized
|
|
*
|
|
* @param {number []} A first list
|
|
* @param {number []} B second list
|
|
* @returns {number} cosine similarity value
|
|
*/
|
|
export function cosSim(A, B) {
|
|
if (!A || !B || A.length !== B.length) {
|
|
throw new Error("Lists should have same lengths and not be undefined");
|
|
}
|
|
if (A.length === 0 && A.length === B.length) {
|
|
return 0;
|
|
}
|
|
|
|
let dotProduct = 0;
|
|
let mA = 0;
|
|
let mB = 0;
|
|
|
|
for (let i = 0; i < A.length; i++) {
|
|
dotProduct += A[i] * B[i];
|
|
mA += A[i] * A[i];
|
|
mB += B[i] * B[i];
|
|
}
|
|
|
|
mA = Math.sqrt(mA);
|
|
mB = Math.sqrt(mB);
|
|
|
|
return mA === 0 || mB === 0 ? 0 : dotProduct / (mA * mB);
|
|
}
|
|
|
|
/**
|
|
* This class is used to generate a vectorized count of keywords
|
|
* that are not part of the chosen language stopwords
|
|
*/
|
|
export class CountVectorizer {
|
|
constructor(stopWords = "EN") {
|
|
this.stopWords = new Set(stopWords === "EN" ? ENGLISH_STOP_WORDS : []);
|
|
}
|
|
|
|
/**
|
|
* Tokenize document based on non-stopwords
|
|
*
|
|
* @param {string} doc String to tokenize
|
|
* @returns {string[]} List containing token strings
|
|
*/
|
|
tokenize(doc) {
|
|
return doc
|
|
.replace(/[0-9]/g, "") // remove numbers
|
|
.split(/\W+/) // split on any non-word character
|
|
.map(token => token.toLowerCase()) // convert to lowercase
|
|
.filter(word => !this.stopWords.has(word)); // remove stop-words
|
|
}
|
|
|
|
/**
|
|
* Fit to corpus and generate the vocabulary
|
|
*
|
|
* @param {string []} corpus list of text to fit vectorizer to
|
|
*/
|
|
fit(corpus) {
|
|
this.tokenizedCorpus = corpus.map(doc => this.tokenize(doc));
|
|
const vocabSet = new Set();
|
|
for (let doc of this.tokenizedCorpus) {
|
|
for (let token of doc) {
|
|
if (token) {
|
|
vocabSet.add(token);
|
|
}
|
|
}
|
|
}
|
|
const vocab = Array.from(vocabSet);
|
|
this.vocabToIdx = {};
|
|
this.vocabLength = vocab.length;
|
|
for (let i = 0; i < vocab.length; i++) {
|
|
this.vocabToIdx[vocab[i]] = i;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Transform tokenized text based on non stop-word characters from vocabulary
|
|
*
|
|
* @param {string [][]} tokenizedCorpus list of tokenized strings to transform
|
|
* @returns {float [][]} Mapped token counts per string doc
|
|
*/
|
|
#transformTokenized(tokenizedCorpus) {
|
|
return tokenizedCorpus.map(doc => {
|
|
const counts = new Array(this.vocabLength).fill(0);
|
|
for (let token of doc) {
|
|
if (token in this.vocabToIdx) {
|
|
counts[this.vocabToIdx[token]]++;
|
|
}
|
|
}
|
|
return counts;
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Transform text based on existing vocabulary
|
|
*
|
|
* @param {string []} corpus list of string to transform
|
|
* @returns {float [][]} transformed counts based on tokens
|
|
*/
|
|
transform(corpus) {
|
|
const tokenizedCorpus =
|
|
this.tokenizedCorpus || corpus.map(doc => this.tokenize(doc));
|
|
return this.#transformTokenized(tokenizedCorpus);
|
|
}
|
|
|
|
/**
|
|
* Fit and transform text based on existing vocabulary
|
|
*
|
|
* @param {string []} corpus list of string to transform
|
|
* @returns {float [][]} transformed counts based on tokens
|
|
*/
|
|
fitTransform(corpus) {
|
|
this.fit(corpus);
|
|
return this.#transformTokenized(this.tokenizedCorpus);
|
|
}
|
|
|
|
/**
|
|
* Extract the most important keywords from corpus
|
|
*
|
|
* @returns {string []} list of keywords
|
|
*/
|
|
getFeatureNamesOut() {
|
|
return Object.keys(this.vocabToIdx);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Simplified ctf-idf implementation based off of
|
|
* https://github.com/MaartenGr/BERTopic/blob/master/bertopic/vectorizers/_ctfidf.py
|
|
* https://maartengr.github.io/BERTopic/api/ctfidf.html
|
|
*
|
|
* This class identifies keywords important to clusters of documents
|
|
* by assigning scores to each keyword in the cluster relative to all clusters
|
|
* present in the corpus
|
|
*/
|
|
export class CTfIdf {
|
|
/**
|
|
* Generate diagonal matrix using 1D array
|
|
*
|
|
* @param {float []} array 1D array to create a diagonalized matrix for
|
|
* @returns {float [][]} 2D matrix with diagonal values
|
|
*/
|
|
createDiagonalMatrix(array) {
|
|
const n = array.length; // Length of the input array
|
|
const matrix = Array.from({ length: n }, () => Array(n).fill(0)); // Create an n x n matrix filled with 0
|
|
|
|
for (let i = 0; i < n; i++) {
|
|
matrix[i][i] = array[i]; // Place elements of the array along the main diagonal
|
|
}
|
|
return matrix;
|
|
}
|
|
|
|
/**
|
|
* Normalize 2D array along the columns
|
|
*
|
|
* @param {float [][]} array 2D matrix to be normalized
|
|
* @returns {float [][]} 2D matrix with normalized values by row
|
|
*/
|
|
normalize(array) {
|
|
// Perform normalization row-wise (axis=1)
|
|
for (let i = 0; i < array.length; i++) {
|
|
const row = array[i];
|
|
// Calculate the L1 norm (sum of absolute values)
|
|
const rowSum = row.reduce((sum, value) => sum + Math.abs(value), 0);
|
|
|
|
if (rowSum !== 0) {
|
|
// Prevent division by zero
|
|
for (let j = 0; j < row.length; j++) {
|
|
row[j] = row[j] / rowSum; // Normalize each element in the row
|
|
}
|
|
}
|
|
}
|
|
return array;
|
|
}
|
|
|
|
/**
|
|
* Learn the "idf" portion (global term weights)
|
|
* by fitting to the count-vectorized array
|
|
*
|
|
* @param {float [][]} array A matrix of term/token counts.
|
|
*/
|
|
fit(array) {
|
|
if (array.length === 0 || array[0].length === 0) {
|
|
throw new Error("Non-Empty 2D array is required");
|
|
}
|
|
const df = array[0].map((_, colIndex) =>
|
|
array.reduce((sum, row) => sum + row[colIndex], 0)
|
|
);
|
|
const avgNrSamples = Math.floor(
|
|
array.reduce(
|
|
(total, row) => total + row.reduce((sum, value) => sum + value, 0),
|
|
0
|
|
) / array.length
|
|
);
|
|
const idf = df.map(value => Math.log(avgNrSamples / value + 1));
|
|
this.idfDiag = this.createDiagonalMatrix(idf);
|
|
}
|
|
|
|
/**
|
|
* Find the most important keywords based off the previously
|
|
* fitted clusters in the corpus
|
|
*
|
|
* @param {float [][]} array 2D matrix to transform vectorized matrix
|
|
* @returns {float [][]} transformed matrix
|
|
*/
|
|
transform(array) {
|
|
const X = this.normalize(array);
|
|
return matMul(X, this.idfDiag);
|
|
}
|
|
|
|
/**
|
|
* Apply the fit and transform operations on the same corpus
|
|
*
|
|
* @param {float [][]} array 2D matrix to fit vectorized matrix
|
|
* @returns {float [][]} transformed matrix
|
|
*/
|
|
fitTransform(array) {
|
|
this.fit(array);
|
|
return this.transform(array);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This class extracts unique keywords from a corpus of documents
|
|
* It currently uses the CountVectorizer and CtfIdf classes but can
|
|
* be extended as more methods are added
|
|
*/
|
|
export class KeywordExtractor {
|
|
constructor() {
|
|
this.cv = new CountVectorizer();
|
|
this.ctfIdf = new CTfIdf();
|
|
}
|
|
|
|
/**
|
|
* Generate the CtfIdf vector of important keywords and their scores
|
|
* and return the n most important keywords
|
|
*
|
|
* @param {string []} corpus
|
|
* @param {int} n max number of keywords to extract
|
|
* @returns {string[][]}
|
|
*/
|
|
fitTransform(corpus, n = MAX_WORDS_PER_DOCUMENT) {
|
|
const transformedCorpus = this.cv.fitTransform(corpus);
|
|
this.transformedCtfIdf = this.ctfIdf.fitTransform(transformedCorpus);
|
|
const { sortedScores, sortedIndices } = this.#getSortedScoresForKeywords();
|
|
return this.#getKeywordsPerCluster(sortedIndices, sortedScores, n);
|
|
}
|
|
|
|
/**
|
|
* Generate the sorted keywords along with their corresponding scores
|
|
*
|
|
* @returns {{sortedScores: float [][], sortedIndices: float [][]}}
|
|
*/
|
|
#getSortedScoresForKeywords() {
|
|
const sortedIndices = this.transformedCtfIdf.map(row => {
|
|
return row
|
|
.map((value, index) => ({ value, index })) // Pair each value with its index
|
|
.sort((a, b) => b.value - a.value) // Sort in descending order
|
|
.map(item => item.index); // Extract sorted indices
|
|
});
|
|
const sortedScores = this.transformedCtfIdf.map((row, clusterIndex) => {
|
|
return sortedIndices[clusterIndex].map(index => row[index]);
|
|
});
|
|
return { sortedIndices, sortedScores };
|
|
}
|
|
|
|
/**
|
|
* For each cluster of documents in the corpus, extract the important keywords
|
|
*
|
|
* @param {float [][]} sortedIndices
|
|
* @param {float [][]} sortedScores
|
|
* @param {int} n max number of keywords to extract
|
|
* @returns {string [][]}
|
|
*/
|
|
#getKeywordsPerCluster(sortedIndices, sortedScores, n) {
|
|
const numClusters = this.transformedCtfIdf.length;
|
|
const numWords = this.transformedCtfIdf[0].length;
|
|
const keywords = this.cv.getFeatureNamesOut();
|
|
|
|
const keywordsForCluster = [];
|
|
for (let cluster = 0; cluster < numClusters; cluster++) {
|
|
let topicWords = [];
|
|
|
|
// fetch the important keywords for each document
|
|
for (let topScoreRef = 0; topScoreRef < numWords; topScoreRef++) {
|
|
// break if we find a score less than the minimum threshold since the array is "score-sorted"
|
|
// also break if we have reach the limit number of keywords to return
|
|
if (
|
|
sortedScores[cluster][topScoreRef] < MIN_THRESHOLD_FOR_KEYWORD ||
|
|
topicWords.length >= n
|
|
) {
|
|
break;
|
|
}
|
|
const wordIndex = sortedIndices[cluster][topScoreRef];
|
|
topicWords.push(keywords[wordIndex]);
|
|
}
|
|
|
|
// Add the topic words to the list
|
|
keywordsForCluster.push(topicWords);
|
|
}
|
|
return keywordsForCluster;
|
|
}
|
|
}
|