firefox/toolkit/components/ml/content/nlp/Utils.sys.mjs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import { ENGLISH_STOP_WORDS } from "chrome://global/content/ml/StopWords.sys.mjs";

// max number of keywords to extract for each document in corpus
const MAX_WORDS_PER_DOCUMENT = 3;

// min threshold to extract a keyword from document in corpus
const MIN_THRESHOLD_FOR_KEYWORD = 0.05;

/**
 * A simple implementation of matrix multiplication
 *
 * @param {float [][]} A 2D matrix
 * @param {float [][]} B 2D matrix
 * @returns {float [][]} 2D matrix multiplication result
 */
export function matMul(A, B) {
  // Check if multiplication is possible (i.e., number of columns in A == number of rows in B)
  if (!A || !B || A[0].length !== B.length) {
    throw new Error("Matrix dimensions are incompatible for multiplication.");
  }

  // Create the result matrix with appropriate dimensions (rows of A, columns of B)
  const result = [];
  for (let i = 0; i < A.length; i++) {
    result[i] = [];
    for (let j = 0; j < B[0].length; j++) {
      let sum = 0;
      // Sum the product of corresponding elements
      for (let k = 0; k < A[0].length; k++) {
        sum += A[i][k] * B[k][j];
      }
      result[i][j] = sum;
    }
  }

  return result;
}

/**
 * Calculates cosine similarity between two lists of floats
 * The lists don't need to be normalized
 *
 * @param {number []} A first list
 * @param {number []} B second list
 * @returns {number} cosine similarity value
 */
export function cosSim(A, B) {
  if (!A || !B || A.length !== B.length) {
    throw new Error("Lists should have same lengths and not be undefined");
  }
  if (A.length === 0 && A.length === B.length) {
    return 0;
  }

  let dotProduct = 0;
  let mA = 0;
  let mB = 0;

  for (let i = 0; i < A.length; i++) {
    dotProduct += A[i] * B[i];
    mA += A[i] * A[i];
    mB += B[i] * B[i];
  }

  mA = Math.sqrt(mA);
  mB = Math.sqrt(mB);

  return mA === 0 || mB === 0 ? 0 : dotProduct / (mA * mB);
}

/**
 * This class is used to generate a vectorized count of keywords
 * that are not part of the chosen language stopwords
 */
export class CountVectorizer {
  constructor(stopWords = "EN") {
    this.stopWords = new Set(stopWords === "EN" ? ENGLISH_STOP_WORDS : []);
  }

  /**
   * Tokenize document based on non-stopwords
   *
   * @param {string} doc String to tokenize
   * @returns {string[]} List containing token strings
   */
  tokenize(doc) {
    return doc
      .replace(/[0-9]/g, "") // remove numbers
      .split(/\W+/) // split on any non-word character
      .map(token => token.toLowerCase()) // convert to lowercase
      .filter(word => !this.stopWords.has(word)); // remove stop-words
  }

  /**
   * Fit to corpus and generate the vocabulary
   *
   * @param {string []} corpus list of text to fit vectorizer to
   */
  fit(corpus) {
    this.tokenizedCorpus = corpus.map(doc => this.tokenize(doc));
    const vocabSet = new Set();
    for (let doc of this.tokenizedCorpus) {
      for (let token of doc) {
        if (token) {
          vocabSet.add(token);
        }
      }
    }
    const vocab = Array.from(vocabSet);
    this.vocabToIdx = {};
    this.vocabLength = vocab.length;
    for (let i = 0; i < vocab.length; i++) {
      this.vocabToIdx[vocab[i]] = i;
    }
  }

  /**
   * Transform tokenized text based on non stop-word characters from vocabulary
   *
   * @param {string [][]} tokenizedCorpus list of tokenized strings to transform
   * @returns {float [][]} Mapped token counts per string doc
   */
  #transformTokenized(tokenizedCorpus) {
    return tokenizedCorpus.map(doc => {
      const counts = new Array(this.vocabLength).fill(0);
      for (let token of doc) {
        if (token in this.vocabToIdx) {
          counts[this.vocabToIdx[token]]++;
        }
      }
      return counts;
    });
  }

  /**
   * Transform text based on existing vocabulary
   *
   * @param {string []} corpus list of string to transform
   * @returns {float [][]} transformed counts based on tokens
   */
  transform(corpus) {
    const tokenizedCorpus =
      this.tokenizedCorpus || corpus.map(doc => this.tokenize(doc));
    return this.#transformTokenized(tokenizedCorpus);
  }

  /**
   * Fit and transform text based on existing vocabulary
   *
   * @param {string []} corpus list of string to transform
   * @returns {float [][]} transformed counts based on tokens
   */
  fitTransform(corpus) {
    this.fit(corpus);
    return this.#transformTokenized(this.tokenizedCorpus);
  }

  /**
   * Extract the most important keywords from corpus
   *
   * @returns {string []} list of keywords
   */
  getFeatureNamesOut() {
    return Object.keys(this.vocabToIdx);
  }
}

/*
 * Simplified ctf-idf implementation based off of
 * https://github.com/MaartenGr/BERTopic/blob/master/bertopic/vectorizers/_ctfidf.py
 * https://maartengr.github.io/BERTopic/api/ctfidf.html
 *
 * This class identifies keywords important to clusters of documents
 * by assigning scores to each keyword in the cluster relative to all clusters
 * present in the corpus
 */
export class CTfIdf {
  /**
   * Generate diagonal matrix using 1D array
   *
   * @param {float []} array 1D array to create a diagonalized matrix for
   * @returns {float [][]} 2D matrix with diagonal values
   */
  createDiagonalMatrix(array) {
    const n = array.length; // Length of the input array
    const matrix = Array.from({ length: n }, () => Array(n).fill(0)); // Create an n x n matrix filled with 0

    for (let i = 0; i < n; i++) {
      matrix[i][i] = array[i]; // Place elements of the array along the main diagonal
    }
    return matrix;
  }

  /**
   * Normalize 2D array along the columns
   *
   * @param {float [][]} array 2D matrix to be normalized
   * @returns {float [][]} 2D matrix with normalized values by row
   */
  normalize(array) {
    // Perform normalization row-wise (axis=1)
    for (let i = 0; i < array.length; i++) {
      const row = array[i];
      // Calculate the L1 norm (sum of absolute values)
      const rowSum = row.reduce((sum, value) => sum + Math.abs(value), 0);

      if (rowSum !== 0) {
        // Prevent division by zero
        for (let j = 0; j < row.length; j++) {
          row[j] = row[j] / rowSum; // Normalize each element in the row
        }
      }
    }
    return array;
  }

  /**
   * Learn the "idf" portion (global term weights)
   * by fitting to the count-vectorized array
   *
   * @param {float [][]} array A matrix of term/token counts.
   */
  fit(array) {
    if (array.length === 0 || array[0].length === 0) {
      throw new Error("Non-Empty 2D array is required");
    }
    const df = array[0].map((_, colIndex) =>
      array.reduce((sum, row) => sum + row[colIndex], 0)
    );
    const avgNrSamples = Math.floor(
      array.reduce(
        (total, row) => total + row.reduce((sum, value) => sum + value, 0),
        0
      ) / array.length
    );
    const idf = df.map(value => Math.log(avgNrSamples / value + 1));
    this.idfDiag = this.createDiagonalMatrix(idf);
  }

  /**
   * Find the most important keywords based off the previously
   * fitted clusters in the corpus
   *
   * @param {float [][]} array 2D matrix to transform vectorized matrix
   * @returns {float [][]} transformed matrix
   */
  transform(array) {
    const X = this.normalize(array);
    return matMul(X, this.idfDiag);
  }

  /**
   * Apply the fit and transform operations on the same corpus
   *
   * @param {float [][]} array 2D matrix to fit vectorized matrix
   * @returns {float [][]} transformed matrix
   */
  fitTransform(array) {
    this.fit(array);
    return this.transform(array);
  }
}

/**
 * This class extracts unique keywords from a corpus of documents
 * It currently uses the CountVectorizer and CtfIdf classes but can
 * be extended as more methods are added
 */
export class KeywordExtractor {
  constructor() {
    this.cv = new CountVectorizer();
    this.ctfIdf = new CTfIdf();
  }

  /**
   * Generate the CtfIdf vector of important keywords and their scores
   * and return the n most important keywords
   *
   * @param {string []} corpus
   * @param {int} n max number of keywords to extract
   * @returns {string[][]}
   */
  fitTransform(corpus, n = MAX_WORDS_PER_DOCUMENT) {
    const transformedCorpus = this.cv.fitTransform(corpus);
    this.transformedCtfIdf = this.ctfIdf.fitTransform(transformedCorpus);
    const { sortedScores, sortedIndices } = this.#getSortedScoresForKeywords();
    return this.#getKeywordsPerCluster(sortedIndices, sortedScores, n);
  }

  /**
   * Generate the sorted keywords along with their corresponding scores
   *
   * @returns {{sortedScores: float [][], sortedIndices: float [][]}}
   */
  #getSortedScoresForKeywords() {
    const sortedIndices = this.transformedCtfIdf.map(row => {
      return row
        .map((value, index) => ({ value, index })) // Pair each value with its index
        .sort((a, b) => b.value - a.value) // Sort in descending order
        .map(item => item.index); // Extract sorted indices
    });
    const sortedScores = this.transformedCtfIdf.map((row, clusterIndex) => {
      return sortedIndices[clusterIndex].map(index => row[index]);
    });
    return { sortedIndices, sortedScores };
  }

  /**
   * For each cluster of documents in the corpus, extract the important keywords
   *
   * @param {float [][]} sortedIndices
   * @param {float [][]} sortedScores
   * @param {int} n max number of keywords to extract
   * @returns {string [][]}
   */
  #getKeywordsPerCluster(sortedIndices, sortedScores, n) {
    const numClusters = this.transformedCtfIdf.length;
    const numWords = this.transformedCtfIdf[0].length;
    const keywords = this.cv.getFeatureNamesOut();

    const keywordsForCluster = [];
    for (let cluster = 0; cluster < numClusters; cluster++) {
      let topicWords = [];

      // fetch the important keywords for each document
      for (let topScoreRef = 0; topScoreRef < numWords; topScoreRef++) {
        // break if we find a score less than the minimum threshold since the array is "score-sorted"
        // also break if we have reach the limit number of keywords to return
        if (
          sortedScores[cluster][topScoreRef] < MIN_THRESHOLD_FOR_KEYWORD ||
          topicWords.length >= n
        ) {
          break;
        }
        const wordIndex = sortedIndices[cluster][topScoreRef];
        topicWords.push(keywords[wordIndex]);
      }

      // Add the topic words to the list
      keywordsForCluster.push(topicWords);
    }
    return keywordsForCluster;
  }
}