firefox/toolkit/components/ml/content/ClusterAlgos.sys.mjs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * This module has math utility functions around clustering from a list of vectors
 */

const JUST_BELOW_1 = 0.99999;

/**
 * Utility function to measure accuracy and related metrics for a classification
 * task based on values from a confusion matrix.
 *
 * @param {object} confusionMatrix - An object containing the counts of classification outcomes.
 * @param {int[]} confusionMatrix.truePositives - Array of true positive counts.
 * @param {int[]} confusionMatrix.trueNegatives - Array of true negative counts.
 * @param {int[]} confusionMatrix.falsePositives - Array of false positive counts.
 * @param {int[]} confusionMatrix.falseNegatives - Array of false negative counts.
 * @returns {{accuracy: number, kappa: number}} An object containing accuracy and kappa values.
 */
export function getAccuracyStats({
  truePositives,
  trueNegatives,
  falsePositives,
  falseNegatives,
}) {
  const accuracy =
    (truePositives + trueNegatives) /
    (truePositives + trueNegatives + falsePositives + falseNegatives);
  const kappa = cohenKappa2x2({
    truePositives,
    falseNegatives,
    falsePositives,
    trueNegatives,
  });
  return { accuracy, kappa };
}

/**
 * Calculate Cohen's Kappa for a 2x2 confusion matrix.
 *
 * @param {object} confusionMatrix - An object containing the confusion matrix values.
 * @param {number} confusionMatrix.truePositives - True Positives (Actual Positive, Predicted Positive).
 * @param {number} confusionMatrix.falseNegatives - False Negatives (Actual Positive, Predicted Negative).
 * @param {number} confusionMatrix.falsePositives - False Positives (Actual Negative, Predicted Positive).
 * @param {number} confusionMatrix.trueNegatives - True Negatives (Actual Negative, Predicted Negative).
 * @returns {number} The Cohen's Kappa coefficient.
 */
export function cohenKappa2x2({
  truePositives,
  falseNegatives,
  falsePositives,
  trueNegatives,
}) {
  // Total number of samples
  const total = truePositives + falseNegatives + falsePositives + trueNegatives;
  if (total === 0) {
    return 0;
  }

  // Observed agreement (p0)
  const p0 = (truePositives + trueNegatives) / total;

  // Marginal sums (row and column totals)
  const row1 = truePositives + falseNegatives;
  const row2 = falsePositives + trueNegatives;
  const col1 = truePositives + falsePositives;
  const col2 = falseNegatives + trueNegatives;

  // Expected agreement (pe)
  const pe = (row1 * col1 + row2 * col2) / (total * total);

  // Cohen's Kappa
  return (p0 - pe) / (1 - pe);
}

/**
 * Compute inner product (dot product) of two vectors
 *
 * @param {number[]} point1 Array of numbers
 * @param  {number[]} point2 Array of numbers of same dimension
 * @returns {number} The inner product
 */
export function dotProduct(point1, point2) {
  return point1.reduce((acc, value, idx) => acc + value * point2[idx], 0);
}

/**
 * Find euclidean distance between two normalized points
 *
 * @param {number[]} point1 Array of numbers
 * @param  {number[]} point2 Array of numbers of same dimension
 * @returns {number}
 */
export function euclideanDistanceNormalized(point1, point2) {
  // Optimization described here https://math.stackexchange.com/a/2981910
  const dot = dotProduct(point1, point2);
  if (dot >= JUST_BELOW_1) {
    // Prevents edge case of square root of negative number
    return 0;
  }
  return Math.sqrt(2 - 2 * dot);
}

/**
 * Find average of two vectors
 *
 * @param {number[][]} vectors List of vectors
 * @returns {number[]} The mean of the vectors
 */
export function vectorMean(vectors) {
  if (vectors.length === 0) {
    return [];
  }
  const dims = vectors[0].length;
  const sum = new Array(dims).fill(0);
  vectors.forEach(a => {
    for (let i = 0; i < dims; i++) {
      sum[i] += a[i];
    }
  });
  return sum.map(a => a / vectors.length);
}

/**
 * Normalize a vector
 *
 * @param {number[]} vector The vector to normalize
 * @returns {number[]} The normalized vector
 */
export function vectorNormalize(vector) {
  // Calculate the magnitude of the vector
  const magnitude = Math.sqrt(vector.reduce((sum, c) => sum + c ** 2, 0));
  if (magnitude === 0) {
    return vector.map(() => 0); // Return a vector of zeros
  }
  return vector.map(c => c / magnitude);
}

/**
 * Find average distance between point and a list of points in a cluster.
 * If the point referenced is in the cluster it may be excluded
 *
 * @param {number[]} point
 * @param {number[][]} clusterPoints List of vectors
 * @param {boolean} excludeSelf Exclude point that is the same point (reference equality) from the averate
 * @returns {number} The mean distance
 */
export function meanDistance(point, clusterPoints, excludeSelf) {
  let totalDistance = 0;
  let count = 0;
  for (let i = 0; i < clusterPoints.length; i++) {
    // Exclude the point itself when calculating intra-cluster distance
    if (excludeSelf && clusterPoints[i] === point) {
      // This tests reference equality, not same values
      continue;
    }
    totalDistance += euclideanDistance(point, clusterPoints[i]);
    count++;
  }
  return count > 0 ? totalDistance / count : 0;
}

/**
 * Returns silhouette coefficients of each point in all clusters, returning as a list.
 *
 * @param {number[][]} data List of vectors
 * @param {number[][]} clusterRef List of clusters, each a list of references to points
 * @returns {number[]} List of silhouette scores per point
 */
export function silhouetteCoefficients(data, clusterRef) {
  const silhouetteScores = [];
  // Group points into clusters based on labels
  const clusters = clusterRef.map(clusterIds =>
    clusterIds.map(pointID => data[pointID])
  );

  // We need a label array where each index has a label referencing a cluster
  const labels = new Array(data.length).fill(0);
  for (let q = 0; q < clusterRef.length; q++) {
    clusterRef[q].forEach(a => {
      labels[a] = q;
    });
  }
  // Iterate through each point to compute its silhouette coefficient
  for (let i = 0; i < data.length; i++) {
    const point = data[i];
    const clusterLabel = labels[i];
    const ownCluster = clusters[clusterLabel];

    if (ownCluster.length === 1) {
      // Single item cluster has a score of 0
      silhouetteScores.push(0);
      continue;
    }
    // Compute the mean intra-cluster distance (a), excluding the point itself
    const a = meanDistance(point, ownCluster, true);
    // Compute the mean nearest-cluster distance (b)
    let b = Infinity;
    for (let label = 0; label < clusterRef.length; label++) {
      if (label !== clusterLabel) {
        const nearestClusterData = clusters[label];
        const distanceToCluster = meanDistance(
          point,
          nearestClusterData,
          false
        );
        if (distanceToCluster < b) {
          b = distanceToCluster;
        }
      }
    }
    // Compute silhouette score for the current point
    const max = Math.max(a, b);
    const silhouette = max > 0 ? (b - a) / max : 0.0;
    silhouetteScores.push(silhouette);
  }
  return silhouetteScores;
}

/**
 * Performs K-Means clustering with K-Means++ initialization of centroids.
 * If an existing cluster is specified with `anchorIndices`, then one of the centroids
 * is the average of the embeddings of the items in the cluster.
 *
 * @param {object} params - An object containing the parameters for K-Means clustering.
 * @param {number[][]} params.data - The dataset represented as an array of numerical vectors.
 * @param {number} params.k - The number of clusters.
 * @param {number} params.maxIterations - The maximum number of iterations.
 * @param {Function} [params.randomFunc=Math.random] - A randomization function that returns values between 0 and 1.
 * @param {int[]} [params.anchorIndices=[]] - Indices of items that are already in a cluster.
 * @param {int[]} [params.preassignedIndices=[]] - Indices of items with predefined cluster assignments.
 * @param {boolean} [params.freezeAnchorsInZeroCluster=true] - Whether to keep anchored indices fixed in the first cluster.
 * @returns {number[][]} A list of clusters with assigned indices.
 */
export function kmeansPlusPlus({
  data,
  k,
  maxIterations,
  randomFunc = Math.random,
  anchorIndices = [],
  preassignedIndices = [],
  freezeAnchorsInZeroCluster = true,
}) {
  randomFunc = randomFunc || randomGenerator;
  maxIterations = maxIterations || 300; // Default value for maxIterations if not provided
  const dimensions = data[0].length;
  const centroids = initializeCentroidsSorted({
    X: data,
    k,
    randomFunc,
    anchorIndices,
  });
  let resultClusters = [];
  const anchorSet = new Set(anchorIndices);
  const preassignedSet = new Set(preassignedIndices);
  for (let iter = 0; iter < maxIterations; iter++) {
    resultClusters = Array.from({ length: k }, () => []);
    let hasChanged = false;

    // Assign each data point to the nearest centroid
    for (let i = 0; i < data.length; i++) {
      if (freezeAnchorsInZeroCluster && anchorSet.has(i)) {
        resultClusters[0].push(i);
      } else {
        const point = data[i];
        const centroidIndex = getClosestCentroid(
          point,
          centroids,
          preassignedSet.has(i) && !anchorSet.has(i) ? 0 : -1
        );
        resultClusters[centroidIndex].push(i); // Push index instead of the point
      }
    }
    // Recompute centroids
    for (let j = 0; j < k; j++) {
      const newCentroid = computeCentroid(resultClusters[j], data, dimensions);
      if (!arePointsEqual(centroids[j], newCentroid)) {
        centroids[j] = newCentroid;
        hasChanged = true;
      }
    }
    // Stop if centroids don't change
    if (!hasChanged) {
      break;
    }
  }
  return resultClusters;
}

/**
 * Cumulative sum for an array
 *
 * @param {number[]} arr Array of numbers
 * @returns {number[]} List of numbers of the same size, with each a cumulative sum of the previous
 * plus the new number.
 */
export function stableCumsum(arr) {
  let sum = 0;
  return arr.map(value => (sum += value));
}

/**
 * Find distances from a single point to a list of points
 *
 * @param {number[]} point A 2d array
 * @param {number[][]} X List of points (nested 2d array)
 * @returns {number[][]} List of distances
 */
export function euclideanDistancesSquared(point, X) {
  return X.map(row =>
    row.reduce((distSq, val, i) => distSq + Math.pow(val - point[i], 2), 0)
  );
}

/**
 * Generates a random number between 0 and 1
 *
 * @returns {number}
 */
function randomGenerator() {
  return Math.random();
}

/**
 * Kmeans++ initialization of centroids by finding ones farther than one another
 * See https://dl.acm.org/doi/10.5555/1283383.1283494 for details as well as Scikit learn
 * immplementation
 *
 * @param {object} params - An object containing the parameters for centroid initialization.
 * @param {number[][]} params.X - The dataset represented as an array of numerical vectors.
 * @param {number} params.k - The number of clusters (i.e., k).
 * @param {Function} params.randomFunc - A random generator function (e.g., Math.random).
 * @param {number} params.numTrials - The number of trials for initialization.
 * @param {int[]} [params.anchorIndices=[]] - Indices of items that belong to an existing group.
 * @returns {number[][]} A list of `k` initialized cluster centers.
 */
export function initializeCentroidsSorted({
  X,
  k,
  randomFunc,
  numTrials,
  anchorIndices = [],
}) {
  const nSamples = X.length;
  const nFeatures = X[0].length;
  const centers = Array.from({ length: k }, () => new Array(nFeatures).fill(0));
  numTrials = numTrials || 2 + Math.floor(Math.log(k));

  const zeroOutAnchorItems = arr => {
    // We don't want to pick other items
    anchorIndices.forEach(a => (arr[a] = 0));
  };

  // First center is random unless anchor is specified
  let centerId;
  if (anchorIndices.length <= 1) {
    if (anchorIndices.length === 1) {
      centerId = anchorIndices[0];
    } else {
      centerId = Math.floor(randomFunc() * nSamples);
    }
    centers[0] = X[centerId].slice();
  } else {
    centers[0] = vectorNormalize(vectorMean(anchorIndices.map(a => X[a])));
  }

  // Get closest distances
  const closestDistSq = euclideanDistancesSquared(centers[0], X);
  let sumOfDistances = closestDistSq.reduce(function (sum, dist) {
    return sum + dist;
  }, 0);

  // Pick the remaining nClusters-1 points
  for (let c = 1; c < k; c++) {
    // Choose center candidates by sampling
    const randVals = Array(numTrials)
      .fill(0)
      .map(() => randomFunc() * sumOfDistances);
    const closestDistSqForSamples = closestDistSq.slice();
    if (anchorIndices.length > 1) {
      // Don't pick items from anchor cluster. We already have it
      zeroOutAnchorItems(closestDistSqForSamples);
    }
    const cumulativeProbs = stableCumsum(closestDistSqForSamples);
    const candidateIds = randVals
      .map(randVal => searchSorted(cumulativeProbs, randVal))
      .filter(candId => candId < nSamples); // TODO: Filter should not be needed if searchSorted works correctly
    // Compute distances to center candidates
    const distancesToCandidates = candidateIds.map(candidateId =>
      euclideanDistancesSquared(X[candidateId], X)
    );
    // Update closest distances squared and potential for each candidate
    const candidatesSumOfDistances = distancesToCandidates.map(distances =>
      closestDistSq.reduce(
        (sum, dist, j) => sum + Math.min(dist, distances[j]),
        0
      )
    );
    // Choose the best candidate
    const bestCandidateIdx = candidatesSumOfDistances.reduce(
      (bestIdx, pot, i) =>
        pot < candidatesSumOfDistances[bestIdx] ? i : bestIdx,
      0
    );
    const bestCandidate = candidateIds[bestCandidateIdx];
    // Update closest distance and potential
    for (let i = 0; i < closestDistSq.length; i++) {
      closestDistSq[i] = Math.min(
        closestDistSq[i],
        distancesToCandidates[bestCandidateIdx][i]
      );
    }
    sumOfDistances = candidatesSumOfDistances[bestCandidateIdx];
    // Pick best candidate
    centers[c] = X[bestCandidate].slice(); // Copy array
  }
  return centers;
}

/**
 * Binary search
 *
 * @param {number[]} arr List of sorted values (ascending)
 * @param {number} val Value to search for. Return lower index if no exact match
 * @returns {number} Index of found value
 */
export function searchSorted(arr, val) {
  let low = 0;
  let high = arr.length;
  while (low < high) {
    const mid = Math.floor((low + high) / 2);
    if (arr[mid] < val) {
      low = mid + 1;
    } else {
      high = mid;
    }
  }
  return low;
}

/**
 * Gets the centroid from a list of vectors
 *
 * @param {number[][]} vectors
 * @returns {number[]}
 */
export function computeCentroidFrom2DArray(vectors) {
  const numVectors = vectors.length;
  if (numVectors === 0) {
    return null;
  }
  const dimension = vectors[0].length;
  let centroid = new Array(dimension).fill(0);
  for (let i = 0; i < numVectors; i++) {
    for (let j = 0; j < dimension; j++) {
      centroid[j] += vectors[i][j];
    }
  }
  // Divide each component by the number of vectors to get the average
  for (let j = 0; j < dimension; j++) {
    centroid[j] /= numVectors;
  }
  return centroid;
}

/**
 * Helper function to compute Euclidean distance between two points
 *
 * @param {number[]} point1
 * @param {number[]} point2
 * @param {boolean} squareResult Return the square of the distance (i.e. don't apply SQRT)
 * @returns {number}
 */
export function euclideanDistance(point1, point2, squareResult) {
  const sum = point1.reduce(
    (acc, val, i) => acc + Math.pow(val - point2[i], 2),
    0
  );
  return squareResult ? sum : Math.sqrt(sum);
}

/**
 * Helper function to find closest centroid for a given point
 *
 * @param {number[]} point List of vectors
 * @param {number[][]} centroids List of centroids of same dimension as point
 * @param {number} excludeIndex Index of centroid to exclude from consideration
 * @returns {number} Index of the closest centroid
 */
export function getClosestCentroid(point, centroids, excludeIndex = -1) {
  var minDistance = Infinity;
  var closestIndex = -1;

  for (var i = 0; i < centroids.length; i++) {
    var distance = euclideanDistance(point, centroids[i]);
    if (distance < minDistance && i !== excludeIndex) {
      minDistance = distance;
      closestIndex = i;
    }
  }

  return closestIndex;
}

/**
 * Compute centroid of a cluster by reference
 *
 * @param {number[][]} cluster List of clusters, each with a list of indices
 * @param {number[][]} data List of all points
 * @param {number} dimensions
 * @returns {number[]}
 */
function computeCentroid(cluster, data, dimensions) {
  if (cluster.length === 0) {
    return new Array(dimensions).fill(0);
  }
  let centroid = new Array(dimensions).fill(0);
  for (let j = 0; j < cluster.length; j++) {
    let point = data[cluster[j]]; // Use the index to get the actual point
    for (let i = 0; i < dimensions; i++) {
      centroid[i] += point[i];
    }
  }
  for (let p = 0; p < dimensions; p++) {
    centroid[p] /= cluster.length;
  }
  return centroid;
}

/**
 * Returns true if both points have equal values
 *
 * @param {number[]} point1 Array of floats
 * @param {number[]} point2
 * @returns {boolean} points are equal
 */
function arePointsEqual(point1, point2) {
  return point1.every((value, index) => value === point2[index]);
}

/**
 * Computes a non-adjusted Rand score from a list of items, each with a label (true cluster id)
 * and cluster (predicted cluster id)
 *
 * @param {[object]} combinedItems
 * @param {string} clusterKey Key for predicted cluster in items
 * @param {string} labelKey Key for labeled (true) cluster in items
 * @returns {number}
 */
export function computeRandScore(combinedItems, clusterKey, labelKey) {
  let agreement = 0;
  let disagreement = 0;
  for (let i = 0; i < combinedItems.length; i++) {
    for (let j = i + 1; j < combinedItems.length; j++) {
      const itemA = combinedItems[i];
      const itemB = combinedItems[j];
      const predictedSame = itemA[clusterKey] === itemB[clusterKey];
      const labeledSame = itemA[labelKey] === itemB[labelKey];
      if (predictedSame === labeledSame) {
        agreement += 1;
      } else {
        disagreement += 1;
      }
    }
  }
  const total_items = agreement + disagreement;
  if (!total_items) {
    return 1.0;
  }
  return agreement / total_items;
}