/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* * This module has math utility functions around clustering from a list of vectors */ const JUST_BELOW_1 = 0.99999; /** * Utility function to measure accuracy and related metrics for a classification * task based on values from a confusion matrix. * * @param {object} confusionMatrix - An object containing the counts of classification outcomes. * @param {int[]} confusionMatrix.truePositives - Array of true positive counts. * @param {int[]} confusionMatrix.trueNegatives - Array of true negative counts. * @param {int[]} confusionMatrix.falsePositives - Array of false positive counts. * @param {int[]} confusionMatrix.falseNegatives - Array of false negative counts. * @returns {{accuracy: number, kappa: number}} An object containing accuracy and kappa values. */ export function getAccuracyStats({ truePositives, trueNegatives, falsePositives, falseNegatives, }) { const accuracy = (truePositives + trueNegatives) / (truePositives + trueNegatives + falsePositives + falseNegatives); const kappa = cohenKappa2x2({ truePositives, falseNegatives, falsePositives, trueNegatives, }); return { accuracy, kappa }; } /** * Calculate Cohen's Kappa for a 2x2 confusion matrix. * * @param {object} confusionMatrix - An object containing the confusion matrix values. * @param {number} confusionMatrix.truePositives - True Positives (Actual Positive, Predicted Positive). * @param {number} confusionMatrix.falseNegatives - False Negatives (Actual Positive, Predicted Negative). * @param {number} confusionMatrix.falsePositives - False Positives (Actual Negative, Predicted Positive). * @param {number} confusionMatrix.trueNegatives - True Negatives (Actual Negative, Predicted Negative). * @returns {number} The Cohen's Kappa coefficient. */ export function cohenKappa2x2({ truePositives, falseNegatives, falsePositives, trueNegatives, }) { // Total number of samples const total = truePositives + falseNegatives + falsePositives + trueNegatives; if (total === 0) { return 0; } // Observed agreement (p0) const p0 = (truePositives + trueNegatives) / total; // Marginal sums (row and column totals) const row1 = truePositives + falseNegatives; const row2 = falsePositives + trueNegatives; const col1 = truePositives + falsePositives; const col2 = falseNegatives + trueNegatives; // Expected agreement (pe) const pe = (row1 * col1 + row2 * col2) / (total * total); // Cohen's Kappa return (p0 - pe) / (1 - pe); } /** * Compute inner product (dot product) of two vectors * * @param {number[]} point1 Array of numbers * @param {number[]} point2 Array of numbers of same dimension * @returns {number} The inner product */ export function dotProduct(point1, point2) { return point1.reduce((acc, value, idx) => acc + value * point2[idx], 0); } /** * Find euclidean distance between two normalized points * * @param {number[]} point1 Array of numbers * @param {number[]} point2 Array of numbers of same dimension * @returns {number} */ export function euclideanDistanceNormalized(point1, point2) { // Optimization described here https://math.stackexchange.com/a/2981910 const dot = dotProduct(point1, point2); if (dot >= JUST_BELOW_1) { // Prevents edge case of square root of negative number return 0; } return Math.sqrt(2 - 2 * dot); } /** * Find average of two vectors * * @param {number[][]} vectors List of vectors * @returns {number[]} The mean of the vectors */ export function vectorMean(vectors) { if (vectors.length === 0) { return []; } const dims = vectors[0].length; const sum = new Array(dims).fill(0); vectors.forEach(a => { for (let i = 0; i < dims; i++) { sum[i] += a[i]; } }); return sum.map(a => a / vectors.length); } /** * Normalize a vector * * @param {number[]} vector The vector to normalize * @returns {number[]} The normalized vector */ export function vectorNormalize(vector) { // Calculate the magnitude of the vector const magnitude = Math.sqrt(vector.reduce((sum, c) => sum + c ** 2, 0)); if (magnitude === 0) { return vector.map(() => 0); // Return a vector of zeros } return vector.map(c => c / magnitude); } /** * Find average distance between point and a list of points in a cluster. * If the point referenced is in the cluster it may be excluded * * @param {number[]} point * @param {number[][]} clusterPoints List of vectors * @param {boolean} excludeSelf Exclude point that is the same point (reference equality) from the averate * @returns {number} The mean distance */ export function meanDistance(point, clusterPoints, excludeSelf) { let totalDistance = 0; let count = 0; for (let i = 0; i < clusterPoints.length; i++) { // Exclude the point itself when calculating intra-cluster distance if (excludeSelf && clusterPoints[i] === point) { // This tests reference equality, not same values continue; } totalDistance += euclideanDistance(point, clusterPoints[i]); count++; } return count > 0 ? totalDistance / count : 0; } /** * Returns silhouette coefficients of each point in all clusters, returning as a list. * * @param {number[][]} data List of vectors * @param {number[][]} clusterRef List of clusters, each a list of references to points * @returns {number[]} List of silhouette scores per point */ export function silhouetteCoefficients(data, clusterRef) { const silhouetteScores = []; // Group points into clusters based on labels const clusters = clusterRef.map(clusterIds => clusterIds.map(pointID => data[pointID]) ); // We need a label array where each index has a label referencing a cluster const labels = new Array(data.length).fill(0); for (let q = 0; q < clusterRef.length; q++) { clusterRef[q].forEach(a => { labels[a] = q; }); } // Iterate through each point to compute its silhouette coefficient for (let i = 0; i < data.length; i++) { const point = data[i]; const clusterLabel = labels[i]; const ownCluster = clusters[clusterLabel]; if (ownCluster.length === 1) { // Single item cluster has a score of 0 silhouetteScores.push(0); continue; } // Compute the mean intra-cluster distance (a), excluding the point itself const a = meanDistance(point, ownCluster, true); // Compute the mean nearest-cluster distance (b) let b = Infinity; for (let label = 0; label < clusterRef.length; label++) { if (label !== clusterLabel) { const nearestClusterData = clusters[label]; const distanceToCluster = meanDistance( point, nearestClusterData, false ); if (distanceToCluster < b) { b = distanceToCluster; } } } // Compute silhouette score for the current point const max = Math.max(a, b); const silhouette = max > 0 ? (b - a) / max : 0.0; silhouetteScores.push(silhouette); } return silhouetteScores; } /** * Performs K-Means clustering with K-Means++ initialization of centroids. * If an existing cluster is specified with `anchorIndices`, then one of the centroids * is the average of the embeddings of the items in the cluster. * * @param {object} params - An object containing the parameters for K-Means clustering. * @param {number[][]} params.data - The dataset represented as an array of numerical vectors. * @param {number} params.k - The number of clusters. * @param {number} params.maxIterations - The maximum number of iterations. * @param {Function} [params.randomFunc=Math.random] - A randomization function that returns values between 0 and 1. * @param {int[]} [params.anchorIndices=[]] - Indices of items that are already in a cluster. * @param {int[]} [params.preassignedIndices=[]] - Indices of items with predefined cluster assignments. * @param {boolean} [params.freezeAnchorsInZeroCluster=true] - Whether to keep anchored indices fixed in the first cluster. * @returns {number[][]} A list of clusters with assigned indices. */ export function kmeansPlusPlus({ data, k, maxIterations, randomFunc = Math.random, anchorIndices = [], preassignedIndices = [], freezeAnchorsInZeroCluster = true, }) { randomFunc = randomFunc || randomGenerator; maxIterations = maxIterations || 300; // Default value for maxIterations if not provided const dimensions = data[0].length; const centroids = initializeCentroidsSorted({ X: data, k, randomFunc, anchorIndices, }); let resultClusters = []; const anchorSet = new Set(anchorIndices); const preassignedSet = new Set(preassignedIndices); for (let iter = 0; iter < maxIterations; iter++) { resultClusters = Array.from({ length: k }, () => []); let hasChanged = false; // Assign each data point to the nearest centroid for (let i = 0; i < data.length; i++) { if (freezeAnchorsInZeroCluster && anchorSet.has(i)) { resultClusters[0].push(i); } else { const point = data[i]; const centroidIndex = getClosestCentroid( point, centroids, preassignedSet.has(i) && !anchorSet.has(i) ? 0 : -1 ); resultClusters[centroidIndex].push(i); // Push index instead of the point } } // Recompute centroids for (let j = 0; j < k; j++) { const newCentroid = computeCentroid(resultClusters[j], data, dimensions); if (!arePointsEqual(centroids[j], newCentroid)) { centroids[j] = newCentroid; hasChanged = true; } } // Stop if centroids don't change if (!hasChanged) { break; } } return resultClusters; } /** * Cumulative sum for an array * * @param {number[]} arr Array of numbers * @returns {number[]} List of numbers of the same size, with each a cumulative sum of the previous * plus the new number. */ export function stableCumsum(arr) { let sum = 0; return arr.map(value => (sum += value)); } /** * Find distances from a single point to a list of points * * @param {number[]} point A 2d array * @param {number[][]} X List of points (nested 2d array) * @returns {number[][]} List of distances */ export function euclideanDistancesSquared(point, X) { return X.map(row => row.reduce((distSq, val, i) => distSq + Math.pow(val - point[i], 2), 0) ); } /** * Generates a random number between 0 and 1 * * @returns {number} */ function randomGenerator() { return Math.random(); } /** * Kmeans++ initialization of centroids by finding ones farther than one another * See https://dl.acm.org/doi/10.5555/1283383.1283494 for details as well as Scikit learn * immplementation * * @param {object} params - An object containing the parameters for centroid initialization. * @param {number[][]} params.X - The dataset represented as an array of numerical vectors. * @param {number} params.k - The number of clusters (i.e., k). * @param {Function} params.randomFunc - A random generator function (e.g., Math.random). * @param {number} params.numTrials - The number of trials for initialization. * @param {int[]} [params.anchorIndices=[]] - Indices of items that belong to an existing group. * @returns {number[][]} A list of `k` initialized cluster centers. */ export function initializeCentroidsSorted({ X, k, randomFunc, numTrials, anchorIndices = [], }) { const nSamples = X.length; const nFeatures = X[0].length; const centers = Array.from({ length: k }, () => new Array(nFeatures).fill(0)); numTrials = numTrials || 2 + Math.floor(Math.log(k)); const zeroOutAnchorItems = arr => { // We don't want to pick other items anchorIndices.forEach(a => (arr[a] = 0)); }; // First center is random unless anchor is specified let centerId; if (anchorIndices.length <= 1) { if (anchorIndices.length === 1) { centerId = anchorIndices[0]; } else { centerId = Math.floor(randomFunc() * nSamples); } centers[0] = X[centerId].slice(); } else { centers[0] = vectorNormalize(vectorMean(anchorIndices.map(a => X[a]))); } // Get closest distances const closestDistSq = euclideanDistancesSquared(centers[0], X); let sumOfDistances = closestDistSq.reduce(function (sum, dist) { return sum + dist; }, 0); // Pick the remaining nClusters-1 points for (let c = 1; c < k; c++) { // Choose center candidates by sampling const randVals = Array(numTrials) .fill(0) .map(() => randomFunc() * sumOfDistances); const closestDistSqForSamples = closestDistSq.slice(); if (anchorIndices.length > 1) { // Don't pick items from anchor cluster. We already have it zeroOutAnchorItems(closestDistSqForSamples); } const cumulativeProbs = stableCumsum(closestDistSqForSamples); const candidateIds = randVals .map(randVal => searchSorted(cumulativeProbs, randVal)) .filter(candId => candId < nSamples); // TODO: Filter should not be needed if searchSorted works correctly // Compute distances to center candidates const distancesToCandidates = candidateIds.map(candidateId => euclideanDistancesSquared(X[candidateId], X) ); // Update closest distances squared and potential for each candidate const candidatesSumOfDistances = distancesToCandidates.map(distances => closestDistSq.reduce( (sum, dist, j) => sum + Math.min(dist, distances[j]), 0 ) ); // Choose the best candidate const bestCandidateIdx = candidatesSumOfDistances.reduce( (bestIdx, pot, i) => pot < candidatesSumOfDistances[bestIdx] ? i : bestIdx, 0 ); const bestCandidate = candidateIds[bestCandidateIdx]; // Update closest distance and potential for (let i = 0; i < closestDistSq.length; i++) { closestDistSq[i] = Math.min( closestDistSq[i], distancesToCandidates[bestCandidateIdx][i] ); } sumOfDistances = candidatesSumOfDistances[bestCandidateIdx]; // Pick best candidate centers[c] = X[bestCandidate].slice(); // Copy array } return centers; } /** * Binary search * * @param {number[]} arr List of sorted values (ascending) * @param {number} val Value to search for. Return lower index if no exact match * @returns {number} Index of found value */ export function searchSorted(arr, val) { let low = 0; let high = arr.length; while (low < high) { const mid = Math.floor((low + high) / 2); if (arr[mid] < val) { low = mid + 1; } else { high = mid; } } return low; } /** * Gets the centroid from a list of vectors * * @param {number[][]} vectors * @returns {number[]} */ export function computeCentroidFrom2DArray(vectors) { const numVectors = vectors.length; if (numVectors === 0) { return null; } const dimension = vectors[0].length; let centroid = new Array(dimension).fill(0); for (let i = 0; i < numVectors; i++) { for (let j = 0; j < dimension; j++) { centroid[j] += vectors[i][j]; } } // Divide each component by the number of vectors to get the average for (let j = 0; j < dimension; j++) { centroid[j] /= numVectors; } return centroid; } /** * Helper function to compute Euclidean distance between two points * * @param {number[]} point1 * @param {number[]} point2 * @param {boolean} squareResult Return the square of the distance (i.e. don't apply SQRT) * @returns {number} */ export function euclideanDistance(point1, point2, squareResult) { const sum = point1.reduce( (acc, val, i) => acc + Math.pow(val - point2[i], 2), 0 ); return squareResult ? sum : Math.sqrt(sum); } /** * Helper function to find closest centroid for a given point * * @param {number[]} point List of vectors * @param {number[][]} centroids List of centroids of same dimension as point * @param {number} excludeIndex Index of centroid to exclude from consideration * @returns {number} Index of the closest centroid */ export function getClosestCentroid(point, centroids, excludeIndex = -1) { var minDistance = Infinity; var closestIndex = -1; for (var i = 0; i < centroids.length; i++) { var distance = euclideanDistance(point, centroids[i]); if (distance < minDistance && i !== excludeIndex) { minDistance = distance; closestIndex = i; } } return closestIndex; } /** * Compute centroid of a cluster by reference * * @param {number[][]} cluster List of clusters, each with a list of indices * @param {number[][]} data List of all points * @param {number} dimensions * @returns {number[]} */ function computeCentroid(cluster, data, dimensions) { if (cluster.length === 0) { return new Array(dimensions).fill(0); } let centroid = new Array(dimensions).fill(0); for (let j = 0; j < cluster.length; j++) { let point = data[cluster[j]]; // Use the index to get the actual point for (let i = 0; i < dimensions; i++) { centroid[i] += point[i]; } } for (let p = 0; p < dimensions; p++) { centroid[p] /= cluster.length; } return centroid; } /** * Returns true if both points have equal values * * @param {number[]} point1 Array of floats * @param {number[]} point2 * @returns {boolean} points are equal */ function arePointsEqual(point1, point2) { return point1.every((value, index) => value === point2[index]); } /** * Computes a non-adjusted Rand score from a list of items, each with a label (true cluster id) * and cluster (predicted cluster id) * * @param {[object]} combinedItems * @param {string} clusterKey Key for predicted cluster in items * @param {string} labelKey Key for labeled (true) cluster in items * @returns {number} */ export function computeRandScore(combinedItems, clusterKey, labelKey) { let agreement = 0; let disagreement = 0; for (let i = 0; i < combinedItems.length; i++) { for (let j = i + 1; j < combinedItems.length; j++) { const itemA = combinedItems[i]; const itemB = combinedItems[j]; const predictedSame = itemA[clusterKey] === itemB[clusterKey]; const labeledSame = itemA[labelKey] === itemB[labelKey]; if (predictedSame === labeledSame) { agreement += 1; } else { disagreement += 1; } } } const total_items = agreement + disagreement; if (!total_items) { return 1.0; } return agreement / total_items; }