/* Any copyright is dedicated to the Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ */ /// // Load the shared-head file first. Services.scriptloader.loadSubScript( "chrome://mochitests/content/browser/toolkit/components/ml/tests/browser/shared-head.js", this ); /** * @type {import("../../actors/MLEngineParent.sys.mjs")} */ const { MLEngineParent, MLEngine } = ChromeUtils.importESModule( "resource://gre/actors/MLEngineParent.sys.mjs" ); const { ModelHub, TestIndexedDBCache } = ChromeUtils.importESModule( "chrome://global/content/ml/ModelHub.sys.mjs" ); const { getInferenceProcessInfo } = ChromeUtils.importESModule( "chrome://global/content/ml/Utils.sys.mjs" ); const MS_PER_SEC = 1000; const IndexedDBCache = TestIndexedDBCache; const { createEngine, PipelineOptions, QuantizationLevel, ExecutionPriority, InferenceDevice, LogLevel, } = ChromeUtils.importESModule( "chrome://global/content/ml/EngineProcess.sys.mjs" ); // This test suite shares some utility functions with translations as they work in a very // similar fashion. Eventually, the plan is to unify these two components. Services.scriptloader.loadSubScript( "chrome://mochitests/content/browser/toolkit/components/translations/tests/browser/shared-head.js", this ); /** * Sets up the stage for a test * */ async function setup({ disabled = false, prefs = [], records = null, backend, } = {}) { const { removeMocks, remoteClients } = await createAndMockMLRemoteSettings({ autoDownloadFromRemoteSettings: false, records, backend, }); await SpecialPowers.pushPrefEnv({ set: [ // Enabled by default. ["browser.ml.enable", !disabled], ["browser.ml.logLevel", "All"], ["browser.ml.modelCacheTimeout", 1000], ["browser.ml.checkForMemory", false], ["browser.ml.queueWaitTimeout", 2], ["javascript.options.wasm_lazy_tiering", true], ...prefs, ], }); return { remoteClients, async cleanup() { await removeMocks(); await waitForCondition( () => EngineProcess.areAllEnginesTerminated(), "Waiting for all of the engines to be terminated.", 100, 200 ); await SpecialPowers.popPrefEnv(); }, }; } function getDefaultWasmRecords(backend) { return [ { name: MLEngineParent.WASM_FILENAME[ backend || MLEngineParent.DEFAULT_BACKEND ], version: MLEngineParent.WASM_MAJOR_VERSION[ backend || MLEngineParent.DEFAULT_BACKEND ] + ".0", }, ]; } async function createAndMockMLRemoteSettings({ autoDownloadFromRemoteSettings = false, records = null, backend, } = {}) { const wasmRecords = getDefaultWasmRecords(backend).map( ({ name, version }) => ({ id: crypto.randomUUID(), name, version, last_modified: Date.now(), schema: Date.now(), }) ); const runtime = await createRemoteClient({ collectionName: "test-translation-wasm", records: wasmRecords, attachmentMock: true, autoDownloadFromRemoteSettings, }); const options = await createRemoteClient({ records: records || [ { taskName: "moz-echo", modelId: "mozilla/distilvit", processorId: "mozilla/distilvit", tokenizerId: "mozilla/distilvit", modelRevision: "main", processorRevision: "main", tokenizerRevision: "main", dtype: "q8", id: "74a71cfd-1734-44e6-85c0-69cf3e874138", }, ], collectionName: "test-ml-inference-options", }); const allowDeny = await createRemoteClient({ records: [ { filter: "ALLOW", urlPrefix: "https://", id: "74a71cfd-1734-44e6-85c0-69cf3e874138", }, ], collectionName: "test-ml-allow-deny-list", }); const remoteClients = { "ml-onnx-runtime": runtime, "ml-inference-options": options, "ml-model-allow-deny-list": allowDeny, }; MLEngineParent.mockRemoteSettings({ "ml-onnx-runtime": runtime.client, "ml-inference-options": options, "ml-model-allow-deny-list": allowDeny, }); return { async removeMocks() { await runtime.client.attachments.deleteAll(); await runtime.client.db.clear(); await options.db.clear(); await allowDeny.db.clear(); MLEngineParent.removeMocks(); }, remoteClients, }; } /** * Creates a local RemoteSettingsClient for use within tests. * * @returns {RemoteSettings|AttachmentMock} */ async function createRemoteClient({ records, collectionName, attachmentMock = false, autoDownloadFromRemoteSettings = false, }) { const { RemoteSettings } = ChromeUtils.importESModule( "resource://services-settings/remote-settings.sys.mjs" ); const client = RemoteSettings(`${collectionName}-${_remoteSettingsMockId++}`); await client.db.clear(); await client.db.importChanges({}, Date.now(), records); if (attachmentMock) { return createAttachmentMock( client, collectionName, autoDownloadFromRemoteSettings ); } return client; } /* * Perftest related */ const ONE_MIB = 1024 * 1024; const INIT_START = "initializationStart"; const INIT_END = "initializationEnd"; const RUN_START = "runStart"; const RUN_END = "runEnd"; const PIPELINE_READY_START = "ensurePipelineIsReadyStart"; const PIPELINE_READY_END = "ensurePipelineIsReadyEnd"; const PIPELINE_READY_LATENCY = "pipeline-ready-latency"; const INITIALIZATION_LATENCY = "initialization-latency"; const MODEL_RUN_LATENCY = "model-run-latency"; const TOTAL_MEMORY_USAGE = "total-memory-usage"; const COLD_START_PREFIX = "cold-start-"; const PEAK_MEMORY_USAGE = "peak-memory-usage"; const ITERATIONS = 10; const WHEN = "when"; const MEMORY = "memory"; const E2E_INIT_LATENCY = "e2e-init-latency"; const FIRST_TOKEN_LATENCY = "1st-token-latency"; const DECODING_LATENCY = "decoding-latency"; // Token speeds are apppropriate for comparing the speed of the same model. const DECODING_TOKEN_SPEED = "decoding-tokenSpeed"; const PROMPT_TOKEN_SPEED = "prompt-tokenSpeed"; // Characters speed is appropriate for comparing the speed of two different models. const DECODING_CHARACTERS_SPEED = "decoding-charactersSpeed"; const PROMPT_CHARACTERS_SPEED = "prompt-charactersSpeed"; const formatNumber = new Intl.NumberFormat("en-US", { maximumSignificantDigits: 4, }).format; function median(arr) { arr = [...arr].sort((a, b) => a - b); const mid = Math.floor(arr.length / 2); if (arr.length % 2) { return arr[mid]; } return (arr[mid - 1] + arr[mid]) / 2; } function stringify(arr) { function pad(str) { str = str.padStart(7, " "); if (str[0] != " ") { str = " " + str; } return str; } return arr.reduce((acc, elem) => acc + pad(formatNumber(elem)), ""); } function reportMetrics(journal) { let text = "\nResults (ms)\n"; const names = Object.keys(journal); const prefixLen = 1 + Math.max(...names.map(str => str.length)); for (const name in journal) { const med = median(journal[name]); text += (name + ":").padEnd(prefixLen, " ") + stringify(journal[name]); text += " median " + formatNumber(med) + "\n"; } const reportedMetrics = []; for (const [name, values] of Object.entries(journal)) { reportedMetrics.push({ name, values, value: median(values), }); } dump(text); info(`perfMetrics | ${JSON.stringify(reportedMetrics)}`); } /** * Fetches the latest metric entry with the specified name and retrieves its value for the given key. * If multiple metrics share the same name, the function returns the key from the most recent one. * * @param {Array} metrics - The array of metric objects to search through. * @param {string} name - The name of the metric to find. * @param {string} key - The key within the metric object whose value should be returned. * @returns {*} - The value of the specified key in the latest metric with the given name, or undefined if no matching metric is found. */ function fetchMLMetric(metrics, name, key) { const matchingMetrics = metrics.filter(metric => metric.name === name); if (matchingMetrics.length === 0) { return undefined; } // Return undefined if no match found const latestMetric = matchingMetrics[matchingMetrics.length - 1]; return latestMetric[key]; } function fetchLatencyMetrics(metrics, isFirstRun) { const pipelineLatency = fetchMLMetric(metrics, PIPELINE_READY_END, WHEN) - fetchMLMetric(metrics, PIPELINE_READY_START, WHEN); const initLatency = fetchMLMetric(metrics, INIT_END, WHEN) - fetchMLMetric(metrics, INIT_START, WHEN); const runLatency = fetchMLMetric(metrics, RUN_END, WHEN) - fetchMLMetric(metrics, RUN_START, WHEN); return { [`${isFirstRun ? COLD_START_PREFIX : ""}${PIPELINE_READY_LATENCY}`]: pipelineLatency, [`${isFirstRun ? COLD_START_PREFIX : ""}${INITIALIZATION_LATENCY}`]: initLatency, [`${isFirstRun ? COLD_START_PREFIX : ""}${MODEL_RUN_LATENCY}`]: runLatency, }; } function fetchMetrics(metrics, isFirstRun) { return { ...fetchLatencyMetrics(metrics, isFirstRun), }; } async function initializeEngine(pipelineOptions, prefs = null) { const modelDirectory = normalizePathForOS( `${Services.env.get("MOZ_FETCHES_DIR")}/onnx-models` ); info(`Model Directory: ${modelDirectory}`); const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB"); if (!modelHubRootUrl) { throw new Error( "MOZ_MODELS_HUB is not set, you need to run with --hooks toolkit/components/ml/tests/tools/hook_local_hub.py" ); } info(`ModelHubRootUrl: ${modelHubRootUrl}`); var browserPrefs = [["browser.ml.modelHubRootUrl", modelHubRootUrl]]; if (prefs) { browserPrefs = browserPrefs.concat(prefs); } const { cleanup } = await perfSetup({ prefs: browserPrefs, backend: pipelineOptions.backend, }); info("Get the engine process"); const startTime = performance.now(); const mlEngineParent = await EngineProcess.getMLEngineParent(); const engine = await mlEngineParent.getEngine( new PipelineOptions(pipelineOptions) ); const e2eInitTime = performance.now() - startTime; info("Get Pipeline Options"); info("Run the inference"); return { cleanup, engine, e2eInitTime, }; } function normalizePathForOS(path) { if (Services.appinfo.OS === "WINNT") { // On Windows, replace forward slashes with backslashes return path.replace(/\//g, "\\"); } // On Unix-like systems, replace backslashes with forward slashes return path.replace(/\\/g, "/"); } async function perfSetup({ disabled = false, prefs = [], backend } = {}) { const { removeMocks, remoteClients } = await createAndMockMLRemoteSettings({ autoDownloadFromRemoteSettings: false, backend, }); var finalPrefs = [ // Enabled by default. ["browser.ml.enable", !disabled], ["browser.ml.logLevel", "Error"], ["browser.ml.modelCacheTimeout", 1000], ["browser.ml.checkForMemory", false], ["javascript.options.wasm_lazy_tiering", true], ...prefs, ]; await SpecialPowers.pushPrefEnv({ set: finalPrefs, }); let artifactDirectory = normalizePathForOS( `${Services.env.get("MOZ_FETCHES_DIR")}` ); async function pathExists(path) { try { return await IOUtils.exists(path); } catch (e) { return false; } } // Stop immediately if this fails. if (!artifactDirectory) { artifactDirectory = normalizePathForOS( `${Services.env.get("MOZ_ML_LOCAL_DIR")}` ); } if (!artifactDirectory) { throw new Error( `The wasm artifact directory is not set. This usually happens when running locally. " + "Please download all the files from taskcluster/kinds/fetch/onnxruntime-web-fetch.yml. " + "Place them in a directory and rerun the test with the environment variable 'MOZ_ML_LOCAL_DIR' " + "set such that all the files are directly inside 'MOZ_ML_LOCAL_DIR'` ); } if (!PathUtils.isAbsolute(artifactDirectory)) { throw new Error( "Please provide an absolute path for 'MOZ_ML_LOCAL_DIR and not a relative path" ); } async function download(record) { info(`Downloading record: ${record.name}`); const recordPath = normalizePathForOS( `${artifactDirectory}/${record.name}` ); // Stop immediately if this fails. if (!(await pathExists(recordPath))) { throw new Error(`The wasm file <${recordPath}> does not exist. This usually happens when running locally. " + "Please download all the files from taskcluster/kinds/fetch/onnxruntime-web-fetch.yml. " + "Place them in the directory <${artifactDirectory}> " + "such that <${recordPath}> exists.`); } return { buffer: (await IOUtils.read(recordPath)).buffer, }; } remoteClients["ml-onnx-runtime"].client.attachments.download = download; return { remoteClients, async cleanup() { await removeMocks(); await waitForCondition( () => EngineProcess.areAllEnginesTerminated(), "Waiting for all of the engines to be terminated.", 100, 200 ); await SpecialPowers.popPrefEnv(); }, }; } /** * Returns the current total physical memory usage in MiB for the inference process */ async function getTotalMemoryUsage() { const procInfo = await getInferenceProcessInfo(); return Math.round(procInfo.memory / ONE_MIB); } /** * Runs an inference given the options and arguments * */ async function runInference({ pipelineOptions, request, isFirstRun = false, browserPrefs = null, }) { info( `runInference is request null | ${request === null || request === undefined}` ); const { cleanup, engine, e2eInitTime } = await initializeEngine( pipelineOptions, browserPrefs ); const streamerOptions = { perTokens: true, skipPrompt: pipelineOptions.taskName !== "text-generation", returnTokens: true, ...(request.streamerOptions || {}), }; request = { ...request, streamerOptions }; let metrics = {}; let timeToFirstToken; let startTime; let numGeneratedCharacters = 0; let numGeneratedTokens = 0; let numPromptCharacters = 0; if (streamerOptions.skipPrompt && Array.isArray(request?.args)) { numPromptCharacters += request.args .flat() .reduce((sum, item) => sum + (item?.length || 0), 0); } let numPromptTokens = 0; const run = async () => { let isFirstTokenReceived = false; let result; let currentTokenLen = 0; let currentCharLen = 0; startTime = performance.now(); const generator = engine.runWithGenerator(request); do { result = await generator.next(); currentTokenLen = result.value?.tokens?.flat()?.length || 0; currentCharLen = result.value?.text?.length || 0; if (result.value?.isPrompt) { numPromptCharacters += currentCharLen; numPromptTokens += currentTokenLen; } else { numGeneratedCharacters += currentCharLen; numGeneratedTokens += currentTokenLen; if (!isFirstTokenReceived) { timeToFirstToken = performance.now() - startTime; isFirstTokenReceived = true; startTime = performance.now(); } } } while (!result.done); return result.value; }; try { const res = await run(); const decodingTime = performance.now() - startTime; metrics = fetchMetrics(res.metrics || [], isFirstRun); metrics[`${isFirstRun ? COLD_START_PREFIX : ""}${TOTAL_MEMORY_USAGE}`] = await getTotalMemoryUsage(); metrics[`${isFirstRun ? COLD_START_PREFIX : ""}${E2E_INIT_LATENCY}`] = e2eInitTime; metrics[`${isFirstRun ? COLD_START_PREFIX : ""}${FIRST_TOKEN_LATENCY}`] = timeToFirstToken; metrics[`${isFirstRun ? COLD_START_PREFIX : ""}${DECODING_LATENCY}`] = decodingTime; metrics[ `${isFirstRun ? COLD_START_PREFIX : ""}${DECODING_CHARACTERS_SPEED}` ] = numGeneratedCharacters / (decodingTime / MS_PER_SEC); metrics[`${isFirstRun ? COLD_START_PREFIX : ""}${DECODING_TOKEN_SPEED}`] = numGeneratedTokens / (decodingTime / MS_PER_SEC); metrics[ `${isFirstRun ? COLD_START_PREFIX : ""}${PROMPT_CHARACTERS_SPEED}` ] = numPromptCharacters / (timeToFirstToken / MS_PER_SEC); metrics[`${isFirstRun ? COLD_START_PREFIX : ""}${PROMPT_TOKEN_SPEED}`] = numPromptTokens / (timeToFirstToken / MS_PER_SEC); } finally { await engine.terminate(); await EngineProcess.destroyMLEngine(); await cleanup(); } return metrics; } /** * Can be used to track peak memory * */ class PeakMemoryTracker { constructor(interval = 500) { this._memory = 0; this._intervalId = null; this._interval = interval; } async collectPeakMemory() { const procInfo = await getInferenceProcessInfo(); if (procInfo.memory && procInfo.memory > this._memory) { this._memory = procInfo.memory; } } start() { if (this._intervalId !== null) { return; } // Prevent multiple intervals this._intervalId = setInterval(() => { this.collectPeakMemory().catch(console.error); }, this._interval); } stop() { if (this._intervalId !== null) { clearInterval(this._intervalId); this._intervalId = null; } try { return Math.round(this._memory / ONE_MIB); } finally { this._memory = 0; } } } /** * Runs a performance test for the given name, options, and arguments and * reports the results for perfherder. */ async function perfTest({ name, options, request, iterations = ITERATIONS, addColdStart = false, trackPeakMemory = false, peakMemoryInterval = 500, browserPrefs = null, }) { info(`is request null | ${request === null || request === undefined}`); name = name.toUpperCase(); let METRICS; // When tracking peak memory we only do this because we're // stressing the system with 500ms callbacks so other netrics are impacted if (trackPeakMemory) { METRICS = [`${name}-${PEAK_MEMORY_USAGE}`]; } else { METRICS = [ `${name}-${PIPELINE_READY_LATENCY}`, `${name}-${INITIALIZATION_LATENCY}`, `${name}-${MODEL_RUN_LATENCY}`, `${name}-${TOTAL_MEMORY_USAGE}`, `${name}-${E2E_INIT_LATENCY}`, `${name}-${FIRST_TOKEN_LATENCY}`, `${name}-${DECODING_LATENCY}`, `${name}-${DECODING_CHARACTERS_SPEED}`, `${name}-${DECODING_TOKEN_SPEED}`, `${name}-${PROMPT_CHARACTERS_SPEED}`, `${name}-${PROMPT_TOKEN_SPEED}`, ...(addColdStart ? [ `${name}-${COLD_START_PREFIX}${PIPELINE_READY_LATENCY}`, `${name}-${COLD_START_PREFIX}${INITIALIZATION_LATENCY}`, `${name}-${COLD_START_PREFIX}${MODEL_RUN_LATENCY}`, `${name}-${COLD_START_PREFIX}${TOTAL_MEMORY_USAGE}`, ] : []), ]; } const journal = {}; for (let metric of METRICS) { journal[metric] = []; } const pipelineOptions = new PipelineOptions(options); var tracker; let nIterations = addColdStart ? iterations + 1 : iterations; for (let i = 0; i < nIterations; i++) { if (trackPeakMemory) { tracker = new PeakMemoryTracker(peakMemoryInterval); tracker.start(); } const shouldAddColdStart = addColdStart && i === 0; let metrics = await runInference({ pipelineOptions, request, isFirstRun: shouldAddColdStart, browserPrefs, }); if (trackPeakMemory) { journal[`${name}-${PEAK_MEMORY_USAGE}`].push(tracker.stop()); } else { for (let [metricName, metricVal] of Object.entries(metrics)) { if (!Number.isFinite(metricVal) || metricVal < 0) { metricVal = 0; } // Add the metric if it wasn't there if (journal[`${name}-${metricName}`] === undefined) { journal[`${name}-${metricName}`] = []; } journal[`${name}-${metricName}`].push(metricVal); } } } Assert.ok(true); reportMetrics(journal); } /** * Measures floating point value within epsilon tolerance */ function isEqualWithTolerance(A, B, epsilon = 0.000001) { return Math.abs(Math.abs(A) - Math.abs(B)) < epsilon; }