summaryrefslogtreecommitdiffstats
path: root/dom/webgpu/tests/cts/checkout/src/webgpu/util/device_pool.ts
blob: fbcbade7717081b1be04722171e50b92e0104057 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
import { SkipTestCase } from '../../common/framework/fixture.js';
import { attemptGarbageCollection } from '../../common/util/collect_garbage.js';
import { getGPU } from '../../common/util/navigator_gpu.js';
import {
  assert,
  raceWithRejectOnTimeout,
  assertReject,
  unreachable,
} from '../../common/util/util.js';
import { kLimitInfo, kLimits } from '../capability_info.js';

export interface DeviceProvider {
  readonly device: GPUDevice;
  expectDeviceLost(reason: GPUDeviceLostReason): void;
}

class TestFailedButDeviceReusable extends Error {}
class FeaturesNotSupported extends Error {}
export class TestOOMedShouldAttemptGC extends Error {}

export class DevicePool {
  private holders: 'uninitialized' | 'failed' | DescriptorToHolderMap = 'uninitialized';

  /** Acquire a device from the pool and begin the error scopes. */
  async acquire(descriptor?: UncanonicalizedDeviceDescriptor): Promise<DeviceProvider> {
    let errorMessage = '';
    if (this.holders === 'uninitialized') {
      this.holders = new DescriptorToHolderMap();
      try {
        await this.holders.getOrCreate(undefined);
      } catch (ex) {
        this.holders = 'failed';
        if (ex instanceof Error) {
          errorMessage = ` with ${ex.name} "${ex.message}"`;
        }
      }
    }

    assert(
      this.holders !== 'failed',
      `WebGPU device failed to initialize${errorMessage}; not retrying`
    );

    const holder = await this.holders.getOrCreate(descriptor);

    assert(holder.state === 'free', 'Device was in use on DevicePool.acquire');
    holder.state = 'acquired';
    holder.beginTestScope();
    return holder;
  }

  /**
   * End the error scopes and check for errors.
   * Then, if the device seems reusable, release it back into the pool. Otherwise, drop it.
   */
  async release(holder: DeviceProvider): Promise<void> {
    assert(this.holders instanceof DescriptorToHolderMap, 'DevicePool got into a bad state');
    assert(holder instanceof DeviceHolder, 'DeviceProvider should always be a DeviceHolder');

    assert(holder.state === 'acquired', 'trying to release a device while already released');
    try {
      await holder.endTestScope();

      // (Hopefully if the device was lost, it has been reported by the time endErrorScopes()
      // has finished (or timed out). If not, it could cause a finite number of extra test
      // failures following this one (but should recover eventually).)
      assert(
        holder.lostInfo === undefined,
        `Device was unexpectedly lost. Reason: ${holder.lostInfo?.reason}, Message: ${holder.lostInfo?.message}`
      );
    } catch (ex) {
      // Any error that isn't explicitly TestFailedButDeviceReusable forces a new device to be
      // created for the next test.
      if (!(ex instanceof TestFailedButDeviceReusable)) {
        this.holders.delete(holder);
        if ('destroy' in holder.device) {
          holder.device.destroy();
        }

        // Release the (hopefully only) ref to the GPUDevice.
        holder.releaseGPUDevice();

        // Try to clean up, in case there are stray GPU resources in need of collection.
        if (ex instanceof TestOOMedShouldAttemptGC) {
          await attemptGarbageCollection();
        }
      }
      // In the try block, we may throw an error if the device is lost in order to force device
      // reinitialization, however, if the device lost was expected we want to suppress the error
      // The device lost is expected when `holder.expectedLostReason` is equal to
      // `holder.lostInfo.reason`.
      const expectedDeviceLost =
        holder.expectedLostReason !== undefined &&
        holder.lostInfo !== undefined &&
        holder.expectedLostReason === holder.lostInfo.reason;
      if (!expectedDeviceLost) {
        throw ex;
      }
    } finally {
      // Mark the holder as free so the device can be reused (if it's still in this.devices).
      holder.state = 'free';
    }
  }
}

/**
 * Map from GPUDeviceDescriptor to DeviceHolder.
 */
class DescriptorToHolderMap {
  /** Map keys that are known to be unsupported and can be rejected quickly. */
  private unsupported: Set<string> = new Set();
  private holders: Map<string, DeviceHolder> = new Map();

  /** Deletes an item from the map by DeviceHolder value. */
  delete(holder: DeviceHolder): void {
    for (const [k, v] of this.holders) {
      if (v === holder) {
        this.holders.delete(k);
        return;
      }
    }
    unreachable("internal error: couldn't find DeviceHolder to delete");
  }

  /**
   * Gets a DeviceHolder from the map if it exists; otherwise, calls create() to create one,
   * inserts it, and returns it.
   *
   * If an `uncanonicalizedDescriptor` is provided, it is canonicalized and used as the map key.
   * If one is not provided, the map key is `""` (empty string).
   *
   * Throws SkipTestCase if devices with this descriptor are unsupported.
   */
  async getOrCreate(
    uncanonicalizedDescriptor: UncanonicalizedDeviceDescriptor | undefined
  ): Promise<DeviceHolder> {
    const [descriptor, key] = canonicalizeDescriptor(uncanonicalizedDescriptor);
    // Quick-reject descriptors that are known to be unsupported already.
    if (this.unsupported.has(key)) {
      throw new SkipTestCase(
        `GPUDeviceDescriptor previously failed: ${JSON.stringify(descriptor)}`
      );
    }

    // Search for an existing device with the same descriptor.
    {
      const value = this.holders.get(key);
      if (value) {
        // Move it to the end of the Map (most-recently-used).
        this.holders.delete(key);
        this.holders.set(key, value);
        return value;
      }
    }

    // No existing item was found; add a new one.
    let value;
    try {
      value = await DeviceHolder.create(descriptor);
    } catch (ex) {
      if (ex instanceof FeaturesNotSupported) {
        this.unsupported.add(key);
        throw new SkipTestCase(
          `GPUDeviceDescriptor not supported: ${JSON.stringify(descriptor)}\n${ex?.message ?? ''}`
        );
      }

      throw ex;
    }
    this.insertAndCleanUp(key, value);
    return value;
  }

  /** Insert an entry, then remove the least-recently-used items if there are too many. */
  private insertAndCleanUp(key: string, value: DeviceHolder) {
    this.holders.set(key, value);

    const kMaxEntries = 5;
    if (this.holders.size > kMaxEntries) {
      // Delete the first (least recently used) item in the set.
      for (const [key] of this.holders) {
        this.holders.delete(key);
        return;
      }
    }
  }
}

export type UncanonicalizedDeviceDescriptor = {
  requiredFeatures?: Iterable<GPUFeatureName>;
  requiredLimits?: Record<string, GPUSize32>;
  /** @deprecated this field cannot be used */
  nonGuaranteedFeatures?: undefined;
  /** @deprecated this field cannot be used */
  nonGuaranteedLimits?: undefined;
  /** @deprecated this field cannot be used */
  extensions?: undefined;
  /** @deprecated this field cannot be used */
  features?: undefined;
};
type CanonicalDeviceDescriptor = Omit<
  Required<GPUDeviceDescriptor>,
  'label' | 'nonGuaranteedFeatures' | 'nonGuaranteedLimits'
>;
/**
 * Make a stringified map-key from a GPUDeviceDescriptor.
 * Tries to make sure all defaults are resolved, first - but it's okay if some are missed
 * (it just means some GPUDevice objects won't get deduplicated).
 *
 * This does **not** canonicalize `undefined` (the "default" descriptor) into a fully-qualified
 * GPUDeviceDescriptor. This is just because `undefined` is a common case and we want to use it
 * as a sanity check that WebGPU is working.
 */
function canonicalizeDescriptor(
  desc: UncanonicalizedDeviceDescriptor | undefined
): [CanonicalDeviceDescriptor | undefined, string] {
  if (desc === undefined) {
    return [undefined, ''];
  }

  const featuresCanonicalized = desc.requiredFeatures
    ? Array.from(new Set(desc.requiredFeatures)).sort()
    : [];

  /** Canonicalized version of the requested limits: in canonical order, with only values which are
   * specified _and_ non-default. */
  const limitsCanonicalized: Record<string, number> = {};
  if (desc.requiredLimits) {
    for (const limit of kLimits) {
      const requestedValue = desc.requiredLimits[limit];
      const defaultValue = kLimitInfo[limit].default;
      // Skip adding a limit to limitsCanonicalized if it is the same as the default.
      if (requestedValue !== undefined && requestedValue !== defaultValue) {
        limitsCanonicalized[limit] = requestedValue;
      }
    }
  }

  // Type ensures every field is carried through.
  const descriptorCanonicalized: CanonicalDeviceDescriptor = {
    requiredFeatures: featuresCanonicalized,
    requiredLimits: limitsCanonicalized,
    defaultQueue: {},
  };
  return [descriptorCanonicalized, JSON.stringify(descriptorCanonicalized)];
}

function supportsFeature(
  adapter: GPUAdapter,
  descriptor: CanonicalDeviceDescriptor | undefined
): boolean {
  if (descriptor === undefined) {
    return true;
  }

  for (const feature of descriptor.requiredFeatures) {
    if (!adapter.features.has(feature)) {
      return false;
    }
  }

  return true;
}

/**
 * DeviceHolder has three states:
 * - 'free': Free to be used for a new test.
 * - 'acquired': In use by a running test.
 */
type DeviceHolderState = 'free' | 'acquired';

/**
 * Holds a GPUDevice and tracks its state (free/acquired) and handles device loss.
 */
class DeviceHolder implements DeviceProvider {
  /** The device. Will be cleared during cleanup if there were unexpected errors. */
  private _device: GPUDevice | undefined;
  /** Whether the device is in use by a test or not. */
  state: DeviceHolderState = 'free';
  /** initially undefined; becomes set when the device is lost */
  lostInfo?: GPUDeviceLostInfo;
  /** Set if the device is expected to be lost. */
  expectedLostReason?: GPUDeviceLostReason;

  // Gets a device and creates a DeviceHolder.
  // If the device is lost, DeviceHolder.lost gets set.
  static async create(descriptor: CanonicalDeviceDescriptor | undefined): Promise<DeviceHolder> {
    const gpu = getGPU();
    const adapter = await gpu.requestAdapter();
    assert(adapter !== null, 'requestAdapter returned null');
    if (!supportsFeature(adapter, descriptor)) {
      throw new FeaturesNotSupported('One or more features are not supported');
    }
    const device = await adapter.requestDevice(descriptor);
    assert(device !== null, 'requestDevice returned null');

    return new DeviceHolder(device);
  }

  private constructor(device: GPUDevice) {
    this._device = device;
    void this._device.lost.then(ev => {
      this.lostInfo = ev;
    });
  }

  get device() {
    assert(this._device !== undefined);
    return this._device;
  }

  /** Push error scopes that surround test execution. */
  beginTestScope(): void {
    assert(this.state === 'acquired');
    this.device.pushErrorScope('out-of-memory');
    this.device.pushErrorScope('validation');
  }

  /** Mark the DeviceHolder as expecting a device loss when the test scope ends. */
  expectDeviceLost(reason: GPUDeviceLostReason) {
    assert(this.state === 'acquired');
    this.expectedLostReason = reason;
  }

  /**
   * Attempt to end test scopes: Check that there are no extra error scopes, and that no
   * otherwise-uncaptured errors occurred during the test. Time out if it takes too long.
   */
  endTestScope(): Promise<void> {
    assert(this.state === 'acquired');
    const kTimeout = 5000;

    // Time out if attemptEndTestScope (popErrorScope or onSubmittedWorkDone) never completes. If
    // this rejects, the device won't be reused, so it's OK that popErrorScope calls may not have
    // finished.
    //
    // This could happen due to a browser bug - e.g.,
    // as of this writing, on Chrome GPU process crash, popErrorScope just hangs.
    return raceWithRejectOnTimeout(this.attemptEndTestScope(), kTimeout, 'endTestScope timed out');
  }

  private async attemptEndTestScope(): Promise<void> {
    let gpuValidationError: GPUError | null;
    let gpuOutOfMemoryError: GPUError | null;

    // Submit to the queue to attempt to force a GPU flush.
    this.device.queue.submit([]);

    try {
      // May reject if the device was lost.
      [gpuValidationError, gpuOutOfMemoryError] = await Promise.all([
        this.device.popErrorScope(),
        this.device.popErrorScope(),
      ]);
    } catch (ex) {
      assert(this.lostInfo !== undefined, 'popErrorScope failed; did beginTestScope get missed?');
      throw ex;
    }

    // Attempt to wait for the queue to be idle.
    if (this.device.queue.onSubmittedWorkDone) {
      await this.device.queue.onSubmittedWorkDone();
    }

    await assertReject(
      this.device.popErrorScope(),
      'There was an extra error scope on the stack after a test'
    );

    if (gpuValidationError !== null) {
      assert(gpuValidationError instanceof GPUValidationError);
      // Allow the device to be reused.
      throw new TestFailedButDeviceReusable(
        `Unexpected validation error occurred: ${gpuValidationError.message}`
      );
    }
    if (gpuOutOfMemoryError !== null) {
      assert(gpuOutOfMemoryError instanceof GPUOutOfMemoryError);
      // Don't allow the device to be reused; unexpected OOM could break the device.
      throw new TestOOMedShouldAttemptGC('Unexpected out-of-memory error occurred');
    }
  }

  /**
   * Release the ref to the GPUDevice. This should be the only ref held by the DevicePool or
   * GPUTest, so in theory it can get garbage collected.
   */
  releaseGPUDevice(): void {
    this._device = undefined;
  }
}