hc
2024-01-03 2f7c68cb55ecb7331f2381deb497c27155f32faf
kernel/drivers/gpu/drm/i915/selftests/i915_request.c
....@@ -23,33 +23,60 @@
2323 */
2424
2525 #include <linux/prime_numbers.h>
26
+#include <linux/pm_qos.h>
27
+#include <linux/sort.h>
2628
27
-#include "../i915_selftest.h"
29
+#include "gem/i915_gem_pm.h"
30
+#include "gem/selftests/mock_context.h"
2831
29
-#include "mock_context.h"
32
+#include "gt/intel_engine_heartbeat.h"
33
+#include "gt/intel_engine_pm.h"
34
+#include "gt/intel_engine_user.h"
35
+#include "gt/intel_gt.h"
36
+#include "gt/intel_gt_requests.h"
37
+#include "gt/selftest_engine_heartbeat.h"
38
+
39
+#include "i915_random.h"
40
+#include "i915_selftest.h"
41
+#include "igt_flush_test.h"
42
+#include "igt_live_test.h"
43
+#include "igt_spinner.h"
44
+#include "lib_sw_fence.h"
45
+
46
+#include "mock_drm.h"
3047 #include "mock_gem_device.h"
48
+
49
+static unsigned int num_uabi_engines(struct drm_i915_private *i915)
50
+{
51
+ struct intel_engine_cs *engine;
52
+ unsigned int count;
53
+
54
+ count = 0;
55
+ for_each_uabi_engine(engine, i915)
56
+ count++;
57
+
58
+ return count;
59
+}
60
+
61
+static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
62
+{
63
+ return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
64
+}
3165
3266 static int igt_add_request(void *arg)
3367 {
3468 struct drm_i915_private *i915 = arg;
3569 struct i915_request *request;
36
- int err = -ENOMEM;
3770
3871 /* Basic preliminary test to create a request and let it loose! */
3972
40
- mutex_lock(&i915->drm.struct_mutex);
41
- request = mock_request(i915->engine[RCS],
42
- i915->kernel_context,
43
- HZ / 10);
73
+ request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
4474 if (!request)
45
- goto out_unlock;
75
+ return -ENOMEM;
4676
4777 i915_request_add(request);
4878
49
- err = 0;
50
-out_unlock:
51
- mutex_unlock(&i915->drm.struct_mutex);
52
- return err;
79
+ return 0;
5380 }
5481
5582 static int igt_wait_request(void *arg)
....@@ -61,64 +88,63 @@
6188
6289 /* Submit a request, then wait upon it */
6390
64
- mutex_lock(&i915->drm.struct_mutex);
65
- request = mock_request(i915->engine[RCS], i915->kernel_context, T);
66
- if (!request) {
67
- err = -ENOMEM;
68
- goto out_unlock;
69
- }
91
+ request = mock_request(rcs0(i915)->kernel_context, T);
92
+ if (!request)
93
+ return -ENOMEM;
7094
71
- if (i915_request_wait(request, I915_WAIT_LOCKED, 0) != -ETIME) {
95
+ i915_request_get(request);
96
+
97
+ if (i915_request_wait(request, 0, 0) != -ETIME) {
7298 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
73
- goto out_unlock;
99
+ goto out_request;
74100 }
75101
76
- if (i915_request_wait(request, I915_WAIT_LOCKED, T) != -ETIME) {
102
+ if (i915_request_wait(request, 0, T) != -ETIME) {
77103 pr_err("request wait succeeded (expected timeout before submit!)\n");
78
- goto out_unlock;
104
+ goto out_request;
79105 }
80106
81107 if (i915_request_completed(request)) {
82108 pr_err("request completed before submit!!\n");
83
- goto out_unlock;
109
+ goto out_request;
84110 }
85111
86112 i915_request_add(request);
87113
88
- if (i915_request_wait(request, I915_WAIT_LOCKED, 0) != -ETIME) {
114
+ if (i915_request_wait(request, 0, 0) != -ETIME) {
89115 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
90
- goto out_unlock;
116
+ goto out_request;
91117 }
92118
93119 if (i915_request_completed(request)) {
94120 pr_err("request completed immediately!\n");
95
- goto out_unlock;
121
+ goto out_request;
96122 }
97123
98
- if (i915_request_wait(request, I915_WAIT_LOCKED, T / 2) != -ETIME) {
124
+ if (i915_request_wait(request, 0, T / 2) != -ETIME) {
99125 pr_err("request wait succeeded (expected timeout!)\n");
100
- goto out_unlock;
126
+ goto out_request;
101127 }
102128
103
- if (i915_request_wait(request, I915_WAIT_LOCKED, T) == -ETIME) {
129
+ if (i915_request_wait(request, 0, T) == -ETIME) {
104130 pr_err("request wait timed out!\n");
105
- goto out_unlock;
131
+ goto out_request;
106132 }
107133
108134 if (!i915_request_completed(request)) {
109135 pr_err("request not complete after waiting!\n");
110
- goto out_unlock;
136
+ goto out_request;
111137 }
112138
113
- if (i915_request_wait(request, I915_WAIT_LOCKED, T) == -ETIME) {
139
+ if (i915_request_wait(request, 0, T) == -ETIME) {
114140 pr_err("request wait timed out when already complete!\n");
115
- goto out_unlock;
141
+ goto out_request;
116142 }
117143
118144 err = 0;
119
-out_unlock:
145
+out_request:
146
+ i915_request_put(request);
120147 mock_device_flush(i915);
121
- mutex_unlock(&i915->drm.struct_mutex);
122148 return err;
123149 }
124150
....@@ -131,54 +157,45 @@
131157
132158 /* Submit a request, treat it as a fence and wait upon it */
133159
134
- mutex_lock(&i915->drm.struct_mutex);
135
- request = mock_request(i915->engine[RCS], i915->kernel_context, T);
136
- if (!request) {
137
- err = -ENOMEM;
138
- goto out_locked;
139
- }
140
- mutex_unlock(&i915->drm.struct_mutex); /* safe as we are single user */
160
+ request = mock_request(rcs0(i915)->kernel_context, T);
161
+ if (!request)
162
+ return -ENOMEM;
141163
142164 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
143165 pr_err("fence wait success before submit (expected timeout)!\n");
144
- goto out_device;
166
+ goto out;
145167 }
146168
147
- mutex_lock(&i915->drm.struct_mutex);
148169 i915_request_add(request);
149
- mutex_unlock(&i915->drm.struct_mutex);
150170
151171 if (dma_fence_is_signaled(&request->fence)) {
152172 pr_err("fence signaled immediately!\n");
153
- goto out_device;
173
+ goto out;
154174 }
155175
156176 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
157177 pr_err("fence wait success after submit (expected timeout)!\n");
158
- goto out_device;
178
+ goto out;
159179 }
160180
161181 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
162182 pr_err("fence wait timed out (expected success)!\n");
163
- goto out_device;
183
+ goto out;
164184 }
165185
166186 if (!dma_fence_is_signaled(&request->fence)) {
167187 pr_err("fence unsignaled after waiting!\n");
168
- goto out_device;
188
+ goto out;
169189 }
170190
171191 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
172192 pr_err("fence wait timed out when complete (expected success)!\n");
173
- goto out_device;
193
+ goto out;
174194 }
175195
176196 err = 0;
177
-out_device:
178
- mutex_lock(&i915->drm.struct_mutex);
179
-out_locked:
197
+out:
180198 mock_device_flush(i915);
181
- mutex_unlock(&i915->drm.struct_mutex);
182199 return err;
183200 }
184201
....@@ -187,11 +204,15 @@
187204 struct drm_i915_private *i915 = arg;
188205 struct i915_request *request, *vip;
189206 struct i915_gem_context *ctx[2];
207
+ struct intel_context *ce;
190208 int err = -EINVAL;
191209
192
- mutex_lock(&i915->drm.struct_mutex);
193210 ctx[0] = mock_context(i915, "A");
194
- request = mock_request(i915->engine[RCS], ctx[0], 2 * HZ);
211
+
212
+ ce = i915_gem_context_get_engine(ctx[0], RCS0);
213
+ GEM_BUG_ON(IS_ERR(ce));
214
+ request = mock_request(ce, 2 * HZ);
215
+ intel_context_put(ce);
195216 if (!request) {
196217 err = -ENOMEM;
197218 goto err_context_0;
....@@ -201,7 +222,11 @@
201222 i915_request_add(request);
202223
203224 ctx[1] = mock_context(i915, "B");
204
- vip = mock_request(i915->engine[RCS], ctx[1], 0);
225
+
226
+ ce = i915_gem_context_get_engine(ctx[1], RCS0);
227
+ GEM_BUG_ON(IS_ERR(ce));
228
+ vip = mock_request(ce, 0);
229
+ intel_context_put(ce);
205230 if (!vip) {
206231 err = -ENOMEM;
207232 goto err_context_1;
....@@ -219,11 +244,9 @@
219244 request->engine->submit_request(request);
220245 rcu_read_unlock();
221246
222
- mutex_unlock(&i915->drm.struct_mutex);
223247
224248 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
225
- pr_err("timed out waiting for high priority request, vip.seqno=%d, current seqno=%d\n",
226
- vip->global_seqno, intel_engine_get_seqno(i915->engine[RCS]));
249
+ pr_err("timed out waiting for high priority request\n");
227250 goto err;
228251 }
229252
....@@ -235,15 +258,253 @@
235258 err = 0;
236259 err:
237260 i915_request_put(vip);
238
- mutex_lock(&i915->drm.struct_mutex);
239261 err_context_1:
240262 mock_context_close(ctx[1]);
241263 i915_request_put(request);
242264 err_context_0:
243265 mock_context_close(ctx[0]);
244266 mock_device_flush(i915);
245
- mutex_unlock(&i915->drm.struct_mutex);
246267 return err;
268
+}
269
+
270
+struct smoketest {
271
+ struct intel_engine_cs *engine;
272
+ struct i915_gem_context **contexts;
273
+ atomic_long_t num_waits, num_fences;
274
+ int ncontexts, max_batch;
275
+ struct i915_request *(*request_alloc)(struct intel_context *ce);
276
+};
277
+
278
+static struct i915_request *
279
+__mock_request_alloc(struct intel_context *ce)
280
+{
281
+ return mock_request(ce, 0);
282
+}
283
+
284
+static struct i915_request *
285
+__live_request_alloc(struct intel_context *ce)
286
+{
287
+ return intel_context_create_request(ce);
288
+}
289
+
290
+static int __igt_breadcrumbs_smoketest(void *arg)
291
+{
292
+ struct smoketest *t = arg;
293
+ const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
294
+ const unsigned int total = 4 * t->ncontexts + 1;
295
+ unsigned int num_waits = 0, num_fences = 0;
296
+ struct i915_request **requests;
297
+ I915_RND_STATE(prng);
298
+ unsigned int *order;
299
+ int err = 0;
300
+
301
+ /*
302
+ * A very simple test to catch the most egregious of list handling bugs.
303
+ *
304
+ * At its heart, we simply create oodles of requests running across
305
+ * multiple kthreads and enable signaling on them, for the sole purpose
306
+ * of stressing our breadcrumb handling. The only inspection we do is
307
+ * that the fences were marked as signaled.
308
+ */
309
+
310
+ requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
311
+ if (!requests)
312
+ return -ENOMEM;
313
+
314
+ order = i915_random_order(total, &prng);
315
+ if (!order) {
316
+ err = -ENOMEM;
317
+ goto out_requests;
318
+ }
319
+
320
+ while (!kthread_should_stop()) {
321
+ struct i915_sw_fence *submit, *wait;
322
+ unsigned int n, count;
323
+
324
+ submit = heap_fence_create(GFP_KERNEL);
325
+ if (!submit) {
326
+ err = -ENOMEM;
327
+ break;
328
+ }
329
+
330
+ wait = heap_fence_create(GFP_KERNEL);
331
+ if (!wait) {
332
+ i915_sw_fence_commit(submit);
333
+ heap_fence_put(submit);
334
+ err = -ENOMEM;
335
+ break;
336
+ }
337
+
338
+ i915_random_reorder(order, total, &prng);
339
+ count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
340
+
341
+ for (n = 0; n < count; n++) {
342
+ struct i915_gem_context *ctx =
343
+ t->contexts[order[n] % t->ncontexts];
344
+ struct i915_request *rq;
345
+ struct intel_context *ce;
346
+
347
+ ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
348
+ GEM_BUG_ON(IS_ERR(ce));
349
+ rq = t->request_alloc(ce);
350
+ intel_context_put(ce);
351
+ if (IS_ERR(rq)) {
352
+ err = PTR_ERR(rq);
353
+ count = n;
354
+ break;
355
+ }
356
+
357
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
358
+ submit,
359
+ GFP_KERNEL);
360
+
361
+ requests[n] = i915_request_get(rq);
362
+ i915_request_add(rq);
363
+
364
+ if (err >= 0)
365
+ err = i915_sw_fence_await_dma_fence(wait,
366
+ &rq->fence,
367
+ 0,
368
+ GFP_KERNEL);
369
+
370
+ if (err < 0) {
371
+ i915_request_put(rq);
372
+ count = n;
373
+ break;
374
+ }
375
+ }
376
+
377
+ i915_sw_fence_commit(submit);
378
+ i915_sw_fence_commit(wait);
379
+
380
+ if (!wait_event_timeout(wait->wait,
381
+ i915_sw_fence_done(wait),
382
+ 5 * HZ)) {
383
+ struct i915_request *rq = requests[count - 1];
384
+
385
+ pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
386
+ atomic_read(&wait->pending), count,
387
+ rq->fence.context, rq->fence.seqno,
388
+ t->engine->name);
389
+ GEM_TRACE_DUMP();
390
+
391
+ intel_gt_set_wedged(t->engine->gt);
392
+ GEM_BUG_ON(!i915_request_completed(rq));
393
+ i915_sw_fence_wait(wait);
394
+ err = -EIO;
395
+ }
396
+
397
+ for (n = 0; n < count; n++) {
398
+ struct i915_request *rq = requests[n];
399
+
400
+ if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
401
+ &rq->fence.flags)) {
402
+ pr_err("%llu:%llu was not signaled!\n",
403
+ rq->fence.context, rq->fence.seqno);
404
+ err = -EINVAL;
405
+ }
406
+
407
+ i915_request_put(rq);
408
+ }
409
+
410
+ heap_fence_put(wait);
411
+ heap_fence_put(submit);
412
+
413
+ if (err < 0)
414
+ break;
415
+
416
+ num_fences += count;
417
+ num_waits++;
418
+
419
+ cond_resched();
420
+ }
421
+
422
+ atomic_long_add(num_fences, &t->num_fences);
423
+ atomic_long_add(num_waits, &t->num_waits);
424
+
425
+ kfree(order);
426
+out_requests:
427
+ kfree(requests);
428
+ return err;
429
+}
430
+
431
+static int mock_breadcrumbs_smoketest(void *arg)
432
+{
433
+ struct drm_i915_private *i915 = arg;
434
+ struct smoketest t = {
435
+ .engine = rcs0(i915),
436
+ .ncontexts = 1024,
437
+ .max_batch = 1024,
438
+ .request_alloc = __mock_request_alloc
439
+ };
440
+ unsigned int ncpus = num_online_cpus();
441
+ struct task_struct **threads;
442
+ unsigned int n;
443
+ int ret = 0;
444
+
445
+ /*
446
+ * Smoketest our breadcrumb/signal handling for requests across multiple
447
+ * threads. A very simple test to only catch the most egregious of bugs.
448
+ * See __igt_breadcrumbs_smoketest();
449
+ */
450
+
451
+ threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
452
+ if (!threads)
453
+ return -ENOMEM;
454
+
455
+ t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
456
+ if (!t.contexts) {
457
+ ret = -ENOMEM;
458
+ goto out_threads;
459
+ }
460
+
461
+ for (n = 0; n < t.ncontexts; n++) {
462
+ t.contexts[n] = mock_context(t.engine->i915, "mock");
463
+ if (!t.contexts[n]) {
464
+ ret = -ENOMEM;
465
+ goto out_contexts;
466
+ }
467
+ }
468
+
469
+ for (n = 0; n < ncpus; n++) {
470
+ threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
471
+ &t, "igt/%d", n);
472
+ if (IS_ERR(threads[n])) {
473
+ ret = PTR_ERR(threads[n]);
474
+ ncpus = n;
475
+ break;
476
+ }
477
+
478
+ get_task_struct(threads[n]);
479
+ }
480
+
481
+ yield(); /* start all threads before we begin */
482
+ msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
483
+
484
+ for (n = 0; n < ncpus; n++) {
485
+ int err;
486
+
487
+ err = kthread_stop(threads[n]);
488
+ if (err < 0 && !ret)
489
+ ret = err;
490
+
491
+ put_task_struct(threads[n]);
492
+ }
493
+ pr_info("Completed %lu waits for %lu fence across %d cpus\n",
494
+ atomic_long_read(&t.num_waits),
495
+ atomic_long_read(&t.num_fences),
496
+ ncpus);
497
+
498
+out_contexts:
499
+ for (n = 0; n < t.ncontexts; n++) {
500
+ if (!t.contexts[n])
501
+ break;
502
+ mock_context_close(t.contexts[n]);
503
+ }
504
+ kfree(t.contexts);
505
+out_threads:
506
+ kfree(threads);
507
+ return ret;
247508 }
248509
249510 int i915_request_mock_selftests(void)
....@@ -253,118 +514,60 @@
253514 SUBTEST(igt_wait_request),
254515 SUBTEST(igt_fence_wait),
255516 SUBTEST(igt_request_rewind),
517
+ SUBTEST(mock_breadcrumbs_smoketest),
256518 };
257519 struct drm_i915_private *i915;
258
- int err;
520
+ intel_wakeref_t wakeref;
521
+ int err = 0;
259522
260523 i915 = mock_gem_device();
261524 if (!i915)
262525 return -ENOMEM;
263526
264
- err = i915_subtests(tests, i915);
265
- drm_dev_put(&i915->drm);
527
+ with_intel_runtime_pm(&i915->runtime_pm, wakeref)
528
+ err = i915_subtests(tests, i915);
529
+
530
+ mock_destroy_device(i915);
266531
267532 return err;
268
-}
269
-
270
-struct live_test {
271
- struct drm_i915_private *i915;
272
- const char *func;
273
- const char *name;
274
-
275
- unsigned int reset_count;
276
-};
277
-
278
-static int begin_live_test(struct live_test *t,
279
- struct drm_i915_private *i915,
280
- const char *func,
281
- const char *name)
282
-{
283
- int err;
284
-
285
- t->i915 = i915;
286
- t->func = func;
287
- t->name = name;
288
-
289
- err = i915_gem_wait_for_idle(i915,
290
- I915_WAIT_LOCKED,
291
- MAX_SCHEDULE_TIMEOUT);
292
- if (err) {
293
- pr_err("%s(%s): failed to idle before, with err=%d!",
294
- func, name, err);
295
- return err;
296
- }
297
-
298
- i915->gpu_error.missed_irq_rings = 0;
299
- t->reset_count = i915_reset_count(&i915->gpu_error);
300
-
301
- return 0;
302
-}
303
-
304
-static int end_live_test(struct live_test *t)
305
-{
306
- struct drm_i915_private *i915 = t->i915;
307
-
308
- i915_retire_requests(i915);
309
-
310
- if (wait_for(intel_engines_are_idle(i915), 10)) {
311
- pr_err("%s(%s): GPU not idle\n", t->func, t->name);
312
- return -EIO;
313
- }
314
-
315
- if (t->reset_count != i915_reset_count(&i915->gpu_error)) {
316
- pr_err("%s(%s): GPU was reset %d times!\n",
317
- t->func, t->name,
318
- i915_reset_count(&i915->gpu_error) - t->reset_count);
319
- return -EIO;
320
- }
321
-
322
- if (i915->gpu_error.missed_irq_rings) {
323
- pr_err("%s(%s): Missed interrupts on engines %lx\n",
324
- t->func, t->name, i915->gpu_error.missed_irq_rings);
325
- return -EIO;
326
- }
327
-
328
- return 0;
329533 }
330534
331535 static int live_nop_request(void *arg)
332536 {
333537 struct drm_i915_private *i915 = arg;
334538 struct intel_engine_cs *engine;
335
- struct live_test t;
336
- unsigned int id;
539
+ struct igt_live_test t;
337540 int err = -ENODEV;
338541
339
- /* Submit various sized batches of empty requests, to each engine
542
+ /*
543
+ * Submit various sized batches of empty requests, to each engine
340544 * (individually), and wait for the batch to complete. We can check
341545 * the overhead of submitting requests to the hardware.
342546 */
343547
344
- mutex_lock(&i915->drm.struct_mutex);
345
-
346
- for_each_engine(engine, i915, id) {
347
- struct i915_request *request = NULL;
548
+ for_each_uabi_engine(engine, i915) {
348549 unsigned long n, prime;
349550 IGT_TIMEOUT(end_time);
350551 ktime_t times[2] = {};
351552
352
- err = begin_live_test(&t, i915, __func__, engine->name);
553
+ err = igt_live_test_begin(&t, i915, __func__, engine->name);
353554 if (err)
354
- goto out_unlock;
555
+ return err;
355556
557
+ intel_engine_pm_get(engine);
356558 for_each_prime_number_from(prime, 1, 8192) {
559
+ struct i915_request *request = NULL;
560
+
357561 times[1] = ktime_get_raw();
358562
359563 for (n = 0; n < prime; n++) {
360
- request = i915_request_alloc(engine,
361
- i915->kernel_context);
362
- if (IS_ERR(request)) {
363
- err = PTR_ERR(request);
364
- goto out_unlock;
365
- }
564
+ i915_request_put(request);
565
+ request = i915_request_create(engine->kernel_context);
566
+ if (IS_ERR(request))
567
+ return PTR_ERR(request);
366568
367
- /* This space is left intentionally blank.
569
+ /*
570
+ * This space is left intentionally blank.
368571 *
369572 * We do not actually want to perform any
370573 * action with this request, we just want
....@@ -377,11 +580,11 @@
377580 * for latency.
378581 */
379582
583
+ i915_request_get(request);
380584 i915_request_add(request);
381585 }
382
- i915_request_wait(request,
383
- I915_WAIT_LOCKED,
384
- MAX_SCHEDULE_TIMEOUT);
586
+ i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
587
+ i915_request_put(request);
385588
386589 times[1] = ktime_sub(ktime_get_raw(), times[1]);
387590 if (prime == 1)
....@@ -390,10 +593,11 @@
390593 if (__igt_timeout(end_time, NULL))
391594 break;
392595 }
596
+ intel_engine_pm_put(engine);
393597
394
- err = end_live_test(&t);
598
+ err = igt_live_test_end(&t);
395599 if (err)
396
- goto out_unlock;
600
+ return err;
397601
398602 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
399603 engine->name,
....@@ -401,8 +605,6 @@
401605 prime, div64_u64(ktime_to_ns(times[1]), prime));
402606 }
403607
404
-out_unlock:
405
- mutex_unlock(&i915->drm.struct_mutex);
406608 return err;
407609 }
408610
....@@ -424,13 +626,11 @@
424626 }
425627
426628 *cmd = MI_BATCH_BUFFER_END;
427
- i915_gem_chipset_flush(i915);
428629
630
+ __i915_gem_object_flush_map(obj, 0, 64);
429631 i915_gem_object_unpin_map(obj);
430632
431
- err = i915_gem_object_set_to_gtt_domain(obj, false);
432
- if (err)
433
- goto err;
633
+ intel_gt_chipset_flush(&i915->gt);
434634
435635 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
436636 if (IS_ERR(vma)) {
....@@ -442,8 +642,15 @@
442642 if (err)
443643 goto err;
444644
645
+ /* Force the wait wait now to avoid including it in the benchmark */
646
+ err = i915_vma_sync(vma);
647
+ if (err)
648
+ goto err_pin;
649
+
445650 return vma;
446651
652
+err_pin:
653
+ i915_vma_unpin(vma);
447654 err:
448655 i915_gem_object_put(obj);
449656 return ERR_PTR(err);
....@@ -456,7 +663,7 @@
456663 struct i915_request *request;
457664 int err;
458665
459
- request = i915_request_alloc(engine, engine->i915->kernel_context);
666
+ request = i915_request_create(engine->kernel_context);
460667 if (IS_ERR(request))
461668 return request;
462669
....@@ -467,6 +674,7 @@
467674 if (err)
468675 goto out_request;
469676
677
+ i915_request_get(request);
470678 out_request:
471679 i915_request_add(request);
472680 return err ? ERR_PTR(err) : request;
....@@ -476,57 +684,54 @@
476684 {
477685 struct drm_i915_private *i915 = arg;
478686 struct intel_engine_cs *engine;
479
- struct live_test t;
687
+ struct igt_live_test t;
480688 struct i915_vma *batch;
481
- unsigned int id;
482689 int err = 0;
483690
484
- /* Submit various sized batches of empty requests, to each engine
691
+ /*
692
+ * Submit various sized batches of empty requests, to each engine
485693 * (individually), and wait for the batch to complete. We can check
486694 * the overhead of submitting requests to the hardware.
487695 */
488696
489
- mutex_lock(&i915->drm.struct_mutex);
490
-
491697 batch = empty_batch(i915);
492
- if (IS_ERR(batch)) {
493
- err = PTR_ERR(batch);
494
- goto out_unlock;
495
- }
698
+ if (IS_ERR(batch))
699
+ return PTR_ERR(batch);
496700
497
- for_each_engine(engine, i915, id) {
701
+ for_each_uabi_engine(engine, i915) {
498702 IGT_TIMEOUT(end_time);
499703 struct i915_request *request;
500704 unsigned long n, prime;
501705 ktime_t times[2] = {};
502706
503
- err = begin_live_test(&t, i915, __func__, engine->name);
707
+ err = igt_live_test_begin(&t, i915, __func__, engine->name);
504708 if (err)
505709 goto out_batch;
710
+
711
+ intel_engine_pm_get(engine);
506712
507713 /* Warmup / preload */
508714 request = empty_request(engine, batch);
509715 if (IS_ERR(request)) {
510716 err = PTR_ERR(request);
717
+ intel_engine_pm_put(engine);
511718 goto out_batch;
512719 }
513
- i915_request_wait(request,
514
- I915_WAIT_LOCKED,
515
- MAX_SCHEDULE_TIMEOUT);
720
+ i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
516721
517722 for_each_prime_number_from(prime, 1, 8192) {
518723 times[1] = ktime_get_raw();
519724
520725 for (n = 0; n < prime; n++) {
726
+ i915_request_put(request);
521727 request = empty_request(engine, batch);
522728 if (IS_ERR(request)) {
523729 err = PTR_ERR(request);
730
+ intel_engine_pm_put(engine);
524731 goto out_batch;
525732 }
526733 }
527
- i915_request_wait(request,
528
- I915_WAIT_LOCKED,
529
- MAX_SCHEDULE_TIMEOUT);
734
+ i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
530735
531736 times[1] = ktime_sub(ktime_get_raw(), times[1]);
532737 if (prime == 1)
....@@ -535,8 +740,10 @@
535740 if (__igt_timeout(end_time, NULL))
536741 break;
537742 }
743
+ i915_request_put(request);
744
+ intel_engine_pm_put(engine);
538745
539
- err = end_live_test(&t);
746
+ err = igt_live_test_end(&t);
540747 if (err)
541748 goto out_batch;
542749
....@@ -549,16 +756,11 @@
549756 out_batch:
550757 i915_vma_unpin(batch);
551758 i915_vma_put(batch);
552
-out_unlock:
553
- mutex_unlock(&i915->drm.struct_mutex);
554759 return err;
555760 }
556761
557762 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
558763 {
559
- struct i915_gem_context *ctx = i915->kernel_context;
560
- struct i915_address_space *vm =
561
- ctx->ppgtt ? &ctx->ppgtt->vm : &i915->ggtt.vm;
562764 struct drm_i915_gem_object *obj;
563765 const int gen = INTEL_GEN(i915);
564766 struct i915_vma *vma;
....@@ -569,17 +771,13 @@
569771 if (IS_ERR(obj))
570772 return ERR_CAST(obj);
571773
572
- vma = i915_vma_instance(obj, vm, NULL);
774
+ vma = i915_vma_instance(obj, i915->gt.vm, NULL);
573775 if (IS_ERR(vma)) {
574776 err = PTR_ERR(vma);
575777 goto err;
576778 }
577779
578780 err = i915_vma_pin(vma, 0, 0, PIN_USER);
579
- if (err)
580
- goto err;
581
-
582
- err = i915_gem_object_set_to_wc_domain(obj, true);
583781 if (err)
584782 goto err;
585783
....@@ -601,9 +799,11 @@
601799 *cmd++ = lower_32_bits(vma->node.start);
602800 }
603801 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
604
- i915_gem_chipset_flush(i915);
605802
803
+ __i915_gem_object_flush_map(obj, 0, 64);
606804 i915_gem_object_unpin_map(obj);
805
+
806
+ intel_gt_chipset_flush(&i915->gt);
607807
608808 return vma;
609809
....@@ -621,9 +821,11 @@
621821 return PTR_ERR(cmd);
622822
623823 *cmd = MI_BATCH_BUFFER_END;
624
- i915_gem_chipset_flush(batch->vm->i915);
625824
825
+ __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
626826 i915_gem_object_unpin_map(batch->obj);
827
+
828
+ intel_gt_chipset_flush(batch->vm->gt);
627829
628830 return 0;
629831 }
....@@ -631,66 +833,75 @@
631833 static int live_all_engines(void *arg)
632834 {
633835 struct drm_i915_private *i915 = arg;
836
+ const unsigned int nengines = num_uabi_engines(i915);
634837 struct intel_engine_cs *engine;
635
- struct i915_request *request[I915_NUM_ENGINES];
838
+ struct i915_request **request;
839
+ struct igt_live_test t;
636840 struct i915_vma *batch;
637
- struct live_test t;
638
- unsigned int id;
841
+ unsigned int idx;
639842 int err;
640843
641
- /* Check we can submit requests to all engines simultaneously. We
844
+ /*
845
+ * Check we can submit requests to all engines simultaneously. We
642846 * send a recursive batch to each engine - checking that we don't
643847 * block doing so, and that they don't complete too soon.
644848 */
645849
646
- mutex_lock(&i915->drm.struct_mutex);
850
+ request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
851
+ if (!request)
852
+ return -ENOMEM;
647853
648
- err = begin_live_test(&t, i915, __func__, "");
854
+ err = igt_live_test_begin(&t, i915, __func__, "");
649855 if (err)
650
- goto out_unlock;
856
+ goto out_free;
651857
652858 batch = recursive_batch(i915);
653859 if (IS_ERR(batch)) {
654860 err = PTR_ERR(batch);
655861 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
656
- goto out_unlock;
862
+ goto out_free;
657863 }
658864
659
- for_each_engine(engine, i915, id) {
660
- request[id] = i915_request_alloc(engine, i915->kernel_context);
661
- if (IS_ERR(request[id])) {
662
- err = PTR_ERR(request[id]);
865
+ i915_vma_lock(batch);
866
+
867
+ idx = 0;
868
+ for_each_uabi_engine(engine, i915) {
869
+ request[idx] = intel_engine_create_kernel_request(engine);
870
+ if (IS_ERR(request[idx])) {
871
+ err = PTR_ERR(request[idx]);
663872 pr_err("%s: Request allocation failed with err=%d\n",
664873 __func__, err);
665874 goto out_request;
666875 }
667876
668
- err = engine->emit_bb_start(request[id],
877
+ err = i915_request_await_object(request[idx], batch->obj, 0);
878
+ if (err == 0)
879
+ err = i915_vma_move_to_active(batch, request[idx], 0);
880
+ GEM_BUG_ON(err);
881
+
882
+ err = engine->emit_bb_start(request[idx],
669883 batch->node.start,
670884 batch->node.size,
671885 0);
672886 GEM_BUG_ON(err);
673
- request[id]->batch = batch;
887
+ request[idx]->batch = batch;
674888
675
- if (!i915_gem_object_has_active_reference(batch->obj)) {
676
- i915_gem_object_get(batch->obj);
677
- i915_gem_object_set_active_reference(batch->obj);
678
- }
679
-
680
- err = i915_vma_move_to_active(batch, request[id], 0);
681
- GEM_BUG_ON(err);
682
-
683
- i915_request_get(request[id]);
684
- i915_request_add(request[id]);
889
+ i915_request_get(request[idx]);
890
+ i915_request_add(request[idx]);
891
+ idx++;
685892 }
686893
687
- for_each_engine(engine, i915, id) {
688
- if (i915_request_completed(request[id])) {
894
+ i915_vma_unlock(batch);
895
+
896
+ idx = 0;
897
+ for_each_uabi_engine(engine, i915) {
898
+ if (i915_request_completed(request[idx])) {
689899 pr_err("%s(%s): request completed too early!\n",
690900 __func__, engine->name);
691901 err = -EINVAL;
692902 goto out_request;
693903 }
904
+ idx++;
694905 }
695906
696907 err = recursive_batch_resolve(batch);
....@@ -699,11 +910,11 @@
699910 goto out_request;
700911 }
701912
702
- for_each_engine(engine, i915, id) {
913
+ idx = 0;
914
+ for_each_uabi_engine(engine, i915) {
703915 long timeout;
704916
705
- timeout = i915_request_wait(request[id],
706
- I915_WAIT_LOCKED,
917
+ timeout = i915_request_wait(request[idx], 0,
707918 MAX_SCHEDULE_TIMEOUT);
708919 if (timeout < 0) {
709920 err = timeout;
....@@ -712,47 +923,56 @@
712923 goto out_request;
713924 }
714925
715
- GEM_BUG_ON(!i915_request_completed(request[id]));
716
- i915_request_put(request[id]);
717
- request[id] = NULL;
926
+ GEM_BUG_ON(!i915_request_completed(request[idx]));
927
+ i915_request_put(request[idx]);
928
+ request[idx] = NULL;
929
+ idx++;
718930 }
719931
720
- err = end_live_test(&t);
932
+ err = igt_live_test_end(&t);
721933
722934 out_request:
723
- for_each_engine(engine, i915, id)
724
- if (request[id])
725
- i915_request_put(request[id]);
935
+ idx = 0;
936
+ for_each_uabi_engine(engine, i915) {
937
+ if (request[idx])
938
+ i915_request_put(request[idx]);
939
+ idx++;
940
+ }
726941 i915_vma_unpin(batch);
727942 i915_vma_put(batch);
728
-out_unlock:
729
- mutex_unlock(&i915->drm.struct_mutex);
943
+out_free:
944
+ kfree(request);
730945 return err;
731946 }
732947
733948 static int live_sequential_engines(void *arg)
734949 {
735950 struct drm_i915_private *i915 = arg;
736
- struct i915_request *request[I915_NUM_ENGINES] = {};
951
+ const unsigned int nengines = num_uabi_engines(i915);
952
+ struct i915_request **request;
737953 struct i915_request *prev = NULL;
738954 struct intel_engine_cs *engine;
739
- struct live_test t;
740
- unsigned int id;
955
+ struct igt_live_test t;
956
+ unsigned int idx;
741957 int err;
742958
743
- /* Check we can submit requests to all engines sequentially, such
959
+ /*
960
+ * Check we can submit requests to all engines sequentially, such
744961 * that each successive request waits for the earlier ones. This
745962 * tests that we don't execute requests out of order, even though
746963 * they are running on independent engines.
747964 */
748965
749
- mutex_lock(&i915->drm.struct_mutex);
966
+ request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
967
+ if (!request)
968
+ return -ENOMEM;
750969
751
- err = begin_live_test(&t, i915, __func__, "");
970
+ err = igt_live_test_begin(&t, i915, __func__, "");
752971 if (err)
753
- goto out_unlock;
972
+ goto out_free;
754973
755
- for_each_engine(engine, i915, id) {
974
+ idx = 0;
975
+ for_each_uabi_engine(engine, i915) {
756976 struct i915_vma *batch;
757977
758978 batch = recursive_batch(i915);
....@@ -760,66 +980,73 @@
760980 err = PTR_ERR(batch);
761981 pr_err("%s: Unable to create batch for %s, err=%d\n",
762982 __func__, engine->name, err);
983
+ goto out_free;
984
+ }
985
+
986
+ i915_vma_lock(batch);
987
+ request[idx] = intel_engine_create_kernel_request(engine);
988
+ if (IS_ERR(request[idx])) {
989
+ err = PTR_ERR(request[idx]);
990
+ pr_err("%s: Request allocation failed for %s with err=%d\n",
991
+ __func__, engine->name, err);
763992 goto out_unlock;
764993 }
765994
766
- request[id] = i915_request_alloc(engine, i915->kernel_context);
767
- if (IS_ERR(request[id])) {
768
- err = PTR_ERR(request[id]);
769
- pr_err("%s: Request allocation failed for %s with err=%d\n",
770
- __func__, engine->name, err);
771
- goto out_request;
772
- }
773
-
774995 if (prev) {
775
- err = i915_request_await_dma_fence(request[id],
996
+ err = i915_request_await_dma_fence(request[idx],
776997 &prev->fence);
777998 if (err) {
778
- i915_request_add(request[id]);
999
+ i915_request_add(request[idx]);
7791000 pr_err("%s: Request await failed for %s with err=%d\n",
7801001 __func__, engine->name, err);
781
- goto out_request;
1002
+ goto out_unlock;
7821003 }
7831004 }
7841005
785
- err = engine->emit_bb_start(request[id],
1006
+ err = i915_request_await_object(request[idx],
1007
+ batch->obj, false);
1008
+ if (err == 0)
1009
+ err = i915_vma_move_to_active(batch, request[idx], 0);
1010
+ GEM_BUG_ON(err);
1011
+
1012
+ err = engine->emit_bb_start(request[idx],
7861013 batch->node.start,
7871014 batch->node.size,
7881015 0);
7891016 GEM_BUG_ON(err);
790
- request[id]->batch = batch;
1017
+ request[idx]->batch = batch;
7911018
792
- err = i915_vma_move_to_active(batch, request[id], 0);
793
- GEM_BUG_ON(err);
1019
+ i915_request_get(request[idx]);
1020
+ i915_request_add(request[idx]);
7941021
795
- i915_gem_object_set_active_reference(batch->obj);
796
- i915_vma_get(batch);
1022
+ prev = request[idx];
1023
+ idx++;
7971024
798
- i915_request_get(request[id]);
799
- i915_request_add(request[id]);
800
-
801
- prev = request[id];
1025
+out_unlock:
1026
+ i915_vma_unlock(batch);
1027
+ if (err)
1028
+ goto out_request;
8021029 }
8031030
804
- for_each_engine(engine, i915, id) {
1031
+ idx = 0;
1032
+ for_each_uabi_engine(engine, i915) {
8051033 long timeout;
8061034
807
- if (i915_request_completed(request[id])) {
1035
+ if (i915_request_completed(request[idx])) {
8081036 pr_err("%s(%s): request completed too early!\n",
8091037 __func__, engine->name);
8101038 err = -EINVAL;
8111039 goto out_request;
8121040 }
8131041
814
- err = recursive_batch_resolve(request[id]->batch);
1042
+ err = recursive_batch_resolve(request[idx]->batch);
8151043 if (err) {
8161044 pr_err("%s: failed to resolve batch, err=%d\n",
8171045 __func__, err);
8181046 goto out_request;
8191047 }
8201048
821
- timeout = i915_request_wait(request[id],
822
- I915_WAIT_LOCKED,
1049
+ timeout = i915_request_wait(request[idx], 0,
8231050 MAX_SCHEDULE_TIMEOUT);
8241051 if (timeout < 0) {
8251052 err = timeout;
....@@ -828,33 +1055,426 @@
8281055 goto out_request;
8291056 }
8301057
831
- GEM_BUG_ON(!i915_request_completed(request[id]));
1058
+ GEM_BUG_ON(!i915_request_completed(request[idx]));
1059
+ idx++;
8321060 }
8331061
834
- err = end_live_test(&t);
1062
+ err = igt_live_test_end(&t);
8351063
8361064 out_request:
837
- for_each_engine(engine, i915, id) {
1065
+ idx = 0;
1066
+ for_each_uabi_engine(engine, i915) {
8381067 u32 *cmd;
8391068
840
- if (!request[id])
1069
+ if (!request[idx])
8411070 break;
8421071
843
- cmd = i915_gem_object_pin_map(request[id]->batch->obj,
1072
+ cmd = i915_gem_object_pin_map(request[idx]->batch->obj,
8441073 I915_MAP_WC);
8451074 if (!IS_ERR(cmd)) {
8461075 *cmd = MI_BATCH_BUFFER_END;
847
- i915_gem_chipset_flush(i915);
8481076
849
- i915_gem_object_unpin_map(request[id]->batch->obj);
1077
+ __i915_gem_object_flush_map(request[idx]->batch->obj,
1078
+ 0, sizeof(*cmd));
1079
+ i915_gem_object_unpin_map(request[idx]->batch->obj);
1080
+
1081
+ intel_gt_chipset_flush(engine->gt);
8501082 }
8511083
852
- i915_vma_put(request[id]->batch);
853
- i915_request_put(request[id]);
1084
+ i915_vma_put(request[idx]->batch);
1085
+ i915_request_put(request[idx]);
1086
+ idx++;
8541087 }
855
-out_unlock:
856
- mutex_unlock(&i915->drm.struct_mutex);
1088
+out_free:
1089
+ kfree(request);
8571090 return err;
1091
+}
1092
+
1093
+static int __live_parallel_engine1(void *arg)
1094
+{
1095
+ struct intel_engine_cs *engine = arg;
1096
+ IGT_TIMEOUT(end_time);
1097
+ unsigned long count;
1098
+ int err = 0;
1099
+
1100
+ count = 0;
1101
+ intel_engine_pm_get(engine);
1102
+ do {
1103
+ struct i915_request *rq;
1104
+
1105
+ rq = i915_request_create(engine->kernel_context);
1106
+ if (IS_ERR(rq)) {
1107
+ err = PTR_ERR(rq);
1108
+ break;
1109
+ }
1110
+
1111
+ i915_request_get(rq);
1112
+ i915_request_add(rq);
1113
+
1114
+ err = 0;
1115
+ if (i915_request_wait(rq, 0, HZ / 5) < 0)
1116
+ err = -ETIME;
1117
+ i915_request_put(rq);
1118
+ if (err)
1119
+ break;
1120
+
1121
+ count++;
1122
+ } while (!__igt_timeout(end_time, NULL));
1123
+ intel_engine_pm_put(engine);
1124
+
1125
+ pr_info("%s: %lu request + sync\n", engine->name, count);
1126
+ return err;
1127
+}
1128
+
1129
+static int __live_parallel_engineN(void *arg)
1130
+{
1131
+ struct intel_engine_cs *engine = arg;
1132
+ IGT_TIMEOUT(end_time);
1133
+ unsigned long count;
1134
+ int err = 0;
1135
+
1136
+ count = 0;
1137
+ intel_engine_pm_get(engine);
1138
+ do {
1139
+ struct i915_request *rq;
1140
+
1141
+ rq = i915_request_create(engine->kernel_context);
1142
+ if (IS_ERR(rq)) {
1143
+ err = PTR_ERR(rq);
1144
+ break;
1145
+ }
1146
+
1147
+ i915_request_add(rq);
1148
+ count++;
1149
+ } while (!__igt_timeout(end_time, NULL));
1150
+ intel_engine_pm_put(engine);
1151
+
1152
+ pr_info("%s: %lu requests\n", engine->name, count);
1153
+ return err;
1154
+}
1155
+
1156
+static bool wake_all(struct drm_i915_private *i915)
1157
+{
1158
+ if (atomic_dec_and_test(&i915->selftest.counter)) {
1159
+ wake_up_var(&i915->selftest.counter);
1160
+ return true;
1161
+ }
1162
+
1163
+ return false;
1164
+}
1165
+
1166
+static int wait_for_all(struct drm_i915_private *i915)
1167
+{
1168
+ if (wake_all(i915))
1169
+ return 0;
1170
+
1171
+ if (wait_var_event_timeout(&i915->selftest.counter,
1172
+ !atomic_read(&i915->selftest.counter),
1173
+ i915_selftest.timeout_jiffies))
1174
+ return 0;
1175
+
1176
+ return -ETIME;
1177
+}
1178
+
1179
+static int __live_parallel_spin(void *arg)
1180
+{
1181
+ struct intel_engine_cs *engine = arg;
1182
+ struct igt_spinner spin;
1183
+ struct i915_request *rq;
1184
+ int err = 0;
1185
+
1186
+ /*
1187
+ * Create a spinner running for eternity on each engine. If a second
1188
+ * spinner is incorrectly placed on the same engine, it will not be
1189
+ * able to start in time.
1190
+ */
1191
+
1192
+ if (igt_spinner_init(&spin, engine->gt)) {
1193
+ wake_all(engine->i915);
1194
+ return -ENOMEM;
1195
+ }
1196
+
1197
+ intel_engine_pm_get(engine);
1198
+ rq = igt_spinner_create_request(&spin,
1199
+ engine->kernel_context,
1200
+ MI_NOOP); /* no preemption */
1201
+ intel_engine_pm_put(engine);
1202
+ if (IS_ERR(rq)) {
1203
+ err = PTR_ERR(rq);
1204
+ if (err == -ENODEV)
1205
+ err = 0;
1206
+ wake_all(engine->i915);
1207
+ goto out_spin;
1208
+ }
1209
+
1210
+ i915_request_get(rq);
1211
+ i915_request_add(rq);
1212
+ if (igt_wait_for_spinner(&spin, rq)) {
1213
+ /* Occupy this engine for the whole test */
1214
+ err = wait_for_all(engine->i915);
1215
+ } else {
1216
+ pr_err("Failed to start spinner on %s\n", engine->name);
1217
+ err = -EINVAL;
1218
+ }
1219
+ igt_spinner_end(&spin);
1220
+
1221
+ if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1222
+ err = -EIO;
1223
+ i915_request_put(rq);
1224
+
1225
+out_spin:
1226
+ igt_spinner_fini(&spin);
1227
+ return err;
1228
+}
1229
+
1230
+static int live_parallel_engines(void *arg)
1231
+{
1232
+ struct drm_i915_private *i915 = arg;
1233
+ static int (* const func[])(void *arg) = {
1234
+ __live_parallel_engine1,
1235
+ __live_parallel_engineN,
1236
+ __live_parallel_spin,
1237
+ NULL,
1238
+ };
1239
+ const unsigned int nengines = num_uabi_engines(i915);
1240
+ struct intel_engine_cs *engine;
1241
+ int (* const *fn)(void *arg);
1242
+ struct task_struct **tsk;
1243
+ int err = 0;
1244
+
1245
+ /*
1246
+ * Check we can submit requests to all engines concurrently. This
1247
+ * tests that we load up the system maximally.
1248
+ */
1249
+
1250
+ tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1251
+ if (!tsk)
1252
+ return -ENOMEM;
1253
+
1254
+ for (fn = func; !err && *fn; fn++) {
1255
+ char name[KSYM_NAME_LEN];
1256
+ struct igt_live_test t;
1257
+ unsigned int idx;
1258
+
1259
+ snprintf(name, sizeof(name), "%ps", *fn);
1260
+ err = igt_live_test_begin(&t, i915, __func__, name);
1261
+ if (err)
1262
+ break;
1263
+
1264
+ atomic_set(&i915->selftest.counter, nengines);
1265
+
1266
+ idx = 0;
1267
+ for_each_uabi_engine(engine, i915) {
1268
+ tsk[idx] = kthread_run(*fn, engine,
1269
+ "igt/parallel:%s",
1270
+ engine->name);
1271
+ if (IS_ERR(tsk[idx])) {
1272
+ err = PTR_ERR(tsk[idx]);
1273
+ break;
1274
+ }
1275
+ get_task_struct(tsk[idx++]);
1276
+ }
1277
+
1278
+ yield(); /* start all threads before we kthread_stop() */
1279
+
1280
+ idx = 0;
1281
+ for_each_uabi_engine(engine, i915) {
1282
+ int status;
1283
+
1284
+ if (IS_ERR(tsk[idx]))
1285
+ break;
1286
+
1287
+ status = kthread_stop(tsk[idx]);
1288
+ if (status && !err)
1289
+ err = status;
1290
+
1291
+ put_task_struct(tsk[idx++]);
1292
+ }
1293
+
1294
+ if (igt_live_test_end(&t))
1295
+ err = -EIO;
1296
+ }
1297
+
1298
+ kfree(tsk);
1299
+ return err;
1300
+}
1301
+
1302
+static int
1303
+max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1304
+{
1305
+ struct i915_request *rq;
1306
+ int ret;
1307
+
1308
+ /*
1309
+ * Before execlists, all contexts share the same ringbuffer. With
1310
+ * execlists, each context/engine has a separate ringbuffer and
1311
+ * for the purposes of this test, inexhaustible.
1312
+ *
1313
+ * For the global ringbuffer though, we have to be very careful
1314
+ * that we do not wrap while preventing the execution of requests
1315
+ * with a unsignaled fence.
1316
+ */
1317
+ if (HAS_EXECLISTS(ctx->i915))
1318
+ return INT_MAX;
1319
+
1320
+ rq = igt_request_alloc(ctx, engine);
1321
+ if (IS_ERR(rq)) {
1322
+ ret = PTR_ERR(rq);
1323
+ } else {
1324
+ int sz;
1325
+
1326
+ ret = rq->ring->size - rq->reserved_space;
1327
+ i915_request_add(rq);
1328
+
1329
+ sz = rq->ring->emit - rq->head;
1330
+ if (sz < 0)
1331
+ sz += rq->ring->size;
1332
+ ret /= sz;
1333
+ ret /= 2; /* leave half spare, in case of emergency! */
1334
+ }
1335
+
1336
+ return ret;
1337
+}
1338
+
1339
+static int live_breadcrumbs_smoketest(void *arg)
1340
+{
1341
+ struct drm_i915_private *i915 = arg;
1342
+ const unsigned int nengines = num_uabi_engines(i915);
1343
+ const unsigned int ncpus = num_online_cpus();
1344
+ unsigned long num_waits, num_fences;
1345
+ struct intel_engine_cs *engine;
1346
+ struct task_struct **threads;
1347
+ struct igt_live_test live;
1348
+ intel_wakeref_t wakeref;
1349
+ struct smoketest *smoke;
1350
+ unsigned int n, idx;
1351
+ struct file *file;
1352
+ int ret = 0;
1353
+
1354
+ /*
1355
+ * Smoketest our breadcrumb/signal handling for requests across multiple
1356
+ * threads. A very simple test to only catch the most egregious of bugs.
1357
+ * See __igt_breadcrumbs_smoketest();
1358
+ *
1359
+ * On real hardware this time.
1360
+ */
1361
+
1362
+ wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1363
+
1364
+ file = mock_file(i915);
1365
+ if (IS_ERR(file)) {
1366
+ ret = PTR_ERR(file);
1367
+ goto out_rpm;
1368
+ }
1369
+
1370
+ smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1371
+ if (!smoke) {
1372
+ ret = -ENOMEM;
1373
+ goto out_file;
1374
+ }
1375
+
1376
+ threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1377
+ if (!threads) {
1378
+ ret = -ENOMEM;
1379
+ goto out_smoke;
1380
+ }
1381
+
1382
+ smoke[0].request_alloc = __live_request_alloc;
1383
+ smoke[0].ncontexts = 64;
1384
+ smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1385
+ sizeof(*smoke[0].contexts),
1386
+ GFP_KERNEL);
1387
+ if (!smoke[0].contexts) {
1388
+ ret = -ENOMEM;
1389
+ goto out_threads;
1390
+ }
1391
+
1392
+ for (n = 0; n < smoke[0].ncontexts; n++) {
1393
+ smoke[0].contexts[n] = live_context(i915, file);
1394
+ if (IS_ERR(smoke[0].contexts[n])) {
1395
+ ret = PTR_ERR(smoke[0].contexts[n]);
1396
+ goto out_contexts;
1397
+ }
1398
+ }
1399
+
1400
+ ret = igt_live_test_begin(&live, i915, __func__, "");
1401
+ if (ret)
1402
+ goto out_contexts;
1403
+
1404
+ idx = 0;
1405
+ for_each_uabi_engine(engine, i915) {
1406
+ smoke[idx] = smoke[0];
1407
+ smoke[idx].engine = engine;
1408
+ smoke[idx].max_batch =
1409
+ max_batches(smoke[0].contexts[0], engine);
1410
+ if (smoke[idx].max_batch < 0) {
1411
+ ret = smoke[idx].max_batch;
1412
+ goto out_flush;
1413
+ }
1414
+ /* One ring interleaved between requests from all cpus */
1415
+ smoke[idx].max_batch /= num_online_cpus() + 1;
1416
+ pr_debug("Limiting batches to %d requests on %s\n",
1417
+ smoke[idx].max_batch, engine->name);
1418
+
1419
+ for (n = 0; n < ncpus; n++) {
1420
+ struct task_struct *tsk;
1421
+
1422
+ tsk = kthread_run(__igt_breadcrumbs_smoketest,
1423
+ &smoke[idx], "igt/%d.%d", idx, n);
1424
+ if (IS_ERR(tsk)) {
1425
+ ret = PTR_ERR(tsk);
1426
+ goto out_flush;
1427
+ }
1428
+
1429
+ get_task_struct(tsk);
1430
+ threads[idx * ncpus + n] = tsk;
1431
+ }
1432
+
1433
+ idx++;
1434
+ }
1435
+
1436
+ yield(); /* start all threads before we begin */
1437
+ msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1438
+
1439
+out_flush:
1440
+ idx = 0;
1441
+ num_waits = 0;
1442
+ num_fences = 0;
1443
+ for_each_uabi_engine(engine, i915) {
1444
+ for (n = 0; n < ncpus; n++) {
1445
+ struct task_struct *tsk = threads[idx * ncpus + n];
1446
+ int err;
1447
+
1448
+ if (!tsk)
1449
+ continue;
1450
+
1451
+ err = kthread_stop(tsk);
1452
+ if (err < 0 && !ret)
1453
+ ret = err;
1454
+
1455
+ put_task_struct(tsk);
1456
+ }
1457
+
1458
+ num_waits += atomic_long_read(&smoke[idx].num_waits);
1459
+ num_fences += atomic_long_read(&smoke[idx].num_fences);
1460
+ idx++;
1461
+ }
1462
+ pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1463
+ num_waits, num_fences, idx, ncpus);
1464
+
1465
+ ret = igt_live_test_end(&live) ?: ret;
1466
+out_contexts:
1467
+ kfree(smoke[0].contexts);
1468
+out_threads:
1469
+ kfree(threads);
1470
+out_smoke:
1471
+ kfree(smoke);
1472
+out_file:
1473
+ fput(file);
1474
+out_rpm:
1475
+ intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1476
+
1477
+ return ret;
8581478 }
8591479
8601480 int i915_request_live_selftests(struct drm_i915_private *i915)
....@@ -863,10 +1483,1407 @@
8631483 SUBTEST(live_nop_request),
8641484 SUBTEST(live_all_engines),
8651485 SUBTEST(live_sequential_engines),
1486
+ SUBTEST(live_parallel_engines),
8661487 SUBTEST(live_empty_request),
1488
+ SUBTEST(live_breadcrumbs_smoketest),
8671489 };
8681490
869
- if (i915_terminally_wedged(&i915->gpu_error))
1491
+ if (intel_gt_is_wedged(&i915->gt))
1492
+ return 0;
1493
+
1494
+ return i915_subtests(tests, i915);
1495
+}
1496
+
1497
+static int switch_to_kernel_sync(struct intel_context *ce, int err)
1498
+{
1499
+ struct i915_request *rq;
1500
+ struct dma_fence *fence;
1501
+
1502
+ rq = intel_engine_create_kernel_request(ce->engine);
1503
+ if (IS_ERR(rq))
1504
+ return PTR_ERR(rq);
1505
+
1506
+ fence = i915_active_fence_get(&ce->timeline->last_request);
1507
+ if (fence) {
1508
+ i915_request_await_dma_fence(rq, fence);
1509
+ dma_fence_put(fence);
1510
+ }
1511
+
1512
+ rq = i915_request_get(rq);
1513
+ i915_request_add(rq);
1514
+ if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1515
+ err = -ETIME;
1516
+ i915_request_put(rq);
1517
+
1518
+ while (!err && !intel_engine_is_idle(ce->engine))
1519
+ intel_engine_flush_submission(ce->engine);
1520
+
1521
+ return err;
1522
+}
1523
+
1524
+struct perf_stats {
1525
+ struct intel_engine_cs *engine;
1526
+ unsigned long count;
1527
+ ktime_t time;
1528
+ ktime_t busy;
1529
+ u64 runtime;
1530
+};
1531
+
1532
+struct perf_series {
1533
+ struct drm_i915_private *i915;
1534
+ unsigned int nengines;
1535
+ struct intel_context *ce[];
1536
+};
1537
+
1538
+static int cmp_u32(const void *A, const void *B)
1539
+{
1540
+ const u32 *a = A, *b = B;
1541
+
1542
+ return *a - *b;
1543
+}
1544
+
1545
+static u32 trifilter(u32 *a)
1546
+{
1547
+ u64 sum;
1548
+
1549
+#define TF_COUNT 5
1550
+ sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1551
+
1552
+ sum = mul_u32_u32(a[2], 2);
1553
+ sum += a[1];
1554
+ sum += a[3];
1555
+
1556
+ GEM_BUG_ON(sum > U32_MAX);
1557
+ return sum;
1558
+#define TF_BIAS 2
1559
+}
1560
+
1561
+static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1562
+{
1563
+ u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
1564
+
1565
+ return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1566
+}
1567
+
1568
+static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1569
+{
1570
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1571
+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1572
+ *cs++ = offset;
1573
+ *cs++ = 0;
1574
+
1575
+ return cs;
1576
+}
1577
+
1578
+static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1579
+{
1580
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1581
+ *cs++ = offset;
1582
+ *cs++ = 0;
1583
+ *cs++ = value;
1584
+
1585
+ return cs;
1586
+}
1587
+
1588
+static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1589
+{
1590
+ *cs++ = MI_SEMAPHORE_WAIT |
1591
+ MI_SEMAPHORE_GLOBAL_GTT |
1592
+ MI_SEMAPHORE_POLL |
1593
+ mode;
1594
+ *cs++ = value;
1595
+ *cs++ = offset;
1596
+ *cs++ = 0;
1597
+
1598
+ return cs;
1599
+}
1600
+
1601
+static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1602
+{
1603
+ return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1604
+}
1605
+
1606
+static void semaphore_set(u32 *sema, u32 value)
1607
+{
1608
+ WRITE_ONCE(*sema, value);
1609
+ wmb(); /* flush the update to the cache, and beyond */
1610
+}
1611
+
1612
+static u32 *hwsp_scratch(const struct intel_context *ce)
1613
+{
1614
+ return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1615
+}
1616
+
1617
+static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1618
+{
1619
+ return (i915_ggtt_offset(ce->engine->status_page.vma) +
1620
+ offset_in_page(dw));
1621
+}
1622
+
1623
+static int measure_semaphore_response(struct intel_context *ce)
1624
+{
1625
+ u32 *sema = hwsp_scratch(ce);
1626
+ const u32 offset = hwsp_offset(ce, sema);
1627
+ u32 elapsed[TF_COUNT], cycles;
1628
+ struct i915_request *rq;
1629
+ u32 *cs;
1630
+ int err;
1631
+ int i;
1632
+
1633
+ /*
1634
+ * Measure how many cycles it takes for the HW to detect the change
1635
+ * in a semaphore value.
1636
+ *
1637
+ * A: read CS_TIMESTAMP from CPU
1638
+ * poke semaphore
1639
+ * B: read CS_TIMESTAMP on GPU
1640
+ *
1641
+ * Semaphore latency: B - A
1642
+ */
1643
+
1644
+ semaphore_set(sema, -1);
1645
+
1646
+ rq = i915_request_create(ce);
1647
+ if (IS_ERR(rq))
1648
+ return PTR_ERR(rq);
1649
+
1650
+ cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1651
+ if (IS_ERR(cs)) {
1652
+ i915_request_add(rq);
1653
+ err = PTR_ERR(cs);
1654
+ goto err;
1655
+ }
1656
+
1657
+ cs = emit_store_dw(cs, offset, 0);
1658
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1659
+ cs = emit_semaphore_poll_until(cs, offset, i);
1660
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1661
+ cs = emit_store_dw(cs, offset, 0);
1662
+ }
1663
+
1664
+ intel_ring_advance(rq, cs);
1665
+ i915_request_add(rq);
1666
+
1667
+ if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1668
+ err = -EIO;
1669
+ goto err;
1670
+ }
1671
+
1672
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1673
+ preempt_disable();
1674
+ cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1675
+ semaphore_set(sema, i);
1676
+ preempt_enable();
1677
+
1678
+ if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1679
+ err = -EIO;
1680
+ goto err;
1681
+ }
1682
+
1683
+ elapsed[i - 1] = sema[i] - cycles;
1684
+ }
1685
+
1686
+ cycles = trifilter(elapsed);
1687
+ pr_info("%s: semaphore response %d cycles, %lluns\n",
1688
+ ce->engine->name, cycles >> TF_BIAS,
1689
+ cycles_to_ns(ce->engine, cycles));
1690
+
1691
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1692
+
1693
+err:
1694
+ intel_gt_set_wedged(ce->engine->gt);
1695
+ return err;
1696
+}
1697
+
1698
+static int measure_idle_dispatch(struct intel_context *ce)
1699
+{
1700
+ u32 *sema = hwsp_scratch(ce);
1701
+ const u32 offset = hwsp_offset(ce, sema);
1702
+ u32 elapsed[TF_COUNT], cycles;
1703
+ u32 *cs;
1704
+ int err;
1705
+ int i;
1706
+
1707
+ /*
1708
+ * Measure how long it takes for us to submit a request while the
1709
+ * engine is idle, but is resting in our context.
1710
+ *
1711
+ * A: read CS_TIMESTAMP from CPU
1712
+ * submit request
1713
+ * B: read CS_TIMESTAMP on GPU
1714
+ *
1715
+ * Submission latency: B - A
1716
+ */
1717
+
1718
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1719
+ struct i915_request *rq;
1720
+
1721
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1722
+ if (err)
1723
+ return err;
1724
+
1725
+ rq = i915_request_create(ce);
1726
+ if (IS_ERR(rq)) {
1727
+ err = PTR_ERR(rq);
1728
+ goto err;
1729
+ }
1730
+
1731
+ cs = intel_ring_begin(rq, 4);
1732
+ if (IS_ERR(cs)) {
1733
+ i915_request_add(rq);
1734
+ err = PTR_ERR(cs);
1735
+ goto err;
1736
+ }
1737
+
1738
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1739
+
1740
+ intel_ring_advance(rq, cs);
1741
+
1742
+ preempt_disable();
1743
+ local_bh_disable();
1744
+ elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1745
+ i915_request_add(rq);
1746
+ local_bh_enable();
1747
+ preempt_enable();
1748
+ }
1749
+
1750
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1751
+ if (err)
1752
+ goto err;
1753
+
1754
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1755
+ elapsed[i] = sema[i] - elapsed[i];
1756
+
1757
+ cycles = trifilter(elapsed);
1758
+ pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1759
+ ce->engine->name, cycles >> TF_BIAS,
1760
+ cycles_to_ns(ce->engine, cycles));
1761
+
1762
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1763
+
1764
+err:
1765
+ intel_gt_set_wedged(ce->engine->gt);
1766
+ return err;
1767
+}
1768
+
1769
+static int measure_busy_dispatch(struct intel_context *ce)
1770
+{
1771
+ u32 *sema = hwsp_scratch(ce);
1772
+ const u32 offset = hwsp_offset(ce, sema);
1773
+ u32 elapsed[TF_COUNT + 1], cycles;
1774
+ u32 *cs;
1775
+ int err;
1776
+ int i;
1777
+
1778
+ /*
1779
+ * Measure how long it takes for us to submit a request while the
1780
+ * engine is busy, polling on a semaphore in our context. With
1781
+ * direct submission, this will include the cost of a lite restore.
1782
+ *
1783
+ * A: read CS_TIMESTAMP from CPU
1784
+ * submit request
1785
+ * B: read CS_TIMESTAMP on GPU
1786
+ *
1787
+ * Submission latency: B - A
1788
+ */
1789
+
1790
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1791
+ struct i915_request *rq;
1792
+
1793
+ rq = i915_request_create(ce);
1794
+ if (IS_ERR(rq)) {
1795
+ err = PTR_ERR(rq);
1796
+ goto err;
1797
+ }
1798
+
1799
+ cs = intel_ring_begin(rq, 12);
1800
+ if (IS_ERR(cs)) {
1801
+ i915_request_add(rq);
1802
+ err = PTR_ERR(cs);
1803
+ goto err;
1804
+ }
1805
+
1806
+ cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
1807
+ cs = emit_semaphore_poll_until(cs, offset, i);
1808
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1809
+
1810
+ intel_ring_advance(rq, cs);
1811
+
1812
+ if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
1813
+ err = -EIO;
1814
+ goto err;
1815
+ }
1816
+
1817
+ preempt_disable();
1818
+ local_bh_disable();
1819
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1820
+ i915_request_add(rq);
1821
+ local_bh_enable();
1822
+ semaphore_set(sema, i - 1);
1823
+ preempt_enable();
1824
+ }
1825
+
1826
+ wait_for(READ_ONCE(sema[i - 1]), 500);
1827
+ semaphore_set(sema, i - 1);
1828
+
1829
+ for (i = 1; i <= TF_COUNT; i++) {
1830
+ GEM_BUG_ON(sema[i] == -1);
1831
+ elapsed[i - 1] = sema[i] - elapsed[i];
1832
+ }
1833
+
1834
+ cycles = trifilter(elapsed);
1835
+ pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1836
+ ce->engine->name, cycles >> TF_BIAS,
1837
+ cycles_to_ns(ce->engine, cycles));
1838
+
1839
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1840
+
1841
+err:
1842
+ intel_gt_set_wedged(ce->engine->gt);
1843
+ return err;
1844
+}
1845
+
1846
+static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
1847
+{
1848
+ const u32 offset =
1849
+ i915_ggtt_offset(engine->status_page.vma) +
1850
+ offset_in_page(sema);
1851
+ struct i915_request *rq;
1852
+ u32 *cs;
1853
+
1854
+ rq = i915_request_create(engine->kernel_context);
1855
+ if (IS_ERR(rq))
1856
+ return PTR_ERR(rq);
1857
+
1858
+ cs = intel_ring_begin(rq, 4);
1859
+ if (IS_ERR(cs)) {
1860
+ i915_request_add(rq);
1861
+ return PTR_ERR(cs);
1862
+ }
1863
+
1864
+ cs = emit_semaphore_poll(cs, mode, value, offset);
1865
+
1866
+ intel_ring_advance(rq, cs);
1867
+ i915_request_add(rq);
1868
+
1869
+ return 0;
1870
+}
1871
+
1872
+static int measure_inter_request(struct intel_context *ce)
1873
+{
1874
+ u32 *sema = hwsp_scratch(ce);
1875
+ const u32 offset = hwsp_offset(ce, sema);
1876
+ u32 elapsed[TF_COUNT + 1], cycles;
1877
+ struct i915_sw_fence *submit;
1878
+ int i, err;
1879
+
1880
+ /*
1881
+ * Measure how long it takes to advance from one request into the
1882
+ * next. Between each request we flush the GPU caches to memory,
1883
+ * update the breadcrumbs, and then invalidate those caches.
1884
+ * We queue up all the requests to be submitted in one batch so
1885
+ * it should be one set of contiguous measurements.
1886
+ *
1887
+ * A: read CS_TIMESTAMP on GPU
1888
+ * advance request
1889
+ * B: read CS_TIMESTAMP on GPU
1890
+ *
1891
+ * Request latency: B - A
1892
+ */
1893
+
1894
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1895
+ if (err)
1896
+ return err;
1897
+
1898
+ submit = heap_fence_create(GFP_KERNEL);
1899
+ if (!submit) {
1900
+ semaphore_set(sema, 1);
1901
+ return -ENOMEM;
1902
+ }
1903
+
1904
+ intel_engine_flush_submission(ce->engine);
1905
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1906
+ struct i915_request *rq;
1907
+ u32 *cs;
1908
+
1909
+ rq = i915_request_create(ce);
1910
+ if (IS_ERR(rq)) {
1911
+ err = PTR_ERR(rq);
1912
+ goto err_submit;
1913
+ }
1914
+
1915
+ err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1916
+ submit,
1917
+ GFP_KERNEL);
1918
+ if (err < 0) {
1919
+ i915_request_add(rq);
1920
+ goto err_submit;
1921
+ }
1922
+
1923
+ cs = intel_ring_begin(rq, 4);
1924
+ if (IS_ERR(cs)) {
1925
+ i915_request_add(rq);
1926
+ err = PTR_ERR(cs);
1927
+ goto err_submit;
1928
+ }
1929
+
1930
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1931
+
1932
+ intel_ring_advance(rq, cs);
1933
+ i915_request_add(rq);
1934
+ }
1935
+ local_bh_disable();
1936
+ i915_sw_fence_commit(submit);
1937
+ local_bh_enable();
1938
+ intel_engine_flush_submission(ce->engine);
1939
+ heap_fence_put(submit);
1940
+
1941
+ semaphore_set(sema, 1);
1942
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1943
+ if (err)
1944
+ goto err;
1945
+
1946
+ for (i = 1; i <= TF_COUNT; i++)
1947
+ elapsed[i - 1] = sema[i + 1] - sema[i];
1948
+
1949
+ cycles = trifilter(elapsed);
1950
+ pr_info("%s: inter-request latency %d cycles, %lluns\n",
1951
+ ce->engine->name, cycles >> TF_BIAS,
1952
+ cycles_to_ns(ce->engine, cycles));
1953
+
1954
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1955
+
1956
+err_submit:
1957
+ i915_sw_fence_commit(submit);
1958
+ heap_fence_put(submit);
1959
+ semaphore_set(sema, 1);
1960
+err:
1961
+ intel_gt_set_wedged(ce->engine->gt);
1962
+ return err;
1963
+}
1964
+
1965
+static int measure_context_switch(struct intel_context *ce)
1966
+{
1967
+ u32 *sema = hwsp_scratch(ce);
1968
+ const u32 offset = hwsp_offset(ce, sema);
1969
+ struct i915_request *fence = NULL;
1970
+ u32 elapsed[TF_COUNT + 1], cycles;
1971
+ int i, j, err;
1972
+ u32 *cs;
1973
+
1974
+ /*
1975
+ * Measure how long it takes to advance from one request in one
1976
+ * context to a request in another context. This allows us to
1977
+ * measure how long the context save/restore take, along with all
1978
+ * the inter-context setup we require.
1979
+ *
1980
+ * A: read CS_TIMESTAMP on GPU
1981
+ * switch context
1982
+ * B: read CS_TIMESTAMP on GPU
1983
+ *
1984
+ * Context switch latency: B - A
1985
+ */
1986
+
1987
+ err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1988
+ if (err)
1989
+ return err;
1990
+
1991
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1992
+ struct intel_context *arr[] = {
1993
+ ce, ce->engine->kernel_context
1994
+ };
1995
+ u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
1996
+
1997
+ for (j = 0; j < ARRAY_SIZE(arr); j++) {
1998
+ struct i915_request *rq;
1999
+
2000
+ rq = i915_request_create(arr[j]);
2001
+ if (IS_ERR(rq)) {
2002
+ err = PTR_ERR(rq);
2003
+ goto err_fence;
2004
+ }
2005
+
2006
+ if (fence) {
2007
+ err = i915_request_await_dma_fence(rq,
2008
+ &fence->fence);
2009
+ if (err) {
2010
+ i915_request_add(rq);
2011
+ goto err_fence;
2012
+ }
2013
+ }
2014
+
2015
+ cs = intel_ring_begin(rq, 4);
2016
+ if (IS_ERR(cs)) {
2017
+ i915_request_add(rq);
2018
+ err = PTR_ERR(cs);
2019
+ goto err_fence;
2020
+ }
2021
+
2022
+ cs = emit_timestamp_store(cs, ce, addr);
2023
+ addr += sizeof(u32);
2024
+
2025
+ intel_ring_advance(rq, cs);
2026
+
2027
+ i915_request_put(fence);
2028
+ fence = i915_request_get(rq);
2029
+
2030
+ i915_request_add(rq);
2031
+ }
2032
+ }
2033
+ i915_request_put(fence);
2034
+ intel_engine_flush_submission(ce->engine);
2035
+
2036
+ semaphore_set(sema, 1);
2037
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2038
+ if (err)
2039
+ goto err;
2040
+
2041
+ for (i = 1; i <= TF_COUNT; i++)
2042
+ elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2043
+
2044
+ cycles = trifilter(elapsed);
2045
+ pr_info("%s: context switch latency %d cycles, %lluns\n",
2046
+ ce->engine->name, cycles >> TF_BIAS,
2047
+ cycles_to_ns(ce->engine, cycles));
2048
+
2049
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2050
+
2051
+err_fence:
2052
+ i915_request_put(fence);
2053
+ semaphore_set(sema, 1);
2054
+err:
2055
+ intel_gt_set_wedged(ce->engine->gt);
2056
+ return err;
2057
+}
2058
+
2059
+static int measure_preemption(struct intel_context *ce)
2060
+{
2061
+ u32 *sema = hwsp_scratch(ce);
2062
+ const u32 offset = hwsp_offset(ce, sema);
2063
+ u32 elapsed[TF_COUNT], cycles;
2064
+ u32 *cs;
2065
+ int err;
2066
+ int i;
2067
+
2068
+ /*
2069
+ * We measure two latencies while triggering preemption. The first
2070
+ * latency is how long it takes for us to submit a preempting request.
2071
+ * The second latency is how it takes for us to return from the
2072
+ * preemption back to the original context.
2073
+ *
2074
+ * A: read CS_TIMESTAMP from CPU
2075
+ * submit preemption
2076
+ * B: read CS_TIMESTAMP on GPU (in preempting context)
2077
+ * context switch
2078
+ * C: read CS_TIMESTAMP on GPU (in original context)
2079
+ *
2080
+ * Preemption dispatch latency: B - A
2081
+ * Preemption switch latency: C - B
2082
+ */
2083
+
2084
+ if (!intel_engine_has_preemption(ce->engine))
2085
+ return 0;
2086
+
2087
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2088
+ u32 addr = offset + 2 * i * sizeof(u32);
2089
+ struct i915_request *rq;
2090
+
2091
+ rq = i915_request_create(ce);
2092
+ if (IS_ERR(rq)) {
2093
+ err = PTR_ERR(rq);
2094
+ goto err;
2095
+ }
2096
+
2097
+ cs = intel_ring_begin(rq, 12);
2098
+ if (IS_ERR(cs)) {
2099
+ i915_request_add(rq);
2100
+ err = PTR_ERR(cs);
2101
+ goto err;
2102
+ }
2103
+
2104
+ cs = emit_store_dw(cs, addr, -1);
2105
+ cs = emit_semaphore_poll_until(cs, offset, i);
2106
+ cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2107
+
2108
+ intel_ring_advance(rq, cs);
2109
+ i915_request_add(rq);
2110
+
2111
+ if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2112
+ err = -EIO;
2113
+ goto err;
2114
+ }
2115
+
2116
+ rq = i915_request_create(ce->engine->kernel_context);
2117
+ if (IS_ERR(rq)) {
2118
+ err = PTR_ERR(rq);
2119
+ goto err;
2120
+ }
2121
+
2122
+ cs = intel_ring_begin(rq, 8);
2123
+ if (IS_ERR(cs)) {
2124
+ i915_request_add(rq);
2125
+ err = PTR_ERR(cs);
2126
+ goto err;
2127
+ }
2128
+
2129
+ cs = emit_timestamp_store(cs, ce, addr);
2130
+ cs = emit_store_dw(cs, offset, i);
2131
+
2132
+ intel_ring_advance(rq, cs);
2133
+ rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2134
+
2135
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2136
+ i915_request_add(rq);
2137
+ }
2138
+
2139
+ if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2140
+ err = -EIO;
2141
+ goto err;
2142
+ }
2143
+
2144
+ for (i = 1; i <= TF_COUNT; i++)
2145
+ elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2146
+
2147
+ cycles = trifilter(elapsed);
2148
+ pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2149
+ ce->engine->name, cycles >> TF_BIAS,
2150
+ cycles_to_ns(ce->engine, cycles));
2151
+
2152
+ for (i = 1; i <= TF_COUNT; i++)
2153
+ elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2154
+
2155
+ cycles = trifilter(elapsed);
2156
+ pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2157
+ ce->engine->name, cycles >> TF_BIAS,
2158
+ cycles_to_ns(ce->engine, cycles));
2159
+
2160
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2161
+
2162
+err:
2163
+ intel_gt_set_wedged(ce->engine->gt);
2164
+ return err;
2165
+}
2166
+
2167
+struct signal_cb {
2168
+ struct dma_fence_cb base;
2169
+ bool seen;
2170
+};
2171
+
2172
+static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2173
+{
2174
+ struct signal_cb *s = container_of(cb, typeof(*s), base);
2175
+
2176
+ smp_store_mb(s->seen, true); /* be safe, be strong */
2177
+}
2178
+
2179
+static int measure_completion(struct intel_context *ce)
2180
+{
2181
+ u32 *sema = hwsp_scratch(ce);
2182
+ const u32 offset = hwsp_offset(ce, sema);
2183
+ u32 elapsed[TF_COUNT], cycles;
2184
+ u32 *cs;
2185
+ int err;
2186
+ int i;
2187
+
2188
+ /*
2189
+ * Measure how long it takes for the signal (interrupt) to be
2190
+ * sent from the GPU to be processed by the CPU.
2191
+ *
2192
+ * A: read CS_TIMESTAMP on GPU
2193
+ * signal
2194
+ * B: read CS_TIMESTAMP from CPU
2195
+ *
2196
+ * Completion latency: B - A
2197
+ */
2198
+
2199
+ for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2200
+ struct signal_cb cb = { .seen = false };
2201
+ struct i915_request *rq;
2202
+
2203
+ rq = i915_request_create(ce);
2204
+ if (IS_ERR(rq)) {
2205
+ err = PTR_ERR(rq);
2206
+ goto err;
2207
+ }
2208
+
2209
+ cs = intel_ring_begin(rq, 12);
2210
+ if (IS_ERR(cs)) {
2211
+ i915_request_add(rq);
2212
+ err = PTR_ERR(cs);
2213
+ goto err;
2214
+ }
2215
+
2216
+ cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2217
+ cs = emit_semaphore_poll_until(cs, offset, i);
2218
+ cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2219
+
2220
+ intel_ring_advance(rq, cs);
2221
+
2222
+ dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2223
+
2224
+ local_bh_disable();
2225
+ i915_request_add(rq);
2226
+ local_bh_enable();
2227
+
2228
+ if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2229
+ err = -EIO;
2230
+ goto err;
2231
+ }
2232
+
2233
+ preempt_disable();
2234
+ semaphore_set(sema, i);
2235
+ while (!READ_ONCE(cb.seen))
2236
+ cpu_relax();
2237
+
2238
+ elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2239
+ preempt_enable();
2240
+ }
2241
+
2242
+ err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2243
+ if (err)
2244
+ goto err;
2245
+
2246
+ for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2247
+ GEM_BUG_ON(sema[i + 1] == -1);
2248
+ elapsed[i] = elapsed[i] - sema[i + 1];
2249
+ }
2250
+
2251
+ cycles = trifilter(elapsed);
2252
+ pr_info("%s: completion latency %d cycles, %lluns\n",
2253
+ ce->engine->name, cycles >> TF_BIAS,
2254
+ cycles_to_ns(ce->engine, cycles));
2255
+
2256
+ return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2257
+
2258
+err:
2259
+ intel_gt_set_wedged(ce->engine->gt);
2260
+ return err;
2261
+}
2262
+
2263
+static void rps_pin(struct intel_gt *gt)
2264
+{
2265
+ /* Pin the frequency to max */
2266
+ atomic_inc(&gt->rps.num_waiters);
2267
+ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2268
+
2269
+ mutex_lock(&gt->rps.lock);
2270
+ intel_rps_set(&gt->rps, gt->rps.max_freq);
2271
+ mutex_unlock(&gt->rps.lock);
2272
+}
2273
+
2274
+static void rps_unpin(struct intel_gt *gt)
2275
+{
2276
+ intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2277
+ atomic_dec(&gt->rps.num_waiters);
2278
+}
2279
+
2280
+static int perf_request_latency(void *arg)
2281
+{
2282
+ struct drm_i915_private *i915 = arg;
2283
+ struct intel_engine_cs *engine;
2284
+ struct pm_qos_request qos;
2285
+ int err = 0;
2286
+
2287
+ if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2288
+ return 0;
2289
+
2290
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2291
+
2292
+ for_each_uabi_engine(engine, i915) {
2293
+ struct intel_context *ce;
2294
+
2295
+ ce = intel_context_create(engine);
2296
+ if (IS_ERR(ce)) {
2297
+ err = PTR_ERR(ce);
2298
+ goto out;
2299
+ }
2300
+
2301
+ err = intel_context_pin(ce);
2302
+ if (err) {
2303
+ intel_context_put(ce);
2304
+ goto out;
2305
+ }
2306
+
2307
+ st_engine_heartbeat_disable(engine);
2308
+ rps_pin(engine->gt);
2309
+
2310
+ if (err == 0)
2311
+ err = measure_semaphore_response(ce);
2312
+ if (err == 0)
2313
+ err = measure_idle_dispatch(ce);
2314
+ if (err == 0)
2315
+ err = measure_busy_dispatch(ce);
2316
+ if (err == 0)
2317
+ err = measure_inter_request(ce);
2318
+ if (err == 0)
2319
+ err = measure_context_switch(ce);
2320
+ if (err == 0)
2321
+ err = measure_preemption(ce);
2322
+ if (err == 0)
2323
+ err = measure_completion(ce);
2324
+
2325
+ rps_unpin(engine->gt);
2326
+ st_engine_heartbeat_enable(engine);
2327
+
2328
+ intel_context_unpin(ce);
2329
+ intel_context_put(ce);
2330
+ if (err)
2331
+ goto out;
2332
+ }
2333
+
2334
+out:
2335
+ if (igt_flush_test(i915))
2336
+ err = -EIO;
2337
+
2338
+ cpu_latency_qos_remove_request(&qos);
2339
+ return err;
2340
+}
2341
+
2342
+static int s_sync0(void *arg)
2343
+{
2344
+ struct perf_series *ps = arg;
2345
+ IGT_TIMEOUT(end_time);
2346
+ unsigned int idx = 0;
2347
+ int err = 0;
2348
+
2349
+ GEM_BUG_ON(!ps->nengines);
2350
+ do {
2351
+ struct i915_request *rq;
2352
+
2353
+ rq = i915_request_create(ps->ce[idx]);
2354
+ if (IS_ERR(rq)) {
2355
+ err = PTR_ERR(rq);
2356
+ break;
2357
+ }
2358
+
2359
+ i915_request_get(rq);
2360
+ i915_request_add(rq);
2361
+
2362
+ if (i915_request_wait(rq, 0, HZ / 5) < 0)
2363
+ err = -ETIME;
2364
+ i915_request_put(rq);
2365
+ if (err)
2366
+ break;
2367
+
2368
+ if (++idx == ps->nengines)
2369
+ idx = 0;
2370
+ } while (!__igt_timeout(end_time, NULL));
2371
+
2372
+ return err;
2373
+}
2374
+
2375
+static int s_sync1(void *arg)
2376
+{
2377
+ struct perf_series *ps = arg;
2378
+ struct i915_request *prev = NULL;
2379
+ IGT_TIMEOUT(end_time);
2380
+ unsigned int idx = 0;
2381
+ int err = 0;
2382
+
2383
+ GEM_BUG_ON(!ps->nengines);
2384
+ do {
2385
+ struct i915_request *rq;
2386
+
2387
+ rq = i915_request_create(ps->ce[idx]);
2388
+ if (IS_ERR(rq)) {
2389
+ err = PTR_ERR(rq);
2390
+ break;
2391
+ }
2392
+
2393
+ i915_request_get(rq);
2394
+ i915_request_add(rq);
2395
+
2396
+ if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2397
+ err = -ETIME;
2398
+ i915_request_put(prev);
2399
+ prev = rq;
2400
+ if (err)
2401
+ break;
2402
+
2403
+ if (++idx == ps->nengines)
2404
+ idx = 0;
2405
+ } while (!__igt_timeout(end_time, NULL));
2406
+ i915_request_put(prev);
2407
+
2408
+ return err;
2409
+}
2410
+
2411
+static int s_many(void *arg)
2412
+{
2413
+ struct perf_series *ps = arg;
2414
+ IGT_TIMEOUT(end_time);
2415
+ unsigned int idx = 0;
2416
+
2417
+ GEM_BUG_ON(!ps->nengines);
2418
+ do {
2419
+ struct i915_request *rq;
2420
+
2421
+ rq = i915_request_create(ps->ce[idx]);
2422
+ if (IS_ERR(rq))
2423
+ return PTR_ERR(rq);
2424
+
2425
+ i915_request_add(rq);
2426
+
2427
+ if (++idx == ps->nengines)
2428
+ idx = 0;
2429
+ } while (!__igt_timeout(end_time, NULL));
2430
+
2431
+ return 0;
2432
+}
2433
+
2434
+static int perf_series_engines(void *arg)
2435
+{
2436
+ struct drm_i915_private *i915 = arg;
2437
+ static int (* const func[])(void *arg) = {
2438
+ s_sync0,
2439
+ s_sync1,
2440
+ s_many,
2441
+ NULL,
2442
+ };
2443
+ const unsigned int nengines = num_uabi_engines(i915);
2444
+ struct intel_engine_cs *engine;
2445
+ int (* const *fn)(void *arg);
2446
+ struct pm_qos_request qos;
2447
+ struct perf_stats *stats;
2448
+ struct perf_series *ps;
2449
+ unsigned int idx;
2450
+ int err = 0;
2451
+
2452
+ stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2453
+ if (!stats)
2454
+ return -ENOMEM;
2455
+
2456
+ ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2457
+ if (!ps) {
2458
+ kfree(stats);
2459
+ return -ENOMEM;
2460
+ }
2461
+
2462
+ cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2463
+
2464
+ ps->i915 = i915;
2465
+ ps->nengines = nengines;
2466
+
2467
+ idx = 0;
2468
+ for_each_uabi_engine(engine, i915) {
2469
+ struct intel_context *ce;
2470
+
2471
+ ce = intel_context_create(engine);
2472
+ if (IS_ERR(ce)) {
2473
+ err = PTR_ERR(ce);
2474
+ goto out;
2475
+ }
2476
+
2477
+ err = intel_context_pin(ce);
2478
+ if (err) {
2479
+ intel_context_put(ce);
2480
+ goto out;
2481
+ }
2482
+
2483
+ ps->ce[idx++] = ce;
2484
+ }
2485
+ GEM_BUG_ON(idx != ps->nengines);
2486
+
2487
+ for (fn = func; *fn && !err; fn++) {
2488
+ char name[KSYM_NAME_LEN];
2489
+ struct igt_live_test t;
2490
+
2491
+ snprintf(name, sizeof(name), "%ps", *fn);
2492
+ err = igt_live_test_begin(&t, i915, __func__, name);
2493
+ if (err)
2494
+ break;
2495
+
2496
+ for (idx = 0; idx < nengines; idx++) {
2497
+ struct perf_stats *p =
2498
+ memset(&stats[idx], 0, sizeof(stats[idx]));
2499
+ struct intel_context *ce = ps->ce[idx];
2500
+
2501
+ p->engine = ps->ce[idx]->engine;
2502
+ intel_engine_pm_get(p->engine);
2503
+
2504
+ if (intel_engine_supports_stats(p->engine))
2505
+ p->busy = intel_engine_get_busy_time(p->engine,
2506
+ &p->time) + 1;
2507
+ else
2508
+ p->time = ktime_get();
2509
+ p->runtime = -intel_context_get_total_runtime_ns(ce);
2510
+ }
2511
+
2512
+ err = (*fn)(ps);
2513
+ if (igt_live_test_end(&t))
2514
+ err = -EIO;
2515
+
2516
+ for (idx = 0; idx < nengines; idx++) {
2517
+ struct perf_stats *p = &stats[idx];
2518
+ struct intel_context *ce = ps->ce[idx];
2519
+ int integer, decimal;
2520
+ u64 busy, dt, now;
2521
+
2522
+ if (p->busy)
2523
+ p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2524
+ &now),
2525
+ p->busy - 1);
2526
+ else
2527
+ now = ktime_get();
2528
+ p->time = ktime_sub(now, p->time);
2529
+
2530
+ err = switch_to_kernel_sync(ce, err);
2531
+ p->runtime += intel_context_get_total_runtime_ns(ce);
2532
+ intel_engine_pm_put(p->engine);
2533
+
2534
+ busy = 100 * ktime_to_ns(p->busy);
2535
+ dt = ktime_to_ns(p->time);
2536
+ if (dt) {
2537
+ integer = div64_u64(busy, dt);
2538
+ busy -= integer * dt;
2539
+ decimal = div64_u64(100 * busy, dt);
2540
+ } else {
2541
+ integer = 0;
2542
+ decimal = 0;
2543
+ }
2544
+
2545
+ pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2546
+ name, p->engine->name, ce->timeline->seqno,
2547
+ integer, decimal,
2548
+ div_u64(p->runtime, 1000 * 1000),
2549
+ div_u64(ktime_to_ns(p->time), 1000 * 1000));
2550
+ }
2551
+ }
2552
+
2553
+out:
2554
+ for (idx = 0; idx < nengines; idx++) {
2555
+ if (IS_ERR_OR_NULL(ps->ce[idx]))
2556
+ break;
2557
+
2558
+ intel_context_unpin(ps->ce[idx]);
2559
+ intel_context_put(ps->ce[idx]);
2560
+ }
2561
+ kfree(ps);
2562
+
2563
+ cpu_latency_qos_remove_request(&qos);
2564
+ kfree(stats);
2565
+ return err;
2566
+}
2567
+
2568
+static int p_sync0(void *arg)
2569
+{
2570
+ struct perf_stats *p = arg;
2571
+ struct intel_engine_cs *engine = p->engine;
2572
+ struct intel_context *ce;
2573
+ IGT_TIMEOUT(end_time);
2574
+ unsigned long count;
2575
+ bool busy;
2576
+ int err = 0;
2577
+
2578
+ ce = intel_context_create(engine);
2579
+ if (IS_ERR(ce))
2580
+ return PTR_ERR(ce);
2581
+
2582
+ err = intel_context_pin(ce);
2583
+ if (err) {
2584
+ intel_context_put(ce);
2585
+ return err;
2586
+ }
2587
+
2588
+ if (intel_engine_supports_stats(engine)) {
2589
+ p->busy = intel_engine_get_busy_time(engine, &p->time);
2590
+ busy = true;
2591
+ } else {
2592
+ p->time = ktime_get();
2593
+ busy = false;
2594
+ }
2595
+
2596
+ count = 0;
2597
+ do {
2598
+ struct i915_request *rq;
2599
+
2600
+ rq = i915_request_create(ce);
2601
+ if (IS_ERR(rq)) {
2602
+ err = PTR_ERR(rq);
2603
+ break;
2604
+ }
2605
+
2606
+ i915_request_get(rq);
2607
+ i915_request_add(rq);
2608
+
2609
+ err = 0;
2610
+ if (i915_request_wait(rq, 0, HZ / 5) < 0)
2611
+ err = -ETIME;
2612
+ i915_request_put(rq);
2613
+ if (err)
2614
+ break;
2615
+
2616
+ count++;
2617
+ } while (!__igt_timeout(end_time, NULL));
2618
+
2619
+ if (busy) {
2620
+ ktime_t now;
2621
+
2622
+ p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2623
+ p->busy);
2624
+ p->time = ktime_sub(now, p->time);
2625
+ } else {
2626
+ p->time = ktime_sub(ktime_get(), p->time);
2627
+ }
2628
+
2629
+ err = switch_to_kernel_sync(ce, err);
2630
+ p->runtime = intel_context_get_total_runtime_ns(ce);
2631
+ p->count = count;
2632
+
2633
+ intel_context_unpin(ce);
2634
+ intel_context_put(ce);
2635
+ return err;
2636
+}
2637
+
2638
+static int p_sync1(void *arg)
2639
+{
2640
+ struct perf_stats *p = arg;
2641
+ struct intel_engine_cs *engine = p->engine;
2642
+ struct i915_request *prev = NULL;
2643
+ struct intel_context *ce;
2644
+ IGT_TIMEOUT(end_time);
2645
+ unsigned long count;
2646
+ bool busy;
2647
+ int err = 0;
2648
+
2649
+ ce = intel_context_create(engine);
2650
+ if (IS_ERR(ce))
2651
+ return PTR_ERR(ce);
2652
+
2653
+ err = intel_context_pin(ce);
2654
+ if (err) {
2655
+ intel_context_put(ce);
2656
+ return err;
2657
+ }
2658
+
2659
+ if (intel_engine_supports_stats(engine)) {
2660
+ p->busy = intel_engine_get_busy_time(engine, &p->time);
2661
+ busy = true;
2662
+ } else {
2663
+ p->time = ktime_get();
2664
+ busy = false;
2665
+ }
2666
+
2667
+ count = 0;
2668
+ do {
2669
+ struct i915_request *rq;
2670
+
2671
+ rq = i915_request_create(ce);
2672
+ if (IS_ERR(rq)) {
2673
+ err = PTR_ERR(rq);
2674
+ break;
2675
+ }
2676
+
2677
+ i915_request_get(rq);
2678
+ i915_request_add(rq);
2679
+
2680
+ err = 0;
2681
+ if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2682
+ err = -ETIME;
2683
+ i915_request_put(prev);
2684
+ prev = rq;
2685
+ if (err)
2686
+ break;
2687
+
2688
+ count++;
2689
+ } while (!__igt_timeout(end_time, NULL));
2690
+ i915_request_put(prev);
2691
+
2692
+ if (busy) {
2693
+ ktime_t now;
2694
+
2695
+ p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2696
+ p->busy);
2697
+ p->time = ktime_sub(now, p->time);
2698
+ } else {
2699
+ p->time = ktime_sub(ktime_get(), p->time);
2700
+ }
2701
+
2702
+ err = switch_to_kernel_sync(ce, err);
2703
+ p->runtime = intel_context_get_total_runtime_ns(ce);
2704
+ p->count = count;
2705
+
2706
+ intel_context_unpin(ce);
2707
+ intel_context_put(ce);
2708
+ return err;
2709
+}
2710
+
2711
+static int p_many(void *arg)
2712
+{
2713
+ struct perf_stats *p = arg;
2714
+ struct intel_engine_cs *engine = p->engine;
2715
+ struct intel_context *ce;
2716
+ IGT_TIMEOUT(end_time);
2717
+ unsigned long count;
2718
+ int err = 0;
2719
+ bool busy;
2720
+
2721
+ ce = intel_context_create(engine);
2722
+ if (IS_ERR(ce))
2723
+ return PTR_ERR(ce);
2724
+
2725
+ err = intel_context_pin(ce);
2726
+ if (err) {
2727
+ intel_context_put(ce);
2728
+ return err;
2729
+ }
2730
+
2731
+ if (intel_engine_supports_stats(engine)) {
2732
+ p->busy = intel_engine_get_busy_time(engine, &p->time);
2733
+ busy = true;
2734
+ } else {
2735
+ p->time = ktime_get();
2736
+ busy = false;
2737
+ }
2738
+
2739
+ count = 0;
2740
+ do {
2741
+ struct i915_request *rq;
2742
+
2743
+ rq = i915_request_create(ce);
2744
+ if (IS_ERR(rq)) {
2745
+ err = PTR_ERR(rq);
2746
+ break;
2747
+ }
2748
+
2749
+ i915_request_add(rq);
2750
+ count++;
2751
+ } while (!__igt_timeout(end_time, NULL));
2752
+
2753
+ if (busy) {
2754
+ ktime_t now;
2755
+
2756
+ p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2757
+ p->busy);
2758
+ p->time = ktime_sub(now, p->time);
2759
+ } else {
2760
+ p->time = ktime_sub(ktime_get(), p->time);
2761
+ }
2762
+
2763
+ err = switch_to_kernel_sync(ce, err);
2764
+ p->runtime = intel_context_get_total_runtime_ns(ce);
2765
+ p->count = count;
2766
+
2767
+ intel_context_unpin(ce);
2768
+ intel_context_put(ce);
2769
+ return err;
2770
+}
2771
+
2772
+static int perf_parallel_engines(void *arg)
2773
+{
2774
+ struct drm_i915_private *i915 = arg;
2775
+ static int (* const func[])(void *arg) = {
2776
+ p_sync0,
2777
+ p_sync1,
2778
+ p_many,
2779
+ NULL,
2780
+ };
2781
+ const unsigned int nengines = num_uabi_engines(i915);
2782
+ struct intel_engine_cs *engine;
2783
+ int (* const *fn)(void *arg);
2784
+ struct pm_qos_request qos;
2785
+ struct {
2786
+ struct perf_stats p;
2787
+ struct task_struct *tsk;
2788
+ } *engines;
2789
+ int err = 0;
2790
+
2791
+ engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2792
+ if (!engines)
2793
+ return -ENOMEM;
2794
+
2795
+ cpu_latency_qos_add_request(&qos, 0);
2796
+
2797
+ for (fn = func; *fn; fn++) {
2798
+ char name[KSYM_NAME_LEN];
2799
+ struct igt_live_test t;
2800
+ unsigned int idx;
2801
+
2802
+ snprintf(name, sizeof(name), "%ps", *fn);
2803
+ err = igt_live_test_begin(&t, i915, __func__, name);
2804
+ if (err)
2805
+ break;
2806
+
2807
+ atomic_set(&i915->selftest.counter, nengines);
2808
+
2809
+ idx = 0;
2810
+ for_each_uabi_engine(engine, i915) {
2811
+ intel_engine_pm_get(engine);
2812
+
2813
+ memset(&engines[idx].p, 0, sizeof(engines[idx].p));
2814
+ engines[idx].p.engine = engine;
2815
+
2816
+ engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
2817
+ "igt:%s", engine->name);
2818
+ if (IS_ERR(engines[idx].tsk)) {
2819
+ err = PTR_ERR(engines[idx].tsk);
2820
+ intel_engine_pm_put(engine);
2821
+ break;
2822
+ }
2823
+ get_task_struct(engines[idx++].tsk);
2824
+ }
2825
+
2826
+ yield(); /* start all threads before we kthread_stop() */
2827
+
2828
+ idx = 0;
2829
+ for_each_uabi_engine(engine, i915) {
2830
+ int status;
2831
+
2832
+ if (IS_ERR(engines[idx].tsk))
2833
+ break;
2834
+
2835
+ status = kthread_stop(engines[idx].tsk);
2836
+ if (status && !err)
2837
+ err = status;
2838
+
2839
+ intel_engine_pm_put(engine);
2840
+ put_task_struct(engines[idx++].tsk);
2841
+ }
2842
+
2843
+ if (igt_live_test_end(&t))
2844
+ err = -EIO;
2845
+ if (err)
2846
+ break;
2847
+
2848
+ idx = 0;
2849
+ for_each_uabi_engine(engine, i915) {
2850
+ struct perf_stats *p = &engines[idx].p;
2851
+ u64 busy = 100 * ktime_to_ns(p->busy);
2852
+ u64 dt = ktime_to_ns(p->time);
2853
+ int integer, decimal;
2854
+
2855
+ if (dt) {
2856
+ integer = div64_u64(busy, dt);
2857
+ busy -= integer * dt;
2858
+ decimal = div64_u64(100 * busy, dt);
2859
+ } else {
2860
+ integer = 0;
2861
+ decimal = 0;
2862
+ }
2863
+
2864
+ GEM_BUG_ON(engine != p->engine);
2865
+ pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2866
+ name, engine->name, p->count, integer, decimal,
2867
+ div_u64(p->runtime, 1000 * 1000),
2868
+ div_u64(ktime_to_ns(p->time), 1000 * 1000));
2869
+ idx++;
2870
+ }
2871
+ }
2872
+
2873
+ cpu_latency_qos_remove_request(&qos);
2874
+ kfree(engines);
2875
+ return err;
2876
+}
2877
+
2878
+int i915_request_perf_selftests(struct drm_i915_private *i915)
2879
+{
2880
+ static const struct i915_subtest tests[] = {
2881
+ SUBTEST(perf_request_latency),
2882
+ SUBTEST(perf_series_engines),
2883
+ SUBTEST(perf_parallel_engines),
2884
+ };
2885
+
2886
+ if (intel_gt_is_wedged(&i915->gt))
8702887 return 0;
8712888
8722889 return i915_subtests(tests, i915);