hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_reset_gpu.c
....@@ -1,7 +1,7 @@
11 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
22 /*
33 *
4
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
4
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
55 *
66 * This program is free software and is provided to you under the terms of the
77 * GNU General Public License version 2 as published by the Free Software
....@@ -21,7 +21,7 @@
2121
2222 #include <mali_kbase.h>
2323 #include <mali_kbase_ctx_sched.h>
24
-#include <mali_kbase_hwcnt_context.h>
24
+#include <hwcnt/mali_kbase_hwcnt_context.h>
2525 #include <device/mali_kbase_device.h>
2626 #include <backend/gpu/mali_kbase_irq_internal.h>
2727 #include <backend/gpu/mali_kbase_pm_internal.h>
....@@ -29,14 +29,14 @@
2929 #include <csf/mali_kbase_csf_trace_buffer.h>
3030 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
3131 #include <mali_kbase_reset_gpu.h>
32
+#include <csf/mali_kbase_csf_firmware_log.h>
3233
33
-/* Waiting timeout for GPU reset to complete */
34
-#define GPU_RESET_TIMEOUT_MS (5000) /* 5 seconds */
35
-#define DUMP_DWORDS_PER_LINE (4)
36
-/* 16 characters needed for a 8 byte value in hex & 1 character for space */
37
-#define DUMP_HEX_CHARS_PER_DWORD ((2 * 8) + 1)
38
-#define DUMP_HEX_CHARS_PER_LINE \
39
- (DUMP_DWORDS_PER_LINE * DUMP_HEX_CHARS_PER_DWORD)
34
+enum kbasep_soft_reset_status {
35
+ RESET_SUCCESS = 0,
36
+ SOFT_RESET_FAILED,
37
+ L2_ON_FAILED,
38
+ MCU_REINIT_FAILED
39
+};
4040
4141 static inline bool
4242 kbase_csf_reset_state_is_silent(enum kbase_csf_reset_gpu_state state)
....@@ -257,56 +257,6 @@
257257 kbase_reg_read(kbdev, GPU_CONTROL_REG(TILER_CONFIG)));
258258 }
259259
260
-static void kbase_csf_dump_firmware_trace_buffer(struct kbase_device *kbdev)
261
-{
262
- u8 *buf, *line_str;
263
- unsigned int read_size;
264
- struct firmware_trace_buffer *tb =
265
- kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
266
-
267
- if (tb == NULL) {
268
- dev_dbg(kbdev->dev, "Can't get the trace buffer, firmware trace dump skipped");
269
- return;
270
- }
271
-
272
- buf = kmalloc(PAGE_SIZE + DUMP_HEX_CHARS_PER_LINE + 1, GFP_KERNEL);
273
- if (buf == NULL) {
274
- dev_err(kbdev->dev, "Short of memory, firmware trace dump skipped");
275
- return;
276
- }
277
- line_str = &buf[PAGE_SIZE];
278
-
279
- dev_err(kbdev->dev, "Firmware trace buffer dump:");
280
- while ((read_size = kbase_csf_firmware_trace_buffer_read_data(tb, buf,
281
- PAGE_SIZE))) {
282
- u64 *ptr = (u64 *)buf;
283
- u32 num_dwords;
284
-
285
- for (num_dwords = read_size / sizeof(u64);
286
- num_dwords >= DUMP_DWORDS_PER_LINE;
287
- num_dwords -= DUMP_DWORDS_PER_LINE) {
288
- dev_err(kbdev->dev, "%016llx %016llx %016llx %016llx",
289
- ptr[0], ptr[1], ptr[2], ptr[3]);
290
- ptr += DUMP_DWORDS_PER_LINE;
291
- }
292
-
293
- if (num_dwords) {
294
- int pos = 0;
295
-
296
- while (num_dwords--) {
297
- pos += snprintf(line_str + pos,
298
- DUMP_HEX_CHARS_PER_DWORD + 1,
299
- "%016llx ", ptr[0]);
300
- ptr++;
301
- }
302
-
303
- dev_err(kbdev->dev, "%s", line_str);
304
- }
305
- }
306
-
307
- kfree(buf);
308
-}
309
-
310260 /**
311261 * kbase_csf_hwcnt_on_reset_error() - Sets HWCNT to appropriate state in the
312262 * event of an error during GPU reset.
....@@ -332,11 +282,102 @@
332282 kbase_csf_scheduler_spin_unlock(kbdev, flags);
333283 }
334284
335
-static int kbase_csf_reset_gpu_now(struct kbase_device *kbdev,
336
- bool firmware_inited, bool silent)
285
+static enum kbasep_soft_reset_status kbase_csf_reset_gpu_once(struct kbase_device *kbdev,
286
+ bool firmware_inited, bool silent)
337287 {
338288 unsigned long flags;
339289 int err;
290
+ enum kbasep_soft_reset_status ret = RESET_SUCCESS;
291
+
292
+ spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
293
+ spin_lock(&kbdev->mmu_mask_change);
294
+ kbase_pm_reset_start_locked(kbdev);
295
+
296
+ dev_dbg(kbdev->dev,
297
+ "We're about to flush out the IRQs and their bottom halves\n");
298
+ kbdev->irq_reset_flush = true;
299
+
300
+ /* Disable IRQ to avoid IRQ handlers to kick in after releasing the
301
+ * spinlock; this also clears any outstanding interrupts
302
+ */
303
+ kbase_pm_disable_interrupts_nolock(kbdev);
304
+
305
+ spin_unlock(&kbdev->mmu_mask_change);
306
+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
307
+
308
+ dev_dbg(kbdev->dev, "Ensure that any IRQ handlers have finished\n");
309
+ /* Must be done without any locks IRQ handlers will take. */
310
+ kbase_synchronize_irqs(kbdev);
311
+
312
+ dev_dbg(kbdev->dev, "Flush out any in-flight work items\n");
313
+ kbase_flush_mmu_wqs(kbdev);
314
+
315
+ dev_dbg(kbdev->dev,
316
+ "The flush has completed so reset the active indicator\n");
317
+ kbdev->irq_reset_flush = false;
318
+
319
+ if (!silent)
320
+ dev_err(kbdev->dev, "Resetting GPU (allowing up to %d ms)",
321
+ RESET_TIMEOUT);
322
+
323
+ /* Output the state of some interesting registers to help in the
324
+ * debugging of GPU resets, and dump the firmware trace buffer
325
+ */
326
+ if (!silent) {
327
+ kbase_csf_debug_dump_registers(kbdev);
328
+ if (likely(firmware_inited))
329
+ kbase_csf_firmware_log_dump_buffer(kbdev);
330
+ }
331
+
332
+ spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
333
+ kbase_ipa_control_handle_gpu_reset_pre(kbdev);
334
+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
335
+
336
+ /* Tell hardware counters a reset is about to occur.
337
+ * If the backend is in an unrecoverable error state (e.g. due to
338
+ * firmware being unresponsive) this will transition the backend out of
339
+ * it, on the assumption a reset will fix whatever problem there was.
340
+ */
341
+ kbase_hwcnt_backend_csf_on_before_reset(&kbdev->hwcnt_gpu_iface);
342
+
343
+ mutex_lock(&kbdev->pm.lock);
344
+ /* Reset the GPU */
345
+ err = kbase_pm_init_hw(kbdev, 0);
346
+
347
+ mutex_unlock(&kbdev->pm.lock);
348
+
349
+ if (WARN_ON(err))
350
+ return SOFT_RESET_FAILED;
351
+
352
+ mutex_lock(&kbdev->mmu_hw_mutex);
353
+ spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
354
+ kbase_ctx_sched_restore_all_as(kbdev);
355
+ kbase_ipa_control_handle_gpu_reset_post(kbdev);
356
+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
357
+ mutex_unlock(&kbdev->mmu_hw_mutex);
358
+
359
+ kbase_pm_enable_interrupts(kbdev);
360
+
361
+ mutex_lock(&kbdev->pm.lock);
362
+ kbase_pm_reset_complete(kbdev);
363
+ /* Synchronously wait for the reload of firmware to complete */
364
+ err = kbase_pm_wait_for_desired_state(kbdev);
365
+ mutex_unlock(&kbdev->pm.lock);
366
+
367
+ if (err) {
368
+ if (!kbase_pm_l2_is_in_desired_state(kbdev))
369
+ ret = L2_ON_FAILED;
370
+ else if (!kbase_pm_mcu_is_in_desired_state(kbdev))
371
+ ret = MCU_REINIT_FAILED;
372
+ }
373
+
374
+ return ret;
375
+}
376
+
377
+static int kbase_csf_reset_gpu_now(struct kbase_device *kbdev, bool firmware_inited, bool silent)
378
+{
379
+ unsigned long flags;
380
+ enum kbasep_soft_reset_status ret;
340381
341382 WARN_ON(kbdev->irq_reset_flush);
342383 /* The reset must now be happening otherwise other threads will not
....@@ -359,113 +400,56 @@
359400 cancel_work_sync(&kbdev->csf.firmware_reload_work);
360401
361402 dev_dbg(kbdev->dev, "Disable GPU hardware counters.\n");
362
- /* This call will block until counters are disabled.
363
- */
403
+ /* This call will block until counters are disabled. */
364404 kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
365405
366
- spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
367
- spin_lock(&kbdev->mmu_mask_change);
368
- kbase_pm_reset_start_locked(kbdev);
369
-
370
- dev_dbg(kbdev->dev,
371
- "We're about to flush out the IRQs and their bottom halves\n");
372
- kbdev->irq_reset_flush = true;
373
-
374
- /* Disable IRQ to avoid IRQ handlers to kick in after releasing the
375
- * spinlock; this also clears any outstanding interrupts
376
- */
377
- kbase_pm_disable_interrupts_nolock(kbdev);
378
-
379
- spin_unlock(&kbdev->mmu_mask_change);
380
- spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
381
-
382
- dev_dbg(kbdev->dev, "Ensure that any IRQ handlers have finished\n");
383
- /* Must be done without any locks IRQ handlers will take.
384
- */
385
- kbase_synchronize_irqs(kbdev);
386
-
387
- dev_dbg(kbdev->dev, "Flush out any in-flight work items\n");
388
- kbase_flush_mmu_wqs(kbdev);
389
-
390
- dev_dbg(kbdev->dev,
391
- "The flush has completed so reset the active indicator\n");
392
- kbdev->irq_reset_flush = false;
393
-
394
- mutex_lock(&kbdev->pm.lock);
395
- if (!silent)
396
- dev_err(kbdev->dev, "Resetting GPU (allowing up to %d ms)",
397
- RESET_TIMEOUT);
398
-
399
- /* Output the state of some interesting registers to help in the
400
- * debugging of GPU resets, and dump the firmware trace buffer
401
- */
402
- if (!silent) {
403
- kbase_csf_debug_dump_registers(kbdev);
404
- if (likely(firmware_inited))
405
- kbase_csf_dump_firmware_trace_buffer(kbdev);
406
- }
407
-
408
- spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
409
- kbase_ipa_control_handle_gpu_reset_pre(kbdev);
410
- spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
411
-
412
- /* Tell hardware counters a reset is about to occur.
413
- * If the backend is in an unrecoverable error state (e.g. due to
414
- * firmware being unresponsive) this will transition the backend out of
415
- * it, on the assumption a reset will fix whatever problem there was.
416
- */
417
- kbase_hwcnt_backend_csf_on_before_reset(&kbdev->hwcnt_gpu_iface);
418
-
419
- /* Reset the GPU */
420
- err = kbase_pm_init_hw(kbdev, 0);
421
-
422
- mutex_unlock(&kbdev->pm.lock);
423
-
424
- if (WARN_ON(err)) {
425
- kbase_csf_hwcnt_on_reset_error(kbdev);
426
- return err;
427
- }
428
-
429
- mutex_lock(&kbdev->mmu_hw_mutex);
430
- spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
431
- kbase_ctx_sched_restore_all_as(kbdev);
432
- kbase_ipa_control_handle_gpu_reset_post(kbdev);
433
- spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
434
- mutex_unlock(&kbdev->mmu_hw_mutex);
435
-
436
- kbase_pm_enable_interrupts(kbdev);
437
-
438
- mutex_lock(&kbdev->pm.lock);
439
- kbase_pm_reset_complete(kbdev);
440
- /* Synchronously wait for the reload of firmware to complete */
441
- err = kbase_pm_wait_for_desired_state(kbdev);
442
- mutex_unlock(&kbdev->pm.lock);
443
-
444
- if (WARN_ON(err)) {
445
- kbase_csf_hwcnt_on_reset_error(kbdev);
446
- return err;
406
+ ret = kbase_csf_reset_gpu_once(kbdev, firmware_inited, silent);
407
+ if (ret == SOFT_RESET_FAILED) {
408
+ dev_err(kbdev->dev, "Soft-reset failed");
409
+ goto err;
410
+ } else if (ret == L2_ON_FAILED) {
411
+ dev_err(kbdev->dev, "L2 power up failed after the soft-reset");
412
+ goto err;
413
+ } else if (ret == MCU_REINIT_FAILED) {
414
+ dev_err(kbdev->dev, "MCU re-init failed trying full firmware reload");
415
+ /* Since MCU reinit failed despite successful soft reset, we can try
416
+ * the firmware full reload.
417
+ */
418
+ kbdev->csf.firmware_full_reload_needed = true;
419
+ ret = kbase_csf_reset_gpu_once(kbdev, firmware_inited, true);
420
+ if (ret != RESET_SUCCESS) {
421
+ dev_err(kbdev->dev,
422
+ "MCU Re-init failed even after trying full firmware reload, ret = [%d]",
423
+ ret);
424
+ goto err;
425
+ }
447426 }
448427
449428 /* Re-enable GPU hardware counters */
450429 kbase_csf_scheduler_spin_lock(kbdev, &flags);
451430 kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
452431 kbase_csf_scheduler_spin_unlock(kbdev, flags);
453
-
454432 if (!silent)
455433 dev_err(kbdev->dev, "Reset complete");
456
-
457434 return 0;
435
+err:
436
+
437
+ kbase_csf_hwcnt_on_reset_error(kbdev);
438
+ return -1;
458439 }
459440
460441 static void kbase_csf_reset_gpu_worker(struct work_struct *data)
461442 {
462443 struct kbase_device *kbdev = container_of(data, struct kbase_device,
463444 csf.reset.work);
445
+ bool gpu_sleep_mode_active = false;
464446 bool firmware_inited;
465447 unsigned long flags;
466448 int err = 0;
467449 const enum kbase_csf_reset_gpu_state initial_reset_state =
468450 atomic_read(&kbdev->csf.reset.state);
451
+ const bool silent =
452
+ kbase_csf_reset_state_is_silent(initial_reset_state);
469453
470454 /* Ensure any threads (e.g. executing the CSF scheduler) have finished
471455 * using the HW
....@@ -474,13 +458,29 @@
474458
475459 spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
476460 firmware_inited = kbdev->csf.firmware_inited;
461
+#ifdef KBASE_PM_RUNTIME
462
+ gpu_sleep_mode_active = kbdev->pm.backend.gpu_sleep_mode_active;
463
+#endif
477464 spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
478465
479
- if (!kbase_pm_context_active_handle_suspend(kbdev,
480
- KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) {
481
- bool silent =
482
- kbase_csf_reset_state_is_silent(initial_reset_state);
466
+ if (unlikely(gpu_sleep_mode_active)) {
467
+#ifdef KBASE_PM_RUNTIME
468
+ /* As prior to GPU reset all on-slot groups are suspended,
469
+ * need to wake up the MCU from sleep.
470
+ * No pm active reference is taken here since GPU is in sleep
471
+ * state and both runtime & system suspend synchronize with the
472
+ * GPU reset before they wake up the GPU to suspend on-slot
473
+ * groups. GPUCORE-29850 would add the proper handling.
474
+ */
475
+ kbase_pm_lock(kbdev);
476
+ if (kbase_pm_force_mcu_wakeup_after_sleep(kbdev))
477
+ dev_warn(kbdev->dev, "Wait for MCU wake up failed on GPU reset");
478
+ kbase_pm_unlock(kbdev);
483479
480
+ err = kbase_csf_reset_gpu_now(kbdev, firmware_inited, silent);
481
+#endif
482
+ } else if (!kbase_pm_context_active_handle_suspend(kbdev,
483
+ KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) {
484484 err = kbase_csf_reset_gpu_now(kbdev, firmware_inited, silent);
485485 kbase_pm_context_idle(kbdev);
486486 }
....@@ -557,6 +557,7 @@
557557
558558 return 0;
559559 }
560
+KBASE_EXPORT_TEST_API(kbase_reset_gpu_silent);
560561
561562 bool kbase_reset_gpu_is_active(struct kbase_device *kbdev)
562563 {
....@@ -570,10 +571,15 @@
570571 return kbase_csf_reset_state_is_active(reset_state);
571572 }
572573
574
+bool kbase_reset_gpu_is_not_pending(struct kbase_device *kbdev)
575
+{
576
+ return atomic_read(&kbdev->csf.reset.state) == KBASE_CSF_RESET_GPU_NOT_PENDING;
577
+}
578
+
573579 int kbase_reset_gpu_wait(struct kbase_device *kbdev)
574580 {
575581 const long wait_timeout =
576
- kbase_csf_timeout_in_jiffies(GPU_RESET_TIMEOUT_MS);
582
+ kbase_csf_timeout_in_jiffies(kbase_get_timeout_ms(kbdev, CSF_GPU_RESET_TIMEOUT));
577583 long remaining;
578584
579585 /* Inform lockdep we might be trying to wait on a reset (as
....@@ -599,6 +605,8 @@
599605
600606 if (!remaining) {
601607 dev_warn(kbdev->dev, "Timed out waiting for the GPU reset to complete");
608
+
609
+
602610 return -ETIMEDOUT;
603611 } else if (atomic_read(&kbdev->csf.reset.state) ==
604612 KBASE_CSF_RESET_GPU_FAILED) {