forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-13 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e
kernel/drivers/gpu/drm/i915/i915_gpu_error.h
....@@ -7,15 +7,18 @@
77 #ifndef _I915_GPU_ERROR_H_
88 #define _I915_GPU_ERROR_H_
99
10
+#include <linux/atomic.h>
1011 #include <linux/kref.h>
1112 #include <linux/ktime.h>
1213 #include <linux/sched.h>
1314
1415 #include <drm/drm_mm.h>
1516
17
+#include "gt/intel_engine.h"
18
+#include "gt/intel_gt_types.h"
19
+#include "gt/uc/intel_uc_fw.h"
20
+
1621 #include "intel_device_info.h"
17
-#include "intel_ringbuffer.h"
18
-#include "intel_uc_fw.h"
1922
2023 #include "i915_gem.h"
2124 #include "i915_gem_gtt.h"
....@@ -23,43 +26,106 @@
2326 #include "i915_scheduler.h"
2427
2528 struct drm_i915_private;
29
+struct i915_vma_compress;
30
+struct intel_engine_capture_vma;
2631 struct intel_overlay_error_state;
2732 struct intel_display_error_state;
2833
29
-struct i915_gpu_state {
30
- struct kref ref;
31
- ktime_t time;
32
- ktime_t boottime;
33
- ktime_t uptime;
34
- unsigned long capture;
35
- unsigned long epoch;
34
+struct i915_vma_coredump {
35
+ struct i915_vma_coredump *next;
3636
37
- struct drm_i915_private *i915;
37
+ char name[20];
3838
39
- char error_msg[128];
39
+ u64 gtt_offset;
40
+ u64 gtt_size;
41
+ u32 gtt_page_sizes;
42
+
43
+ int num_pages;
44
+ int page_count;
45
+ int unused;
46
+ u32 *pages[];
47
+};
48
+
49
+struct i915_request_coredump {
50
+ unsigned long flags;
51
+ pid_t pid;
52
+ u32 context;
53
+ u32 seqno;
54
+ u32 head;
55
+ u32 tail;
56
+ struct i915_sched_attr sched_attr;
57
+};
58
+
59
+struct intel_engine_coredump {
60
+ const struct intel_engine_cs *engine;
61
+
4062 bool simulated;
41
- bool awake;
42
- bool wakelock;
43
- bool suspended;
44
- int iommu;
4563 u32 reset_count;
46
- u32 suspend_count;
47
- struct intel_device_info device_info;
48
- struct intel_driver_caps driver_caps;
49
- struct i915_params params;
5064
51
- struct i915_error_uc {
52
- struct intel_uc_fw guc_fw;
53
- struct intel_uc_fw huc_fw;
54
- struct drm_i915_error_object *guc_log;
55
- } uc;
65
+ /* position of active request inside the ring */
66
+ u32 rq_head, rq_post, rq_tail;
67
+
68
+ /* Register state */
69
+ u32 ccid;
70
+ u32 start;
71
+ u32 tail;
72
+ u32 head;
73
+ u32 ctl;
74
+ u32 mode;
75
+ u32 hws;
76
+ u32 ipeir;
77
+ u32 ipehr;
78
+ u32 esr;
79
+ u32 bbstate;
80
+ u32 instpm;
81
+ u32 instps;
82
+ u64 bbaddr;
83
+ u64 acthd;
84
+ u32 fault_reg;
85
+ u64 faddr;
86
+ u32 rc_psmi; /* sleep state */
87
+ struct intel_instdone instdone;
88
+
89
+ struct i915_gem_context_coredump {
90
+ char comm[TASK_COMM_LEN];
91
+
92
+ u64 total_runtime;
93
+ u32 avg_runtime;
94
+
95
+ pid_t pid;
96
+ int active;
97
+ int guilty;
98
+ struct i915_sched_attr sched_attr;
99
+ } context;
100
+
101
+ struct i915_vma_coredump *vma;
102
+
103
+ struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
104
+ unsigned int num_ports;
105
+
106
+ struct {
107
+ u32 gfx_mode;
108
+ union {
109
+ u64 pdp[4];
110
+ u32 pp_dir_base;
111
+ };
112
+ } vm_info;
113
+
114
+ struct intel_engine_coredump *next;
115
+};
116
+
117
+struct intel_gt_coredump {
118
+ const struct intel_gt *_gt;
119
+ bool awake;
120
+ bool simulated;
121
+
122
+ struct intel_gt_info info;
56123
57124 /* Generic register state */
58125 u32 eir;
59126 u32 pgtbl_er;
60127 u32 ier;
61128 u32 gtier[6], ngtier;
62
- u32 ccid;
63129 u32 derrmr;
64130 u32 forcewake;
65131 u32 error; /* gen6+ */
....@@ -71,297 +137,213 @@
71137 u32 gam_ecochk;
72138 u32 gab_ctl;
73139 u32 gfx_mode;
140
+ u32 gtt_cache;
141
+ u32 aux_err; /* gen12 */
142
+ u32 sfc_done[GEN12_SFC_DONE_MAX]; /* gen12 */
143
+ u32 gam_done; /* gen12 */
74144
75145 u32 nfence;
76146 u64 fence[I915_MAX_NUM_FENCES];
147
+
148
+ struct intel_engine_coredump *engine;
149
+
150
+ struct intel_uc_coredump {
151
+ struct intel_uc_fw guc_fw;
152
+ struct intel_uc_fw huc_fw;
153
+ struct i915_vma_coredump *guc_log;
154
+ } *uc;
155
+
156
+ struct intel_gt_coredump *next;
157
+};
158
+
159
+struct i915_gpu_coredump {
160
+ struct kref ref;
161
+ ktime_t time;
162
+ ktime_t boottime;
163
+ ktime_t uptime;
164
+ unsigned long capture;
165
+
166
+ struct drm_i915_private *i915;
167
+
168
+ struct intel_gt_coredump *gt;
169
+
170
+ char error_msg[128];
171
+ bool simulated;
172
+ bool wakelock;
173
+ bool suspended;
174
+ int iommu;
175
+ u32 reset_count;
176
+ u32 suspend_count;
177
+
178
+ struct intel_device_info device_info;
179
+ struct intel_runtime_info runtime_info;
180
+ struct intel_driver_caps driver_caps;
181
+ struct i915_params params;
182
+
77183 struct intel_overlay_error_state *overlay;
78184 struct intel_display_error_state *display;
79185
80
- struct drm_i915_error_engine {
81
- int engine_id;
82
- /* Software tracked state */
83
- bool idle;
84
- bool waiting;
85
- int num_waiters;
86
- unsigned long hangcheck_timestamp;
87
- bool hangcheck_stalled;
88
- enum intel_engine_hangcheck_action hangcheck_action;
89
- struct i915_address_space *vm;
90
- int num_requests;
91
- u32 reset_count;
92
-
93
- /* position of active request inside the ring */
94
- u32 rq_head, rq_post, rq_tail;
95
-
96
- /* our own tracking of ring head and tail */
97
- u32 cpu_ring_head;
98
- u32 cpu_ring_tail;
99
-
100
- u32 last_seqno;
101
-
102
- /* Register state */
103
- u32 start;
104
- u32 tail;
105
- u32 head;
106
- u32 ctl;
107
- u32 mode;
108
- u32 hws;
109
- u32 ipeir;
110
- u32 ipehr;
111
- u32 bbstate;
112
- u32 instpm;
113
- u32 instps;
114
- u32 seqno;
115
- u64 bbaddr;
116
- u64 acthd;
117
- u32 fault_reg;
118
- u64 faddr;
119
- u32 rc_psmi; /* sleep state */
120
- u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
121
- struct intel_instdone instdone;
122
-
123
- struct drm_i915_error_context {
124
- char comm[TASK_COMM_LEN];
125
- pid_t pid;
126
- u32 handle;
127
- u32 hw_id;
128
- int ban_score;
129
- int active;
130
- int guilty;
131
- bool bannable;
132
- struct i915_sched_attr sched_attr;
133
- } context;
134
-
135
- struct drm_i915_error_object {
136
- u64 gtt_offset;
137
- u64 gtt_size;
138
- int num_pages;
139
- int page_count;
140
- int unused;
141
- u32 *pages[0];
142
- } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
143
-
144
- struct drm_i915_error_object **user_bo;
145
- long user_bo_count;
146
-
147
- struct drm_i915_error_object *wa_ctx;
148
- struct drm_i915_error_object *default_state;
149
-
150
- struct drm_i915_error_request {
151
- long jiffies;
152
- pid_t pid;
153
- u32 context;
154
- int ban_score;
155
- u32 seqno;
156
- u32 start;
157
- u32 head;
158
- u32 tail;
159
- struct i915_sched_attr sched_attr;
160
- } *requests, execlist[EXECLIST_MAX_PORTS];
161
- unsigned int num_ports;
162
-
163
- struct drm_i915_error_waiter {
164
- char comm[TASK_COMM_LEN];
165
- pid_t pid;
166
- u32 seqno;
167
- } *waiters;
168
-
169
- struct {
170
- u32 gfx_mode;
171
- union {
172
- u64 pdp[4];
173
- u32 pp_dir_base;
174
- };
175
- } vm_info;
176
- } engine[I915_NUM_ENGINES];
177
-
178
- struct drm_i915_error_buffer {
179
- u32 size;
180
- u32 name;
181
- u32 wseqno;
182
- u64 gtt_offset;
183
- u32 read_domains;
184
- u32 write_domain;
185
- s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
186
- u32 tiling:2;
187
- u32 dirty:1;
188
- u32 purgeable:1;
189
- u32 userptr:1;
190
- s32 engine:4;
191
- u32 cache_level:3;
192
- } *active_bo[I915_NUM_ENGINES], *pinned_bo;
193
- u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
194
- struct i915_address_space *active_vm[I915_NUM_ENGINES];
186
+ struct scatterlist *sgl, *fit;
195187 };
196188
197189 struct i915_gpu_error {
198
- /* For hangcheck timer */
199
-#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
200
-#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
201
-
202
- struct delayed_work hangcheck_work;
203
-
204190 /* For reset and error_state handling. */
205191 spinlock_t lock;
206192 /* Protected by the above dev->gpu_error.lock. */
207
- struct i915_gpu_state *first_error;
193
+ struct i915_gpu_coredump *first_error;
208194
209195 atomic_t pending_fb_pin;
210196
211
- unsigned long missed_irq_rings;
212
-
213
- /**
214
- * State variable controlling the reset flow and count
215
- *
216
- * This is a counter which gets incremented when reset is triggered,
217
- *
218
- * Before the reset commences, the I915_RESET_BACKOFF bit is set
219
- * meaning that any waiters holding onto the struct_mutex should
220
- * relinquish the lock immediately in order for the reset to start.
221
- *
222
- * If reset is not completed successfully, the I915_WEDGE bit is
223
- * set meaning that hardware is terminally sour and there is no
224
- * recovery. All waiters on the reset_queue will be woken when
225
- * that happens.
226
- *
227
- * This counter is used by the wait_seqno code to notice that reset
228
- * event happened and it needs to restart the entire ioctl (since most
229
- * likely the seqno it waited for won't ever signal anytime soon).
230
- *
231
- * This is important for lock-free wait paths, where no contended lock
232
- * naturally enforces the correct ordering between the bail-out of the
233
- * waiter and the gpu reset work code.
234
- */
235
- unsigned long reset_count;
236
-
237
- /**
238
- * flags: Control various stages of the GPU reset
239
- *
240
- * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
241
- * other users acquiring the struct_mutex. To do this we set the
242
- * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
243
- * and then check for that bit before acquiring the struct_mutex (in
244
- * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
245
- * secondary role in preventing two concurrent global reset attempts.
246
- *
247
- * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
248
- * struct_mutex. We try to acquire the struct_mutex in the reset worker,
249
- * but it may be held by some long running waiter (that we cannot
250
- * interrupt without causing trouble). Once we are ready to do the GPU
251
- * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
252
- * they already hold the struct_mutex and want to participate they can
253
- * inspect the bit and do the reset directly, otherwise the worker
254
- * waits for the struct_mutex.
255
- *
256
- * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
257
- * acquire the struct_mutex to reset an engine, we need an explicit
258
- * flag to prevent two concurrent reset attempts in the same engine.
259
- * As the number of engines continues to grow, allocate the flags from
260
- * the most significant bits.
261
- *
262
- * #I915_WEDGED - If reset fails and we can no longer use the GPU,
263
- * we set the #I915_WEDGED bit. Prior to command submission, e.g.
264
- * i915_request_alloc(), this bit is checked and the sequence
265
- * aborted (with -EIO reported to userspace) if set.
266
- */
267
- unsigned long flags;
268
-#define I915_RESET_BACKOFF 0
269
-#define I915_RESET_HANDOFF 1
270
-#define I915_RESET_MODESET 2
271
-#define I915_WEDGED (BITS_PER_LONG - 1)
272
-#define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES)
197
+ /** Number of times the device has been reset (global) */
198
+ atomic_t reset_count;
273199
274200 /** Number of times an engine has been reset */
275
- u32 reset_engine_count[I915_NUM_ENGINES];
276
-
277
- /** Set of stalled engines with guilty requests, in the current reset */
278
- u32 stalled_mask;
279
-
280
- /** Reason for the current *global* reset */
281
- const char *reason;
282
-
283
- /**
284
- * Waitqueue to signal when a hang is detected. Used to for waiters
285
- * to release the struct_mutex for the reset to procede.
286
- */
287
- wait_queue_head_t wait_queue;
288
-
289
- /**
290
- * Waitqueue to signal when the reset has completed. Used by clients
291
- * that wait for dev_priv->mm.wedged to settle.
292
- */
293
- wait_queue_head_t reset_queue;
294
-
295
- /* For missed irq/seqno simulation. */
296
- unsigned long test_irq_rings;
201
+ atomic_t reset_engine_count[I915_NUM_ENGINES];
297202 };
298203
299204 struct drm_i915_error_state_buf {
300205 struct drm_i915_private *i915;
301
- unsigned int bytes;
302
- unsigned int size;
206
+ struct scatterlist *sgl, *cur, *end;
207
+
208
+ char *buf;
209
+ size_t bytes;
210
+ size_t size;
211
+ loff_t iter;
212
+
303213 int err;
304
- u8 *buf;
305
- loff_t start;
306
- loff_t pos;
307214 };
308215
309216 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
310217
311218 __printf(2, 3)
312219 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
313
-int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
314
- const struct i915_gpu_state *gpu);
315
-int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb,
316
- struct drm_i915_private *i915,
317
- size_t count, loff_t pos);
318220
319
-static inline void
320
-i915_error_state_buf_release(struct drm_i915_error_state_buf *eb)
321
-{
322
- kfree(eb->buf);
323
-}
221
+struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
222
+void i915_capture_error_state(struct drm_i915_private *i915);
324223
325
-struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
326
-void i915_capture_error_state(struct drm_i915_private *dev_priv,
327
- u32 engine_mask,
328
- const char *error_msg);
224
+struct i915_gpu_coredump *
225
+i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
329226
330
-static inline struct i915_gpu_state *
331
-i915_gpu_state_get(struct i915_gpu_state *gpu)
227
+struct intel_gt_coredump *
228
+intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp);
229
+
230
+struct intel_engine_coredump *
231
+intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp);
232
+
233
+struct intel_engine_capture_vma *
234
+intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
235
+ struct i915_request *rq,
236
+ gfp_t gfp);
237
+
238
+void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
239
+ struct intel_engine_capture_vma *capture,
240
+ struct i915_vma_compress *compress);
241
+
242
+struct i915_vma_compress *
243
+i915_vma_capture_prepare(struct intel_gt_coredump *gt);
244
+
245
+void i915_vma_capture_finish(struct intel_gt_coredump *gt,
246
+ struct i915_vma_compress *compress);
247
+
248
+void i915_error_state_store(struct i915_gpu_coredump *error);
249
+
250
+static inline struct i915_gpu_coredump *
251
+i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
332252 {
333253 kref_get(&gpu->ref);
334254 return gpu;
335255 }
336256
337
-void __i915_gpu_state_free(struct kref *kref);
338
-static inline void i915_gpu_state_put(struct i915_gpu_state *gpu)
257
+ssize_t
258
+i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
259
+ char *buf, loff_t offset, size_t count);
260
+
261
+void __i915_gpu_coredump_free(struct kref *kref);
262
+static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
339263 {
340264 if (gpu)
341
- kref_put(&gpu->ref, __i915_gpu_state_free);
265
+ kref_put(&gpu->ref, __i915_gpu_coredump_free);
342266 }
343267
344
-struct i915_gpu_state *i915_first_error_state(struct drm_i915_private *i915);
268
+struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);
345269 void i915_reset_error_state(struct drm_i915_private *i915);
270
+void i915_disable_error_state(struct drm_i915_private *i915, int err);
346271
347272 #else
348273
349
-static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
350
- u32 engine_mask,
351
- const char *error_msg)
274
+static inline void i915_capture_error_state(struct drm_i915_private *i915)
352275 {
353276 }
354277
355
-static inline struct i915_gpu_state *
356
-i915_first_error_state(struct drm_i915_private *i915)
278
+static inline struct i915_gpu_coredump *
279
+i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
357280 {
358281 return NULL;
359282 }
360283
284
+static inline struct intel_gt_coredump *
285
+intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp)
286
+{
287
+ return NULL;
288
+}
289
+
290
+static inline struct intel_engine_coredump *
291
+intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp)
292
+{
293
+ return NULL;
294
+}
295
+
296
+static inline struct intel_engine_capture_vma *
297
+intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
298
+ struct i915_request *rq,
299
+ gfp_t gfp)
300
+{
301
+ return NULL;
302
+}
303
+
304
+static inline void
305
+intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
306
+ struct intel_engine_capture_vma *capture,
307
+ struct i915_vma_compress *compress)
308
+{
309
+}
310
+
311
+static inline struct i915_vma_compress *
312
+i915_vma_capture_prepare(struct intel_gt_coredump *gt)
313
+{
314
+ return NULL;
315
+}
316
+
317
+static inline void
318
+i915_vma_capture_finish(struct intel_gt_coredump *gt,
319
+ struct i915_vma_compress *compress)
320
+{
321
+}
322
+
323
+static inline void
324
+i915_error_state_store(struct i915_gpu_coredump *error)
325
+{
326
+}
327
+
328
+static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
329
+{
330
+}
331
+
332
+static inline struct i915_gpu_coredump *
333
+i915_first_error_state(struct drm_i915_private *i915)
334
+{
335
+ return ERR_PTR(-ENODEV);
336
+}
337
+
361338 static inline void i915_reset_error_state(struct drm_i915_private *i915)
362339 {
363340 }
364341
342
+static inline void i915_disable_error_state(struct drm_i915_private *i915,
343
+ int err)
344
+{
345
+}
346
+
365347 #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
366348
367349 #endif /* _I915_GPU_ERROR_H_ */