.. | .. |
---|
22 | 22 | * |
---|
23 | 23 | */ |
---|
24 | 24 | |
---|
25 | | -#include <linux/prefetch.h> |
---|
26 | 25 | #include <linux/dma-fence-array.h> |
---|
| 26 | +#include <linux/dma-fence-chain.h> |
---|
| 27 | +#include <linux/irq_work.h> |
---|
| 28 | +#include <linux/prefetch.h> |
---|
27 | 29 | #include <linux/sched.h> |
---|
28 | 30 | #include <linux/sched/clock.h> |
---|
29 | 31 | #include <linux/sched/signal.h> |
---|
30 | 32 | |
---|
| 33 | +#include "gem/i915_gem_context.h" |
---|
| 34 | +#include "gt/intel_breadcrumbs.h" |
---|
| 35 | +#include "gt/intel_context.h" |
---|
| 36 | +#include "gt/intel_ring.h" |
---|
| 37 | +#include "gt/intel_rps.h" |
---|
| 38 | + |
---|
| 39 | +#include "i915_active.h" |
---|
31 | 40 | #include "i915_drv.h" |
---|
| 41 | +#include "i915_globals.h" |
---|
| 42 | +#include "i915_trace.h" |
---|
| 43 | +#include "intel_pm.h" |
---|
| 44 | + |
---|
| 45 | +struct execute_cb { |
---|
| 46 | + struct irq_work work; |
---|
| 47 | + struct i915_sw_fence *fence; |
---|
| 48 | + void (*hook)(struct i915_request *rq, struct dma_fence *signal); |
---|
| 49 | + struct i915_request *signal; |
---|
| 50 | +}; |
---|
| 51 | + |
---|
| 52 | +static struct i915_global_request { |
---|
| 53 | + struct i915_global base; |
---|
| 54 | + struct kmem_cache *slab_requests; |
---|
| 55 | + struct kmem_cache *slab_execute_cbs; |
---|
| 56 | +} global; |
---|
32 | 57 | |
---|
33 | 58 | static const char *i915_fence_get_driver_name(struct dma_fence *fence) |
---|
34 | 59 | { |
---|
35 | | - return "i915"; |
---|
| 60 | + return dev_name(to_request(fence)->engine->i915->drm.dev); |
---|
36 | 61 | } |
---|
37 | 62 | |
---|
38 | 63 | static const char *i915_fence_get_timeline_name(struct dma_fence *fence) |
---|
39 | 64 | { |
---|
| 65 | + const struct i915_gem_context *ctx; |
---|
| 66 | + |
---|
40 | 67 | /* |
---|
41 | 68 | * The timeline struct (as part of the ppgtt underneath a context) |
---|
42 | 69 | * may be freed when the request is no longer in use by the GPU. |
---|
.. | .. |
---|
49 | 76 | if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) |
---|
50 | 77 | return "signaled"; |
---|
51 | 78 | |
---|
52 | | - return to_request(fence)->timeline->name; |
---|
| 79 | + ctx = i915_request_gem_context(to_request(fence)); |
---|
| 80 | + if (!ctx) |
---|
| 81 | + return "[" DRIVER_NAME "]"; |
---|
| 82 | + |
---|
| 83 | + return ctx->name; |
---|
53 | 84 | } |
---|
54 | 85 | |
---|
55 | 86 | static bool i915_fence_signaled(struct dma_fence *fence) |
---|
.. | .. |
---|
59 | 90 | |
---|
60 | 91 | static bool i915_fence_enable_signaling(struct dma_fence *fence) |
---|
61 | 92 | { |
---|
62 | | - return intel_engine_enable_signaling(to_request(fence), true); |
---|
| 93 | + return i915_request_enable_breadcrumb(to_request(fence)); |
---|
63 | 94 | } |
---|
64 | 95 | |
---|
65 | 96 | static signed long i915_fence_wait(struct dma_fence *fence, |
---|
66 | 97 | bool interruptible, |
---|
67 | 98 | signed long timeout) |
---|
68 | 99 | { |
---|
69 | | - return i915_request_wait(to_request(fence), interruptible, timeout); |
---|
| 100 | + return i915_request_wait(to_request(fence), |
---|
| 101 | + interruptible | I915_WAIT_PRIORITY, |
---|
| 102 | + timeout); |
---|
| 103 | +} |
---|
| 104 | + |
---|
| 105 | +struct kmem_cache *i915_request_slab_cache(void) |
---|
| 106 | +{ |
---|
| 107 | + return global.slab_requests; |
---|
70 | 108 | } |
---|
71 | 109 | |
---|
72 | 110 | static void i915_fence_release(struct dma_fence *fence) |
---|
.. | .. |
---|
81 | 119 | * caught trying to reuse dead objects. |
---|
82 | 120 | */ |
---|
83 | 121 | i915_sw_fence_fini(&rq->submit); |
---|
| 122 | + i915_sw_fence_fini(&rq->semaphore); |
---|
84 | 123 | |
---|
85 | | - kmem_cache_free(rq->i915->requests, rq); |
---|
| 124 | + /* |
---|
| 125 | + * Keep one request on each engine for reserved use under mempressure |
---|
| 126 | + * |
---|
| 127 | + * We do not hold a reference to the engine here and so have to be |
---|
| 128 | + * very careful in what rq->engine we poke. The virtual engine is |
---|
| 129 | + * referenced via the rq->context and we released that ref during |
---|
| 130 | + * i915_request_retire(), ergo we must not dereference a virtual |
---|
| 131 | + * engine here. Not that we would want to, as the only consumer of |
---|
| 132 | + * the reserved engine->request_pool is the power management parking, |
---|
| 133 | + * which must-not-fail, and that is only run on the physical engines. |
---|
| 134 | + * |
---|
| 135 | + * Since the request must have been executed to be have completed, |
---|
| 136 | + * we know that it will have been processed by the HW and will |
---|
| 137 | + * not be unsubmitted again, so rq->engine and rq->execution_mask |
---|
| 138 | + * at this point is stable. rq->execution_mask will be a single |
---|
| 139 | + * bit if the last and _only_ engine it could execution on was a |
---|
| 140 | + * physical engine, if it's multiple bits then it started on and |
---|
| 141 | + * could still be on a virtual engine. Thus if the mask is not a |
---|
| 142 | + * power-of-two we assume that rq->engine may still be a virtual |
---|
| 143 | + * engine and so a dangling invalid pointer that we cannot dereference |
---|
| 144 | + * |
---|
| 145 | + * For example, consider the flow of a bonded request through a virtual |
---|
| 146 | + * engine. The request is created with a wide engine mask (all engines |
---|
| 147 | + * that we might execute on). On processing the bond, the request mask |
---|
| 148 | + * is reduced to one or more engines. If the request is subsequently |
---|
| 149 | + * bound to a single engine, it will then be constrained to only |
---|
| 150 | + * execute on that engine and never returned to the virtual engine |
---|
| 151 | + * after timeslicing away, see __unwind_incomplete_requests(). Thus we |
---|
| 152 | + * know that if the rq->execution_mask is a single bit, rq->engine |
---|
| 153 | + * can be a physical engine with the exact corresponding mask. |
---|
| 154 | + */ |
---|
| 155 | + if (is_power_of_2(rq->execution_mask) && |
---|
| 156 | + !cmpxchg(&rq->engine->request_pool, NULL, rq)) |
---|
| 157 | + return; |
---|
| 158 | + |
---|
| 159 | + kmem_cache_free(global.slab_requests, rq); |
---|
86 | 160 | } |
---|
87 | 161 | |
---|
88 | 162 | const struct dma_fence_ops i915_fence_ops = { |
---|
.. | .. |
---|
94 | 168 | .release = i915_fence_release, |
---|
95 | 169 | }; |
---|
96 | 170 | |
---|
97 | | -static inline void |
---|
98 | | -i915_request_remove_from_client(struct i915_request *request) |
---|
| 171 | +static void irq_execute_cb(struct irq_work *wrk) |
---|
99 | 172 | { |
---|
100 | | - struct drm_i915_file_private *file_priv; |
---|
| 173 | + struct execute_cb *cb = container_of(wrk, typeof(*cb), work); |
---|
101 | 174 | |
---|
102 | | - file_priv = request->file_priv; |
---|
103 | | - if (!file_priv) |
---|
| 175 | + i915_sw_fence_complete(cb->fence); |
---|
| 176 | + kmem_cache_free(global.slab_execute_cbs, cb); |
---|
| 177 | +} |
---|
| 178 | + |
---|
| 179 | +static void irq_execute_cb_hook(struct irq_work *wrk) |
---|
| 180 | +{ |
---|
| 181 | + struct execute_cb *cb = container_of(wrk, typeof(*cb), work); |
---|
| 182 | + |
---|
| 183 | + cb->hook(container_of(cb->fence, struct i915_request, submit), |
---|
| 184 | + &cb->signal->fence); |
---|
| 185 | + i915_request_put(cb->signal); |
---|
| 186 | + |
---|
| 187 | + irq_execute_cb(wrk); |
---|
| 188 | +} |
---|
| 189 | + |
---|
| 190 | +static __always_inline void |
---|
| 191 | +__notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk)) |
---|
| 192 | +{ |
---|
| 193 | + struct execute_cb *cb, *cn; |
---|
| 194 | + |
---|
| 195 | + if (llist_empty(&rq->execute_cb)) |
---|
104 | 196 | return; |
---|
105 | 197 | |
---|
106 | | - spin_lock(&file_priv->mm.lock); |
---|
107 | | - if (request->file_priv) { |
---|
108 | | - list_del(&request->client_link); |
---|
109 | | - request->file_priv = NULL; |
---|
| 198 | + llist_for_each_entry_safe(cb, cn, |
---|
| 199 | + llist_del_all(&rq->execute_cb), |
---|
| 200 | + work.llnode) |
---|
| 201 | + fn(&cb->work); |
---|
| 202 | +} |
---|
| 203 | + |
---|
| 204 | +static void __notify_execute_cb_irq(struct i915_request *rq) |
---|
| 205 | +{ |
---|
| 206 | + __notify_execute_cb(rq, irq_work_queue); |
---|
| 207 | +} |
---|
| 208 | + |
---|
| 209 | +static bool irq_work_imm(struct irq_work *wrk) |
---|
| 210 | +{ |
---|
| 211 | + wrk->func(wrk); |
---|
| 212 | + return false; |
---|
| 213 | +} |
---|
| 214 | + |
---|
| 215 | +static void __notify_execute_cb_imm(struct i915_request *rq) |
---|
| 216 | +{ |
---|
| 217 | + __notify_execute_cb(rq, irq_work_imm); |
---|
| 218 | +} |
---|
| 219 | + |
---|
| 220 | +static void free_capture_list(struct i915_request *request) |
---|
| 221 | +{ |
---|
| 222 | + struct i915_capture_list *capture; |
---|
| 223 | + |
---|
| 224 | + capture = fetch_and_zero(&request->capture_list); |
---|
| 225 | + while (capture) { |
---|
| 226 | + struct i915_capture_list *next = capture->next; |
---|
| 227 | + |
---|
| 228 | + kfree(capture); |
---|
| 229 | + capture = next; |
---|
110 | 230 | } |
---|
111 | | - spin_unlock(&file_priv->mm.lock); |
---|
112 | 231 | } |
---|
113 | 232 | |
---|
114 | | -static struct i915_dependency * |
---|
115 | | -i915_dependency_alloc(struct drm_i915_private *i915) |
---|
| 233 | +static void __i915_request_fill(struct i915_request *rq, u8 val) |
---|
116 | 234 | { |
---|
117 | | - return kmem_cache_alloc(i915->dependencies, GFP_KERNEL); |
---|
| 235 | + void *vaddr = rq->ring->vaddr; |
---|
| 236 | + u32 head; |
---|
| 237 | + |
---|
| 238 | + head = rq->infix; |
---|
| 239 | + if (rq->postfix < head) { |
---|
| 240 | + memset(vaddr + head, val, rq->ring->size - head); |
---|
| 241 | + head = 0; |
---|
| 242 | + } |
---|
| 243 | + memset(vaddr + head, val, rq->postfix - head); |
---|
118 | 244 | } |
---|
119 | 245 | |
---|
120 | | -static void |
---|
121 | | -i915_dependency_free(struct drm_i915_private *i915, |
---|
122 | | - struct i915_dependency *dep) |
---|
| 246 | +static void remove_from_engine(struct i915_request *rq) |
---|
123 | 247 | { |
---|
124 | | - kmem_cache_free(i915->dependencies, dep); |
---|
125 | | -} |
---|
126 | | - |
---|
127 | | -static void |
---|
128 | | -__i915_sched_node_add_dependency(struct i915_sched_node *node, |
---|
129 | | - struct i915_sched_node *signal, |
---|
130 | | - struct i915_dependency *dep, |
---|
131 | | - unsigned long flags) |
---|
132 | | -{ |
---|
133 | | - INIT_LIST_HEAD(&dep->dfs_link); |
---|
134 | | - list_add(&dep->wait_link, &signal->waiters_list); |
---|
135 | | - list_add(&dep->signal_link, &node->signalers_list); |
---|
136 | | - dep->signaler = signal; |
---|
137 | | - dep->flags = flags; |
---|
138 | | -} |
---|
139 | | - |
---|
140 | | -static int |
---|
141 | | -i915_sched_node_add_dependency(struct drm_i915_private *i915, |
---|
142 | | - struct i915_sched_node *node, |
---|
143 | | - struct i915_sched_node *signal) |
---|
144 | | -{ |
---|
145 | | - struct i915_dependency *dep; |
---|
146 | | - |
---|
147 | | - dep = i915_dependency_alloc(i915); |
---|
148 | | - if (!dep) |
---|
149 | | - return -ENOMEM; |
---|
150 | | - |
---|
151 | | - __i915_sched_node_add_dependency(node, signal, dep, |
---|
152 | | - I915_DEPENDENCY_ALLOC); |
---|
153 | | - return 0; |
---|
154 | | -} |
---|
155 | | - |
---|
156 | | -static void |
---|
157 | | -i915_sched_node_fini(struct drm_i915_private *i915, |
---|
158 | | - struct i915_sched_node *node) |
---|
159 | | -{ |
---|
160 | | - struct i915_dependency *dep, *tmp; |
---|
161 | | - |
---|
162 | | - GEM_BUG_ON(!list_empty(&node->link)); |
---|
| 248 | + struct intel_engine_cs *engine, *locked; |
---|
163 | 249 | |
---|
164 | 250 | /* |
---|
165 | | - * Everyone we depended upon (the fences we wait to be signaled) |
---|
166 | | - * should retire before us and remove themselves from our list. |
---|
167 | | - * However, retirement is run independently on each timeline and |
---|
168 | | - * so we may be called out-of-order. |
---|
| 251 | + * Virtual engines complicate acquiring the engine timeline lock, |
---|
| 252 | + * as their rq->engine pointer is not stable until under that |
---|
| 253 | + * engine lock. The simple ploy we use is to take the lock then |
---|
| 254 | + * check that the rq still belongs to the newly locked engine. |
---|
169 | 255 | */ |
---|
170 | | - list_for_each_entry_safe(dep, tmp, &node->signalers_list, signal_link) { |
---|
171 | | - GEM_BUG_ON(!i915_sched_node_signaled(dep->signaler)); |
---|
172 | | - GEM_BUG_ON(!list_empty(&dep->dfs_link)); |
---|
173 | | - |
---|
174 | | - list_del(&dep->wait_link); |
---|
175 | | - if (dep->flags & I915_DEPENDENCY_ALLOC) |
---|
176 | | - i915_dependency_free(i915, dep); |
---|
| 256 | + locked = READ_ONCE(rq->engine); |
---|
| 257 | + spin_lock_irq(&locked->active.lock); |
---|
| 258 | + while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { |
---|
| 259 | + spin_unlock(&locked->active.lock); |
---|
| 260 | + spin_lock(&engine->active.lock); |
---|
| 261 | + locked = engine; |
---|
177 | 262 | } |
---|
| 263 | + list_del_init(&rq->sched.link); |
---|
178 | 264 | |
---|
179 | | - /* Remove ourselves from everyone who depends upon us */ |
---|
180 | | - list_for_each_entry_safe(dep, tmp, &node->waiters_list, wait_link) { |
---|
181 | | - GEM_BUG_ON(dep->signaler != node); |
---|
182 | | - GEM_BUG_ON(!list_empty(&dep->dfs_link)); |
---|
| 265 | + clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); |
---|
| 266 | + clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); |
---|
183 | 267 | |
---|
184 | | - list_del(&dep->signal_link); |
---|
185 | | - if (dep->flags & I915_DEPENDENCY_ALLOC) |
---|
186 | | - i915_dependency_free(i915, dep); |
---|
187 | | - } |
---|
| 268 | + /* Prevent further __await_execution() registering a cb, then flush */ |
---|
| 269 | + set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); |
---|
| 270 | + |
---|
| 271 | + spin_unlock_irq(&locked->active.lock); |
---|
| 272 | + |
---|
| 273 | + __notify_execute_cb_imm(rq); |
---|
188 | 274 | } |
---|
189 | 275 | |
---|
190 | | -static void |
---|
191 | | -i915_sched_node_init(struct i915_sched_node *node) |
---|
| 276 | +bool i915_request_retire(struct i915_request *rq) |
---|
192 | 277 | { |
---|
193 | | - INIT_LIST_HEAD(&node->signalers_list); |
---|
194 | | - INIT_LIST_HEAD(&node->waiters_list); |
---|
195 | | - INIT_LIST_HEAD(&node->link); |
---|
196 | | - node->attr.priority = I915_PRIORITY_INVALID; |
---|
197 | | -} |
---|
| 278 | + if (!i915_request_completed(rq)) |
---|
| 279 | + return false; |
---|
198 | 280 | |
---|
199 | | -static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno) |
---|
200 | | -{ |
---|
201 | | - struct intel_engine_cs *engine; |
---|
202 | | - struct i915_timeline *timeline; |
---|
203 | | - enum intel_engine_id id; |
---|
204 | | - int ret; |
---|
| 281 | + RQ_TRACE(rq, "\n"); |
---|
205 | 282 | |
---|
206 | | - /* Carefully retire all requests without writing to the rings */ |
---|
207 | | - ret = i915_gem_wait_for_idle(i915, |
---|
208 | | - I915_WAIT_INTERRUPTIBLE | |
---|
209 | | - I915_WAIT_LOCKED, |
---|
210 | | - MAX_SCHEDULE_TIMEOUT); |
---|
211 | | - if (ret) |
---|
212 | | - return ret; |
---|
213 | | - |
---|
214 | | - GEM_BUG_ON(i915->gt.active_requests); |
---|
215 | | - |
---|
216 | | - /* If the seqno wraps around, we need to clear the breadcrumb rbtree */ |
---|
217 | | - for_each_engine(engine, i915, id) { |
---|
218 | | - GEM_TRACE("%s seqno %d (current %d) -> %d\n", |
---|
219 | | - engine->name, |
---|
220 | | - engine->timeline.seqno, |
---|
221 | | - intel_engine_get_seqno(engine), |
---|
222 | | - seqno); |
---|
223 | | - |
---|
224 | | - if (!i915_seqno_passed(seqno, engine->timeline.seqno)) { |
---|
225 | | - /* Flush any waiters before we reuse the seqno */ |
---|
226 | | - intel_engine_disarm_breadcrumbs(engine); |
---|
227 | | - intel_engine_init_hangcheck(engine); |
---|
228 | | - GEM_BUG_ON(!list_empty(&engine->breadcrumbs.signals)); |
---|
229 | | - } |
---|
230 | | - |
---|
231 | | - /* Check we are idle before we fiddle with hw state! */ |
---|
232 | | - GEM_BUG_ON(!intel_engine_is_idle(engine)); |
---|
233 | | - GEM_BUG_ON(i915_gem_active_isset(&engine->timeline.last_request)); |
---|
234 | | - |
---|
235 | | - /* Finally reset hw state */ |
---|
236 | | - intel_engine_init_global_seqno(engine, seqno); |
---|
237 | | - engine->timeline.seqno = seqno; |
---|
238 | | - } |
---|
239 | | - |
---|
240 | | - list_for_each_entry(timeline, &i915->gt.timelines, link) |
---|
241 | | - memset(timeline->global_sync, 0, sizeof(timeline->global_sync)); |
---|
242 | | - |
---|
243 | | - i915->gt.request_serial = seqno; |
---|
244 | | - |
---|
245 | | - return 0; |
---|
246 | | -} |
---|
247 | | - |
---|
248 | | -int i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno) |
---|
249 | | -{ |
---|
250 | | - struct drm_i915_private *i915 = to_i915(dev); |
---|
251 | | - |
---|
252 | | - lockdep_assert_held(&i915->drm.struct_mutex); |
---|
253 | | - |
---|
254 | | - if (seqno == 0) |
---|
255 | | - return -EINVAL; |
---|
256 | | - |
---|
257 | | - /* HWS page needs to be set less than what we will inject to ring */ |
---|
258 | | - return reset_all_global_seqno(i915, seqno - 1); |
---|
259 | | -} |
---|
260 | | - |
---|
261 | | -static int reserve_gt(struct drm_i915_private *i915) |
---|
262 | | -{ |
---|
263 | | - int ret; |
---|
264 | | - |
---|
265 | | - /* |
---|
266 | | - * Reservation is fine until we may need to wrap around |
---|
267 | | - * |
---|
268 | | - * By incrementing the serial for every request, we know that no |
---|
269 | | - * individual engine may exceed that serial (as each is reset to 0 |
---|
270 | | - * on any wrap). This protects even the most pessimistic of migrations |
---|
271 | | - * of every request from all engines onto just one. |
---|
272 | | - */ |
---|
273 | | - while (unlikely(++i915->gt.request_serial == 0)) { |
---|
274 | | - ret = reset_all_global_seqno(i915, 0); |
---|
275 | | - if (ret) { |
---|
276 | | - i915->gt.request_serial--; |
---|
277 | | - return ret; |
---|
278 | | - } |
---|
279 | | - } |
---|
280 | | - |
---|
281 | | - if (!i915->gt.active_requests++) |
---|
282 | | - i915_gem_unpark(i915); |
---|
283 | | - |
---|
284 | | - return 0; |
---|
285 | | -} |
---|
286 | | - |
---|
287 | | -static void unreserve_gt(struct drm_i915_private *i915) |
---|
288 | | -{ |
---|
289 | | - GEM_BUG_ON(!i915->gt.active_requests); |
---|
290 | | - if (!--i915->gt.active_requests) |
---|
291 | | - i915_gem_park(i915); |
---|
292 | | -} |
---|
293 | | - |
---|
294 | | -void i915_gem_retire_noop(struct i915_gem_active *active, |
---|
295 | | - struct i915_request *request) |
---|
296 | | -{ |
---|
297 | | - /* Space left intentionally blank */ |
---|
298 | | -} |
---|
299 | | - |
---|
300 | | -static void advance_ring(struct i915_request *request) |
---|
301 | | -{ |
---|
302 | | - struct intel_ring *ring = request->ring; |
---|
303 | | - unsigned int tail; |
---|
| 283 | + GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); |
---|
| 284 | + trace_i915_request_retire(rq); |
---|
| 285 | + i915_request_mark_complete(rq); |
---|
304 | 286 | |
---|
305 | 287 | /* |
---|
306 | 288 | * We know the GPU must have read the request to have |
---|
.. | .. |
---|
311 | 293 | * Note this requires that we are always called in request |
---|
312 | 294 | * completion order. |
---|
313 | 295 | */ |
---|
314 | | - GEM_BUG_ON(!list_is_first(&request->ring_link, &ring->request_list)); |
---|
315 | | - if (list_is_last(&request->ring_link, &ring->request_list)) { |
---|
316 | | - /* |
---|
317 | | - * We may race here with execlists resubmitting this request |
---|
318 | | - * as we retire it. The resubmission will move the ring->tail |
---|
319 | | - * forwards (to request->wa_tail). We either read the |
---|
320 | | - * current value that was written to hw, or the value that |
---|
321 | | - * is just about to be. Either works, if we miss the last two |
---|
322 | | - * noops - they are safe to be replayed on a reset. |
---|
323 | | - */ |
---|
324 | | - GEM_TRACE("marking %s as inactive\n", ring->timeline->name); |
---|
325 | | - tail = READ_ONCE(request->tail); |
---|
326 | | - list_del(&ring->active_link); |
---|
327 | | - } else { |
---|
328 | | - tail = request->postfix; |
---|
329 | | - } |
---|
330 | | - list_del_init(&request->ring_link); |
---|
| 296 | + GEM_BUG_ON(!list_is_first(&rq->link, |
---|
| 297 | + &i915_request_timeline(rq)->requests)); |
---|
| 298 | + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) |
---|
| 299 | + /* Poison before we release our space in the ring */ |
---|
| 300 | + __i915_request_fill(rq, POISON_FREE); |
---|
| 301 | + rq->ring->head = rq->postfix; |
---|
331 | 302 | |
---|
332 | | - ring->head = tail; |
---|
333 | | -} |
---|
334 | | - |
---|
335 | | -static void free_capture_list(struct i915_request *request) |
---|
336 | | -{ |
---|
337 | | - struct i915_capture_list *capture; |
---|
338 | | - |
---|
339 | | - capture = request->capture_list; |
---|
340 | | - while (capture) { |
---|
341 | | - struct i915_capture_list *next = capture->next; |
---|
342 | | - |
---|
343 | | - kfree(capture); |
---|
344 | | - capture = next; |
---|
345 | | - } |
---|
346 | | -} |
---|
347 | | - |
---|
348 | | -static void __retire_engine_request(struct intel_engine_cs *engine, |
---|
349 | | - struct i915_request *rq) |
---|
350 | | -{ |
---|
351 | | - GEM_TRACE("%s(%s) fence %llx:%d, global=%d, current %d\n", |
---|
352 | | - __func__, engine->name, |
---|
353 | | - rq->fence.context, rq->fence.seqno, |
---|
354 | | - rq->global_seqno, |
---|
355 | | - intel_engine_get_seqno(engine)); |
---|
356 | | - |
---|
357 | | - GEM_BUG_ON(!i915_request_completed(rq)); |
---|
358 | | - |
---|
359 | | - local_irq_disable(); |
---|
360 | | - |
---|
361 | | - spin_lock(&engine->timeline.lock); |
---|
362 | | - GEM_BUG_ON(!list_is_first(&rq->link, &engine->timeline.requests)); |
---|
363 | | - list_del_init(&rq->link); |
---|
364 | | - spin_unlock(&engine->timeline.lock); |
---|
365 | | - |
---|
366 | | - spin_lock(&rq->lock); |
---|
367 | | - if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) |
---|
| 303 | + if (!i915_request_signaled(rq)) { |
---|
| 304 | + spin_lock_irq(&rq->lock); |
---|
368 | 305 | dma_fence_signal_locked(&rq->fence); |
---|
369 | | - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags)) |
---|
370 | | - intel_engine_cancel_signaling(rq); |
---|
371 | | - if (rq->waitboost) { |
---|
372 | | - GEM_BUG_ON(!atomic_read(&rq->i915->gt_pm.rps.num_waiters)); |
---|
373 | | - atomic_dec(&rq->i915->gt_pm.rps.num_waiters); |
---|
| 306 | + spin_unlock_irq(&rq->lock); |
---|
374 | 307 | } |
---|
375 | | - spin_unlock(&rq->lock); |
---|
376 | 308 | |
---|
377 | | - local_irq_enable(); |
---|
| 309 | + if (i915_request_has_waitboost(rq)) { |
---|
| 310 | + GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters)); |
---|
| 311 | + atomic_dec(&rq->engine->gt->rps.num_waiters); |
---|
| 312 | + } |
---|
378 | 313 | |
---|
379 | 314 | /* |
---|
380 | | - * The backing object for the context is done after switching to the |
---|
381 | | - * *next* context. Therefore we cannot retire the previous context until |
---|
382 | | - * the next context has already started running. However, since we |
---|
383 | | - * cannot take the required locks at i915_request_submit() we |
---|
384 | | - * defer the unpinning of the active context to now, retirement of |
---|
385 | | - * the subsequent request. |
---|
386 | | - */ |
---|
387 | | - if (engine->last_retired_context) |
---|
388 | | - intel_context_unpin(engine->last_retired_context); |
---|
389 | | - engine->last_retired_context = rq->hw_context; |
---|
390 | | -} |
---|
391 | | - |
---|
392 | | -static void __retire_engine_upto(struct intel_engine_cs *engine, |
---|
393 | | - struct i915_request *rq) |
---|
394 | | -{ |
---|
395 | | - struct i915_request *tmp; |
---|
396 | | - |
---|
397 | | - if (list_empty(&rq->link)) |
---|
398 | | - return; |
---|
399 | | - |
---|
400 | | - do { |
---|
401 | | - tmp = list_first_entry(&engine->timeline.requests, |
---|
402 | | - typeof(*tmp), link); |
---|
403 | | - |
---|
404 | | - GEM_BUG_ON(tmp->engine != engine); |
---|
405 | | - __retire_engine_request(engine, tmp); |
---|
406 | | - } while (tmp != rq); |
---|
407 | | -} |
---|
408 | | - |
---|
409 | | -static void i915_request_retire(struct i915_request *request) |
---|
410 | | -{ |
---|
411 | | - struct i915_gem_active *active, *next; |
---|
412 | | - |
---|
413 | | - GEM_TRACE("%s fence %llx:%d, global=%d, current %d\n", |
---|
414 | | - request->engine->name, |
---|
415 | | - request->fence.context, request->fence.seqno, |
---|
416 | | - request->global_seqno, |
---|
417 | | - intel_engine_get_seqno(request->engine)); |
---|
418 | | - |
---|
419 | | - lockdep_assert_held(&request->i915->drm.struct_mutex); |
---|
420 | | - GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit)); |
---|
421 | | - GEM_BUG_ON(!i915_request_completed(request)); |
---|
422 | | - |
---|
423 | | - trace_i915_request_retire(request); |
---|
424 | | - |
---|
425 | | - advance_ring(request); |
---|
426 | | - free_capture_list(request); |
---|
427 | | - |
---|
428 | | - /* |
---|
429 | | - * Walk through the active list, calling retire on each. This allows |
---|
430 | | - * objects to track their GPU activity and mark themselves as idle |
---|
431 | | - * when their *last* active request is completed (updating state |
---|
432 | | - * tracking lists for eviction, active references for GEM, etc). |
---|
| 315 | + * We only loosely track inflight requests across preemption, |
---|
| 316 | + * and so we may find ourselves attempting to retire a _completed_ |
---|
| 317 | + * request that we have removed from the HW and put back on a run |
---|
| 318 | + * queue. |
---|
433 | 319 | * |
---|
434 | | - * As the ->retire() may free the node, we decouple it first and |
---|
435 | | - * pass along the auxiliary information (to avoid dereferencing |
---|
436 | | - * the node after the callback). |
---|
| 320 | + * As we set I915_FENCE_FLAG_ACTIVE on the request, this should be |
---|
| 321 | + * after removing the breadcrumb and signaling it, so that we do not |
---|
| 322 | + * inadvertently attach the breadcrumb to a completed request. |
---|
437 | 323 | */ |
---|
438 | | - list_for_each_entry_safe(active, next, &request->active_list, link) { |
---|
439 | | - /* |
---|
440 | | - * In microbenchmarks or focusing upon time inside the kernel, |
---|
441 | | - * we may spend an inordinate amount of time simply handling |
---|
442 | | - * the retirement of requests and processing their callbacks. |
---|
443 | | - * Of which, this loop itself is particularly hot due to the |
---|
444 | | - * cache misses when jumping around the list of i915_gem_active. |
---|
445 | | - * So we try to keep this loop as streamlined as possible and |
---|
446 | | - * also prefetch the next i915_gem_active to try and hide |
---|
447 | | - * the likely cache miss. |
---|
448 | | - */ |
---|
449 | | - prefetchw(next); |
---|
| 324 | + remove_from_engine(rq); |
---|
| 325 | + GEM_BUG_ON(!llist_empty(&rq->execute_cb)); |
---|
450 | 326 | |
---|
451 | | - INIT_LIST_HEAD(&active->link); |
---|
452 | | - RCU_INIT_POINTER(active->request, NULL); |
---|
| 327 | + __list_del_entry(&rq->link); /* poison neither prev/next (RCU walks) */ |
---|
453 | 328 | |
---|
454 | | - active->retire(active, request); |
---|
455 | | - } |
---|
| 329 | + intel_context_exit(rq->context); |
---|
| 330 | + intel_context_unpin(rq->context); |
---|
456 | 331 | |
---|
457 | | - i915_request_remove_from_client(request); |
---|
| 332 | + free_capture_list(rq); |
---|
| 333 | + i915_sched_node_fini(&rq->sched); |
---|
| 334 | + i915_request_put(rq); |
---|
458 | 335 | |
---|
459 | | - /* Retirement decays the ban score as it is a sign of ctx progress */ |
---|
460 | | - atomic_dec_if_positive(&request->gem_context->ban_score); |
---|
461 | | - intel_context_unpin(request->hw_context); |
---|
462 | | - |
---|
463 | | - __retire_engine_upto(request->engine, request); |
---|
464 | | - |
---|
465 | | - unreserve_gt(request->i915); |
---|
466 | | - |
---|
467 | | - i915_sched_node_fini(request->i915, &request->sched); |
---|
468 | | - i915_request_put(request); |
---|
| 336 | + return true; |
---|
469 | 337 | } |
---|
470 | 338 | |
---|
471 | 339 | void i915_request_retire_upto(struct i915_request *rq) |
---|
472 | 340 | { |
---|
473 | | - struct intel_ring *ring = rq->ring; |
---|
| 341 | + struct intel_timeline * const tl = i915_request_timeline(rq); |
---|
474 | 342 | struct i915_request *tmp; |
---|
475 | 343 | |
---|
476 | | - GEM_TRACE("%s fence %llx:%d, global=%d, current %d\n", |
---|
477 | | - rq->engine->name, |
---|
478 | | - rq->fence.context, rq->fence.seqno, |
---|
479 | | - rq->global_seqno, |
---|
480 | | - intel_engine_get_seqno(rq->engine)); |
---|
| 344 | + RQ_TRACE(rq, "\n"); |
---|
481 | 345 | |
---|
482 | | - lockdep_assert_held(&rq->i915->drm.struct_mutex); |
---|
483 | 346 | GEM_BUG_ON(!i915_request_completed(rq)); |
---|
484 | 347 | |
---|
485 | | - if (list_empty(&rq->ring_link)) |
---|
| 348 | + do { |
---|
| 349 | + tmp = list_first_entry(&tl->requests, typeof(*tmp), link); |
---|
| 350 | + } while (i915_request_retire(tmp) && tmp != rq); |
---|
| 351 | +} |
---|
| 352 | + |
---|
| 353 | +static struct i915_request * const * |
---|
| 354 | +__engine_active(struct intel_engine_cs *engine) |
---|
| 355 | +{ |
---|
| 356 | + return READ_ONCE(engine->execlists.active); |
---|
| 357 | +} |
---|
| 358 | + |
---|
| 359 | +static bool __request_in_flight(const struct i915_request *signal) |
---|
| 360 | +{ |
---|
| 361 | + struct i915_request * const *port, *rq; |
---|
| 362 | + bool inflight = false; |
---|
| 363 | + |
---|
| 364 | + if (!i915_request_is_ready(signal)) |
---|
| 365 | + return false; |
---|
| 366 | + |
---|
| 367 | + /* |
---|
| 368 | + * Even if we have unwound the request, it may still be on |
---|
| 369 | + * the GPU (preempt-to-busy). If that request is inside an |
---|
| 370 | + * unpreemptible critical section, it will not be removed. Some |
---|
| 371 | + * GPU functions may even be stuck waiting for the paired request |
---|
| 372 | + * (__await_execution) to be submitted and cannot be preempted |
---|
| 373 | + * until the bond is executing. |
---|
| 374 | + * |
---|
| 375 | + * As we know that there are always preemption points between |
---|
| 376 | + * requests, we know that only the currently executing request |
---|
| 377 | + * may be still active even though we have cleared the flag. |
---|
| 378 | + * However, we can't rely on our tracking of ELSP[0] to know |
---|
| 379 | + * which request is currently active and so maybe stuck, as |
---|
| 380 | + * the tracking maybe an event behind. Instead assume that |
---|
| 381 | + * if the context is still inflight, then it is still active |
---|
| 382 | + * even if the active flag has been cleared. |
---|
| 383 | + * |
---|
| 384 | + * To further complicate matters, if there a pending promotion, the HW |
---|
| 385 | + * may either perform a context switch to the second inflight execlists, |
---|
| 386 | + * or it may switch to the pending set of execlists. In the case of the |
---|
| 387 | + * latter, it may send the ACK and we process the event copying the |
---|
| 388 | + * pending[] over top of inflight[], _overwriting_ our *active. Since |
---|
| 389 | + * this implies the HW is arbitrating and not struck in *active, we do |
---|
| 390 | + * not worry about complete accuracy, but we do require no read/write |
---|
| 391 | + * tearing of the pointer [the read of the pointer must be valid, even |
---|
| 392 | + * as the array is being overwritten, for which we require the writes |
---|
| 393 | + * to avoid tearing.] |
---|
| 394 | + * |
---|
| 395 | + * Note that the read of *execlists->active may race with the promotion |
---|
| 396 | + * of execlists->pending[] to execlists->inflight[], overwritting |
---|
| 397 | + * the value at *execlists->active. This is fine. The promotion implies |
---|
| 398 | + * that we received an ACK from the HW, and so the context is not |
---|
| 399 | + * stuck -- if we do not see ourselves in *active, the inflight status |
---|
| 400 | + * is valid. If instead we see ourselves being copied into *active, |
---|
| 401 | + * we are inflight and may signal the callback. |
---|
| 402 | + */ |
---|
| 403 | + if (!intel_context_inflight(signal->context)) |
---|
| 404 | + return false; |
---|
| 405 | + |
---|
| 406 | + rcu_read_lock(); |
---|
| 407 | + for (port = __engine_active(signal->engine); |
---|
| 408 | + (rq = READ_ONCE(*port)); /* may race with promotion of pending[] */ |
---|
| 409 | + port++) { |
---|
| 410 | + if (rq->context == signal->context) { |
---|
| 411 | + inflight = i915_seqno_passed(rq->fence.seqno, |
---|
| 412 | + signal->fence.seqno); |
---|
| 413 | + break; |
---|
| 414 | + } |
---|
| 415 | + } |
---|
| 416 | + rcu_read_unlock(); |
---|
| 417 | + |
---|
| 418 | + return inflight; |
---|
| 419 | +} |
---|
| 420 | + |
---|
| 421 | +static int |
---|
| 422 | +__await_execution(struct i915_request *rq, |
---|
| 423 | + struct i915_request *signal, |
---|
| 424 | + void (*hook)(struct i915_request *rq, |
---|
| 425 | + struct dma_fence *signal), |
---|
| 426 | + gfp_t gfp) |
---|
| 427 | +{ |
---|
| 428 | + struct execute_cb *cb; |
---|
| 429 | + |
---|
| 430 | + if (i915_request_is_active(signal)) { |
---|
| 431 | + if (hook) |
---|
| 432 | + hook(rq, &signal->fence); |
---|
| 433 | + return 0; |
---|
| 434 | + } |
---|
| 435 | + |
---|
| 436 | + cb = kmem_cache_alloc(global.slab_execute_cbs, gfp); |
---|
| 437 | + if (!cb) |
---|
| 438 | + return -ENOMEM; |
---|
| 439 | + |
---|
| 440 | + cb->fence = &rq->submit; |
---|
| 441 | + i915_sw_fence_await(cb->fence); |
---|
| 442 | + init_irq_work(&cb->work, irq_execute_cb); |
---|
| 443 | + |
---|
| 444 | + if (hook) { |
---|
| 445 | + cb->hook = hook; |
---|
| 446 | + cb->signal = i915_request_get(signal); |
---|
| 447 | + cb->work.func = irq_execute_cb_hook; |
---|
| 448 | + } |
---|
| 449 | + |
---|
| 450 | + /* |
---|
| 451 | + * Register the callback first, then see if the signaler is already |
---|
| 452 | + * active. This ensures that if we race with the |
---|
| 453 | + * __notify_execute_cb from i915_request_submit() and we are not |
---|
| 454 | + * included in that list, we get a second bite of the cherry and |
---|
| 455 | + * execute it ourselves. After this point, a future |
---|
| 456 | + * i915_request_submit() will notify us. |
---|
| 457 | + * |
---|
| 458 | + * In i915_request_retire() we set the ACTIVE bit on a completed |
---|
| 459 | + * request (then flush the execute_cb). So by registering the |
---|
| 460 | + * callback first, then checking the ACTIVE bit, we serialise with |
---|
| 461 | + * the completed/retired request. |
---|
| 462 | + */ |
---|
| 463 | + if (llist_add(&cb->work.llnode, &signal->execute_cb)) { |
---|
| 464 | + if (i915_request_is_active(signal) || |
---|
| 465 | + __request_in_flight(signal)) |
---|
| 466 | + __notify_execute_cb_imm(signal); |
---|
| 467 | + } |
---|
| 468 | + |
---|
| 469 | + return 0; |
---|
| 470 | +} |
---|
| 471 | + |
---|
| 472 | +static bool fatal_error(int error) |
---|
| 473 | +{ |
---|
| 474 | + switch (error) { |
---|
| 475 | + case 0: /* not an error! */ |
---|
| 476 | + case -EAGAIN: /* innocent victim of a GT reset (__i915_request_reset) */ |
---|
| 477 | + case -ETIMEDOUT: /* waiting for Godot (timer_i915_sw_fence_wake) */ |
---|
| 478 | + return false; |
---|
| 479 | + default: |
---|
| 480 | + return true; |
---|
| 481 | + } |
---|
| 482 | +} |
---|
| 483 | + |
---|
| 484 | +void __i915_request_skip(struct i915_request *rq) |
---|
| 485 | +{ |
---|
| 486 | + GEM_BUG_ON(!fatal_error(rq->fence.error)); |
---|
| 487 | + |
---|
| 488 | + if (rq->infix == rq->postfix) |
---|
486 | 489 | return; |
---|
487 | 490 | |
---|
| 491 | + /* |
---|
| 492 | + * As this request likely depends on state from the lost |
---|
| 493 | + * context, clear out all the user operations leaving the |
---|
| 494 | + * breadcrumb at the end (so we get the fence notifications). |
---|
| 495 | + */ |
---|
| 496 | + __i915_request_fill(rq, 0); |
---|
| 497 | + rq->infix = rq->postfix; |
---|
| 498 | +} |
---|
| 499 | + |
---|
| 500 | +void i915_request_set_error_once(struct i915_request *rq, int error) |
---|
| 501 | +{ |
---|
| 502 | + int old; |
---|
| 503 | + |
---|
| 504 | + GEM_BUG_ON(!IS_ERR_VALUE((long)error)); |
---|
| 505 | + |
---|
| 506 | + if (i915_request_signaled(rq)) |
---|
| 507 | + return; |
---|
| 508 | + |
---|
| 509 | + old = READ_ONCE(rq->fence.error); |
---|
488 | 510 | do { |
---|
489 | | - tmp = list_first_entry(&ring->request_list, |
---|
490 | | - typeof(*tmp), ring_link); |
---|
491 | | - |
---|
492 | | - i915_request_retire(tmp); |
---|
493 | | - } while (tmp != rq); |
---|
| 511 | + if (fatal_error(old)) |
---|
| 512 | + return; |
---|
| 513 | + } while (!try_cmpxchg(&rq->fence.error, &old, error)); |
---|
494 | 514 | } |
---|
495 | 515 | |
---|
496 | | -static u32 timeline_get_seqno(struct i915_timeline *tl) |
---|
497 | | -{ |
---|
498 | | - return ++tl->seqno; |
---|
499 | | -} |
---|
500 | | - |
---|
501 | | -static void move_to_timeline(struct i915_request *request, |
---|
502 | | - struct i915_timeline *timeline) |
---|
503 | | -{ |
---|
504 | | - GEM_BUG_ON(request->timeline == &request->engine->timeline); |
---|
505 | | - lockdep_assert_held(&request->engine->timeline.lock); |
---|
506 | | - |
---|
507 | | - spin_lock(&request->timeline->lock); |
---|
508 | | - list_move_tail(&request->link, &timeline->requests); |
---|
509 | | - spin_unlock(&request->timeline->lock); |
---|
510 | | -} |
---|
511 | | - |
---|
512 | | -void __i915_request_submit(struct i915_request *request) |
---|
| 516 | +bool __i915_request_submit(struct i915_request *request) |
---|
513 | 517 | { |
---|
514 | 518 | struct intel_engine_cs *engine = request->engine; |
---|
515 | | - u32 seqno; |
---|
| 519 | + bool result = false; |
---|
516 | 520 | |
---|
517 | | - GEM_TRACE("%s fence %llx:%d -> global=%d, current %d\n", |
---|
518 | | - engine->name, |
---|
519 | | - request->fence.context, request->fence.seqno, |
---|
520 | | - engine->timeline.seqno + 1, |
---|
521 | | - intel_engine_get_seqno(engine)); |
---|
| 521 | + RQ_TRACE(request, "\n"); |
---|
522 | 522 | |
---|
523 | 523 | GEM_BUG_ON(!irqs_disabled()); |
---|
524 | | - lockdep_assert_held(&engine->timeline.lock); |
---|
| 524 | + lockdep_assert_held(&engine->active.lock); |
---|
525 | 525 | |
---|
526 | | - GEM_BUG_ON(request->global_seqno); |
---|
| 526 | + /* |
---|
| 527 | + * With the advent of preempt-to-busy, we frequently encounter |
---|
| 528 | + * requests that we have unsubmitted from HW, but left running |
---|
| 529 | + * until the next ack and so have completed in the meantime. On |
---|
| 530 | + * resubmission of that completed request, we can skip |
---|
| 531 | + * updating the payload, and execlists can even skip submitting |
---|
| 532 | + * the request. |
---|
| 533 | + * |
---|
| 534 | + * We must remove the request from the caller's priority queue, |
---|
| 535 | + * and the caller must only call us when the request is in their |
---|
| 536 | + * priority queue, under the active.lock. This ensures that the |
---|
| 537 | + * request has *not* yet been retired and we can safely move |
---|
| 538 | + * the request into the engine->active.list where it will be |
---|
| 539 | + * dropped upon retiring. (Otherwise if resubmit a *retired* |
---|
| 540 | + * request, this would be a horrible use-after-free.) |
---|
| 541 | + */ |
---|
| 542 | + if (i915_request_completed(request)) |
---|
| 543 | + goto xfer; |
---|
527 | 544 | |
---|
528 | | - seqno = timeline_get_seqno(&engine->timeline); |
---|
529 | | - GEM_BUG_ON(!seqno); |
---|
530 | | - GEM_BUG_ON(i915_seqno_passed(intel_engine_get_seqno(engine), seqno)); |
---|
| 545 | + if (unlikely(intel_context_is_closed(request->context) && |
---|
| 546 | + !intel_engine_has_heartbeat(engine))) |
---|
| 547 | + intel_context_set_banned(request->context); |
---|
531 | 548 | |
---|
532 | | - /* We may be recursing from the signal callback of another i915 fence */ |
---|
533 | | - spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); |
---|
534 | | - request->global_seqno = seqno; |
---|
535 | | - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) |
---|
536 | | - intel_engine_enable_signaling(request, false); |
---|
537 | | - spin_unlock(&request->lock); |
---|
| 549 | + if (unlikely(intel_context_is_banned(request->context))) |
---|
| 550 | + i915_request_set_error_once(request, -EIO); |
---|
538 | 551 | |
---|
539 | | - engine->emit_breadcrumb(request, |
---|
540 | | - request->ring->vaddr + request->postfix); |
---|
| 552 | + if (unlikely(fatal_error(request->fence.error))) |
---|
| 553 | + __i915_request_skip(request); |
---|
541 | 554 | |
---|
542 | | - /* Transfer from per-context onto the global per-engine timeline */ |
---|
543 | | - move_to_timeline(request, &engine->timeline); |
---|
| 555 | + /* |
---|
| 556 | + * Are we using semaphores when the gpu is already saturated? |
---|
| 557 | + * |
---|
| 558 | + * Using semaphores incurs a cost in having the GPU poll a |
---|
| 559 | + * memory location, busywaiting for it to change. The continual |
---|
| 560 | + * memory reads can have a noticeable impact on the rest of the |
---|
| 561 | + * system with the extra bus traffic, stalling the cpu as it too |
---|
| 562 | + * tries to access memory across the bus (perf stat -e bus-cycles). |
---|
| 563 | + * |
---|
| 564 | + * If we installed a semaphore on this request and we only submit |
---|
| 565 | + * the request after the signaler completed, that indicates the |
---|
| 566 | + * system is overloaded and using semaphores at this time only |
---|
| 567 | + * increases the amount of work we are doing. If so, we disable |
---|
| 568 | + * further use of semaphores until we are idle again, whence we |
---|
| 569 | + * optimistically try again. |
---|
| 570 | + */ |
---|
| 571 | + if (request->sched.semaphores && |
---|
| 572 | + i915_sw_fence_signaled(&request->semaphore)) |
---|
| 573 | + engine->saturated |= request->sched.semaphores; |
---|
| 574 | + |
---|
| 575 | + engine->emit_fini_breadcrumb(request, |
---|
| 576 | + request->ring->vaddr + request->postfix); |
---|
544 | 577 | |
---|
545 | 578 | trace_i915_request_execute(request); |
---|
| 579 | + engine->serial++; |
---|
| 580 | + result = true; |
---|
546 | 581 | |
---|
547 | | - wake_up_all(&request->execute); |
---|
| 582 | +xfer: |
---|
| 583 | + if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) { |
---|
| 584 | + list_move_tail(&request->sched.link, &engine->active.requests); |
---|
| 585 | + clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags); |
---|
| 586 | + } |
---|
| 587 | + |
---|
| 588 | + /* |
---|
| 589 | + * XXX Rollback bonded-execution on __i915_request_unsubmit()? |
---|
| 590 | + * |
---|
| 591 | + * In the future, perhaps when we have an active time-slicing scheduler, |
---|
| 592 | + * it will be interesting to unsubmit parallel execution and remove |
---|
| 593 | + * busywaits from the GPU until their master is restarted. This is |
---|
| 594 | + * quite hairy, we have to carefully rollback the fence and do a |
---|
| 595 | + * preempt-to-idle cycle on the target engine, all the while the |
---|
| 596 | + * master execute_cb may refire. |
---|
| 597 | + */ |
---|
| 598 | + __notify_execute_cb_irq(request); |
---|
| 599 | + |
---|
| 600 | + /* We may be recursing from the signal callback of another i915 fence */ |
---|
| 601 | + if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) |
---|
| 602 | + i915_request_enable_breadcrumb(request); |
---|
| 603 | + |
---|
| 604 | + return result; |
---|
548 | 605 | } |
---|
549 | 606 | |
---|
550 | 607 | void i915_request_submit(struct i915_request *request) |
---|
.. | .. |
---|
553 | 610 | unsigned long flags; |
---|
554 | 611 | |
---|
555 | 612 | /* Will be called from irq-context when using foreign fences. */ |
---|
556 | | - spin_lock_irqsave(&engine->timeline.lock, flags); |
---|
| 613 | + spin_lock_irqsave(&engine->active.lock, flags); |
---|
557 | 614 | |
---|
558 | 615 | __i915_request_submit(request); |
---|
559 | 616 | |
---|
560 | | - spin_unlock_irqrestore(&engine->timeline.lock, flags); |
---|
| 617 | + spin_unlock_irqrestore(&engine->active.lock, flags); |
---|
561 | 618 | } |
---|
562 | 619 | |
---|
563 | 620 | void __i915_request_unsubmit(struct i915_request *request) |
---|
564 | 621 | { |
---|
565 | 622 | struct intel_engine_cs *engine = request->engine; |
---|
566 | 623 | |
---|
567 | | - GEM_TRACE("%s fence %llx:%d <- global=%d, current %d\n", |
---|
568 | | - engine->name, |
---|
569 | | - request->fence.context, request->fence.seqno, |
---|
570 | | - request->global_seqno, |
---|
571 | | - intel_engine_get_seqno(engine)); |
---|
572 | | - |
---|
573 | | - GEM_BUG_ON(!irqs_disabled()); |
---|
574 | | - lockdep_assert_held(&engine->timeline.lock); |
---|
575 | | - |
---|
576 | 624 | /* |
---|
577 | 625 | * Only unwind in reverse order, required so that the per-context list |
---|
578 | 626 | * is kept in seqno/ring order. |
---|
579 | 627 | */ |
---|
580 | | - GEM_BUG_ON(!request->global_seqno); |
---|
581 | | - GEM_BUG_ON(request->global_seqno != engine->timeline.seqno); |
---|
582 | | - GEM_BUG_ON(i915_seqno_passed(intel_engine_get_seqno(engine), |
---|
583 | | - request->global_seqno)); |
---|
584 | | - engine->timeline.seqno--; |
---|
| 628 | + RQ_TRACE(request, "\n"); |
---|
585 | 629 | |
---|
586 | | - /* We may be recursing from the signal callback of another i915 fence */ |
---|
587 | | - spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); |
---|
588 | | - request->global_seqno = 0; |
---|
| 630 | + GEM_BUG_ON(!irqs_disabled()); |
---|
| 631 | + lockdep_assert_held(&engine->active.lock); |
---|
| 632 | + |
---|
| 633 | + /* |
---|
| 634 | + * Before we remove this breadcrumb from the signal list, we have |
---|
| 635 | + * to ensure that a concurrent dma_fence_enable_signaling() does not |
---|
| 636 | + * attach itself. We first mark the request as no longer active and |
---|
| 637 | + * make sure that is visible to other cores, and then remove the |
---|
| 638 | + * breadcrumb if attached. |
---|
| 639 | + */ |
---|
| 640 | + GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); |
---|
| 641 | + clear_bit_unlock(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); |
---|
589 | 642 | if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) |
---|
590 | | - intel_engine_cancel_signaling(request); |
---|
591 | | - spin_unlock(&request->lock); |
---|
| 643 | + i915_request_cancel_breadcrumb(request); |
---|
592 | 644 | |
---|
593 | | - /* Transfer back from the global per-engine timeline to per-context */ |
---|
594 | | - move_to_timeline(request, request->timeline); |
---|
| 645 | + /* We've already spun, don't charge on resubmitting. */ |
---|
| 646 | + if (request->sched.semaphores && i915_request_started(request)) |
---|
| 647 | + request->sched.semaphores = 0; |
---|
595 | 648 | |
---|
596 | 649 | /* |
---|
597 | 650 | * We don't need to wake_up any waiters on request->execute, they |
---|
.. | .. |
---|
608 | 661 | unsigned long flags; |
---|
609 | 662 | |
---|
610 | 663 | /* Will be called from irq-context when using foreign fences. */ |
---|
611 | | - spin_lock_irqsave(&engine->timeline.lock, flags); |
---|
| 664 | + spin_lock_irqsave(&engine->active.lock, flags); |
---|
612 | 665 | |
---|
613 | 666 | __i915_request_unsubmit(request); |
---|
614 | 667 | |
---|
615 | | - spin_unlock_irqrestore(&engine->timeline.lock, flags); |
---|
| 668 | + spin_unlock_irqrestore(&engine->active.lock, flags); |
---|
616 | 669 | } |
---|
617 | 670 | |
---|
618 | 671 | static int __i915_sw_fence_call |
---|
.. | .. |
---|
624 | 677 | switch (state) { |
---|
625 | 678 | case FENCE_COMPLETE: |
---|
626 | 679 | trace_i915_request_submit(request); |
---|
| 680 | + |
---|
| 681 | + if (unlikely(fence->error)) |
---|
| 682 | + i915_request_set_error_once(request, fence->error); |
---|
| 683 | + |
---|
627 | 684 | /* |
---|
628 | 685 | * We need to serialize use of the submit_request() callback |
---|
629 | 686 | * with its hotplugging performed during an emergency |
---|
.. | .. |
---|
645 | 702 | return NOTIFY_DONE; |
---|
646 | 703 | } |
---|
647 | 704 | |
---|
648 | | -/** |
---|
649 | | - * i915_request_alloc - allocate a request structure |
---|
650 | | - * |
---|
651 | | - * @engine: engine that we wish to issue the request on. |
---|
652 | | - * @ctx: context that the request will be associated with. |
---|
653 | | - * |
---|
654 | | - * Returns a pointer to the allocated request if successful, |
---|
655 | | - * or an error code if not. |
---|
656 | | - */ |
---|
657 | | -struct i915_request * |
---|
658 | | -i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) |
---|
| 705 | +static int __i915_sw_fence_call |
---|
| 706 | +semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) |
---|
659 | 707 | { |
---|
660 | | - struct drm_i915_private *i915 = engine->i915; |
---|
| 708 | + struct i915_request *rq = container_of(fence, typeof(*rq), semaphore); |
---|
| 709 | + |
---|
| 710 | + switch (state) { |
---|
| 711 | + case FENCE_COMPLETE: |
---|
| 712 | + break; |
---|
| 713 | + |
---|
| 714 | + case FENCE_FREE: |
---|
| 715 | + i915_request_put(rq); |
---|
| 716 | + break; |
---|
| 717 | + } |
---|
| 718 | + |
---|
| 719 | + return NOTIFY_DONE; |
---|
| 720 | +} |
---|
| 721 | + |
---|
| 722 | +static void retire_requests(struct intel_timeline *tl) |
---|
| 723 | +{ |
---|
| 724 | + struct i915_request *rq, *rn; |
---|
| 725 | + |
---|
| 726 | + list_for_each_entry_safe(rq, rn, &tl->requests, link) |
---|
| 727 | + if (!i915_request_retire(rq)) |
---|
| 728 | + break; |
---|
| 729 | +} |
---|
| 730 | + |
---|
| 731 | +static noinline struct i915_request * |
---|
| 732 | +request_alloc_slow(struct intel_timeline *tl, |
---|
| 733 | + struct i915_request **rsvd, |
---|
| 734 | + gfp_t gfp) |
---|
| 735 | +{ |
---|
661 | 736 | struct i915_request *rq; |
---|
662 | | - struct intel_context *ce; |
---|
663 | | - int ret; |
---|
664 | 737 | |
---|
665 | | - lockdep_assert_held(&i915->drm.struct_mutex); |
---|
| 738 | + /* If we cannot wait, dip into our reserves */ |
---|
| 739 | + if (!gfpflags_allow_blocking(gfp)) { |
---|
| 740 | + rq = xchg(rsvd, NULL); |
---|
| 741 | + if (!rq) /* Use the normal failure path for one final WARN */ |
---|
| 742 | + goto out; |
---|
666 | 743 | |
---|
667 | | - /* |
---|
668 | | - * Preempt contexts are reserved for exclusive use to inject a |
---|
669 | | - * preemption context switch. They are never to be used for any trivial |
---|
670 | | - * request! |
---|
671 | | - */ |
---|
672 | | - GEM_BUG_ON(ctx == i915->preempt_context); |
---|
| 744 | + return rq; |
---|
| 745 | + } |
---|
673 | 746 | |
---|
674 | | - /* |
---|
675 | | - * ABI: Before userspace accesses the GPU (e.g. execbuffer), report |
---|
676 | | - * EIO if the GPU is already wedged. |
---|
677 | | - */ |
---|
678 | | - if (i915_terminally_wedged(&i915->gpu_error)) |
---|
679 | | - return ERR_PTR(-EIO); |
---|
680 | | - |
---|
681 | | - /* |
---|
682 | | - * Pinning the contexts may generate requests in order to acquire |
---|
683 | | - * GGTT space, so do this first before we reserve a seqno for |
---|
684 | | - * ourselves. |
---|
685 | | - */ |
---|
686 | | - ce = intel_context_pin(ctx, engine); |
---|
687 | | - if (IS_ERR(ce)) |
---|
688 | | - return ERR_CAST(ce); |
---|
689 | | - |
---|
690 | | - ret = reserve_gt(i915); |
---|
691 | | - if (ret) |
---|
692 | | - goto err_unpin; |
---|
693 | | - |
---|
694 | | - ret = intel_ring_wait_for_space(ce->ring, MIN_SPACE_FOR_ADD_REQUEST); |
---|
695 | | - if (ret) |
---|
696 | | - goto err_unreserve; |
---|
| 747 | + if (list_empty(&tl->requests)) |
---|
| 748 | + goto out; |
---|
697 | 749 | |
---|
698 | 750 | /* Move our oldest request to the slab-cache (if not in use!) */ |
---|
699 | | - rq = list_first_entry(&ce->ring->request_list, typeof(*rq), ring_link); |
---|
700 | | - if (!list_is_last(&rq->ring_link, &ce->ring->request_list) && |
---|
701 | | - i915_request_completed(rq)) |
---|
702 | | - i915_request_retire(rq); |
---|
| 751 | + rq = list_first_entry(&tl->requests, typeof(*rq), link); |
---|
| 752 | + i915_request_retire(rq); |
---|
| 753 | + |
---|
| 754 | + rq = kmem_cache_alloc(global.slab_requests, |
---|
| 755 | + gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); |
---|
| 756 | + if (rq) |
---|
| 757 | + return rq; |
---|
| 758 | + |
---|
| 759 | + /* Ratelimit ourselves to prevent oom from malicious clients */ |
---|
| 760 | + rq = list_last_entry(&tl->requests, typeof(*rq), link); |
---|
| 761 | + cond_synchronize_rcu(rq->rcustate); |
---|
| 762 | + |
---|
| 763 | + /* Retire our old requests in the hope that we free some */ |
---|
| 764 | + retire_requests(tl); |
---|
| 765 | + |
---|
| 766 | +out: |
---|
| 767 | + return kmem_cache_alloc(global.slab_requests, gfp); |
---|
| 768 | +} |
---|
| 769 | + |
---|
| 770 | +static void __i915_request_ctor(void *arg) |
---|
| 771 | +{ |
---|
| 772 | + struct i915_request *rq = arg; |
---|
| 773 | + |
---|
| 774 | + spin_lock_init(&rq->lock); |
---|
| 775 | + i915_sched_node_init(&rq->sched); |
---|
| 776 | + i915_sw_fence_init(&rq->submit, submit_notify); |
---|
| 777 | + i915_sw_fence_init(&rq->semaphore, semaphore_notify); |
---|
| 778 | + |
---|
| 779 | + rq->capture_list = NULL; |
---|
| 780 | + |
---|
| 781 | + init_llist_head(&rq->execute_cb); |
---|
| 782 | +} |
---|
| 783 | + |
---|
| 784 | +struct i915_request * |
---|
| 785 | +__i915_request_create(struct intel_context *ce, gfp_t gfp) |
---|
| 786 | +{ |
---|
| 787 | + struct intel_timeline *tl = ce->timeline; |
---|
| 788 | + struct i915_request *rq; |
---|
| 789 | + u32 seqno; |
---|
| 790 | + int ret; |
---|
| 791 | + |
---|
| 792 | + might_sleep_if(gfpflags_allow_blocking(gfp)); |
---|
| 793 | + |
---|
| 794 | + /* Check that the caller provided an already pinned context */ |
---|
| 795 | + __intel_context_pin(ce); |
---|
703 | 796 | |
---|
704 | 797 | /* |
---|
705 | 798 | * Beware: Dragons be flying overhead. |
---|
.. | .. |
---|
707 | 800 | * We use RCU to look up requests in flight. The lookups may |
---|
708 | 801 | * race with the request being allocated from the slab freelist. |
---|
709 | 802 | * That is the request we are writing to here, may be in the process |
---|
710 | | - * of being read by __i915_gem_active_get_rcu(). As such, |
---|
| 803 | + * of being read by __i915_active_request_get_rcu(). As such, |
---|
711 | 804 | * we have to be very careful when overwriting the contents. During |
---|
712 | 805 | * the RCU lookup, we change chase the request->engine pointer, |
---|
713 | 806 | * read the request->global_seqno and increment the reference count. |
---|
.. | .. |
---|
730 | 823 | * |
---|
731 | 824 | * Do not use kmem_cache_zalloc() here! |
---|
732 | 825 | */ |
---|
733 | | - rq = kmem_cache_alloc(i915->requests, |
---|
734 | | - GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); |
---|
| 826 | + rq = kmem_cache_alloc(global.slab_requests, |
---|
| 827 | + gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); |
---|
735 | 828 | if (unlikely(!rq)) { |
---|
736 | | - /* Ratelimit ourselves to prevent oom from malicious clients */ |
---|
737 | | - ret = i915_gem_wait_for_idle(i915, |
---|
738 | | - I915_WAIT_LOCKED | |
---|
739 | | - I915_WAIT_INTERRUPTIBLE, |
---|
740 | | - MAX_SCHEDULE_TIMEOUT); |
---|
741 | | - if (ret) |
---|
742 | | - goto err_unreserve; |
---|
743 | | - |
---|
744 | | - /* |
---|
745 | | - * We've forced the client to stall and catch up with whatever |
---|
746 | | - * backlog there might have been. As we are assuming that we |
---|
747 | | - * caused the mempressure, now is an opportune time to |
---|
748 | | - * recover as much memory from the request pool as is possible. |
---|
749 | | - * Having already penalized the client to stall, we spend |
---|
750 | | - * a little extra time to re-optimise page allocation. |
---|
751 | | - */ |
---|
752 | | - kmem_cache_shrink(i915->requests); |
---|
753 | | - rcu_barrier(); /* Recover the TYPESAFE_BY_RCU pages */ |
---|
754 | | - |
---|
755 | | - rq = kmem_cache_alloc(i915->requests, GFP_KERNEL); |
---|
| 829 | + rq = request_alloc_slow(tl, &ce->engine->request_pool, gfp); |
---|
756 | 830 | if (!rq) { |
---|
757 | 831 | ret = -ENOMEM; |
---|
758 | 832 | goto err_unreserve; |
---|
759 | 833 | } |
---|
760 | 834 | } |
---|
761 | 835 | |
---|
762 | | - INIT_LIST_HEAD(&rq->active_list); |
---|
763 | | - rq->i915 = i915; |
---|
764 | | - rq->engine = engine; |
---|
765 | | - rq->gem_context = ctx; |
---|
766 | | - rq->hw_context = ce; |
---|
| 836 | + rq->context = ce; |
---|
| 837 | + rq->engine = ce->engine; |
---|
767 | 838 | rq->ring = ce->ring; |
---|
768 | | - rq->timeline = ce->ring->timeline; |
---|
769 | | - GEM_BUG_ON(rq->timeline == &engine->timeline); |
---|
| 839 | + rq->execution_mask = ce->engine->mask; |
---|
770 | 840 | |
---|
771 | | - spin_lock_init(&rq->lock); |
---|
772 | | - dma_fence_init(&rq->fence, |
---|
773 | | - &i915_fence_ops, |
---|
774 | | - &rq->lock, |
---|
775 | | - rq->timeline->fence_context, |
---|
776 | | - timeline_get_seqno(rq->timeline)); |
---|
| 841 | + ret = intel_timeline_get_seqno(tl, rq, &seqno); |
---|
| 842 | + if (ret) |
---|
| 843 | + goto err_free; |
---|
| 844 | + |
---|
| 845 | + dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, |
---|
| 846 | + tl->fence_context, seqno); |
---|
| 847 | + |
---|
| 848 | + RCU_INIT_POINTER(rq->timeline, tl); |
---|
| 849 | + RCU_INIT_POINTER(rq->hwsp_cacheline, tl->hwsp_cacheline); |
---|
| 850 | + rq->hwsp_seqno = tl->hwsp_seqno; |
---|
| 851 | + GEM_BUG_ON(i915_request_completed(rq)); |
---|
| 852 | + |
---|
| 853 | + rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ |
---|
777 | 854 | |
---|
778 | 855 | /* We bump the ref for the fence chain */ |
---|
779 | | - i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify); |
---|
780 | | - init_waitqueue_head(&rq->execute); |
---|
| 856 | + i915_sw_fence_reinit(&i915_request_get(rq)->submit); |
---|
| 857 | + i915_sw_fence_reinit(&i915_request_get(rq)->semaphore); |
---|
781 | 858 | |
---|
782 | | - i915_sched_node_init(&rq->sched); |
---|
| 859 | + i915_sched_node_reinit(&rq->sched); |
---|
783 | 860 | |
---|
784 | | - /* No zalloc, must clear what we need by hand */ |
---|
785 | | - rq->global_seqno = 0; |
---|
786 | | - rq->signaling.wait.seqno = 0; |
---|
787 | | - rq->file_priv = NULL; |
---|
| 861 | + /* No zalloc, everything must be cleared after use */ |
---|
788 | 862 | rq->batch = NULL; |
---|
789 | | - rq->capture_list = NULL; |
---|
790 | | - rq->waitboost = false; |
---|
| 863 | + GEM_BUG_ON(rq->capture_list); |
---|
| 864 | + GEM_BUG_ON(!llist_empty(&rq->execute_cb)); |
---|
791 | 865 | |
---|
792 | 866 | /* |
---|
793 | 867 | * Reserve space in the ring buffer for all the commands required to |
---|
.. | .. |
---|
795 | 869 | * i915_request_add() call can't fail. Note that the reserve may need |
---|
796 | 870 | * to be redone if the request is not actually submitted straight |
---|
797 | 871 | * away, e.g. because a GPU scheduler has deferred it. |
---|
| 872 | + * |
---|
| 873 | + * Note that due to how we add reserved_space to intel_ring_begin() |
---|
| 874 | + * we need to double our request to ensure that if we need to wrap |
---|
| 875 | + * around inside i915_request_add() there is sufficient space at |
---|
| 876 | + * the beginning of the ring as well. |
---|
798 | 877 | */ |
---|
799 | | - rq->reserved_space = MIN_SPACE_FOR_ADD_REQUEST; |
---|
800 | | - GEM_BUG_ON(rq->reserved_space < engine->emit_breadcrumb_sz); |
---|
| 878 | + rq->reserved_space = |
---|
| 879 | + 2 * rq->engine->emit_fini_breadcrumb_dw * sizeof(u32); |
---|
801 | 880 | |
---|
802 | 881 | /* |
---|
803 | 882 | * Record the position of the start of the request so that |
---|
.. | .. |
---|
807 | 886 | */ |
---|
808 | 887 | rq->head = rq->ring->emit; |
---|
809 | 888 | |
---|
810 | | - /* Unconditionally invalidate GPU caches and TLBs. */ |
---|
811 | | - ret = engine->emit_flush(rq, EMIT_INVALIDATE); |
---|
| 889 | + ret = rq->engine->request_alloc(rq); |
---|
812 | 890 | if (ret) |
---|
813 | 891 | goto err_unwind; |
---|
814 | | - |
---|
815 | | - ret = engine->request_alloc(rq); |
---|
816 | | - if (ret) |
---|
817 | | - goto err_unwind; |
---|
818 | | - |
---|
819 | | - /* Keep a second pin for the dual retirement along engine and ring */ |
---|
820 | | - __intel_context_pin(ce); |
---|
821 | 892 | |
---|
822 | 893 | rq->infix = rq->ring->emit; /* end of header; start of user payload */ |
---|
823 | 894 | |
---|
824 | | - /* Check that we didn't interrupt ourselves with a new request */ |
---|
825 | | - GEM_BUG_ON(rq->timeline->seqno != rq->fence.seqno); |
---|
| 895 | + intel_context_mark_active(ce); |
---|
| 896 | + list_add_tail_rcu(&rq->link, &tl->requests); |
---|
| 897 | + |
---|
826 | 898 | return rq; |
---|
827 | 899 | |
---|
828 | 900 | err_unwind: |
---|
829 | 901 | ce->ring->emit = rq->head; |
---|
830 | 902 | |
---|
831 | 903 | /* Make sure we didn't add ourselves to external state before freeing */ |
---|
832 | | - GEM_BUG_ON(!list_empty(&rq->active_list)); |
---|
833 | 904 | GEM_BUG_ON(!list_empty(&rq->sched.signalers_list)); |
---|
834 | 905 | GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); |
---|
835 | 906 | |
---|
836 | | - kmem_cache_free(i915->requests, rq); |
---|
| 907 | +err_free: |
---|
| 908 | + kmem_cache_free(global.slab_requests, rq); |
---|
837 | 909 | err_unreserve: |
---|
838 | | - unreserve_gt(i915); |
---|
839 | | -err_unpin: |
---|
840 | 910 | intel_context_unpin(ce); |
---|
841 | 911 | return ERR_PTR(ret); |
---|
| 912 | +} |
---|
| 913 | + |
---|
| 914 | +struct i915_request * |
---|
| 915 | +i915_request_create(struct intel_context *ce) |
---|
| 916 | +{ |
---|
| 917 | + struct i915_request *rq; |
---|
| 918 | + struct intel_timeline *tl; |
---|
| 919 | + |
---|
| 920 | + tl = intel_context_timeline_lock(ce); |
---|
| 921 | + if (IS_ERR(tl)) |
---|
| 922 | + return ERR_CAST(tl); |
---|
| 923 | + |
---|
| 924 | + /* Move our oldest request to the slab-cache (if not in use!) */ |
---|
| 925 | + rq = list_first_entry(&tl->requests, typeof(*rq), link); |
---|
| 926 | + if (!list_is_last(&rq->link, &tl->requests)) |
---|
| 927 | + i915_request_retire(rq); |
---|
| 928 | + |
---|
| 929 | + intel_context_enter(ce); |
---|
| 930 | + rq = __i915_request_create(ce, GFP_KERNEL); |
---|
| 931 | + intel_context_exit(ce); /* active reference transferred to request */ |
---|
| 932 | + if (IS_ERR(rq)) |
---|
| 933 | + goto err_unlock; |
---|
| 934 | + |
---|
| 935 | + /* Check that we do not interrupt ourselves with a new request */ |
---|
| 936 | + rq->cookie = lockdep_pin_lock(&tl->mutex); |
---|
| 937 | + |
---|
| 938 | + return rq; |
---|
| 939 | + |
---|
| 940 | +err_unlock: |
---|
| 941 | + intel_context_timeline_unlock(tl); |
---|
| 942 | + return rq; |
---|
| 943 | +} |
---|
| 944 | + |
---|
| 945 | +static int |
---|
| 946 | +i915_request_await_start(struct i915_request *rq, struct i915_request *signal) |
---|
| 947 | +{ |
---|
| 948 | + struct dma_fence *fence; |
---|
| 949 | + int err; |
---|
| 950 | + |
---|
| 951 | + if (i915_request_timeline(rq) == rcu_access_pointer(signal->timeline)) |
---|
| 952 | + return 0; |
---|
| 953 | + |
---|
| 954 | + if (i915_request_started(signal)) |
---|
| 955 | + return 0; |
---|
| 956 | + |
---|
| 957 | + fence = NULL; |
---|
| 958 | + rcu_read_lock(); |
---|
| 959 | + spin_lock_irq(&signal->lock); |
---|
| 960 | + do { |
---|
| 961 | + struct list_head *pos = READ_ONCE(signal->link.prev); |
---|
| 962 | + struct i915_request *prev; |
---|
| 963 | + |
---|
| 964 | + /* Confirm signal has not been retired, the link is valid */ |
---|
| 965 | + if (unlikely(i915_request_started(signal))) |
---|
| 966 | + break; |
---|
| 967 | + |
---|
| 968 | + /* Is signal the earliest request on its timeline? */ |
---|
| 969 | + if (pos == &rcu_dereference(signal->timeline)->requests) |
---|
| 970 | + break; |
---|
| 971 | + |
---|
| 972 | + /* |
---|
| 973 | + * Peek at the request before us in the timeline. That |
---|
| 974 | + * request will only be valid before it is retired, so |
---|
| 975 | + * after acquiring a reference to it, confirm that it is |
---|
| 976 | + * still part of the signaler's timeline. |
---|
| 977 | + */ |
---|
| 978 | + prev = list_entry(pos, typeof(*prev), link); |
---|
| 979 | + if (!i915_request_get_rcu(prev)) |
---|
| 980 | + break; |
---|
| 981 | + |
---|
| 982 | + /* After the strong barrier, confirm prev is still attached */ |
---|
| 983 | + if (unlikely(READ_ONCE(prev->link.next) != &signal->link)) { |
---|
| 984 | + i915_request_put(prev); |
---|
| 985 | + break; |
---|
| 986 | + } |
---|
| 987 | + |
---|
| 988 | + fence = &prev->fence; |
---|
| 989 | + } while (0); |
---|
| 990 | + spin_unlock_irq(&signal->lock); |
---|
| 991 | + rcu_read_unlock(); |
---|
| 992 | + if (!fence) |
---|
| 993 | + return 0; |
---|
| 994 | + |
---|
| 995 | + err = 0; |
---|
| 996 | + if (!intel_timeline_sync_is_later(i915_request_timeline(rq), fence)) |
---|
| 997 | + err = i915_sw_fence_await_dma_fence(&rq->submit, |
---|
| 998 | + fence, 0, |
---|
| 999 | + I915_FENCE_GFP); |
---|
| 1000 | + dma_fence_put(fence); |
---|
| 1001 | + |
---|
| 1002 | + return err; |
---|
| 1003 | +} |
---|
| 1004 | + |
---|
| 1005 | +static intel_engine_mask_t |
---|
| 1006 | +already_busywaiting(struct i915_request *rq) |
---|
| 1007 | +{ |
---|
| 1008 | + /* |
---|
| 1009 | + * Polling a semaphore causes bus traffic, delaying other users of |
---|
| 1010 | + * both the GPU and CPU. We want to limit the impact on others, |
---|
| 1011 | + * while taking advantage of early submission to reduce GPU |
---|
| 1012 | + * latency. Therefore we restrict ourselves to not using more |
---|
| 1013 | + * than one semaphore from each source, and not using a semaphore |
---|
| 1014 | + * if we have detected the engine is saturated (i.e. would not be |
---|
| 1015 | + * submitted early and cause bus traffic reading an already passed |
---|
| 1016 | + * semaphore). |
---|
| 1017 | + * |
---|
| 1018 | + * See the are-we-too-late? check in __i915_request_submit(). |
---|
| 1019 | + */ |
---|
| 1020 | + return rq->sched.semaphores | READ_ONCE(rq->engine->saturated); |
---|
| 1021 | +} |
---|
| 1022 | + |
---|
| 1023 | +static int |
---|
| 1024 | +__emit_semaphore_wait(struct i915_request *to, |
---|
| 1025 | + struct i915_request *from, |
---|
| 1026 | + u32 seqno) |
---|
| 1027 | +{ |
---|
| 1028 | + const int has_token = INTEL_GEN(to->engine->i915) >= 12; |
---|
| 1029 | + u32 hwsp_offset; |
---|
| 1030 | + int len, err; |
---|
| 1031 | + u32 *cs; |
---|
| 1032 | + |
---|
| 1033 | + GEM_BUG_ON(INTEL_GEN(to->engine->i915) < 8); |
---|
| 1034 | + GEM_BUG_ON(i915_request_has_initial_breadcrumb(to)); |
---|
| 1035 | + |
---|
| 1036 | + /* We need to pin the signaler's HWSP until we are finished reading. */ |
---|
| 1037 | + err = intel_timeline_read_hwsp(from, to, &hwsp_offset); |
---|
| 1038 | + if (err) |
---|
| 1039 | + return err; |
---|
| 1040 | + |
---|
| 1041 | + len = 4; |
---|
| 1042 | + if (has_token) |
---|
| 1043 | + len += 2; |
---|
| 1044 | + |
---|
| 1045 | + cs = intel_ring_begin(to, len); |
---|
| 1046 | + if (IS_ERR(cs)) |
---|
| 1047 | + return PTR_ERR(cs); |
---|
| 1048 | + |
---|
| 1049 | + /* |
---|
| 1050 | + * Using greater-than-or-equal here means we have to worry |
---|
| 1051 | + * about seqno wraparound. To side step that issue, we swap |
---|
| 1052 | + * the timeline HWSP upon wrapping, so that everyone listening |
---|
| 1053 | + * for the old (pre-wrap) values do not see the much smaller |
---|
| 1054 | + * (post-wrap) values than they were expecting (and so wait |
---|
| 1055 | + * forever). |
---|
| 1056 | + */ |
---|
| 1057 | + *cs++ = (MI_SEMAPHORE_WAIT | |
---|
| 1058 | + MI_SEMAPHORE_GLOBAL_GTT | |
---|
| 1059 | + MI_SEMAPHORE_POLL | |
---|
| 1060 | + MI_SEMAPHORE_SAD_GTE_SDD) + |
---|
| 1061 | + has_token; |
---|
| 1062 | + *cs++ = seqno; |
---|
| 1063 | + *cs++ = hwsp_offset; |
---|
| 1064 | + *cs++ = 0; |
---|
| 1065 | + if (has_token) { |
---|
| 1066 | + *cs++ = 0; |
---|
| 1067 | + *cs++ = MI_NOOP; |
---|
| 1068 | + } |
---|
| 1069 | + |
---|
| 1070 | + intel_ring_advance(to, cs); |
---|
| 1071 | + return 0; |
---|
| 1072 | +} |
---|
| 1073 | + |
---|
| 1074 | +static int |
---|
| 1075 | +emit_semaphore_wait(struct i915_request *to, |
---|
| 1076 | + struct i915_request *from, |
---|
| 1077 | + gfp_t gfp) |
---|
| 1078 | +{ |
---|
| 1079 | + const intel_engine_mask_t mask = READ_ONCE(from->engine)->mask; |
---|
| 1080 | + struct i915_sw_fence *wait = &to->submit; |
---|
| 1081 | + |
---|
| 1082 | + if (!intel_context_use_semaphores(to->context)) |
---|
| 1083 | + goto await_fence; |
---|
| 1084 | + |
---|
| 1085 | + if (i915_request_has_initial_breadcrumb(to)) |
---|
| 1086 | + goto await_fence; |
---|
| 1087 | + |
---|
| 1088 | + if (!rcu_access_pointer(from->hwsp_cacheline)) |
---|
| 1089 | + goto await_fence; |
---|
| 1090 | + |
---|
| 1091 | + /* |
---|
| 1092 | + * If this or its dependents are waiting on an external fence |
---|
| 1093 | + * that may fail catastrophically, then we want to avoid using |
---|
| 1094 | + * sempahores as they bypass the fence signaling metadata, and we |
---|
| 1095 | + * lose the fence->error propagation. |
---|
| 1096 | + */ |
---|
| 1097 | + if (from->sched.flags & I915_SCHED_HAS_EXTERNAL_CHAIN) |
---|
| 1098 | + goto await_fence; |
---|
| 1099 | + |
---|
| 1100 | + /* Just emit the first semaphore we see as request space is limited. */ |
---|
| 1101 | + if (already_busywaiting(to) & mask) |
---|
| 1102 | + goto await_fence; |
---|
| 1103 | + |
---|
| 1104 | + if (i915_request_await_start(to, from) < 0) |
---|
| 1105 | + goto await_fence; |
---|
| 1106 | + |
---|
| 1107 | + /* Only submit our spinner after the signaler is running! */ |
---|
| 1108 | + if (__await_execution(to, from, NULL, gfp)) |
---|
| 1109 | + goto await_fence; |
---|
| 1110 | + |
---|
| 1111 | + if (__emit_semaphore_wait(to, from, from->fence.seqno)) |
---|
| 1112 | + goto await_fence; |
---|
| 1113 | + |
---|
| 1114 | + to->sched.semaphores |= mask; |
---|
| 1115 | + wait = &to->semaphore; |
---|
| 1116 | + |
---|
| 1117 | +await_fence: |
---|
| 1118 | + return i915_sw_fence_await_dma_fence(wait, |
---|
| 1119 | + &from->fence, 0, |
---|
| 1120 | + I915_FENCE_GFP); |
---|
| 1121 | +} |
---|
| 1122 | + |
---|
| 1123 | +static bool intel_timeline_sync_has_start(struct intel_timeline *tl, |
---|
| 1124 | + struct dma_fence *fence) |
---|
| 1125 | +{ |
---|
| 1126 | + return __intel_timeline_sync_is_later(tl, |
---|
| 1127 | + fence->context, |
---|
| 1128 | + fence->seqno - 1); |
---|
| 1129 | +} |
---|
| 1130 | + |
---|
| 1131 | +static int intel_timeline_sync_set_start(struct intel_timeline *tl, |
---|
| 1132 | + const struct dma_fence *fence) |
---|
| 1133 | +{ |
---|
| 1134 | + return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1); |
---|
| 1135 | +} |
---|
| 1136 | + |
---|
| 1137 | +static int |
---|
| 1138 | +__i915_request_await_execution(struct i915_request *to, |
---|
| 1139 | + struct i915_request *from, |
---|
| 1140 | + void (*hook)(struct i915_request *rq, |
---|
| 1141 | + struct dma_fence *signal)) |
---|
| 1142 | +{ |
---|
| 1143 | + int err; |
---|
| 1144 | + |
---|
| 1145 | + GEM_BUG_ON(intel_context_is_barrier(from->context)); |
---|
| 1146 | + |
---|
| 1147 | + /* Submit both requests at the same time */ |
---|
| 1148 | + err = __await_execution(to, from, hook, I915_FENCE_GFP); |
---|
| 1149 | + if (err) |
---|
| 1150 | + return err; |
---|
| 1151 | + |
---|
| 1152 | + /* Squash repeated depenendices to the same timelines */ |
---|
| 1153 | + if (intel_timeline_sync_has_start(i915_request_timeline(to), |
---|
| 1154 | + &from->fence)) |
---|
| 1155 | + return 0; |
---|
| 1156 | + |
---|
| 1157 | + /* |
---|
| 1158 | + * Wait until the start of this request. |
---|
| 1159 | + * |
---|
| 1160 | + * The execution cb fires when we submit the request to HW. But in |
---|
| 1161 | + * many cases this may be long before the request itself is ready to |
---|
| 1162 | + * run (consider that we submit 2 requests for the same context, where |
---|
| 1163 | + * the request of interest is behind an indefinite spinner). So we hook |
---|
| 1164 | + * up to both to reduce our queues and keep the execution lag minimised |
---|
| 1165 | + * in the worst case, though we hope that the await_start is elided. |
---|
| 1166 | + */ |
---|
| 1167 | + err = i915_request_await_start(to, from); |
---|
| 1168 | + if (err < 0) |
---|
| 1169 | + return err; |
---|
| 1170 | + |
---|
| 1171 | + /* |
---|
| 1172 | + * Ensure both start together [after all semaphores in signal] |
---|
| 1173 | + * |
---|
| 1174 | + * Now that we are queued to the HW at roughly the same time (thanks |
---|
| 1175 | + * to the execute cb) and are ready to run at roughly the same time |
---|
| 1176 | + * (thanks to the await start), our signaler may still be indefinitely |
---|
| 1177 | + * delayed by waiting on a semaphore from a remote engine. If our |
---|
| 1178 | + * signaler depends on a semaphore, so indirectly do we, and we do not |
---|
| 1179 | + * want to start our payload until our signaler also starts theirs. |
---|
| 1180 | + * So we wait. |
---|
| 1181 | + * |
---|
| 1182 | + * However, there is also a second condition for which we need to wait |
---|
| 1183 | + * for the precise start of the signaler. Consider that the signaler |
---|
| 1184 | + * was submitted in a chain of requests following another context |
---|
| 1185 | + * (with just an ordinary intra-engine fence dependency between the |
---|
| 1186 | + * two). In this case the signaler is queued to HW, but not for |
---|
| 1187 | + * immediate execution, and so we must wait until it reaches the |
---|
| 1188 | + * active slot. |
---|
| 1189 | + */ |
---|
| 1190 | + if (intel_engine_has_semaphores(to->engine) && |
---|
| 1191 | + !i915_request_has_initial_breadcrumb(to)) { |
---|
| 1192 | + err = __emit_semaphore_wait(to, from, from->fence.seqno - 1); |
---|
| 1193 | + if (err < 0) |
---|
| 1194 | + return err; |
---|
| 1195 | + } |
---|
| 1196 | + |
---|
| 1197 | + /* Couple the dependency tree for PI on this exposed to->fence */ |
---|
| 1198 | + if (to->engine->schedule) { |
---|
| 1199 | + err = i915_sched_node_add_dependency(&to->sched, |
---|
| 1200 | + &from->sched, |
---|
| 1201 | + I915_DEPENDENCY_WEAK); |
---|
| 1202 | + if (err < 0) |
---|
| 1203 | + return err; |
---|
| 1204 | + } |
---|
| 1205 | + |
---|
| 1206 | + return intel_timeline_sync_set_start(i915_request_timeline(to), |
---|
| 1207 | + &from->fence); |
---|
| 1208 | +} |
---|
| 1209 | + |
---|
| 1210 | +static void mark_external(struct i915_request *rq) |
---|
| 1211 | +{ |
---|
| 1212 | + /* |
---|
| 1213 | + * The downside of using semaphores is that we lose metadata passing |
---|
| 1214 | + * along the signaling chain. This is particularly nasty when we |
---|
| 1215 | + * need to pass along a fatal error such as EFAULT or EDEADLK. For |
---|
| 1216 | + * fatal errors we want to scrub the request before it is executed, |
---|
| 1217 | + * which means that we cannot preload the request onto HW and have |
---|
| 1218 | + * it wait upon a semaphore. |
---|
| 1219 | + */ |
---|
| 1220 | + rq->sched.flags |= I915_SCHED_HAS_EXTERNAL_CHAIN; |
---|
| 1221 | +} |
---|
| 1222 | + |
---|
| 1223 | +static int |
---|
| 1224 | +__i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) |
---|
| 1225 | +{ |
---|
| 1226 | + mark_external(rq); |
---|
| 1227 | + return i915_sw_fence_await_dma_fence(&rq->submit, fence, |
---|
| 1228 | + i915_fence_context_timeout(rq->engine->i915, |
---|
| 1229 | + fence->context), |
---|
| 1230 | + I915_FENCE_GFP); |
---|
| 1231 | +} |
---|
| 1232 | + |
---|
| 1233 | +static int |
---|
| 1234 | +i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) |
---|
| 1235 | +{ |
---|
| 1236 | + struct dma_fence *iter; |
---|
| 1237 | + int err = 0; |
---|
| 1238 | + |
---|
| 1239 | + if (!to_dma_fence_chain(fence)) |
---|
| 1240 | + return __i915_request_await_external(rq, fence); |
---|
| 1241 | + |
---|
| 1242 | + dma_fence_chain_for_each(iter, fence) { |
---|
| 1243 | + struct dma_fence_chain *chain = to_dma_fence_chain(iter); |
---|
| 1244 | + |
---|
| 1245 | + if (!dma_fence_is_i915(chain->fence)) { |
---|
| 1246 | + err = __i915_request_await_external(rq, iter); |
---|
| 1247 | + break; |
---|
| 1248 | + } |
---|
| 1249 | + |
---|
| 1250 | + err = i915_request_await_dma_fence(rq, chain->fence); |
---|
| 1251 | + if (err < 0) |
---|
| 1252 | + break; |
---|
| 1253 | + } |
---|
| 1254 | + |
---|
| 1255 | + dma_fence_put(iter); |
---|
| 1256 | + return err; |
---|
| 1257 | +} |
---|
| 1258 | + |
---|
| 1259 | +int |
---|
| 1260 | +i915_request_await_execution(struct i915_request *rq, |
---|
| 1261 | + struct dma_fence *fence, |
---|
| 1262 | + void (*hook)(struct i915_request *rq, |
---|
| 1263 | + struct dma_fence *signal)) |
---|
| 1264 | +{ |
---|
| 1265 | + struct dma_fence **child = &fence; |
---|
| 1266 | + unsigned int nchild = 1; |
---|
| 1267 | + int ret; |
---|
| 1268 | + |
---|
| 1269 | + if (dma_fence_is_array(fence)) { |
---|
| 1270 | + struct dma_fence_array *array = to_dma_fence_array(fence); |
---|
| 1271 | + |
---|
| 1272 | + /* XXX Error for signal-on-any fence arrays */ |
---|
| 1273 | + |
---|
| 1274 | + child = array->fences; |
---|
| 1275 | + nchild = array->num_fences; |
---|
| 1276 | + GEM_BUG_ON(!nchild); |
---|
| 1277 | + } |
---|
| 1278 | + |
---|
| 1279 | + do { |
---|
| 1280 | + fence = *child++; |
---|
| 1281 | + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) |
---|
| 1282 | + continue; |
---|
| 1283 | + |
---|
| 1284 | + if (fence->context == rq->fence.context) |
---|
| 1285 | + continue; |
---|
| 1286 | + |
---|
| 1287 | + /* |
---|
| 1288 | + * We don't squash repeated fence dependencies here as we |
---|
| 1289 | + * want to run our callback in all cases. |
---|
| 1290 | + */ |
---|
| 1291 | + |
---|
| 1292 | + if (dma_fence_is_i915(fence)) |
---|
| 1293 | + ret = __i915_request_await_execution(rq, |
---|
| 1294 | + to_request(fence), |
---|
| 1295 | + hook); |
---|
| 1296 | + else |
---|
| 1297 | + ret = i915_request_await_external(rq, fence); |
---|
| 1298 | + if (ret < 0) |
---|
| 1299 | + return ret; |
---|
| 1300 | + } while (--nchild); |
---|
| 1301 | + |
---|
| 1302 | + return 0; |
---|
| 1303 | +} |
---|
| 1304 | + |
---|
| 1305 | +static int |
---|
| 1306 | +await_request_submit(struct i915_request *to, struct i915_request *from) |
---|
| 1307 | +{ |
---|
| 1308 | + /* |
---|
| 1309 | + * If we are waiting on a virtual engine, then it may be |
---|
| 1310 | + * constrained to execute on a single engine *prior* to submission. |
---|
| 1311 | + * When it is submitted, it will be first submitted to the virtual |
---|
| 1312 | + * engine and then passed to the physical engine. We cannot allow |
---|
| 1313 | + * the waiter to be submitted immediately to the physical engine |
---|
| 1314 | + * as it may then bypass the virtual request. |
---|
| 1315 | + */ |
---|
| 1316 | + if (to->engine == READ_ONCE(from->engine)) |
---|
| 1317 | + return i915_sw_fence_await_sw_fence_gfp(&to->submit, |
---|
| 1318 | + &from->submit, |
---|
| 1319 | + I915_FENCE_GFP); |
---|
| 1320 | + else |
---|
| 1321 | + return __i915_request_await_execution(to, from, NULL); |
---|
842 | 1322 | } |
---|
843 | 1323 | |
---|
844 | 1324 | static int |
---|
.. | .. |
---|
849 | 1329 | GEM_BUG_ON(to == from); |
---|
850 | 1330 | GEM_BUG_ON(to->timeline == from->timeline); |
---|
851 | 1331 | |
---|
852 | | - if (i915_request_completed(from)) |
---|
| 1332 | + if (i915_request_completed(from)) { |
---|
| 1333 | + i915_sw_fence_set_error_once(&to->submit, from->fence.error); |
---|
853 | 1334 | return 0; |
---|
| 1335 | + } |
---|
854 | 1336 | |
---|
855 | 1337 | if (to->engine->schedule) { |
---|
856 | | - ret = i915_sched_node_add_dependency(to->i915, |
---|
857 | | - &to->sched, |
---|
858 | | - &from->sched); |
---|
| 1338 | + ret = i915_sched_node_add_dependency(&to->sched, |
---|
| 1339 | + &from->sched, |
---|
| 1340 | + I915_DEPENDENCY_EXTERNAL); |
---|
859 | 1341 | if (ret < 0) |
---|
860 | 1342 | return ret; |
---|
861 | 1343 | } |
---|
862 | 1344 | |
---|
863 | | - if (to->engine == from->engine) { |
---|
864 | | - ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, |
---|
865 | | - &from->submit, |
---|
866 | | - I915_FENCE_GFP); |
---|
867 | | - return ret < 0 ? ret : 0; |
---|
868 | | - } |
---|
| 1345 | + if (is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask))) |
---|
| 1346 | + ret = await_request_submit(to, from); |
---|
| 1347 | + else |
---|
| 1348 | + ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); |
---|
| 1349 | + if (ret < 0) |
---|
| 1350 | + return ret; |
---|
869 | 1351 | |
---|
870 | | - if (to->engine->semaphore.sync_to) { |
---|
871 | | - u32 seqno; |
---|
872 | | - |
---|
873 | | - GEM_BUG_ON(!from->engine->semaphore.signal); |
---|
874 | | - |
---|
875 | | - seqno = i915_request_global_seqno(from); |
---|
876 | | - if (!seqno) |
---|
877 | | - goto await_dma_fence; |
---|
878 | | - |
---|
879 | | - if (seqno <= to->timeline->global_sync[from->engine->id]) |
---|
880 | | - return 0; |
---|
881 | | - |
---|
882 | | - trace_i915_gem_ring_sync_to(to, from); |
---|
883 | | - ret = to->engine->semaphore.sync_to(to, from); |
---|
884 | | - if (ret) |
---|
885 | | - return ret; |
---|
886 | | - |
---|
887 | | - to->timeline->global_sync[from->engine->id] = seqno; |
---|
888 | | - return 0; |
---|
889 | | - } |
---|
890 | | - |
---|
891 | | -await_dma_fence: |
---|
892 | | - ret = i915_sw_fence_await_dma_fence(&to->submit, |
---|
893 | | - &from->fence, 0, |
---|
894 | | - I915_FENCE_GFP); |
---|
895 | | - return ret < 0 ? ret : 0; |
---|
| 1352 | + return 0; |
---|
896 | 1353 | } |
---|
897 | 1354 | |
---|
898 | 1355 | int |
---|
.. | .. |
---|
932 | 1389 | continue; |
---|
933 | 1390 | |
---|
934 | 1391 | /* Squash repeated waits to the same timelines */ |
---|
935 | | - if (fence->context != rq->i915->mm.unordered_timeline && |
---|
936 | | - i915_timeline_sync_is_later(rq->timeline, fence)) |
---|
| 1392 | + if (fence->context && |
---|
| 1393 | + intel_timeline_sync_is_later(i915_request_timeline(rq), |
---|
| 1394 | + fence)) |
---|
937 | 1395 | continue; |
---|
938 | 1396 | |
---|
939 | 1397 | if (dma_fence_is_i915(fence)) |
---|
940 | 1398 | ret = i915_request_await_request(rq, to_request(fence)); |
---|
941 | 1399 | else |
---|
942 | | - ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, |
---|
943 | | - I915_FENCE_TIMEOUT, |
---|
944 | | - I915_FENCE_GFP); |
---|
| 1400 | + ret = i915_request_await_external(rq, fence); |
---|
945 | 1401 | if (ret < 0) |
---|
946 | 1402 | return ret; |
---|
947 | 1403 | |
---|
948 | 1404 | /* Record the latest fence used against each timeline */ |
---|
949 | | - if (fence->context != rq->i915->mm.unordered_timeline) |
---|
950 | | - i915_timeline_sync_set(rq->timeline, fence); |
---|
| 1405 | + if (fence->context) |
---|
| 1406 | + intel_timeline_sync_set(i915_request_timeline(rq), |
---|
| 1407 | + fence); |
---|
951 | 1408 | } while (--nchild); |
---|
952 | 1409 | |
---|
953 | 1410 | return 0; |
---|
.. | .. |
---|
985 | 1442 | struct dma_fence **shared; |
---|
986 | 1443 | unsigned int count, i; |
---|
987 | 1444 | |
---|
988 | | - ret = reservation_object_get_fences_rcu(obj->resv, |
---|
| 1445 | + ret = dma_resv_get_fences_rcu(obj->base.resv, |
---|
989 | 1446 | &excl, &count, &shared); |
---|
990 | 1447 | if (ret) |
---|
991 | 1448 | return ret; |
---|
.. | .. |
---|
1002 | 1459 | dma_fence_put(shared[i]); |
---|
1003 | 1460 | kfree(shared); |
---|
1004 | 1461 | } else { |
---|
1005 | | - excl = reservation_object_get_excl_rcu(obj->resv); |
---|
| 1462 | + excl = dma_resv_get_excl_rcu(obj->base.resv); |
---|
1006 | 1463 | } |
---|
1007 | 1464 | |
---|
1008 | 1465 | if (excl) { |
---|
.. | .. |
---|
1015 | 1472 | return ret; |
---|
1016 | 1473 | } |
---|
1017 | 1474 | |
---|
1018 | | -void i915_request_skip(struct i915_request *rq, int error) |
---|
| 1475 | +static struct i915_request * |
---|
| 1476 | +__i915_request_add_to_timeline(struct i915_request *rq) |
---|
1019 | 1477 | { |
---|
1020 | | - void *vaddr = rq->ring->vaddr; |
---|
1021 | | - u32 head; |
---|
1022 | | - |
---|
1023 | | - GEM_BUG_ON(!IS_ERR_VALUE((long)error)); |
---|
1024 | | - dma_fence_set_error(&rq->fence, error); |
---|
| 1478 | + struct intel_timeline *timeline = i915_request_timeline(rq); |
---|
| 1479 | + struct i915_request *prev; |
---|
1025 | 1480 | |
---|
1026 | 1481 | /* |
---|
1027 | | - * As this request likely depends on state from the lost |
---|
1028 | | - * context, clear out all the user operations leaving the |
---|
1029 | | - * breadcrumb at the end (so we get the fence notifications). |
---|
| 1482 | + * Dependency tracking and request ordering along the timeline |
---|
| 1483 | + * is special cased so that we can eliminate redundant ordering |
---|
| 1484 | + * operations while building the request (we know that the timeline |
---|
| 1485 | + * itself is ordered, and here we guarantee it). |
---|
| 1486 | + * |
---|
| 1487 | + * As we know we will need to emit tracking along the timeline, |
---|
| 1488 | + * we embed the hooks into our request struct -- at the cost of |
---|
| 1489 | + * having to have specialised no-allocation interfaces (which will |
---|
| 1490 | + * be beneficial elsewhere). |
---|
| 1491 | + * |
---|
| 1492 | + * A second benefit to open-coding i915_request_await_request is |
---|
| 1493 | + * that we can apply a slight variant of the rules specialised |
---|
| 1494 | + * for timelines that jump between engines (such as virtual engines). |
---|
| 1495 | + * If we consider the case of virtual engine, we must emit a dma-fence |
---|
| 1496 | + * to prevent scheduling of the second request until the first is |
---|
| 1497 | + * complete (to maximise our greedy late load balancing) and this |
---|
| 1498 | + * precludes optimising to use semaphores serialisation of a single |
---|
| 1499 | + * timeline across engines. |
---|
1030 | 1500 | */ |
---|
1031 | | - head = rq->infix; |
---|
1032 | | - if (rq->postfix < head) { |
---|
1033 | | - memset(vaddr + head, 0, rq->ring->size - head); |
---|
1034 | | - head = 0; |
---|
| 1501 | + prev = to_request(__i915_active_fence_set(&timeline->last_request, |
---|
| 1502 | + &rq->fence)); |
---|
| 1503 | + if (prev && !i915_request_completed(prev)) { |
---|
| 1504 | + /* |
---|
| 1505 | + * The requests are supposed to be kept in order. However, |
---|
| 1506 | + * we need to be wary in case the timeline->last_request |
---|
| 1507 | + * is used as a barrier for external modification to this |
---|
| 1508 | + * context. |
---|
| 1509 | + */ |
---|
| 1510 | + GEM_BUG_ON(prev->context == rq->context && |
---|
| 1511 | + i915_seqno_passed(prev->fence.seqno, |
---|
| 1512 | + rq->fence.seqno)); |
---|
| 1513 | + |
---|
| 1514 | + if (is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask)) |
---|
| 1515 | + i915_sw_fence_await_sw_fence(&rq->submit, |
---|
| 1516 | + &prev->submit, |
---|
| 1517 | + &rq->submitq); |
---|
| 1518 | + else |
---|
| 1519 | + __i915_sw_fence_await_dma_fence(&rq->submit, |
---|
| 1520 | + &prev->fence, |
---|
| 1521 | + &rq->dmaq); |
---|
| 1522 | + if (rq->engine->schedule) |
---|
| 1523 | + __i915_sched_node_add_dependency(&rq->sched, |
---|
| 1524 | + &prev->sched, |
---|
| 1525 | + &rq->dep, |
---|
| 1526 | + 0); |
---|
1035 | 1527 | } |
---|
1036 | | - memset(vaddr + head, 0, rq->postfix - head); |
---|
| 1528 | + if (prev) |
---|
| 1529 | + i915_request_put(prev); |
---|
| 1530 | + |
---|
| 1531 | + /* |
---|
| 1532 | + * Make sure that no request gazumped us - if it was allocated after |
---|
| 1533 | + * our i915_request_alloc() and called __i915_request_add() before |
---|
| 1534 | + * us, the timeline will hold its seqno which is later than ours. |
---|
| 1535 | + */ |
---|
| 1536 | + GEM_BUG_ON(timeline->seqno != rq->fence.seqno); |
---|
| 1537 | + |
---|
| 1538 | + return prev; |
---|
1037 | 1539 | } |
---|
1038 | 1540 | |
---|
1039 | 1541 | /* |
---|
.. | .. |
---|
1041 | 1543 | * request is not being tracked for completion but the work itself is |
---|
1042 | 1544 | * going to happen on the hardware. This would be a Bad Thing(tm). |
---|
1043 | 1545 | */ |
---|
1044 | | -void i915_request_add(struct i915_request *request) |
---|
| 1546 | +struct i915_request *__i915_request_commit(struct i915_request *rq) |
---|
1045 | 1547 | { |
---|
1046 | | - struct intel_engine_cs *engine = request->engine; |
---|
1047 | | - struct i915_timeline *timeline = request->timeline; |
---|
1048 | | - struct intel_ring *ring = request->ring; |
---|
1049 | | - struct i915_request *prev; |
---|
| 1548 | + struct intel_engine_cs *engine = rq->engine; |
---|
| 1549 | + struct intel_ring *ring = rq->ring; |
---|
1050 | 1550 | u32 *cs; |
---|
1051 | 1551 | |
---|
1052 | | - GEM_TRACE("%s fence %llx:%d\n", |
---|
1053 | | - engine->name, request->fence.context, request->fence.seqno); |
---|
1054 | | - |
---|
1055 | | - lockdep_assert_held(&request->i915->drm.struct_mutex); |
---|
1056 | | - trace_i915_request_add(request); |
---|
1057 | | - |
---|
1058 | | - /* |
---|
1059 | | - * Make sure that no request gazumped us - if it was allocated after |
---|
1060 | | - * our i915_request_alloc() and called __i915_request_add() before |
---|
1061 | | - * us, the timeline will hold its seqno which is later than ours. |
---|
1062 | | - */ |
---|
1063 | | - GEM_BUG_ON(timeline->seqno != request->fence.seqno); |
---|
| 1552 | + RQ_TRACE(rq, "\n"); |
---|
1064 | 1553 | |
---|
1065 | 1554 | /* |
---|
1066 | 1555 | * To ensure that this call will not fail, space for its emissions |
---|
1067 | 1556 | * should already have been reserved in the ring buffer. Let the ring |
---|
1068 | 1557 | * know that it is time to use that space up. |
---|
1069 | 1558 | */ |
---|
1070 | | - request->reserved_space = 0; |
---|
1071 | | - engine->emit_flush(request, EMIT_FLUSH); |
---|
| 1559 | + GEM_BUG_ON(rq->reserved_space > ring->space); |
---|
| 1560 | + rq->reserved_space = 0; |
---|
| 1561 | + rq->emitted_jiffies = jiffies; |
---|
1072 | 1562 | |
---|
1073 | 1563 | /* |
---|
1074 | 1564 | * Record the position of the start of the breadcrumb so that |
---|
.. | .. |
---|
1076 | 1566 | * GPU processing the request, we never over-estimate the |
---|
1077 | 1567 | * position of the ring's HEAD. |
---|
1078 | 1568 | */ |
---|
1079 | | - cs = intel_ring_begin(request, engine->emit_breadcrumb_sz); |
---|
| 1569 | + cs = intel_ring_begin(rq, engine->emit_fini_breadcrumb_dw); |
---|
1080 | 1570 | GEM_BUG_ON(IS_ERR(cs)); |
---|
1081 | | - request->postfix = intel_ring_offset(request, cs); |
---|
| 1571 | + rq->postfix = intel_ring_offset(rq, cs); |
---|
1082 | 1572 | |
---|
1083 | | - /* |
---|
1084 | | - * Seal the request and mark it as pending execution. Note that |
---|
1085 | | - * we may inspect this state, without holding any locks, during |
---|
1086 | | - * hangcheck. Hence we apply the barrier to ensure that we do not |
---|
1087 | | - * see a more recent value in the hws than we are tracking. |
---|
1088 | | - */ |
---|
| 1573 | + return __i915_request_add_to_timeline(rq); |
---|
| 1574 | +} |
---|
1089 | 1575 | |
---|
1090 | | - prev = i915_gem_active_raw(&timeline->last_request, |
---|
1091 | | - &request->i915->drm.struct_mutex); |
---|
1092 | | - if (prev && !i915_request_completed(prev)) { |
---|
1093 | | - i915_sw_fence_await_sw_fence(&request->submit, &prev->submit, |
---|
1094 | | - &request->submitq); |
---|
1095 | | - if (engine->schedule) |
---|
1096 | | - __i915_sched_node_add_dependency(&request->sched, |
---|
1097 | | - &prev->sched, |
---|
1098 | | - &request->dep, |
---|
1099 | | - 0); |
---|
1100 | | - } |
---|
1101 | | - |
---|
1102 | | - spin_lock_irq(&timeline->lock); |
---|
1103 | | - list_add_tail(&request->link, &timeline->requests); |
---|
1104 | | - spin_unlock_irq(&timeline->lock); |
---|
1105 | | - |
---|
1106 | | - GEM_BUG_ON(timeline->seqno != request->fence.seqno); |
---|
1107 | | - i915_gem_active_set(&timeline->last_request, request); |
---|
1108 | | - |
---|
1109 | | - list_add_tail(&request->ring_link, &ring->request_list); |
---|
1110 | | - if (list_is_first(&request->ring_link, &ring->request_list)) { |
---|
1111 | | - GEM_TRACE("marking %s as active\n", ring->timeline->name); |
---|
1112 | | - list_add(&ring->active_link, &request->i915->gt.active_rings); |
---|
1113 | | - } |
---|
1114 | | - request->emitted_jiffies = jiffies; |
---|
1115 | | - |
---|
| 1576 | +void __i915_request_queue(struct i915_request *rq, |
---|
| 1577 | + const struct i915_sched_attr *attr) |
---|
| 1578 | +{ |
---|
1116 | 1579 | /* |
---|
1117 | 1580 | * Let the backend know a new request has arrived that may need |
---|
1118 | 1581 | * to adjust the existing execution schedule due to a high priority |
---|
.. | .. |
---|
1124 | 1587 | * decide whether to preempt the entire chain so that it is ready to |
---|
1125 | 1588 | * run at the earliest possible convenience. |
---|
1126 | 1589 | */ |
---|
1127 | | - local_bh_disable(); |
---|
1128 | | - rcu_read_lock(); /* RCU serialisation for set-wedged protection */ |
---|
1129 | | - if (engine->schedule) |
---|
1130 | | - engine->schedule(request, &request->gem_context->sched); |
---|
1131 | | - rcu_read_unlock(); |
---|
1132 | | - i915_sw_fence_commit(&request->submit); |
---|
1133 | | - local_bh_enable(); /* Kick the execlists tasklet if just scheduled */ |
---|
1134 | | - |
---|
1135 | | - /* |
---|
1136 | | - * In typical scenarios, we do not expect the previous request on |
---|
1137 | | - * the timeline to be still tracked by timeline->last_request if it |
---|
1138 | | - * has been completed. If the completed request is still here, that |
---|
1139 | | - * implies that request retirement is a long way behind submission, |
---|
1140 | | - * suggesting that we haven't been retiring frequently enough from |
---|
1141 | | - * the combination of retire-before-alloc, waiters and the background |
---|
1142 | | - * retirement worker. So if the last request on this timeline was |
---|
1143 | | - * already completed, do a catch up pass, flushing the retirement queue |
---|
1144 | | - * up to this client. Since we have now moved the heaviest operations |
---|
1145 | | - * during retirement onto secondary workers, such as freeing objects |
---|
1146 | | - * or contexts, retiring a bunch of requests is mostly list management |
---|
1147 | | - * (and cache misses), and so we should not be overly penalizing this |
---|
1148 | | - * client by performing excess work, though we may still performing |
---|
1149 | | - * work on behalf of others -- but instead we should benefit from |
---|
1150 | | - * improved resource management. (Well, that's the theory at least.) |
---|
1151 | | - */ |
---|
1152 | | - if (prev && i915_request_completed(prev)) |
---|
1153 | | - i915_request_retire_upto(prev); |
---|
| 1590 | + if (attr && rq->engine->schedule) |
---|
| 1591 | + rq->engine->schedule(rq, attr); |
---|
| 1592 | + i915_sw_fence_commit(&rq->semaphore); |
---|
| 1593 | + i915_sw_fence_commit(&rq->submit); |
---|
1154 | 1594 | } |
---|
1155 | 1595 | |
---|
1156 | | -static unsigned long local_clock_us(unsigned int *cpu) |
---|
| 1596 | +void i915_request_add(struct i915_request *rq) |
---|
| 1597 | +{ |
---|
| 1598 | + struct intel_timeline * const tl = i915_request_timeline(rq); |
---|
| 1599 | + struct i915_sched_attr attr = {}; |
---|
| 1600 | + struct i915_gem_context *ctx; |
---|
| 1601 | + |
---|
| 1602 | + lockdep_assert_held(&tl->mutex); |
---|
| 1603 | + lockdep_unpin_lock(&tl->mutex, rq->cookie); |
---|
| 1604 | + |
---|
| 1605 | + trace_i915_request_add(rq); |
---|
| 1606 | + __i915_request_commit(rq); |
---|
| 1607 | + |
---|
| 1608 | + /* XXX placeholder for selftests */ |
---|
| 1609 | + rcu_read_lock(); |
---|
| 1610 | + ctx = rcu_dereference(rq->context->gem_context); |
---|
| 1611 | + if (ctx) |
---|
| 1612 | + attr = ctx->sched; |
---|
| 1613 | + rcu_read_unlock(); |
---|
| 1614 | + |
---|
| 1615 | + __i915_request_queue(rq, &attr); |
---|
| 1616 | + |
---|
| 1617 | + mutex_unlock(&tl->mutex); |
---|
| 1618 | +} |
---|
| 1619 | + |
---|
| 1620 | +static unsigned long local_clock_ns(unsigned int *cpu) |
---|
1157 | 1621 | { |
---|
1158 | 1622 | unsigned long t; |
---|
1159 | 1623 | |
---|
.. | .. |
---|
1170 | 1634 | * stop busywaiting, see busywait_stop(). |
---|
1171 | 1635 | */ |
---|
1172 | 1636 | *cpu = get_cpu(); |
---|
1173 | | - t = local_clock() >> 10; |
---|
| 1637 | + t = local_clock(); |
---|
1174 | 1638 | put_cpu(); |
---|
1175 | 1639 | |
---|
1176 | 1640 | return t; |
---|
.. | .. |
---|
1180 | 1644 | { |
---|
1181 | 1645 | unsigned int this_cpu; |
---|
1182 | 1646 | |
---|
1183 | | - if (time_after(local_clock_us(&this_cpu), timeout)) |
---|
| 1647 | + if (time_after(local_clock_ns(&this_cpu), timeout)) |
---|
1184 | 1648 | return true; |
---|
1185 | 1649 | |
---|
1186 | 1650 | return this_cpu != cpu; |
---|
1187 | 1651 | } |
---|
1188 | 1652 | |
---|
1189 | | -static bool __i915_spin_request(const struct i915_request *rq, |
---|
1190 | | - u32 seqno, int state, unsigned long timeout_us) |
---|
| 1653 | +static bool __i915_spin_request(struct i915_request * const rq, int state) |
---|
1191 | 1654 | { |
---|
1192 | | - struct intel_engine_cs *engine = rq->engine; |
---|
1193 | | - unsigned int irq, cpu; |
---|
1194 | | - |
---|
1195 | | - GEM_BUG_ON(!seqno); |
---|
| 1655 | + unsigned long timeout_ns; |
---|
| 1656 | + unsigned int cpu; |
---|
1196 | 1657 | |
---|
1197 | 1658 | /* |
---|
1198 | 1659 | * Only wait for the request if we know it is likely to complete. |
---|
.. | .. |
---|
1200 | 1661 | * We don't track the timestamps around requests, nor the average |
---|
1201 | 1662 | * request length, so we do not have a good indicator that this |
---|
1202 | 1663 | * request will complete within the timeout. What we do know is the |
---|
1203 | | - * order in which requests are executed by the engine and so we can |
---|
1204 | | - * tell if the request has started. If the request hasn't started yet, |
---|
1205 | | - * it is a fair assumption that it will not complete within our |
---|
1206 | | - * relatively short timeout. |
---|
| 1664 | + * order in which requests are executed by the context and so we can |
---|
| 1665 | + * tell if the request has been started. If the request is not even |
---|
| 1666 | + * running yet, it is a fair assumption that it will not complete |
---|
| 1667 | + * within our relatively short timeout. |
---|
1207 | 1668 | */ |
---|
1208 | | - if (!i915_seqno_passed(intel_engine_get_seqno(engine), seqno - 1)) |
---|
| 1669 | + if (!i915_request_is_running(rq)) |
---|
1209 | 1670 | return false; |
---|
1210 | 1671 | |
---|
1211 | 1672 | /* |
---|
.. | .. |
---|
1219 | 1680 | * takes to sleep on a request, on the order of a microsecond. |
---|
1220 | 1681 | */ |
---|
1221 | 1682 | |
---|
1222 | | - irq = READ_ONCE(engine->breadcrumbs.irq_count); |
---|
1223 | | - timeout_us += local_clock_us(&cpu); |
---|
| 1683 | + timeout_ns = READ_ONCE(rq->engine->props.max_busywait_duration_ns); |
---|
| 1684 | + timeout_ns += local_clock_ns(&cpu); |
---|
1224 | 1685 | do { |
---|
1225 | | - if (i915_seqno_passed(intel_engine_get_seqno(engine), seqno)) |
---|
1226 | | - return seqno == i915_request_global_seqno(rq); |
---|
1227 | | - |
---|
1228 | | - /* |
---|
1229 | | - * Seqno are meant to be ordered *before* the interrupt. If |
---|
1230 | | - * we see an interrupt without a corresponding seqno advance, |
---|
1231 | | - * assume we won't see one in the near future but require |
---|
1232 | | - * the engine->seqno_barrier() to fixup coherency. |
---|
1233 | | - */ |
---|
1234 | | - if (READ_ONCE(engine->breadcrumbs.irq_count) != irq) |
---|
1235 | | - break; |
---|
| 1686 | + if (dma_fence_is_signaled(&rq->fence)) |
---|
| 1687 | + return true; |
---|
1236 | 1688 | |
---|
1237 | 1689 | if (signal_pending_state(state, current)) |
---|
1238 | 1690 | break; |
---|
1239 | 1691 | |
---|
1240 | | - if (busywait_stop(timeout_us, cpu)) |
---|
| 1692 | + if (busywait_stop(timeout_ns, cpu)) |
---|
1241 | 1693 | break; |
---|
1242 | 1694 | |
---|
1243 | 1695 | cpu_relax(); |
---|
.. | .. |
---|
1246 | 1698 | return false; |
---|
1247 | 1699 | } |
---|
1248 | 1700 | |
---|
1249 | | -static bool __i915_wait_request_check_and_reset(struct i915_request *request) |
---|
| 1701 | +struct request_wait { |
---|
| 1702 | + struct dma_fence_cb cb; |
---|
| 1703 | + struct task_struct *tsk; |
---|
| 1704 | +}; |
---|
| 1705 | + |
---|
| 1706 | +static void request_wait_wake(struct dma_fence *fence, struct dma_fence_cb *cb) |
---|
1250 | 1707 | { |
---|
1251 | | - struct i915_gpu_error *error = &request->i915->gpu_error; |
---|
| 1708 | + struct request_wait *wait = container_of(cb, typeof(*wait), cb); |
---|
1252 | 1709 | |
---|
1253 | | - if (likely(!i915_reset_handoff(error))) |
---|
1254 | | - return false; |
---|
1255 | | - |
---|
1256 | | - __set_current_state(TASK_RUNNING); |
---|
1257 | | - i915_reset(request->i915, error->stalled_mask, error->reason); |
---|
1258 | | - return true; |
---|
| 1710 | + wake_up_process(fetch_and_zero(&wait->tsk)); |
---|
1259 | 1711 | } |
---|
1260 | 1712 | |
---|
1261 | 1713 | /** |
---|
.. | .. |
---|
1268 | 1720 | * maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an |
---|
1269 | 1721 | * unbounded wait). |
---|
1270 | 1722 | * |
---|
1271 | | - * If the caller holds the struct_mutex, the caller must pass I915_WAIT_LOCKED |
---|
1272 | | - * in via the flags, and vice versa if the struct_mutex is not held, the caller |
---|
1273 | | - * must not specify that the wait is locked. |
---|
1274 | | - * |
---|
1275 | 1723 | * Returns the remaining time (in jiffies) if the request completed, which may |
---|
1276 | 1724 | * be zero or -ETIME if the request is unfinished after the timeout expires. |
---|
1277 | 1725 | * May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is |
---|
.. | .. |
---|
1283 | 1731 | { |
---|
1284 | 1732 | const int state = flags & I915_WAIT_INTERRUPTIBLE ? |
---|
1285 | 1733 | TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; |
---|
1286 | | - wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue; |
---|
1287 | | - DEFINE_WAIT_FUNC(reset, default_wake_function); |
---|
1288 | | - DEFINE_WAIT_FUNC(exec, default_wake_function); |
---|
1289 | | - struct intel_wait wait; |
---|
| 1734 | + struct request_wait wait; |
---|
1290 | 1735 | |
---|
1291 | 1736 | might_sleep(); |
---|
1292 | | -#if IS_ENABLED(CONFIG_LOCKDEP) |
---|
1293 | | - GEM_BUG_ON(debug_locks && |
---|
1294 | | - !!lockdep_is_held(&rq->i915->drm.struct_mutex) != |
---|
1295 | | - !!(flags & I915_WAIT_LOCKED)); |
---|
1296 | | -#endif |
---|
1297 | 1737 | GEM_BUG_ON(timeout < 0); |
---|
1298 | 1738 | |
---|
1299 | | - if (i915_request_completed(rq)) |
---|
| 1739 | + if (dma_fence_is_signaled(&rq->fence)) |
---|
1300 | 1740 | return timeout; |
---|
1301 | 1741 | |
---|
1302 | 1742 | if (!timeout) |
---|
.. | .. |
---|
1304 | 1744 | |
---|
1305 | 1745 | trace_i915_request_wait_begin(rq, flags); |
---|
1306 | 1746 | |
---|
1307 | | - add_wait_queue(&rq->execute, &exec); |
---|
1308 | | - if (flags & I915_WAIT_LOCKED) |
---|
1309 | | - add_wait_queue(errq, &reset); |
---|
| 1747 | + /* |
---|
| 1748 | + * We must never wait on the GPU while holding a lock as we |
---|
| 1749 | + * may need to perform a GPU reset. So while we don't need to |
---|
| 1750 | + * serialise wait/reset with an explicit lock, we do want |
---|
| 1751 | + * lockdep to detect potential dependency cycles. |
---|
| 1752 | + */ |
---|
| 1753 | + mutex_acquire(&rq->engine->gt->reset.mutex.dep_map, 0, 0, _THIS_IP_); |
---|
1310 | 1754 | |
---|
1311 | | - intel_wait_init(&wait); |
---|
| 1755 | + /* |
---|
| 1756 | + * Optimistic spin before touching IRQs. |
---|
| 1757 | + * |
---|
| 1758 | + * We may use a rather large value here to offset the penalty of |
---|
| 1759 | + * switching away from the active task. Frequently, the client will |
---|
| 1760 | + * wait upon an old swapbuffer to throttle itself to remain within a |
---|
| 1761 | + * frame of the gpu. If the client is running in lockstep with the gpu, |
---|
| 1762 | + * then it should not be waiting long at all, and a sleep now will incur |
---|
| 1763 | + * extra scheduler latency in producing the next frame. To try to |
---|
| 1764 | + * avoid adding the cost of enabling/disabling the interrupt to the |
---|
| 1765 | + * short wait, we first spin to see if the request would have completed |
---|
| 1766 | + * in the time taken to setup the interrupt. |
---|
| 1767 | + * |
---|
| 1768 | + * We need upto 5us to enable the irq, and upto 20us to hide the |
---|
| 1769 | + * scheduler latency of a context switch, ignoring the secondary |
---|
| 1770 | + * impacts from a context switch such as cache eviction. |
---|
| 1771 | + * |
---|
| 1772 | + * The scheme used for low-latency IO is called "hybrid interrupt |
---|
| 1773 | + * polling". The suggestion there is to sleep until just before you |
---|
| 1774 | + * expect to be woken by the device interrupt and then poll for its |
---|
| 1775 | + * completion. That requires having a good predictor for the request |
---|
| 1776 | + * duration, which we currently lack. |
---|
| 1777 | + */ |
---|
| 1778 | + if (IS_ACTIVE(CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT) && |
---|
| 1779 | + __i915_spin_request(rq, state)) |
---|
| 1780 | + goto out; |
---|
1312 | 1781 | |
---|
1313 | | -restart: |
---|
1314 | | - do { |
---|
1315 | | - set_current_state(state); |
---|
1316 | | - if (intel_wait_update_request(&wait, rq)) |
---|
1317 | | - break; |
---|
| 1782 | + /* |
---|
| 1783 | + * This client is about to stall waiting for the GPU. In many cases |
---|
| 1784 | + * this is undesirable and limits the throughput of the system, as |
---|
| 1785 | + * many clients cannot continue processing user input/output whilst |
---|
| 1786 | + * blocked. RPS autotuning may take tens of milliseconds to respond |
---|
| 1787 | + * to the GPU load and thus incurs additional latency for the client. |
---|
| 1788 | + * We can circumvent that by promoting the GPU frequency to maximum |
---|
| 1789 | + * before we sleep. This makes the GPU throttle up much more quickly |
---|
| 1790 | + * (good for benchmarks and user experience, e.g. window animations), |
---|
| 1791 | + * but at a cost of spending more power processing the workload |
---|
| 1792 | + * (bad for battery). |
---|
| 1793 | + */ |
---|
| 1794 | + if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq)) |
---|
| 1795 | + intel_rps_boost(rq); |
---|
1318 | 1796 | |
---|
1319 | | - if (flags & I915_WAIT_LOCKED && |
---|
1320 | | - __i915_wait_request_check_and_reset(rq)) |
---|
1321 | | - continue; |
---|
| 1797 | + wait.tsk = current; |
---|
| 1798 | + if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake)) |
---|
| 1799 | + goto out; |
---|
1322 | 1800 | |
---|
1323 | | - if (signal_pending_state(state, current)) { |
---|
1324 | | - timeout = -ERESTARTSYS; |
---|
1325 | | - goto complete; |
---|
1326 | | - } |
---|
1327 | | - |
---|
1328 | | - if (!timeout) { |
---|
1329 | | - timeout = -ETIME; |
---|
1330 | | - goto complete; |
---|
1331 | | - } |
---|
1332 | | - |
---|
1333 | | - timeout = io_schedule_timeout(timeout); |
---|
1334 | | - } while (1); |
---|
1335 | | - |
---|
1336 | | - GEM_BUG_ON(!intel_wait_has_seqno(&wait)); |
---|
1337 | | - GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); |
---|
1338 | | - |
---|
1339 | | - /* Optimistic short spin before touching IRQs */ |
---|
1340 | | - if (__i915_spin_request(rq, wait.seqno, state, 5)) |
---|
1341 | | - goto complete; |
---|
1342 | | - |
---|
1343 | | - set_current_state(state); |
---|
1344 | | - if (intel_engine_add_wait(rq->engine, &wait)) |
---|
1345 | | - /* |
---|
1346 | | - * In order to check that we haven't missed the interrupt |
---|
1347 | | - * as we enabled it, we need to kick ourselves to do a |
---|
1348 | | - * coherent check on the seqno before we sleep. |
---|
1349 | | - */ |
---|
1350 | | - goto wakeup; |
---|
1351 | | - |
---|
1352 | | - if (flags & I915_WAIT_LOCKED) |
---|
1353 | | - __i915_wait_request_check_and_reset(rq); |
---|
| 1801 | + /* |
---|
| 1802 | + * Flush the submission tasklet, but only if it may help this request. |
---|
| 1803 | + * |
---|
| 1804 | + * We sometimes experience some latency between the HW interrupts and |
---|
| 1805 | + * tasklet execution (mostly due to ksoftirqd latency, but it can also |
---|
| 1806 | + * be due to lazy CS events), so lets run the tasklet manually if there |
---|
| 1807 | + * is a chance it may submit this request. If the request is not ready |
---|
| 1808 | + * to run, as it is waiting for other fences to be signaled, flushing |
---|
| 1809 | + * the tasklet is busy work without any advantage for this client. |
---|
| 1810 | + * |
---|
| 1811 | + * If the HW is being lazy, this is the last chance before we go to |
---|
| 1812 | + * sleep to catch any pending events. We will check periodically in |
---|
| 1813 | + * the heartbeat to flush the submission tasklets as a last resort |
---|
| 1814 | + * for unhappy HW. |
---|
| 1815 | + */ |
---|
| 1816 | + if (i915_request_is_ready(rq)) |
---|
| 1817 | + intel_engine_flush_submission(rq->engine); |
---|
1354 | 1818 | |
---|
1355 | 1819 | for (;;) { |
---|
| 1820 | + set_current_state(state); |
---|
| 1821 | + |
---|
| 1822 | + if (dma_fence_is_signaled(&rq->fence)) |
---|
| 1823 | + break; |
---|
| 1824 | + |
---|
1356 | 1825 | if (signal_pending_state(state, current)) { |
---|
1357 | 1826 | timeout = -ERESTARTSYS; |
---|
1358 | 1827 | break; |
---|
.. | .. |
---|
1364 | 1833 | } |
---|
1365 | 1834 | |
---|
1366 | 1835 | timeout = io_schedule_timeout(timeout); |
---|
1367 | | - |
---|
1368 | | - if (intel_wait_complete(&wait) && |
---|
1369 | | - intel_wait_check_request(&wait, rq)) |
---|
1370 | | - break; |
---|
1371 | | - |
---|
1372 | | - set_current_state(state); |
---|
1373 | | - |
---|
1374 | | -wakeup: |
---|
1375 | | - /* |
---|
1376 | | - * Carefully check if the request is complete, giving time |
---|
1377 | | - * for the seqno to be visible following the interrupt. |
---|
1378 | | - * We also have to check in case we are kicked by the GPU |
---|
1379 | | - * reset in order to drop the struct_mutex. |
---|
1380 | | - */ |
---|
1381 | | - if (__i915_request_irq_complete(rq)) |
---|
1382 | | - break; |
---|
1383 | | - |
---|
1384 | | - /* |
---|
1385 | | - * If the GPU is hung, and we hold the lock, reset the GPU |
---|
1386 | | - * and then check for completion. On a full reset, the engine's |
---|
1387 | | - * HW seqno will be advanced passed us and we are complete. |
---|
1388 | | - * If we do a partial reset, we have to wait for the GPU to |
---|
1389 | | - * resume and update the breadcrumb. |
---|
1390 | | - * |
---|
1391 | | - * If we don't hold the mutex, we can just wait for the worker |
---|
1392 | | - * to come along and update the breadcrumb (either directly |
---|
1393 | | - * itself, or indirectly by recovering the GPU). |
---|
1394 | | - */ |
---|
1395 | | - if (flags & I915_WAIT_LOCKED && |
---|
1396 | | - __i915_wait_request_check_and_reset(rq)) |
---|
1397 | | - continue; |
---|
1398 | | - |
---|
1399 | | - /* Only spin if we know the GPU is processing this request */ |
---|
1400 | | - if (__i915_spin_request(rq, wait.seqno, state, 2)) |
---|
1401 | | - break; |
---|
1402 | | - |
---|
1403 | | - if (!intel_wait_check_request(&wait, rq)) { |
---|
1404 | | - intel_engine_remove_wait(rq->engine, &wait); |
---|
1405 | | - goto restart; |
---|
1406 | | - } |
---|
1407 | 1836 | } |
---|
1408 | | - |
---|
1409 | | - intel_engine_remove_wait(rq->engine, &wait); |
---|
1410 | | -complete: |
---|
1411 | 1837 | __set_current_state(TASK_RUNNING); |
---|
1412 | | - if (flags & I915_WAIT_LOCKED) |
---|
1413 | | - remove_wait_queue(errq, &reset); |
---|
1414 | | - remove_wait_queue(&rq->execute, &exec); |
---|
| 1838 | + |
---|
| 1839 | + if (READ_ONCE(wait.tsk)) |
---|
| 1840 | + dma_fence_remove_callback(&rq->fence, &wait.cb); |
---|
| 1841 | + GEM_BUG_ON(!list_empty(&wait.cb.node)); |
---|
| 1842 | + |
---|
| 1843 | +out: |
---|
| 1844 | + mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_); |
---|
1415 | 1845 | trace_i915_request_wait_end(rq); |
---|
1416 | | - |
---|
1417 | 1846 | return timeout; |
---|
1418 | | -} |
---|
1419 | | - |
---|
1420 | | -static void ring_retire_requests(struct intel_ring *ring) |
---|
1421 | | -{ |
---|
1422 | | - struct i915_request *request, *next; |
---|
1423 | | - |
---|
1424 | | - list_for_each_entry_safe(request, next, |
---|
1425 | | - &ring->request_list, ring_link) { |
---|
1426 | | - if (!i915_request_completed(request)) |
---|
1427 | | - break; |
---|
1428 | | - |
---|
1429 | | - i915_request_retire(request); |
---|
1430 | | - } |
---|
1431 | | -} |
---|
1432 | | - |
---|
1433 | | -void i915_retire_requests(struct drm_i915_private *i915) |
---|
1434 | | -{ |
---|
1435 | | - struct intel_ring *ring, *tmp; |
---|
1436 | | - |
---|
1437 | | - lockdep_assert_held(&i915->drm.struct_mutex); |
---|
1438 | | - |
---|
1439 | | - if (!i915->gt.active_requests) |
---|
1440 | | - return; |
---|
1441 | | - |
---|
1442 | | - list_for_each_entry_safe(ring, tmp, &i915->gt.active_rings, active_link) |
---|
1443 | | - ring_retire_requests(ring); |
---|
1444 | 1847 | } |
---|
1445 | 1848 | |
---|
1446 | 1849 | #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) |
---|
1447 | 1850 | #include "selftests/mock_request.c" |
---|
1448 | 1851 | #include "selftests/i915_request.c" |
---|
1449 | 1852 | #endif |
---|
| 1853 | + |
---|
| 1854 | +static void i915_global_request_shrink(void) |
---|
| 1855 | +{ |
---|
| 1856 | + kmem_cache_shrink(global.slab_execute_cbs); |
---|
| 1857 | + kmem_cache_shrink(global.slab_requests); |
---|
| 1858 | +} |
---|
| 1859 | + |
---|
| 1860 | +static void i915_global_request_exit(void) |
---|
| 1861 | +{ |
---|
| 1862 | + kmem_cache_destroy(global.slab_execute_cbs); |
---|
| 1863 | + kmem_cache_destroy(global.slab_requests); |
---|
| 1864 | +} |
---|
| 1865 | + |
---|
| 1866 | +static struct i915_global_request global = { { |
---|
| 1867 | + .shrink = i915_global_request_shrink, |
---|
| 1868 | + .exit = i915_global_request_exit, |
---|
| 1869 | +} }; |
---|
| 1870 | + |
---|
| 1871 | +int __init i915_global_request_init(void) |
---|
| 1872 | +{ |
---|
| 1873 | + global.slab_requests = |
---|
| 1874 | + kmem_cache_create("i915_request", |
---|
| 1875 | + sizeof(struct i915_request), |
---|
| 1876 | + __alignof__(struct i915_request), |
---|
| 1877 | + SLAB_HWCACHE_ALIGN | |
---|
| 1878 | + SLAB_RECLAIM_ACCOUNT | |
---|
| 1879 | + SLAB_TYPESAFE_BY_RCU, |
---|
| 1880 | + __i915_request_ctor); |
---|
| 1881 | + if (!global.slab_requests) |
---|
| 1882 | + return -ENOMEM; |
---|
| 1883 | + |
---|
| 1884 | + global.slab_execute_cbs = KMEM_CACHE(execute_cb, |
---|
| 1885 | + SLAB_HWCACHE_ALIGN | |
---|
| 1886 | + SLAB_RECLAIM_ACCOUNT | |
---|
| 1887 | + SLAB_TYPESAFE_BY_RCU); |
---|
| 1888 | + if (!global.slab_execute_cbs) |
---|
| 1889 | + goto err_requests; |
---|
| 1890 | + |
---|
| 1891 | + i915_global_register(&global.base); |
---|
| 1892 | + return 0; |
---|
| 1893 | + |
---|
| 1894 | +err_requests: |
---|
| 1895 | + kmem_cache_destroy(global.slab_requests); |
---|
| 1896 | + return -ENOMEM; |
---|
| 1897 | +} |
---|