intel_lrc.c 82.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/*
 * Copyright © 2014 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Ben Widawsky <ben@bwidawsk.net>
 *    Michel Thierry <michel.thierry@intel.com>
 *    Thomas Daniel <thomas.daniel@intel.com>
 *    Oscar Mateo <oscar.mateo@intel.com>
 *
 */

31
32
33
34
/**
 * DOC: Logical Rings, Logical Ring Contexts and Execlists
 *
 * Motivation:
35
36
37
38
 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
 * These expanded contexts enable a number of new abilities, especially
 * "Execlists" (also implemented in this file).
 *
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
 * One of the main differences with the legacy HW contexts is that logical
 * ring contexts incorporate many more things to the context's state, like
 * PDPs or ringbuffer control registers:
 *
 * The reason why PDPs are included in the context is straightforward: as
 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
 * instead, the GPU will do it for you on the context switch.
 *
 * But, what about the ringbuffer control registers (head, tail, etc..)?
 * shouldn't we just need a set of those per engine command streamer? This is
 * where the name "Logical Rings" starts to make sense: by virtualizing the
 * rings, the engine cs shifts to a new "ring buffer" with every context
 * switch. When you want to submit a workload to the GPU you: A) choose your
 * context, B) find its appropriate virtualized ring, C) write commands to it
 * and then, finally, D) tell the GPU to switch to that context.
 *
 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
 * to a contexts is via a context execution list, ergo "Execlists".
 *
 * LRC implementation:
 * Regarding the creation of contexts, we have:
 *
 * - One global default context.
 * - One local default context for each opened fd.
 * - One local extra context for each context create ioctl call.
 *
 * Now that ringbuffers belong per-context (and not per-engine, like before)
 * and that contexts are uniquely tied to a given engine (and not reusable,
 * like before) we need:
 *
 * - One ringbuffer per-engine inside each context.
 * - One backing object per-engine inside each context.
 *
 * The global default context starts its life with these new objects fully
 * allocated and populated. The local default context for each opened fd is
 * more complex, because we don't know at creation time which engine is going
 * to use them. To handle this, we have implemented a deferred creation of LR
 * contexts:
 *
 * The local context starts its life as a hollow or blank holder, that only
 * gets populated for a given engine once we receive an execbuffer. If later
 * on we receive another execbuffer ioctl for the same context but a different
 * engine, we allocate/populate a new ringbuffer and context backing object and
 * so on.
 *
 * Finally, regarding local contexts created using the ioctl call: as they are
 * only allowed with the render ring, we can allocate & populate them right
 * away (no need to defer anything, at least for now).
 *
 * Execlists implementation:
90
91
 * Execlists are the new method by which, on gen8+ hardware, workloads are
 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
 * This method works as follows:
 *
 * When a request is committed, its commands (the BB start and any leading or
 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
 * for the appropriate context. The tail pointer in the hardware context is not
 * updated at this time, but instead, kept by the driver in the ringbuffer
 * structure. A structure representing this request is added to a request queue
 * for the appropriate engine: this structure contains a copy of the context's
 * tail after the request was written to the ring buffer and a pointer to the
 * context itself.
 *
 * If the engine's request queue was empty before the request was added, the
 * queue is processed immediately. Otherwise the queue will be processed during
 * a context switch interrupt. In any case, elements on the queue will get sent
 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 * globally unique 20-bits submission ID.
 *
 * When execution of a request completes, the GPU updates the context status
 * buffer with a context complete event and generates a context switch interrupt.
 * During the interrupt handling, the driver examines the events in the buffer:
 * for each context complete event, if the announced ID matches that on the head
 * of the request queue, then that request is retired and removed from the queue.
 *
 * After processing, if any requests were retired and the queue is not empty
 * then a new execution list can be submitted. The two requests at the front of
 * the queue are next to be submitted but since a context may not occur twice in
 * an execution list, if subsequent requests have the same ID as the first then
 * the two requests must be combined. This is done simply by discarding requests
 * at the head of the queue until either only one requests is left (in which case
 * we use a NULL second context) or the first two requests have unique IDs.
 *
 * By always executing the first two requests in the queue the driver ensures
 * that the GPU is kept as busy as possible. In the case where a single context
 * completes but a second context is still executing, the request for this second
 * context will be at the head of the queue when we remove the first one. This
 * request will then be resubmitted along with a new request for a different context,
 * which will cause the hardware to continue executing the second request and queue
 * the new request (the GPU detects the condition of a context getting preempted
 * with the same context and optimizes the context switch flow by not doing
 * preemption, but just sampling the new tail pointer).
 *
133
 */
134
#include <linux/interrupt.h>
135
136
137
138

#include <drm/drmP.h>
#include <drm/i915_drm.h>
#include "i915_drv.h"
139
#include "i915_gem_render_state.h"
140
#include "i915_vgpu.h"
141
#include "intel_lrc_reg.h"
142
#include "intel_mocs.h"
143
#include "intel_workarounds.h"
144

145
146
147
148
149
150
151
152
153
154
155
156
157
#define RING_EXECLIST_QFULL		(1 << 0x2)
#define RING_EXECLIST1_VALID		(1 << 0x3)
#define RING_EXECLIST0_VALID		(1 << 0x4)
#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
#define RING_EXECLIST0_ACTIVE		(1 << 0x12)

#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
158

159
#define GEN8_CTX_STATUS_COMPLETED_MASK \
160
	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
161

162
163
/* Typical size of the average request (2 pipecontrols and a MI_BB) */
#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
164
#define WA_TAIL_DWORDS 2
165
#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
166

167
static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
168
169
					    struct intel_engine_cs *engine,
					    struct intel_context *ce);
170
171
172
173
static void execlists_init_reg_state(u32 *reg_state,
				     struct i915_gem_context *ctx,
				     struct intel_engine_cs *engine,
				     struct intel_ring *ring);
174

175
176
177
178
179
180
181
static inline struct i915_priolist *to_priolist(struct rb_node *rb)
{
	return rb_entry(rb, struct i915_priolist, node);
}

static inline int rq_prio(const struct i915_request *rq)
{
182
	return rq->sched.attr.priority;
183
184
185
186
187
188
}

static inline bool need_preempt(const struct intel_engine_cs *engine,
				const struct i915_request *last,
				int prio)
{
189
	return (intel_engine_has_preemption(engine) &&
190
191
		__execlists_need_preempt(prio, rq_prio(last)) &&
		!i915_request_completed(last));
192
193
}

194
/*
195
196
197
198
199
 * The context descriptor encodes various attributes of a context,
 * including its GTT address and some flags. Because it's fairly
 * expensive to calculate, we'll just do it once and cache the result,
 * which remains valid until the context is unpinned.
 *
200
201
 * This is what a descriptor looks like, from LSB to MSB::
 *
202
 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
203
 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
204
 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
205
206
 *      bits 53-54:    mbz, reserved for use by hardware
 *      bits 55-63:    group ID, currently unused and set to 0
207
208
209
210
211
212
213
214
215
216
217
218
 *
 * Starting from Gen11, the upper dword of the descriptor has a new format:
 *
 *      bits 32-36:    reserved
 *      bits 37-47:    SW context ID
 *      bits 48:53:    engine instance
 *      bit 54:        mbz, reserved for use by hardware
 *      bits 55-60:    SW counter
 *      bits 61-63:    engine class
 *
 * engine info, SW context ID and SW counter need to form a unique number
 * (Context ID) per lrc.
219
 */
220
static void
221
intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
222
223
				   struct intel_engine_cs *engine,
				   struct intel_context *ce)
224
{
225
	u64 desc;
226

227
228
	BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
	BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
229

230
	desc = ctx->desc_template;				/* bits  0-11 */
231
232
	GEM_BUG_ON(desc & GENMASK_ULL(63, 12));

233
	desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
234
								/* bits 12-31 */
235
236
	GEM_BUG_ON(desc & GENMASK_ULL(63, 32));

237
238
239
240
241
	/*
	 * The following 32bits are copied into the OA reports (dword 2).
	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
	 * anything below.
	 */
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
	if (INTEL_GEN(ctx->i915) >= 11) {
		GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
		desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
								/* bits 37-47 */

		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
								/* bits 48-53 */

		/* TODO: decide what to do with SW counter (bits 55-60) */

		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
								/* bits 61-63 */
	} else {
		GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
		desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;	/* bits 32-52 */
	}
258

259
	ce->lrc_desc = desc;
260
261
}

262
static void unwind_wa_tail(struct i915_request *rq)
263
264
265
266
267
{
	rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
	assert_ring_tail_valid(rq->ring, rq->tail);
}

268
static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
269
{
270
	struct i915_request *rq, *rn, *active = NULL;
271
	struct list_head *uninitialized_var(pl);
272
	int prio = I915_PRIORITY_INVALID | I915_PRIORITY_NEWCLIENT;
273

274
	lockdep_assert_held(&engine->timeline.lock);
275
276

	list_for_each_entry_safe_reverse(rq, rn,
277
					 &engine->timeline.requests,
278
					 link) {
279
		if (i915_request_completed(rq))
280
			break;
281

282
		__i915_request_unsubmit(rq);
283
284
		unwind_wa_tail(rq);

285
286
		GEM_BUG_ON(rq->hw_context->active);

287
		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
288
289
		if (rq_prio(rq) != prio) {
			prio = rq_prio(rq);
290
			pl = i915_sched_lookup_priolist(engine, prio);
291
		}
292
		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
293

294
		list_add(&rq->sched.link, pl);
295
296
297
298
299
300
301
302
303
304
305
306

		active = rq;
	}

	/*
	 * The active request is now effectively the start of a new client
	 * stream, so give it the equivalent small priority bump to prevent
	 * it being gazumped a second time by another peer.
	 */
	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
		prio |= I915_PRIORITY_NEWCLIENT;
		list_move_tail(&active->sched.link,
307
			       i915_sched_lookup_priolist(engine, prio));
308
309
310
	}
}

311
void
312
313
314
315
316
317
318
319
execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
{
	struct intel_engine_cs *engine =
		container_of(execlists, typeof(*engine), execlists);

	__unwind_incomplete_requests(engine);
}

320
static inline void
321
execlists_context_status_change(struct i915_request *rq, unsigned long status)
322
{
323
324
325
326
327
328
	/*
	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
	 * The compiler should eliminate this function as dead-code.
	 */
	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
		return;
329

330
331
	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
				   status, rq);
332
333
}

334
335
336
337
338
339
340
341
342
343
344
345
346
inline void
execlists_user_begin(struct intel_engine_execlists *execlists,
		     const struct execlist_port *port)
{
	execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER);
}

inline void
execlists_user_end(struct intel_engine_execlists *execlists)
{
	execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
}

347
static inline void
348
execlists_context_schedule_in(struct i915_request *rq)
349
{
350
351
	GEM_BUG_ON(rq->hw_context->active);

352
	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
353
	intel_engine_context_in(rq->engine);
354
	rq->hw_context->active = rq->engine;
355
356
357
}

static inline void
358
execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
359
{
360
	rq->hw_context->active = NULL;
361
	intel_engine_context_out(rq->engine);
362
363
	execlists_context_status_change(rq, status);
	trace_i915_request_out(rq);
364
365
}

366
367
368
369
370
371
372
373
374
static void
execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
{
	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
}

375
static u64 execlists_update_context(struct i915_request *rq)
376
{
377
	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
378
	struct intel_context *ce = rq->hw_context;
379
	u32 *reg_state = ce->lrc_reg_state;
380

381
	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
382

383
384
	/*
	 * True 32b PPGTT with dynamic page allocation: update PDP
385
386
387
388
	 * registers and point the unallocated PDPs to scratch page.
	 * PML4 is allocated during ppgtt init, so this is not needed
	 * in 48-bit mode.
	 */
389
	if (!i915_vm_is_48bit(&ppgtt->vm))
390
		execlists_update_context_pdps(ppgtt, reg_state);
391

392
393
394
395
396
397
398
399
400
401
402
	/*
	 * Make sure the context image is complete before we submit it to HW.
	 *
	 * Ostensibly, writes (including the WCB) should be flushed prior to
	 * an uncached write such as our mmio register access, the empirical
	 * evidence (esp. on Braswell) suggests that the WC write into memory
	 * may not be visible to the HW prior to the completion of the UC
	 * register write and that we may begin execution from the context
	 * before its image is complete leading to invalid PD chasing.
	 */
	wmb();
403
	return ce->lrc_desc;
404
405
}

406
static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
Chris Wilson's avatar
Chris Wilson committed
407
{
408
409
410
411
412
413
414
	if (execlists->ctrl_reg) {
		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
	} else {
		writel(upper_32_bits(desc), execlists->submit_reg);
		writel(lower_32_bits(desc), execlists->submit_reg);
	}
Chris Wilson's avatar
Chris Wilson committed
415
416
}

417
static void execlists_submit_ports(struct intel_engine_cs *engine)
418
{
419
420
	struct intel_engine_execlists *execlists = &engine->execlists;
	struct execlist_port *port = execlists->port;
421
	unsigned int n;
422

423
424
425
426
427
428
429
430
431
432
	/*
	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
	 * not be relinquished until the device is idle (see
	 * i915_gem_idle_work_handler()). As a precaution, we make sure
	 * that all ELSP are drained i.e. we have processed the CSB,
	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
	 */
	GEM_BUG_ON(!engine->i915->gt.awake);

433
434
435
436
437
438
439
	/*
	 * ELSQ note: the submit queue is not cleared after being submitted
	 * to the HW so we need to make sure we always clean it up. This is
	 * currently ensured by the fact that we always write the same number
	 * of elsq entries, keep this in mind before changing the loop below.
	 */
	for (n = execlists_num_ports(execlists); n--; ) {
440
		struct i915_request *rq;
441
442
443
444
445
446
447
		unsigned int count;
		u64 desc;

		rq = port_unpack(&port[n], &count);
		if (rq) {
			GEM_BUG_ON(count > !n);
			if (!count++)
448
				execlists_context_schedule_in(rq);
449
450
451
			port_set(&port[n], port_pack(rq, count));
			desc = execlists_update_context(rq);
			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
452

453
			GEM_TRACE("%s in[%d]:  ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
454
				  engine->name, n,
455
				  port[n].context_id, count,
456
				  rq->global_seqno,
457
				  rq->fence.context, rq->fence.seqno,
458
				  intel_engine_get_seqno(engine),
459
				  rq_prio(rq));
460
461
462
463
		} else {
			GEM_BUG_ON(!n);
			desc = 0;
		}
464

465
		write_desc(execlists, desc, n);
466
	}
467
468
469
470
471
472

	/* we need to manually load the submit queue */
	if (execlists->ctrl_reg)
		writel(EL_CTRL_LOAD, execlists->ctrl_reg);

	execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
473
474
}

475
static bool ctx_single_port_submission(const struct intel_context *ce)
476
{
477
	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
478
		i915_gem_context_force_single_submission(ce->gem_context));
479
}
480

481
482
static bool can_merge_ctx(const struct intel_context *prev,
			  const struct intel_context *next)
483
484
485
{
	if (prev != next)
		return false;
486

487
488
	if (ctx_single_port_submission(prev))
		return false;
489

490
	return true;
491
492
}

493
static void port_assign(struct execlist_port *port, struct i915_request *rq)
494
495
496
497
{
	GEM_BUG_ON(rq == port_request(port));

	if (port_isset(port))
498
		i915_request_put(port_request(port));
499

500
	port_set(port, port_pack(i915_request_get(rq), port_count(port)));
501
502
}

Chris Wilson's avatar
Chris Wilson committed
503
504
static void inject_preempt_context(struct intel_engine_cs *engine)
{
505
	struct intel_engine_execlists *execlists = &engine->execlists;
Chris Wilson's avatar
Chris Wilson committed
506
	struct intel_context *ce =
507
		to_intel_context(engine->i915->preempt_context, engine);
Chris Wilson's avatar
Chris Wilson committed
508
509
	unsigned int n;

510
	GEM_BUG_ON(execlists->preempt_complete_status !=
511
		   upper_32_bits(ce->lrc_desc));
512

513
514
515
516
	/*
	 * Switch to our empty preempt context so
	 * the state of the GPU is known (idle).
	 */
517
	GEM_TRACE("%s\n", engine->name);
518
519
520
521
522
523
524
525
	for (n = execlists_num_ports(execlists); --n; )
		write_desc(execlists, 0, n);

	write_desc(execlists, ce->lrc_desc, n);

	/* we need to manually load the submit queue */
	if (execlists->ctrl_reg)
		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
Chris Wilson's avatar
Chris Wilson committed
526

527
528
529
530
531
532
533
534
	execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
	execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
}

static void complete_preempt_context(struct intel_engine_execlists *execlists)
{
	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));

535
536
537
	if (inject_preempt_hang(execlists))
		return;

538
	execlists_cancel_port_requests(execlists);
539
540
541
	__unwind_incomplete_requests(container_of(execlists,
						  struct intel_engine_cs,
						  execlists));
Chris Wilson's avatar
Chris Wilson committed
542
543
}

544
static void execlists_dequeue(struct intel_engine_cs *engine)
545
{
546
547
	struct intel_engine_execlists * const execlists = &engine->execlists;
	struct execlist_port *port = execlists->port;
548
549
	const struct execlist_port * const last_port =
		&execlists->port[execlists->port_mask];
550
	struct i915_request *last = port_request(port);
551
	struct rb_node *rb;
552
553
	bool submit = false;

554
555
	/*
	 * Hardware submission is through 2 ports. Conceptually each port
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
	 * static for a context, and unique to each, so we only execute
	 * requests belonging to a single context from each ring. RING_HEAD
	 * is maintained by the CS in the context image, it marks the place
	 * where it got up to last time, and through RING_TAIL we tell the CS
	 * where we want to execute up to this time.
	 *
	 * In this list the requests are in order of execution. Consecutive
	 * requests from the same context are adjacent in the ringbuffer. We
	 * can combine these requests into a single RING_TAIL update:
	 *
	 *              RING_HEAD...req1...req2
	 *                                    ^- RING_TAIL
	 * since to execute req2 the CS must first execute req1.
	 *
	 * Our goal then is to point each port to the end of a consecutive
	 * sequence of requests as being the most optimal (fewest wake ups
	 * and context switches) submission.
574
	 */
575

Chris Wilson's avatar
Chris Wilson committed
576
577
578
579
580
581
582
	if (last) {
		/*
		 * Don't resubmit or switch until all outstanding
		 * preemptions (lite-restore) are seen. Then we
		 * know the next preemption status we see corresponds
		 * to this ELSP update.
		 */
583
584
		GEM_BUG_ON(!execlists_is_active(execlists,
						EXECLISTS_ACTIVE_USER));
585
		GEM_BUG_ON(!port_count(&port[0]));
Chris Wilson's avatar
Chris Wilson committed
586

587
588
589
590
591
592
593
594
		/*
		 * If we write to ELSP a second time before the HW has had
		 * a chance to respond to the previous write, we can confuse
		 * the HW and hit "undefined behaviour". After writing to ELSP,
		 * we must then wait until we see a context-switch event from
		 * the HW to indicate that it has had a chance to respond.
		 */
		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
595
			return;
596

597
		if (need_preempt(engine, last, execlists->queue_priority)) {
Chris Wilson's avatar
Chris Wilson committed
598
			inject_preempt_context(engine);
599
			return;
Chris Wilson's avatar
Chris Wilson committed
600
		}
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623

		/*
		 * In theory, we could coalesce more requests onto
		 * the second port (the first port is active, with
		 * no preemptions pending). However, that means we
		 * then have to deal with the possible lite-restore
		 * of the second port (as we submit the ELSP, there
		 * may be a context-switch) but also we may complete
		 * the resubmission before the context-switch. Ergo,
		 * coalescing onto the second port will cause a
		 * preemption event, but we cannot predict whether
		 * that will affect port[0] or port[1].
		 *
		 * If the second port is already active, we can wait
		 * until the next context-switch before contemplating
		 * new requests. The GPU will be busy and we should be
		 * able to resubmit the new ELSP before it idles,
		 * avoiding pipeline bubbles (momentary pauses where
		 * the driver is unable to keep up the supply of new
		 * work). However, we have to double check that the
		 * priorities of the ports haven't been switch.
		 */
		if (port_count(&port[1]))
624
			return;
625
626
627
628
629
630
631
632
633
634

		/*
		 * WaIdleLiteRestore:bdw,skl
		 * Apply the wa NOOPs to prevent
		 * ring:HEAD == rq:TAIL as we resubmit the
		 * request. See gen8_emit_breadcrumb() for
		 * where we prepare the padding after the
		 * end of the request.
		 */
		last->tail = last->wa_tail;
Chris Wilson's avatar
Chris Wilson committed
635
636
	}

637
	while ((rb = rb_first_cached(&execlists->queue))) {
638
		struct i915_priolist *p = to_priolist(rb);
639
		struct i915_request *rq, *rn;
640
		int i;
641

642
		priolist_for_each_request_consume(rq, rn, p, i) {
643
644
645
646
647
648
649
650
651
652
			/*
			 * Can we combine this request with the current port?
			 * It has to be the same context/ringbuffer and not
			 * have any exceptions (e.g. GVT saying never to
			 * combine contexts).
			 *
			 * If we can combine the requests, we can execute both
			 * by updating the RING_TAIL to point to the end of the
			 * second request, and so we never need to tell the
			 * hardware about the first.
653
			 */
654
655
			if (last &&
			    !can_merge_ctx(rq->hw_context, last->hw_context)) {
656
657
658
659
660
				/*
				 * If we are on the second port and cannot
				 * combine this request with the last, then we
				 * are done.
				 */
661
				if (port == last_port)
662
663
664
665
666
667
668
669
670
					goto done;

				/*
				 * If GVT overrides us we only ever submit
				 * port[0], leaving port[1] empty. Note that we
				 * also have to be careful that we don't queue
				 * the same context (even though a different
				 * request) to the second port.
				 */
671
				if (ctx_single_port_submission(last->hw_context) ||
672
				    ctx_single_port_submission(rq->hw_context))
673
674
					goto done;

675
				GEM_BUG_ON(last->hw_context == rq->hw_context);
676
677
678
679

				if (submit)
					port_assign(port, last);
				port++;
680
681

				GEM_BUG_ON(port_isset(port));
682
			}
683

684
685
			list_del_init(&rq->sched.link);

686
687
			__i915_request_submit(rq);
			trace_i915_request_in(rq, port_index(port, execlists));
688

689
690
			last = rq;
			submit = true;
691
		}
692

693
		rb_erase_cached(&p->node, &execlists->queue);
694
		if (p->priority != I915_PRIORITY_NORMAL)
695
			kmem_cache_free(engine->i915->priorities, p);
696
	}
697

698
done:
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
	/*
	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
	 *
	 * We choose queue_priority such that if we add a request of greater
	 * priority than this, we kick the submission tasklet to decide on
	 * the right order of submitting the requests to hardware. We must
	 * also be prepared to reorder requests as they are in-flight on the
	 * HW. We derive the queue_priority then as the first "hole" in
	 * the HW submission ports and if there are no available slots,
	 * the priority of the lowest executing request, i.e. last.
	 *
	 * When we do receive a higher priority request ready to run from the
	 * user, see queue_request(), the queue_priority is bumped to that
	 * request triggering preemption on the next dequeue (or subsequent
	 * interrupt for secondary ports).
	 */
	execlists->queue_priority =
		port != execlists->port ? rq_prio(last) : INT_MIN;

718
	if (submit) {
719
		port_assign(port, last);
720
721
		execlists_submit_ports(engine);
	}
722
723

	/* We must always keep the beast fed if we have work piled up */
724
725
	GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
		   !port_isset(execlists->port));
726

727
728
	/* Re-evaluate the executing context setup after each preemptive kick */
	if (last)
729
		execlists_user_begin(execlists, execlists->port);
730

731
732
733
734
	/* If the engine is now idle, so should be the flag; and vice versa. */
	GEM_BUG_ON(execlists_is_active(&engine->execlists,
				       EXECLISTS_ACTIVE_USER) ==
		   !port_isset(engine->execlists.port));
735
736
}

737
void
738
execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
739
{
740
	struct execlist_port *port = execlists->port;
741
	unsigned int num_ports = execlists_num_ports(execlists);
742

743
	while (num_ports-- && port_isset(port)) {
744
		struct i915_request *rq = port_request(port);
745

746
747
748
749
750
751
752
		GEM_TRACE("%s:port%u global=%d (fence %llx:%d), (current %d)\n",
			  rq->engine->name,
			  (unsigned int)(port - execlists->port),
			  rq->global_seqno,
			  rq->fence.context, rq->fence.seqno,
			  intel_engine_get_seqno(rq->engine));

753
		GEM_BUG_ON(!execlists->active);
754
755
756
757
		execlists_context_schedule_out(rq,
					       i915_request_completed(rq) ?
					       INTEL_CONTEXT_SCHEDULE_OUT :
					       INTEL_CONTEXT_SCHEDULE_PREEMPTED);
758

759
		i915_request_put(rq);
760

761
762
763
		memset(port, 0, sizeof(*port));
		port++;
	}
764

765
	execlists_clear_all_active(execlists);
766
767
}

768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
static void reset_csb_pointers(struct intel_engine_execlists *execlists)
{
	/*
	 * After a reset, the HW starts writing into CSB entry [0]. We
	 * therefore have to set our HEAD pointer back one entry so that
	 * the *first* entry we check is entry 0. To complicate this further,
	 * as we don't wait for the first interrupt after reset, we have to
	 * fake the HW write to point back to the last entry so that our
	 * inline comparison of our cached head position against the last HW
	 * write works even before the first interrupt.
	 */
	execlists->csb_head = execlists->csb_write_reset;
	WRITE_ONCE(*execlists->csb_write, execlists->csb_write_reset);
}

783
784
785
786
787
static void nop_submission_tasklet(unsigned long data)
{
	/* The driver is wedged; don't process any more events. */
}

788
789
static void execlists_cancel_requests(struct intel_engine_cs *engine)
{
790
	struct intel_engine_execlists * const execlists = &engine->execlists;
791
	struct i915_request *rq, *rn;
792
793
794
	struct rb_node *rb;
	unsigned long flags;

795
796
	GEM_TRACE("%s current %d\n",
		  engine->name, intel_engine_get_seqno(engine));
797

798
799
800
801
802
803
804
805
806
807
808
809
810
811
	/*
	 * Before we call engine->cancel_requests(), we should have exclusive
	 * access to the submission state. This is arranged for us by the
	 * caller disabling the interrupt generation, the tasklet and other
	 * threads that may then access the same state, giving us a free hand
	 * to reset state. However, we still need to let lockdep be aware that
	 * we know this state may be accessed in hardirq context, so we
	 * disable the irq around this manipulation and we want to keep
	 * the spinlock focused on its duties and not accidentally conflate
	 * coverage to the submission's irq state. (Similarly, although we
	 * shouldn't need to disable irq around the manipulation of the
	 * submission's irq state, we also wish to remind ourselves that
	 * it is irq state.)
	 */
812
	spin_lock_irqsave(&engine->timeline.lock, flags);
813
814

	/* Cancel the requests on the HW and clear the ELSP tracker. */
815
	execlists_cancel_port_requests(execlists);
816
	execlists_user_end(execlists);
817
818

	/* Mark all executing requests as skipped. */
819
	list_for_each_entry(rq, &engine->timeline.requests, link) {
820
		GEM_BUG_ON(!rq->global_seqno);
821
		if (!i915_request_completed(rq))
822
823
824
825
			dma_fence_set_error(&rq->fence, -EIO);
	}

	/* Flush the queued requests to the timeline list (for retiring). */
826
	while ((rb = rb_first_cached(&execlists->queue))) {
827
		struct i915_priolist *p = to_priolist(rb);
828
		int i;
829

830
831
		priolist_for_each_request_consume(rq, rn, p, i) {
			list_del_init(&rq->sched.link);
832
833

			dma_fence_set_error(&rq->fence, -EIO);
834
			__i915_request_submit(rq);
835
836
		}

837
		rb_erase_cached(&p->node, &execlists->queue);
838
839
840
841
842
843
		if (p->priority != I915_PRIORITY_NORMAL)
			kmem_cache_free(engine->i915->priorities, p);
	}

	/* Remaining _unready_ requests will be nop'ed when submitted */

844
	execlists->queue_priority = INT_MIN;
845
	execlists->queue = RB_ROOT_CACHED;
846
	GEM_BUG_ON(port_isset(execlists->port));
847

848
849
850
	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
	execlists->tasklet.func = nop_submission_tasklet;

851
	spin_unlock_irqrestore(&engine->timeline.lock, flags);
852
853
}

854
855
856
857
858
859
static inline bool
reset_in_progress(const struct intel_engine_execlists *execlists)
{
	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
}

860
static void process_csb(struct intel_engine_cs *engine)
861
{
862
	struct intel_engine_execlists * const execlists = &engine->execlists;
863
	struct execlist_port *port = execlists->port;
864
865
	const u32 * const buf = execlists->csb_status;
	u8 head, tail;
866

867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
	/*
	 * Note that csb_write, csb_status may be either in HWSP or mmio.
	 * When reading from the csb_write mmio register, we have to be
	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
	 * the low 4bits. As it happens we know the next 4bits are always
	 * zero and so we can simply masked off the low u8 of the register
	 * and treat it identically to reading from the HWSP (without having
	 * to use explicit shifting and masking, and probably bifurcating
	 * the code to handle the legacy mmio read).
	 */
	head = execlists->csb_head;
	tail = READ_ONCE(*execlists->csb_write);
	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
	if (unlikely(head == tail))
		return;
882

883
884
885
886
887
888
889
890
891
	/*
	 * Hopefully paired with a wmb() in HW!
	 *
	 * We must complete the read of the write pointer before any reads
	 * from the CSB, so that we do not see stale values. Without an rmb
	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
	 * we perform the READ_ONCE(*csb_write).
	 */
	rmb();
892

893
	do {
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
		struct i915_request *rq;
		unsigned int status;
		unsigned int count;

		if (++head == GEN8_CSB_ENTRIES)
			head = 0;

		/*
		 * We are flying near dragons again.
		 *
		 * We hold a reference to the request in execlist_port[]
		 * but no more than that. We are operating in softirq
		 * context and so cannot hold any mutex or sleep. That
		 * prevents us stopping the requests we are processing
		 * in port[] from being retired simultaneously (the
		 * breadcrumb will be complete before we see the
		 * context-switch). As we only hold the reference to the
		 * request, any pointer chasing underneath the request
		 * is subject to a potential use-after-free. Thus we
		 * store all of the bookkeeping within port[] as
		 * required, and avoid using unguarded pointers beneath
		 * request itself. The same applies to the atomic
		 * status notifier.
		 */

		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
			  engine->name, head,
921
			  buf[2 * head + 0], buf[2 * head + 1],
922
923
			  execlists->active);

924
		status = buf[2 * head];
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
		if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
			      GEN8_CTX_STATUS_PREEMPTED))
			execlists_set_active(execlists,
					     EXECLISTS_ACTIVE_HWACK);
		if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
			execlists_clear_active(execlists,
					       EXECLISTS_ACTIVE_HWACK);

		if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
			continue;

		/* We should never get a COMPLETED | IDLE_ACTIVE! */
		GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);

		if (status & GEN8_CTX_STATUS_COMPLETE &&
		    buf[2*head + 1] == execlists->preempt_complete_status) {
			GEM_TRACE("%s preempt-idle\n", engine->name);
			complete_preempt_context(execlists);
			continue;
944
		}
945

946
947
948
949
		if (status & GEN8_CTX_STATUS_PREEMPTED &&
		    execlists_is_active(execlists,
					EXECLISTS_ACTIVE_PREEMPT))
			continue;
950

951
952
		GEM_BUG_ON(!execlists_is_active(execlists,
						EXECLISTS_ACTIVE_USER));
953

954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
		rq = port_unpack(port, &count);
		GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
			  engine->name,
			  port->context_id, count,
			  rq ? rq->global_seqno : 0,
			  rq ? rq->fence.context : 0,
			  rq ? rq->fence.seqno : 0,
			  intel_engine_get_seqno(engine),
			  rq ? rq_prio(rq) : 0);

		/* Check the context/desc id for this event matches */
		GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);

		GEM_BUG_ON(count == 0);
		if (--count == 0) {
969
			/*
970
971
972
973
974
975
			 * On the final event corresponding to the
			 * submission of this context, we expect either
			 * an element-switch event or a completion
			 * event (and on completion, the active-idle
			 * marker). No more preemptions, lite-restore
			 * or otherwise.
976
			 */
977
978
979
980
981
			GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
			GEM_BUG_ON(port_isset(&port[1]) &&
				   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
			GEM_BUG_ON(!port_isset(&port[1]) &&
				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
982

983
984
985
986
987
988
989
			/*
			 * We rely on the hardware being strongly
			 * ordered, that the breadcrumb write is
			 * coherent (visible from the CPU) before the
			 * user interrupt and CSB is processed.
			 */
			GEM_BUG_ON(!i915_request_completed(rq));
Chris Wilson's avatar
Chris Wilson committed
990

991
992
993
			execlists_context_schedule_out(rq,
						       INTEL_CONTEXT_SCHEDULE_OUT);
			i915_request_put(rq);
994

995
996
			GEM_TRACE("%s completed ctx=%d\n",
				  engine->name, port->context_id);
997

998
999
1000
1001
1002
1003
1004
			port = execlists_port_complete(execlists, port);
			if (port_isset(port))
				execlists_user_begin(execlists, port);
			else
				execlists_user_end(execlists);
		} else {
			port_set(port, port_pack(rq, count));
1005
		}
1006
	} while (head != tail);
1007

1008
	execlists->csb_head = head;
1009
}
1010

1011
static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1012
{
1013
	lockdep_assert_held(&engine->timeline.lock);
1014

1015
	process_csb(engine);
1016
1017
	if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
		execlists_dequeue(engine);
1018
1019
}

1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
/*
 * Check the unread Context Status Buffers and manage the submission of new
 * contexts to the ELSP accordingly.
 */
static void execlists_submission_tasklet(unsigned long data)
{
	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
	unsigned long flags;

	GEM_TRACE("%s awake?=%d, active=%x\n",
		  engine->name,
		  engine->i915->gt.awake,
		  engine->execlists.active);

	spin_lock_irqsave(&engine->timeline.lock, flags);
1035
	__execlists_submission_tasklet(engine);
1036
1037
1038
	spin_unlock_irqrestore(&engine->timeline.lock, flags);
}

1039
static void queue_request(struct intel_engine_cs *engine,
1040
			  struct i915_sched_node *node,
1041
			  int prio)
1042
{
1043
	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
}

static void __submit_queue_imm(struct intel_engine_cs *engine)
{
	struct intel_engine_execlists * const execlists = &engine->execlists;

	if (reset_in_progress(execlists))
		return; /* defer until we restart the engine following reset */

	if (execlists->tasklet.func == execlists_submission_tasklet)
		__execlists_submission_tasklet(engine);
	else
		tasklet_hi_schedule(&execlists->tasklet);
1057
1058
}

1059
1060
static void submit_queue(struct intel_engine_cs *engine, int prio)
{
1061
	if (prio > engine->execlists.queue_priority) {
1062
		engine->execlists.queue_priority = prio;
1063
1064
		__submit_queue_imm(engine);
	}
1065
1066
}

1067
static void execlists_submit_request(struct i915_request *request)
1068
{
1069
	struct intel_engine_cs *engine = request->engine;
1070
	unsigned long flags;
1071

1072
	/* Will be called from irq-context when using foreign fences. */
1073
	spin_lock_irqsave(&engine->timeline.lock, flags);
1074

1075
	queue_request(engine, &request->sched, rq_prio(request));
1076

1077
	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1078
	GEM_BUG_ON(list_empty(&request->sched.link));
1079

1080
1081
	submit_queue(engine, rq_prio(request));

1082
	spin_unlock_irqrestore(&engine->timeline.lock, flags);
1083
1084
}

1085
1086
1087
1088
static void execlists_context_destroy(struct intel_context *ce)
{
	GEM_BUG_ON(ce->pin_count);

1089
1090
1091
	if (!ce->state)
		return;

1092
	intel_ring_free(ce->ring);
1093
1094
1095

	GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
	i915_gem_object_put(ce->state->obj);
1096
1097
}

1098
static void execlists_context_unpin(struct intel_context *ce)
1099
{
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
	struct intel_engine_cs *engine;

	/*
	 * The tasklet may still be using a pointer to our state, via an
	 * old request. However, since we know we only unpin the context
	 * on retirement of the following request, we know that the last
	 * request referencing us will have had a completion CS interrupt.
	 * If we see that it is still active, it means that the tasklet hasn't
	 * had the chance to run yet; let it run before we teardown the
	 * reference it may use.
	 */
	engine = READ_ONCE(ce->active);
	if (unlikely(engine)) {
		unsigned long flags;

		spin_lock_irqsave(&engine->timeline.lock, flags);
		process_csb(engine);
		spin_unlock_irqrestore(&engine->timeline.lock, flags);

		GEM_BUG_ON(READ_ONCE(ce->active));
	}

1122
1123
	i915_gem_context_unpin_hw_id(ce->gem_context);

1124
1125
1126
1127
1128
1129
1130
1131
1132
	intel_ring_unpin(ce->ring);

	ce->state->obj->pin_global--;
	i915_gem_object_unpin_map(ce->state->obj);
	i915_vma_unpin(ce->state);

	i915_gem_context_put(ce->gem_context);
}

1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
{
	unsigned int flags;
	int err;

	/*
	 * Clear this page out of any CPU caches for coherent swap-in/out.
	 * We only want to do this on the first bind so that we do not stall
	 * on an active context (which by nature is already on the GPU).
	 */
	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
1144
		err = i915_gem_object_set_to_wc_domain(vma->obj, true);
1145
1146
1147
1148
1149
		if (err)
			return err;
	}

	flags = PIN_GLOBAL | PIN_HIGH;
1150
	flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
1151

1152
	return i915_vma_pin(vma, 0, 0, flags);
1153
1154
}

1155
1156
1157
1158
static struct intel_context *
__execlists_context_pin(struct intel_engine_cs *engine,
			struct i915_gem_context *ctx,
			struct intel_context *ce)
1159
{
1160
	void *vaddr;
1161
	int ret;
1162