qspinlock_paravirt.h 15.4 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
2
3
4
5
6
7
#ifndef _GEN_PV_LOCK_SLOWPATH
#error "do not include this file"
#endif

#include <linux/hash.h>
#include <linux/bootmem.h>
8
#include <linux/debug_locks.h>
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

/*
 * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
 * of spinning them.
 *
 * This relies on the architecture to provide two paravirt hypercalls:
 *
 *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
 *   pv_kick(cpu)             -- wakes a suspended vcpu
 *
 * Using these we implement __pv_queued_spin_lock_slowpath() and
 * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
 * native_queued_spin_unlock().
 */

#define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)

26
27
28
29
30
31
32
33
34
35
36
37
38
39
/*
 * Queue Node Adaptive Spinning
 *
 * A queue node vCPU will stop spinning if the vCPU in the previous node is
 * not running. The one lock stealing attempt allowed at slowpath entry
 * mitigates the slight slowdown for non-overcommitted guest with this
 * aggressive wait-early mechanism.
 *
 * The status of the previous node will be checked at fixed interval
 * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
 * pound on the cacheline of the previous node too heavily.
 */
#define PV_PREV_CHECK_MASK	0xff

40
41
42
43
/*
 * Queue node uses: vcpu_running & vcpu_halted.
 * Queue head uses: vcpu_running & vcpu_hashed.
 */
44
45
enum vcpu_state {
	vcpu_running = 0,
46
47
	vcpu_halted,		/* Used only in pv_wait_node */
	vcpu_hashed,		/* = pv_hash'ed + vcpu_halted */
48
49
50
51
52
53
54
55
56
57
};

struct pv_node {
	struct mcs_spinlock	mcs;
	struct mcs_spinlock	__res[3];

	int			cpu;
	u8			state;
};

58
59
60
61
62
/*
 * Include queued spinlock statistics code
 */
#include "qspinlock_stat.h"

63
64
65
66
67
68
69
70
71
72
/*
 * By replacing the regular queued_spin_trylock() with the function below,
 * it will be called once when a lock waiter enter the PV slowpath before
 * being queued. By allowing one lock stealing attempt here when the pending
 * bit is off, it helps to reduce the performance impact of lock waiter
 * preemption without the drawback of lock starvation.
 */
#define queued_spin_trylock(l)	pv_queued_spin_steal_lock(l)
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
{
73
	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
74
	    (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
75
76
77
78
79
		qstat_inc(qstat_pv_lock_stealing, true);
		return true;
	}

	return false;
80
81
82
83
84
85
86
87
88
}

/*
 * The pending bit is used by the queue head vCPU to indicate that it
 * is actively spinning on the lock and no lock stealing is allowed.
 */
#if _Q_PENDING_BITS == 8
static __always_inline void set_pending(struct qspinlock *lock)
{
89
	WRITE_ONCE(lock->pending, 1);
90
91
92
93
}

static __always_inline void clear_pending(struct qspinlock *lock)
{
94
	WRITE_ONCE(lock->pending, 0);
95
96
97
98
}

/*
 * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
99
100
 * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
 * lock just to be sure that it will get it.
101
102
103
 */
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
104
105
	return !READ_ONCE(lock->locked) &&
	       (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
106
				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
107
108
109
110
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
{
111
	atomic_or(_Q_PENDING_VAL, &lock->val);
112
113
114
115
}

static __always_inline void clear_pending(struct qspinlock *lock)
{
116
	atomic_andnot(_Q_PENDING_VAL, &lock->val);
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
}

static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
	int val = atomic_read(&lock->val);

	for (;;) {
		int old, new;

		if (val  & _Q_LOCKED_MASK)
			break;

		/*
		 * Try to clear pending bit & set locked bit
		 */
		old = val;
		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
134
		val = atomic_cmpxchg_acquire(&lock->val, old, new);
135
136
137
138
139
140
141
142

		if (val == old)
			return 1;
	}
	return 0;
}
#endif /* _Q_PENDING_BITS == 8 */

143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
/*
 * Lock and MCS node addresses hash table for fast lookup
 *
 * Hashing is done on a per-cacheline basis to minimize the need to access
 * more than one cacheline.
 *
 * Dynamically allocate a hash table big enough to hold at least 4X the
 * number of possible cpus in the system. Allocation is done on page
 * granularity. So the minimum number of hash buckets should be at least
 * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
 *
 * Since we should not be holding locks from NMI context (very rare indeed) the
 * max load factor is 0.75, which is around the point where open addressing
 * breaks down.
 *
 */
struct pv_hash_entry {
	struct qspinlock *lock;
	struct pv_node   *node;
};

#define PV_HE_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
#define PV_HE_MIN	(PAGE_SIZE / sizeof(struct pv_hash_entry))

static struct pv_hash_entry *pv_lock_hash;
static unsigned int pv_lock_hash_bits __read_mostly;

/*
 * Allocate memory for the PV qspinlock hash buckets
 *
 * This function should be called from the paravirt spinlock initialization
 * routine.
 */
void __init __pv_init_lock_hash(void)
{
	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);

	if (pv_hash_size < PV_HE_MIN)
		pv_hash_size = PV_HE_MIN;

	/*
	 * Allocate space from bootmem which should be page-size aligned
	 * and hence cacheline aligned.
	 */
	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
					       sizeof(struct pv_hash_entry),
189
190
					       pv_hash_size, 0,
					       HASH_EARLY | HASH_ZERO,
191
192
193
194
195
196
197
198
199
200
201
202
203
					       &pv_lock_hash_bits, NULL,
					       pv_hash_size, pv_hash_size);
}

#define for_each_hash_entry(he, offset, hash)						\
	for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;	\
	     offset < (1 << pv_lock_hash_bits);						\
	     offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])

static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{
	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
	struct pv_hash_entry *he;
204
	int hopcnt = 0;
205
206

	for_each_hash_entry(he, offset, hash) {
207
		hopcnt++;
208
209
		if (!cmpxchg(&he->lock, NULL, lock)) {
			WRITE_ONCE(he->node, node);
210
			qstat_hop(hopcnt);
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
			return &he->lock;
		}
	}
	/*
	 * Hard assume there is a free entry for us.
	 *
	 * This is guaranteed by ensuring every blocked lock only ever consumes
	 * a single entry, and since we only have 4 nesting levels per CPU
	 * and allocated 4*nr_possible_cpus(), this must be so.
	 *
	 * The single entry is guaranteed by having the lock owner unhash
	 * before it releases.
	 */
	BUG();
}

static struct pv_node *pv_unhash(struct qspinlock *lock)
{
	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
	struct pv_hash_entry *he;
	struct pv_node *node;

	for_each_hash_entry(he, offset, hash) {
		if (READ_ONCE(he->lock) == lock) {
			node = READ_ONCE(he->node);
			WRITE_ONCE(he->lock, NULL);
			return node;
		}
	}
	/*
	 * Hard assume we'll find an entry.
	 *
	 * This guarantees a limited lookup time and is itself guaranteed by
	 * having the lock owner do the unhash -- IFF the unlock sees the
	 * SLOW flag, there MUST be a hash entry.
	 */
	BUG();
}

250
251
252
253
254
255
256
257
258
259
/*
 * Return true if when it is time to check the previous node which is not
 * in a running state.
 */
static inline bool
pv_wait_early(struct pv_node *prev, int loop)
{
	if ((loop & PV_PREV_CHECK_MASK) != 0)
		return false;

260
	return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
261
262
}

263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
/*
 * Initialize the PV part of the mcs_spinlock node.
 */
static void pv_init_node(struct mcs_spinlock *node)
{
	struct pv_node *pn = (struct pv_node *)node;

	BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));

	pn->cpu = smp_processor_id();
	pn->state = vcpu_running;
}

/*
 * Wait for node->locked to become true, halt the vcpu after a short spin.
278
279
 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
 * behalf.
280
 */
281
static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
282
283
{
	struct pv_node *pn = (struct pv_node *)node;
284
	struct pv_node *pp = (struct pv_node *)prev;
285
	int loop;
286
	bool wait_early;
287

288
	for (;;) {
289
		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
290
291
			if (READ_ONCE(node->locked))
				return;
292
293
294
295
			if (pv_wait_early(pp, loop)) {
				wait_early = true;
				break;
			}
296
297
298
299
300
301
302
303
			cpu_relax();
		}

		/*
		 * Order pn->state vs pn->locked thusly:
		 *
		 * [S] pn->state = vcpu_halted	  [S] next->locked = 1
		 *     MB			      MB
304
		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
305
		 *
306
		 * Matches the cmpxchg() from pv_kick_node().
307
		 */
308
		smp_store_mb(pn->state, vcpu_halted);
309

310
311
		if (!READ_ONCE(node->locked)) {
			qstat_inc(qstat_pv_wait_node, true);
312
			qstat_inc(qstat_pv_wait_early, wait_early);
313
			pv_wait(&pn->state, vcpu_halted);
314
		}
315
316

		/*
317
		 * If pv_kick_node() changed us to vcpu_hashed, retain that
318
319
		 * value so that pv_wait_head_or_lock() knows to not also try
		 * to hash this lock.
320
		 */
321
		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
322
323
324
325
326
327
328
329

		/*
		 * If the locked flag is still not set after wakeup, it is a
		 * spurious wakeup and the vCPU should wait again. However,
		 * there is a pretty high overhead for CPU halting and kicking.
		 * So it is better to spin for a while in the hope that the
		 * MCS lock will be released soon.
		 */
330
		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
331
	}
332

333
334
335
336
337
338
339
340
	/*
	 * By now our node->locked should be 1 and our caller will not actually
	 * spin-wait for it. We do however rely on our caller to do a
	 * load-acquire for us.
	 */
}

/*
341
342
 * Called after setting next->locked = 1 when we're the lock owner.
 *
343
344
345
 * Instead of waking the waiters stuck in pv_wait_node() advance their state
 * such that they're waiting in pv_wait_head_or_lock(), this avoids a
 * wake/sleep cycle.
346
 */
347
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
348
349
350
351
{
	struct pv_node *pn = (struct pv_node *)node;

	/*
352
353
354
	 * If the vCPU is indeed halted, advance its state to match that of
	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
	 * observe its next->locked value and advance itself.
355
	 *
356
	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
357
358
359
360
361
362
363
364
	 *
	 * The write to next->locked in arch_mcs_spin_unlock_contended()
	 * must be ordered before the read of pn->state in the cmpxchg()
	 * below for the code to work correctly. To guarantee full ordering
	 * irrespective of the success or failure of the cmpxchg(),
	 * a relaxed version with explicit barrier is used. The control
	 * dependency will order the reading of pn->state before any
	 * subsequent writes.
365
	 */
366
367
368
	smp_mb__before_atomic();
	if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
	    != vcpu_halted)
369
370
371
372
		return;

	/*
	 * Put the lock into the hash table and set the _Q_SLOW_VAL.
373
	 *
374
375
376
	 * As this is the same vCPU that will check the _Q_SLOW_VAL value and
	 * the hash table later on at unlock time, no atomic instruction is
	 * needed.
377
	 */
378
	WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
379
	(void)pv_hash(lock, pn);
380
381
382
}

/*
383
384
 * Wait for l->locked to become clear and acquire the lock;
 * halt the vcpu after a short spin.
385
 * __pv_queued_spin_unlock() will wake us.
386
387
 *
 * The current value of the lock will be returned for additional processing.
388
 */
389
390
static u32
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
391
392
393
{
	struct pv_node *pn = (struct pv_node *)node;
	struct qspinlock **lp = NULL;
394
	int waitcnt = 0;
395
396
	int loop;

397
398
399
400
401
402
403
	/*
	 * If pv_kick_node() already advanced our state, we don't need to
	 * insert ourselves into the hash table anymore.
	 */
	if (READ_ONCE(pn->state) == vcpu_hashed)
		lp = (struct qspinlock **)1;

404
405
406
407
408
	/*
	 * Tracking # of slowpath locking operations
	 */
	qstat_inc(qstat_pv_lock_slowpath, true);

409
	for (;; waitcnt++) {
410
411
412
413
414
415
		/*
		 * Set correct vCPU state to be used by queue node wait-early
		 * mechanism.
		 */
		WRITE_ONCE(pn->state, vcpu_running);

416
417
418
419
420
		/*
		 * Set the pending bit in the active lock spinning loop to
		 * disable lock stealing before attempting to acquire the lock.
		 */
		set_pending(lock);
421
		for (loop = SPIN_THRESHOLD; loop; loop--) {
422
423
			if (trylock_clear_pending(lock))
				goto gotlock;
424
425
			cpu_relax();
		}
426
427
		clear_pending(lock);

428
429
430

		if (!lp) { /* ONCE */
			lp = pv_hash(lock, pn);
431

432
			/*
433
434
435
			 * We must hash before setting _Q_SLOW_VAL, such that
			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
			 * we'll be sure to be able to observe our hash entry.
436
			 *
437
438
439
			 *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
			 *       MB                           RMB
			 * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
440
			 *
441
			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
442
			 */
443
			if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
444
				/*
445
446
447
				 * The lock was free and now we own the lock.
				 * Change the lock value back to _Q_LOCKED_VAL
				 * and unhash the table.
448
				 */
449
				WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
450
				WRITE_ONCE(*lp, NULL);
451
				goto gotlock;
452
453
			}
		}
454
		WRITE_ONCE(pn->state, vcpu_hashed);
455
456
		qstat_inc(qstat_pv_wait_head, true);
		qstat_inc(qstat_pv_wait_again, waitcnt);
457
		pv_wait(&lock->locked, _Q_SLOW_VAL);
458
459

		/*
460
461
		 * Because of lock stealing, the queue head vCPU may not be
		 * able to acquire the lock before it has to wait again.
462
463
464
465
		 */
	}

	/*
466
467
468
469
	 * The cmpxchg() or xchg() call before coming here provides the
	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
	 * here is to indicate to the compiler that the value will always
	 * be nozero to enable better code optimization.
470
	 */
471
472
gotlock:
	return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
473
474
475
}

/*
476
477
 * PV versions of the unlock fastpath and slowpath functions to be used
 * instead of queued_spin_unlock().
478
 */
479
480
__visible void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
481
482
483
{
	struct pv_node *node;

484
485
486
487
	if (unlikely(locked != _Q_SLOW_VAL)) {
		WARN(!debug_locks_silent,
		     "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
		     (unsigned long)lock, atomic_read(&lock->val));
488
489
490
		return;
	}

491
492
493
494
495
	/*
	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
	 * so we need a barrier to order the read of the node data in
	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
	 *
496
	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
497
498
499
	 */
	smp_rmb();

500
501
502
503
504
505
506
507
508
509
	/*
	 * Since the above failed to release, this must be the SLOW path.
	 * Therefore start by looking up the blocked node and unhashing it.
	 */
	node = pv_unhash(lock);

	/*
	 * Now that we have a reference to the (likely) blocked pv_node,
	 * release the lock.
	 */
510
	smp_store_release(&lock->locked, 0);
511
512
513
514

	/*
	 * At this point the memory pointed at by lock can be freed/reused,
	 * however we can still use the pv_node to kick the CPU.
515
516
517
	 * The other vCPU may not really be halted, but kicking an active
	 * vCPU is harmless other than the additional latency in completing
	 * the unlock.
518
	 */
519
	qstat_inc(qstat_pv_kick_unlock, true);
520
	pv_kick(node->cpu);
521
}
522

523
524
525
/*
 * Include the architecture specific callee-save thunk of the
 * __pv_queued_spin_unlock(). This thunk is put together with
526
527
528
529
 * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
 * function close to each other sharing consecutive instruction cachelines.
 * Alternatively, architecture specific version of __pv_queued_spin_unlock()
 * can be defined.
530
531
532
 */
#include <asm/qspinlock_paravirt.h>

533
534
535
536
537
538
539
540
541
542
#ifndef __pv_queued_spin_unlock
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
	u8 locked;

	/*
	 * We must not unlock if SLOW, because in that case we must first
	 * unhash. Otherwise it would be possible to have multiple @lock
	 * entries, which would be BAD.
	 */
543
	locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
544
545
546
547
548
549
	if (likely(locked == _Q_LOCKED_VAL))
		return;

	__pv_queued_spin_unlock_slowpath(lock, locked);
}
#endif /* __pv_queued_spin_unlock */