blk-throttle.c 42.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
/*
 * Interface for controlling IO bandwidth on a request queue
 *
 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
12
#include <linux/blk-cgroup.h>
13
#include "blk.h"
14
15
16
17
18
19
20
21
22
23

/* Max dispatch from a group in 1 round */
static int throtl_grp_quantum = 8;

/* Total max dispatch from all groups in one round */
static int throtl_quantum = 32;

/* Throttling is performed over 100ms slice and after that slice is renewed */
static unsigned long throtl_slice = HZ/10;	/* 100 ms */

Tejun Heo's avatar
Tejun Heo committed
24
static struct blkcg_policy blkcg_policy_throtl;
25

26
27
28
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/*
 * To implement hierarchical throttling, throtl_grps form a tree and bios
 * are dispatched upwards level by level until they reach the top and get
 * issued.  When dispatching bios from the children and local group at each
 * level, if the bios are dispatched into a single bio_list, there's a risk
 * of a local or child group which can queue many bios at once filling up
 * the list starving others.
 *
 * To avoid such starvation, dispatched bios are queued separately
 * according to where they came from.  When they are again dispatched to
 * the parent, they're popped in round-robin order so that no single source
 * hogs the dispatch window.
 *
 * throtl_qnode is used to keep the queued bios separated by their sources.
 * Bios are queued to throtl_qnode which in turn is queued to
 * throtl_service_queue and then dispatched in round-robin order.
 *
 * It's also used to track the reference counts on blkg's.  A qnode always
 * belongs to a throtl_grp and gets queued on itself or the parent, so
 * incrementing the reference of the associated throtl_grp when a qnode is
 * queued and decrementing when dequeued is enough to keep the whole blkg
 * tree pinned while bios are in flight.
 */
struct throtl_qnode {
	struct list_head	node;		/* service_queue->queued[] */
	struct bio_list		bios;		/* queued bios */
	struct throtl_grp	*tg;		/* tg this qnode belongs to */
};

58
struct throtl_service_queue {
59
60
	struct throtl_service_queue *parent_sq;	/* the parent service_queue */

61
62
63
64
	/*
	 * Bios queued directly to this service_queue or dispatched from
	 * children throtl_grp's.
	 */
65
	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
66
67
68
69
70
71
	unsigned int		nr_queued[2];	/* number of queued bios */

	/*
	 * RB tree of active children throtl_grp's, which are sorted by
	 * their ->disptime.
	 */
72
73
74
75
	struct rb_root		pending_tree;	/* RB tree of active tgs */
	struct rb_node		*first_pending;	/* first node in the tree */
	unsigned int		nr_pending;	/* # queued in the tree */
	unsigned long		first_pending_disptime;	/* disptime of the first tg */
76
	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
77
78
};

79
80
enum tg_state_flags {
	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
81
	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
82
83
};

84
85
86
#define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)

struct throtl_grp {
87
88
89
	/* must be the first member */
	struct blkg_policy_data pd;

90
	/* active throtl group service_queue member */
91
92
	struct rb_node rb_node;

93
94
95
	/* throtl_data this group belongs to */
	struct throtl_data *td;

96
97
98
	/* this group's service queue */
	struct throtl_service_queue service_queue;

99
100
101
102
103
104
105
106
107
108
109
	/*
	 * qnode_on_self is used when bios are directly queued to this
	 * throtl_grp so that local bios compete fairly with bios
	 * dispatched from children.  qnode_on_parent is used when bios are
	 * dispatched from this throtl_grp into its parent and will compete
	 * with the sibling qnode_on_parents and the parent's
	 * qnode_on_self.
	 */
	struct throtl_qnode qnode_on_self[2];
	struct throtl_qnode qnode_on_parent[2];

110
111
112
113
114
115
116
117
118
	/*
	 * Dispatch time in jiffies. This is the estimated time when group
	 * will unthrottle and is ready to dispatch more bio. It is used as
	 * key to sort active groups in service tree.
	 */
	unsigned long disptime;

	unsigned int flags;

119
120
121
	/* are there any throtl rules between this group and td? */
	bool has_rules[2];

122
123
124
	/* bytes per second rate limits */
	uint64_t bps[2];

125
126
127
	/* IOPS limits */
	unsigned int iops[2];

128
129
	/* Number of bytes disptached in current slice */
	uint64_t bytes_disp[2];
130
131
	/* Number of bio's dispatched in current slice */
	unsigned int io_disp[2];
132
133
134
135
136
137
138
139
140

	/* When did we start a new slice */
	unsigned long slice_start[2];
	unsigned long slice_end[2];
};

struct throtl_data
{
	/* service tree for active throtl groups */
141
	struct throtl_service_queue service_queue;
142
143
144
145
146
147
148

	struct request_queue *queue;

	/* Total Number of queued bios on READ and WRITE lists */
	unsigned int nr_queued[2];

	/* Work for dispatching throttled bios */
149
	struct work_struct dispatch_work;
150
151
};

152
153
static void throtl_pending_timer_fn(unsigned long arg);

154
155
156
157
158
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}

Tejun Heo's avatar
Tejun Heo committed
159
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
160
{
161
	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
162
163
}

Tejun Heo's avatar
Tejun Heo committed
164
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
165
{
166
	return pd_to_blkg(&tg->pd);
167
168
}

169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/**
 * sq_to_tg - return the throl_grp the specified service queue belongs to
 * @sq: the throtl_service_queue of interest
 *
 * Return the throtl_grp @sq belongs to.  If @sq is the top-level one
 * embedded in throtl_data, %NULL is returned.
 */
static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
{
	if (sq && sq->parent_sq)
		return container_of(sq, struct throtl_grp, service_queue);
	else
		return NULL;
}

/**
 * sq_to_td - return throtl_data the specified service queue belongs to
 * @sq: the throtl_service_queue of interest
 *
 * A service_queue can be embeded in either a throtl_grp or throtl_data.
 * Determine the associated throtl_data accordingly and return it.
 */
static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
{
	struct throtl_grp *tg = sq_to_tg(sq);

	if (tg)
		return tg->td;
	else
		return container_of(sq, struct throtl_data, service_queue);
}

/**
 * throtl_log - log debug message via blktrace
 * @sq: the service_queue being reported
 * @fmt: printf format string
 * @args: printf args
 *
 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
 * throtl_grp; otherwise, just "throtl".
 */
#define throtl_log(sq, fmt, args...)	do {				\
	struct throtl_grp *__tg = sq_to_tg((sq));			\
	struct throtl_data *__td = sq_to_td((sq));			\
									\
	(void)__td;							\
215
216
	if (likely(!blk_trace_note_message_enabled(__td->queue)))	\
		break;							\
217
218
	if ((__tg)) {							\
		char __pbuf[128];					\
Tejun Heo's avatar
Tejun Heo committed
219
									\
220
221
222
223
224
		blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf));	\
		blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
	} else {							\
		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
	}								\
Tejun Heo's avatar
Tejun Heo committed
225
} while (0)
226

227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
	INIT_LIST_HEAD(&qn->node);
	bio_list_init(&qn->bios);
	qn->tg = tg;
}

/**
 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
 * @bio: bio being added
 * @qn: qnode to add bio to
 * @queued: the service_queue->queued[] list @qn belongs to
 *
 * Add @bio to @qn and put @qn on @queued if it's not already on.
 * @qn->tg's reference count is bumped when @qn is activated.  See the
 * comment on top of throtl_qnode definition for details.
 */
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
				 struct list_head *queued)
{
	bio_list_add(&qn->bios, bio);
	if (list_empty(&qn->node)) {
		list_add_tail(&qn->node, queued);
		blkg_get(tg_to_blkg(qn->tg));
	}
}

/**
 * throtl_peek_queued - peek the first bio on a qnode list
 * @queued: the qnode list to peek
 */
static struct bio *throtl_peek_queued(struct list_head *queued)
{
	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
	struct bio *bio;

	if (list_empty(queued))
		return NULL;

	bio = bio_list_peek(&qn->bios);
	WARN_ON_ONCE(!bio);
	return bio;
}

/**
 * throtl_pop_queued - pop the first bio form a qnode list
 * @queued: the qnode list to pop a bio from
 * @tg_to_put: optional out argument for throtl_grp to put
 *
 * Pop the first bio from the qnode list @queued.  After popping, the first
 * qnode is removed from @queued if empty or moved to the end of @queued so
 * that the popping order is round-robin.
 *
 * When the first qnode is removed, its associated throtl_grp should be put
 * too.  If @tg_to_put is NULL, this function automatically puts it;
 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
 * responsible for putting it.
 */
static struct bio *throtl_pop_queued(struct list_head *queued,
				     struct throtl_grp **tg_to_put)
{
	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
	struct bio *bio;

	if (list_empty(queued))
		return NULL;

	bio = bio_list_pop(&qn->bios);
	WARN_ON_ONCE(!bio);

	if (bio_list_empty(&qn->bios)) {
		list_del_init(&qn->node);
		if (tg_to_put)
			*tg_to_put = qn->tg;
		else
			blkg_put(tg_to_blkg(qn->tg));
	} else {
		list_move_tail(&qn->node, queued);
	}

	return bio;
}

310
/* init a service_queue, assumes the caller zeroed it */
311
static void throtl_service_queue_init(struct throtl_service_queue *sq)
312
{
313
314
	INIT_LIST_HEAD(&sq->queued[0]);
	INIT_LIST_HEAD(&sq->queued[1]);
315
	sq->pending_tree = RB_ROOT;
316
317
318
319
	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
		    (unsigned long)sq);
}

320
321
static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
{
322
	struct throtl_grp *tg;
Tejun Heo's avatar
Tejun Heo committed
323
	int rw;
324
325
326

	tg = kzalloc_node(sizeof(*tg), gfp, node);
	if (!tg)
327
		return NULL;
328

329
330
331
332
333
334
335
336
337
338
339
340
341
	throtl_service_queue_init(&tg->service_queue);

	for (rw = READ; rw <= WRITE; rw++) {
		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
	}

	RB_CLEAR_NODE(&tg->rb_node);
	tg->bps[READ] = -1;
	tg->bps[WRITE] = -1;
	tg->iops[READ] = -1;
	tg->iops[WRITE] = -1;

342
	return &tg->pd;
343
344
}

345
static void throtl_pd_init(struct blkg_policy_data *pd)
346
{
347
348
	struct throtl_grp *tg = pd_to_tg(pd);
	struct blkcg_gq *blkg = tg_to_blkg(tg);
349
	struct throtl_data *td = blkg->q->td;
350
	struct throtl_service_queue *sq = &tg->service_queue;
351

352
	/*
353
	 * If on the default hierarchy, we switch to properly hierarchical
354
355
356
357
358
	 * behavior where limits on a given throtl_grp are applied to the
	 * whole subtree rather than just the group itself.  e.g. If 16M
	 * read_bps limit is set on the root group, the whole system can't
	 * exceed 16M for the device.
	 *
359
	 * If not on the default hierarchy, the broken flat hierarchy
360
361
362
363
364
	 * behavior is retained where all throtl_grps are treated as if
	 * they're all separate root groups right below throtl_data.
	 * Limits of a group don't interact with limits of other groups
	 * regardless of the position of the group in the hierarchy.
	 */
365
	sq->parent_sq = &td->service_queue;
366
	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
367
		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
368
	tg->td = td;
369
370
}

371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
/*
 * Set has_rules[] if @tg or any of its parents have limits configured.
 * This doesn't require walking up to the top of the hierarchy as the
 * parent's has_rules[] is guaranteed to be correct.
 */
static void tg_update_has_rules(struct throtl_grp *tg)
{
	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
	int rw;

	for (rw = READ; rw <= WRITE; rw++)
		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
				    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
}

386
static void throtl_pd_online(struct blkg_policy_data *pd)
387
388
389
390
391
{
	/*
	 * We don't want new groups to escape the limits of its ancestors.
	 * Update has_rules[] after a new group is brought online.
	 */
392
	tg_update_has_rules(pd_to_tg(pd));
393
394
}

395
396
static void throtl_pd_free(struct blkg_policy_data *pd)
{
397
398
	struct throtl_grp *tg = pd_to_tg(pd);

399
	del_timer_sync(&tg->service_queue.pending_timer);
400
	kfree(tg);
401
402
}

403
404
static struct throtl_grp *
throtl_rb_first(struct throtl_service_queue *parent_sq)
405
406
{
	/* Service tree is empty */
407
	if (!parent_sq->nr_pending)
408
409
		return NULL;

410
411
	if (!parent_sq->first_pending)
		parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
412

413
414
	if (parent_sq->first_pending)
		return rb_entry_tg(parent_sq->first_pending);
415
416
417
418
419
420
421
422
423
424

	return NULL;
}

static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
	rb_erase(n, root);
	RB_CLEAR_NODE(n);
}

425
426
static void throtl_rb_erase(struct rb_node *n,
			    struct throtl_service_queue *parent_sq)
427
{
428
429
430
431
	if (parent_sq->first_pending == n)
		parent_sq->first_pending = NULL;
	rb_erase_init(n, &parent_sq->pending_tree);
	--parent_sq->nr_pending;
432
433
}

434
static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
435
436
437
{
	struct throtl_grp *tg;

438
	tg = throtl_rb_first(parent_sq);
439
440
441
	if (!tg)
		return;

442
	parent_sq->first_pending_disptime = tg->disptime;
443
444
}

445
static void tg_service_queue_add(struct throtl_grp *tg)
446
{
447
	struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
448
	struct rb_node **node = &parent_sq->pending_tree.rb_node;
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
	struct rb_node *parent = NULL;
	struct throtl_grp *__tg;
	unsigned long key = tg->disptime;
	int left = 1;

	while (*node != NULL) {
		parent = *node;
		__tg = rb_entry_tg(parent);

		if (time_before(key, __tg->disptime))
			node = &parent->rb_left;
		else {
			node = &parent->rb_right;
			left = 0;
		}
	}

	if (left)
467
		parent_sq->first_pending = &tg->rb_node;
468
469

	rb_link_node(&tg->rb_node, parent, node);
470
	rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
471
472
}

473
static void __throtl_enqueue_tg(struct throtl_grp *tg)
474
{
475
	tg_service_queue_add(tg);
476
	tg->flags |= THROTL_TG_PENDING;
477
	tg->service_queue.parent_sq->nr_pending++;
478
479
}

480
static void throtl_enqueue_tg(struct throtl_grp *tg)
481
{
482
	if (!(tg->flags & THROTL_TG_PENDING))
483
		__throtl_enqueue_tg(tg);
484
485
}

486
static void __throtl_dequeue_tg(struct throtl_grp *tg)
487
{
488
	throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
489
	tg->flags &= ~THROTL_TG_PENDING;
490
491
}

492
static void throtl_dequeue_tg(struct throtl_grp *tg)
493
{
494
	if (tg->flags & THROTL_TG_PENDING)
495
		__throtl_dequeue_tg(tg);
496
497
}

498
/* Call with queue lock held */
499
500
static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
					  unsigned long expires)
501
{
502
503
504
	mod_timer(&sq->pending_timer, expires);
	throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
		   expires - jiffies, jiffies);
505
506
}

507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
/**
 * throtl_schedule_next_dispatch - schedule the next dispatch cycle
 * @sq: the service_queue to schedule dispatch for
 * @force: force scheduling
 *
 * Arm @sq->pending_timer so that the next dispatch cycle starts on the
 * dispatch time of the first pending child.  Returns %true if either timer
 * is armed or there's no pending child left.  %false if the current
 * dispatch window is still open and the caller should continue
 * dispatching.
 *
 * If @force is %true, the dispatch timer is always scheduled and this
 * function is guaranteed to return %true.  This is to be used when the
 * caller can't dispatch itself and needs to invoke pending_timer
 * unconditionally.  Note that forced scheduling is likely to induce short
 * delay before dispatch starts even if @sq->first_pending_disptime is not
 * in the future and thus shouldn't be used in hot paths.
 */
static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
					  bool force)
527
{
528
	/* any pending children left? */
529
	if (!sq->nr_pending)
530
		return true;
531

532
	update_min_dispatch_time(sq);
533

534
	/* is the next dispatch time in the future? */
535
	if (force || time_after(sq->first_pending_disptime, jiffies)) {
536
		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
537
		return true;
538
539
	}

540
541
	/* tell the caller to continue dispatching */
	return false;
542
543
}

544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
		bool rw, unsigned long start)
{
	tg->bytes_disp[rw] = 0;
	tg->io_disp[rw] = 0;

	/*
	 * Previous slice has expired. We must have trimmed it after last
	 * bio dispatch. That means since start of last slice, we never used
	 * that bandwidth. Do try to make use of that bandwidth while giving
	 * credit.
	 */
	if (time_after_eq(start, tg->slice_start[rw]))
		tg->slice_start[rw] = start;

	tg->slice_end[rw] = jiffies + throtl_slice;
	throtl_log(&tg->service_queue,
		   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
		   tg->slice_end[rw], jiffies);
}

566
static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
567
568
{
	tg->bytes_disp[rw] = 0;
569
	tg->io_disp[rw] = 0;
570
571
	tg->slice_start[rw] = jiffies;
	tg->slice_end[rw] = jiffies + throtl_slice;
572
573
574
575
	throtl_log(&tg->service_queue,
		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
		   tg->slice_end[rw], jiffies);
576
577
}

578
579
static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
					unsigned long jiffy_end)
580
581
582
583
{
	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
}

584
585
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
				       unsigned long jiffy_end)
586
587
{
	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
588
589
590
591
	throtl_log(&tg->service_queue,
		   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
		   tg->slice_end[rw], jiffies);
592
593
594
}

/* Determine if previously allocated or extended slice is complete or not */
595
static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
596
597
{
	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
598
		return false;
599
600
601
602
603

	return 1;
}

/* Trim the used slices and adjust slice start accordingly */
604
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
605
{
606
607
	unsigned long nr_slices, time_elapsed, io_trim;
	u64 bytes_trim, tmp;
608
609
610
611
612
613
614
615

	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));

	/*
	 * If bps are unlimited (-1), then time slice don't get
	 * renewed. Don't try to trim the slice if slice is used. A new
	 * slice will start when appropriate.
	 */
616
	if (throtl_slice_used(tg, rw))
617
618
		return;

619
620
621
622
623
624
625
626
	/*
	 * A bio has been dispatched. Also adjust slice_end. It might happen
	 * that initially cgroup limit was very low resulting in high
	 * slice_end, but later limit was bumped up and bio was dispached
	 * sooner, then we need to reduce slice_end. A high bogus slice_end
	 * is bad because it does not allow new slice to start.
	 */

627
	throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
628

629
630
631
632
633
634
	time_elapsed = jiffies - tg->slice_start[rw];

	nr_slices = time_elapsed / throtl_slice;

	if (!nr_slices)
		return;
635
636
637
	tmp = tg->bps[rw] * throtl_slice * nr_slices;
	do_div(tmp, HZ);
	bytes_trim = tmp;
638

639
	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
640

641
	if (!bytes_trim && !io_trim)
642
643
644
645
646
647
648
		return;

	if (tg->bytes_disp[rw] >= bytes_trim)
		tg->bytes_disp[rw] -= bytes_trim;
	else
		tg->bytes_disp[rw] = 0;

649
650
651
652
653
	if (tg->io_disp[rw] >= io_trim)
		tg->io_disp[rw] -= io_trim;
	else
		tg->io_disp[rw] = 0;

654
655
	tg->slice_start[rw] += nr_slices * throtl_slice;

656
657
658
659
	throtl_log(&tg->service_queue,
		   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
		   rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
		   tg->slice_start[rw], tg->slice_end[rw], jiffies);
660
661
}

662
663
static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
				  unsigned long *wait)
664
665
{
	bool rw = bio_data_dir(bio);
666
	unsigned int io_allowed;
667
	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
668
	u64 tmp;
669

670
	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
671

672
673
674
675
676
677
	/* Slice has just started. Consider one slice interval */
	if (!jiffy_elapsed)
		jiffy_elapsed_rnd = throtl_slice;

	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);

678
679
680
681
682
683
684
685
686
687
688
689
690
691
	/*
	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
	 * will allow dispatch after 1 second and after that slice should
	 * have been trimmed.
	 */

	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
	do_div(tmp, HZ);

	if (tmp > UINT_MAX)
		io_allowed = UINT_MAX;
	else
		io_allowed = tmp;
692
693

	if (tg->io_disp[rw] + 1 <= io_allowed) {
694
695
		if (wait)
			*wait = 0;
696
		return true;
697
698
	}

699
700
701
702
703
704
705
706
707
708
709
710
711
	/* Calc approx time to dispatch */
	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;

	if (jiffy_wait > jiffy_elapsed)
		jiffy_wait = jiffy_wait - jiffy_elapsed;
	else
		jiffy_wait = 1;

	if (wait)
		*wait = jiffy_wait;
	return 0;
}

712
713
static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
				 unsigned long *wait)
714
715
{
	bool rw = bio_data_dir(bio);
716
	u64 bytes_allowed, extra_bytes, tmp;
717
	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
718
719
720
721
722
723
724
725
726

	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];

	/* Slice has just started. Consider one slice interval */
	if (!jiffy_elapsed)
		jiffy_elapsed_rnd = throtl_slice;

	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);

727
728
	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
	do_div(tmp, HZ);
729
	bytes_allowed = tmp;
730

731
	if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
732
733
		if (wait)
			*wait = 0;
734
		return true;
735
736
737
	}

	/* Calc approx time to dispatch */
738
	extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
739
740
741
742
743
744
745
746
747
748
749
750
	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);

	if (!jiffy_wait)
		jiffy_wait = 1;

	/*
	 * This wait time is without taking into consideration the rounding
	 * up we did. Add that time also.
	 */
	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
	if (wait)
		*wait = jiffy_wait;
751
752
753
754
755
756
757
	return 0;
}

/*
 * Returns whether one can dispatch a bio or not. Also returns approx number
 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
 */
758
759
static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
			    unsigned long *wait)
760
761
762
763
764
765
766
767
768
769
{
	bool rw = bio_data_dir(bio);
	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;

	/*
 	 * Currently whole state machine of group depends on first bio
	 * queued in the group bio list. So one should not be calling
	 * this function with a different bio if there are other bios
	 * queued.
	 */
770
	BUG_ON(tg->service_queue.nr_queued[rw] &&
771
	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
772

773
774
775
776
	/* If tg->bps = -1, then BW is unlimited */
	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
		if (wait)
			*wait = 0;
777
		return true;
778
779
780
781
782
	}

	/*
	 * If previous slice expired, start a new one otherwise renew/extend
	 * existing slice to make sure it is at least throtl_slice interval
783
784
785
	 * long since now. New slice is started only for empty throttle group.
	 * If there is queued bio, that means there should be an active
	 * slice and it should be extended instead.
786
	 */
787
	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
788
		throtl_start_new_slice(tg, rw);
789
790
	else {
		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
791
			throtl_extend_slice(tg, rw, jiffies + throtl_slice);
792
793
	}

794
795
	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
796
797
798
799
800
801
802
803
804
805
806
		if (wait)
			*wait = 0;
		return 1;
	}

	max_wait = max(bps_wait, iops_wait);

	if (wait)
		*wait = max_wait;

	if (time_before(tg->slice_end[rw], jiffies + max_wait))
807
		throtl_extend_slice(tg, rw, jiffies + max_wait);
808
809
810
811
812
813
814
815
816

	return 0;
}

static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
{
	bool rw = bio_data_dir(bio);

	/* Charge the bio to the group */
817
	tg->bytes_disp[rw] += bio->bi_iter.bi_size;
818
	tg->io_disp[rw]++;
819

820
	/*
821
	 * BIO_THROTTLED is used to prevent the same bio to be throttled
822
823
824
825
	 * more than once as a throttled bio will go through blk-throtl the
	 * second time when it eventually gets issued.  Set it when a bio
	 * is being charged to a tg.
	 */
826
827
	if (!bio_flagged(bio, BIO_THROTTLED))
		bio_set_flag(bio, BIO_THROTTLED);
828
829
}

830
831
832
833
834
835
836
837
838
839
840
/**
 * throtl_add_bio_tg - add a bio to the specified throtl_grp
 * @bio: bio to add
 * @qn: qnode to use
 * @tg: the target throtl_grp
 *
 * Add @bio to @tg's service_queue using @qn.  If @qn is not specified,
 * tg->qnode_on_self[] is used.
 */
static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
			      struct throtl_grp *tg)
841
{
842
	struct throtl_service_queue *sq = &tg->service_queue;
843
844
	bool rw = bio_data_dir(bio);

845
846
847
	if (!qn)
		qn = &tg->qnode_on_self[rw];

848
849
850
851
852
853
854
855
856
	/*
	 * If @tg doesn't currently have any bios queued in the same
	 * direction, queueing @bio can change when @tg should be
	 * dispatched.  Mark that @tg was empty.  This is automatically
	 * cleaered on the next tg_update_disptime().
	 */
	if (!sq->nr_queued[rw])
		tg->flags |= THROTL_TG_WAS_EMPTY;

857
858
	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);

859
	sq->nr_queued[rw]++;
860
	throtl_enqueue_tg(tg);
861
862
}

863
static void tg_update_disptime(struct throtl_grp *tg)
864
{
865
	struct throtl_service_queue *sq = &tg->service_queue;
866
867
868
	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
	struct bio *bio;

869
	if ((bio = throtl_peek_queued(&sq->queued[READ])))
870
		tg_may_dispatch(tg, bio, &read_wait);
871

872
	if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
873
		tg_may_dispatch(tg, bio, &write_wait);
874
875
876
877
878

	min_wait = min(read_wait, write_wait);
	disptime = jiffies + min_wait;

	/* Update dispatch time */
879
	throtl_dequeue_tg(tg);
880
	tg->disptime = disptime;
881
	throtl_enqueue_tg(tg);
882
883
884

	/* see throtl_add_bio_tg() */
	tg->flags &= ~THROTL_TG_WAS_EMPTY;
885
886
}

887
888
889
890
891
892
893
894
895
896
static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
					struct throtl_grp *parent_tg, bool rw)
{
	if (throtl_slice_used(parent_tg, rw)) {
		throtl_start_new_slice_with_credit(parent_tg, rw,
				child_tg->slice_start[rw]);
	}

}

897
static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
898
{
899
	struct throtl_service_queue *sq = &tg->service_queue;
900
901
	struct throtl_service_queue *parent_sq = sq->parent_sq;
	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
902
	struct throtl_grp *tg_to_put = NULL;
903
904
	struct bio *bio;

905
906
907
908
909
910
911
	/*
	 * @bio is being transferred from @tg to @parent_sq.  Popping a bio
	 * from @tg may put its reference and @parent_sq might end up
	 * getting released prematurely.  Remember the tg to put and put it
	 * after @bio is transferred to @parent_sq.
	 */
	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
912
	sq->nr_queued[rw]--;
913
914

	throtl_charge_bio(tg, bio);
915
916
917
918
919
920
921
922
923

	/*
	 * If our parent is another tg, we just need to transfer @bio to
	 * the parent using throtl_add_bio_tg().  If our parent is
	 * @td->service_queue, @bio is ready to be issued.  Put it on its
	 * bio_lists[] and decrease total number queued.  The caller is
	 * responsible for issuing these bios.
	 */
	if (parent_tg) {
924
		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
925
		start_parent_slice_with_credit(tg, parent_tg, rw);
926
	} else {
927
928
		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
				     &parent_sq->queued[rw]);
929
930
931
		BUG_ON(tg->td->nr_queued[rw] <= 0);
		tg->td->nr_queued[rw]--;
	}
932

933
	throtl_trim_slice(tg, rw);
934

935
936
	if (tg_to_put)
		blkg_put(tg_to_blkg(tg_to_put));
937
938
}

939
static int throtl_dispatch_tg(struct throtl_grp *tg)
940
{
941
	struct throtl_service_queue *sq = &tg->service_queue;
942
943
	unsigned int nr_reads = 0, nr_writes = 0;
	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
944
	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
945
946
947
948
	struct bio *bio;

	/* Try to dispatch 75% READS and 25% WRITES */

949
	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
950
	       tg_may_dispatch(tg, bio, NULL)) {
951

952
		tg_dispatch_one_bio(tg, bio_data_dir(bio));
953
954
955
956
957
958
		nr_reads++;

		if (nr_reads >= max_nr_reads)
			break;
	}

959
	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
960
	       tg_may_dispatch(tg, bio, NULL)) {
961

962
		tg_dispatch_one_bio(tg, bio_data_dir(bio));
963
964
965
966
967
968
969
970
971
		nr_writes++;

		if (nr_writes >= max_nr_writes)
			break;
	}

	return nr_reads + nr_writes;
}

972
static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
973
974
975
976
{
	unsigned int nr_disp = 0;

	while (1) {
977
978
		struct throtl_grp *tg = throtl_rb_first(parent_sq);
		struct throtl_service_queue *sq = &tg->service_queue;
979
980
981
982
983
984
985

		if (!tg)
			break;

		if (time_before(jiffies, tg->disptime))
			break;

986
		throtl_dequeue_tg(tg);
987

988
		nr_disp += throtl_dispatch_tg(tg);
989

990
		if (sq->nr_queued[0] || sq->nr_queued[1])
991
			tg_update_disptime(tg);
992
993
994
995
996
997
998
999

		if (nr_disp >= throtl_quantum)
			break;
	}

	return nr_disp;
}

1000
1001
1002
1003
1004
1005
1006
/**
 * throtl_pending_timer_fn - timer function for service_queue->pending_timer
 * @arg: the throtl_service_queue being serviced
 *
 * This timer is armed when a child throtl_grp with active bio's become
 * pending and queued on the service_queue's pending_tree and expires when
 * the first child throtl_grp should be dispatched.  This function
1007
1008
1009
1010
1011
1012
1013
 * dispatches bio's from the children throtl_grps to the parent
 * service_queue.
 *
 * If the parent's parent is another throtl_grp, dispatching is propagated
 * by either arming its pending_timer or repeating dispatch directly.  If
 * the top-level service_tree is reached, throtl_data->dispatch_work is
 * kicked so that the ready bio's are issued.
1014
 */
1015
1016
1017
static void throtl_pending_timer_fn(unsigned long arg)
{
	struct throtl_service_queue *sq = (void *)arg;
1018
	struct throtl_grp *tg = sq_to_tg(sq);
1019
	struct throtl_data *td = sq_to_td(sq);
1020
	struct request_queue *q = td->queue;
1021
1022
	struct throtl_service_queue *parent_sq;
	bool dispatched;
1023
	int ret;
1024
1025

	spin_lock_irq(q->queue_lock);
1026
1027
1028
again:
	parent_sq = sq->parent_sq;
	dispatched = false;
1029

1030
1031
	while (true) {
		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
1032
1033
			   sq->nr_queued[READ] + sq->nr_queued[WRITE],
			   sq->nr_queued[READ], sq->nr_queued[WRITE]);
1034
1035
1036
1037
1038
1039

		ret = throtl_select_dispatch(sq);
		if (ret) {
			throtl_log(sq, "bios disp=%u", ret);
			dispatched = true;
		}
1040

1041
1042
		if (throtl_schedule_next_dispatch(sq, false))
			break;
1043

1044
1045
1046
1047
		/* this dispatch windows is still open, relax and repeat */
		spin_unlock_irq(q->queue_lock);
		cpu_relax();
		spin_lock_irq(q->queue_lock);
1048
	}
1049

1050
1051
	if (!dispatched)
		goto out_unlock;
1052

1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
	if (parent_sq) {
		/* @parent_sq is another throl_grp, propagate dispatch */
		if (tg->flags & THROTL_TG_WAS_EMPTY) {
			tg_update_disptime(tg);
			if (!throtl_schedule_next_dispatch(parent_sq, false)) {
				/* window is already open, repeat dispatching */
				sq = parent_sq;
				tg = sq_to_tg(sq);
				goto again;
			}
		}
	} else {
		/* reached the top-level, queue issueing */
		queue_work(kthrotld_workqueue, &td->dispatch_work);
	}
out_unlock:
1069
	spin_unlock_irq(q->queue_lock);
1070
}
1071

1072
1073
1074
1075
1076
1077
1078
1079
/**
 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
 * @work: work item being executed
 *
 * This function is queued for execution when bio's reach the bio_lists[]
 * of throtl_data->service_queue.  Those bio's are ready and issued by this
 * function.
 */
1080
static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
{
	struct throtl_data *td = container_of(work, struct throtl_data,
					      dispatch_work);
	struct throtl_service_queue *td_sq = &td->service_queue;
	struct request_queue *q = td->queue;
	struct bio_list bio_list_on_stack;
	struct bio *bio;
	struct blk_plug plug;
	int rw;

	bio_list_init(&bio_list_on_stack);

	spin_lock_irq(q->queue_lock);
1094
1095
1096
	for (rw = READ; rw <= WRITE; rw++)
		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
			bio_list_add(&bio_list_on_stack, bio);
1097
1098
1099
	spin_unlock_irq(q->queue_lock);

	if (!bio_list_empty(&bio_list_on_stack)) {
1100
		blk_start_plug(&plug);
1101
1102
		while((bio = bio_list_pop(&bio_list_on_stack)))
			generic_make_request(bio);
1103
		blk_finish_plug(&plug);
1104
1105
1106
	}
}

1107
1108
static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
			      int off)
1109
{
1110
1111
	struct throtl_grp *tg = pd_to_tg(pd);
	u64 v = *(u64 *)((void *)tg + off);
1112

1113
	if (v == -1)
1114
		return 0;
1115
	return __blkg_prfill_u64(sf, pd, v);
1116
1117
}

1118
1119
static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
			       int off)
1120
{
1121
1122
	struct throtl_grp *tg = pd_to_tg(pd);
	unsigned int v = *(unsigned int *)((void *)tg + off);
1123

1124
1125
	if (v == -1)
		return 0;
1126
	return __blkg_prfill_u64(sf, pd, v);
1127
1128
}

1129
static int tg_print_conf_u64(struct seq_file *sf, void *v)
1130
{
1131
1132
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
1133
	return 0;
1134
1135
}

1136
static int tg_print_conf_uint(struct seq_file *sf, void *v)
1137
{
1138
1139
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
1140
	return 0;
1141
1142
}

1143
static void tg_conf_updated(struct throtl_grp *tg)
1144
{
1145
	struct throtl_service_queue *sq = &tg->service_queue;
1146
	struct cgroup_subsys_state *pos_css;
1147
	struct blkcg_gq *blkg;
1148

1149
1150
1151
1152
	throtl_log(&tg->service_queue,
		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
		   tg->bps[READ], tg->bps[WRITE],
		   tg->iops[READ], tg->iops[WRITE]);
1153

1154
1155
1156
1157
1158
1159
1160
	/*
	 * Update has_rules[] flags for the updated tg's subtree.  A tg is
	 * considered to have rules if either the tg itself or any of its
	 * ancestors has rules.  This identifies groups without any
	 * restrictions in the whole hierarchy and allows them to bypass
	 * blk-throttle.
	 */
1161
	blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
1162
1163
		tg_update_has_rules(blkg_to_tg(blkg));

1164
1165
1166
1167
1168
1169
1170
1171
	/*
	 * We're already holding queue_lock and know @tg is valid.  Let's
	 * apply the new config directly.
	 *
	 * Restart the slices for both READ and WRITES. It might happen
	 * that a group's limit are dropped suddenly and we don't want to
	 * account recently dispatched IO with new low rate.
	 */
1172
1173
	throtl_start_new_slice(tg, 0);
	throtl_start_new_slice(tg, 1);
1174

1175
	if (tg->flags & THROTL_TG_PENDING) {
1176
		tg_update_disptime(tg);
1177
		throtl_schedule_next_dispatch(sq->parent_sq, true);
1178
	}
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
}

static ssize_t tg_set_conf(struct kernfs_open_file *of,
			   char *buf, size_t nbytes, loff_t off, bool is_u64)
{
	struct blkcg *blkcg = css_to_blkcg(of_css(of));
	struct blkg_conf_ctx ctx;
	struct throtl_grp *tg;
	int ret;
	u64 v;

	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
	if (ret)
		return ret;

	ret = -EINVAL;
	if (sscanf(ctx.body, "%llu", &v) != 1)
		goto out_finish;
	if (!v)
		v = -1;

	tg = blkg_to_tg(ctx.blkg);

	if (is_u64)
		*(u64 *)((void *)tg + of_cft(of)->private) = v;
	else
		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
1206

1207
	tg_conf_updated(tg);
1208
1209
	ret = 0;
out_finish:
1210
	blkg_conf_finish(&ctx);