blk-cgroup.c 41.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14
15
#include <linux/seq_file.h>
#include <linux/kdev_t.h>
16
#include <linux/module.h>
17
#include <linux/err.h>
18
#include <linux/blkdev.h>
19
#include <linux/slab.h>
20
#include <linux/genhd.h>
21
#include <linux/delay.h>
Tejun Heo's avatar
Tejun Heo committed
22
#include <linux/atomic.h>
23
#include "blk-cgroup.h"
24
#include "blk.h"
25

26
27
#define MAX_KEY_LEN 100

28
29
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
30

31
32
33
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

34
35
36
37
38
39
40
/* List of groups pending per cpu stats allocation */
static DEFINE_SPINLOCK(alloc_list_lock);
static LIST_HEAD(alloc_list);

static void blkio_stat_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);

41
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
42
43
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

44
45
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

46
47
48
49
50
51
/* for encoding cft->private value on file */
#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
/* What policy owns the file, proportional or throttle */
#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
#define BLKIOFILE_ATTR(val)		((val) & 0xffff)

52
53
54
55
56
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
57
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
58

59
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
60
61
62
63
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
64
65
66
67
68
69
70
71

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
72

73
74
static inline void blkio_update_group_weight(struct blkio_group *blkg,
					     int plid, unsigned int weight)
75
76
77
78
79
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {
		/* If this policy does not own the blkg, do not send updates */
80
		if (blkiop->plid != plid)
81
82
			continue;
		if (blkiop->ops.blkio_update_group_weight_fn)
83
			blkiop->ops.blkio_update_group_weight_fn(blkg->q,
84
							blkg, weight);
85
86
87
	}
}

88
89
static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
					  u64 bps, int fileid)
90
91
92
93
94
95
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {

		/* If this policy does not own the blkg, do not send updates */
96
		if (blkiop->plid != plid)
97
98
99
100
			continue;

		if (fileid == BLKIO_THROTL_read_bps_device
		    && blkiop->ops.blkio_update_group_read_bps_fn)
101
			blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
102
								blkg, bps);
103
104
105

		if (fileid == BLKIO_THROTL_write_bps_device
		    && blkiop->ops.blkio_update_group_write_bps_fn)
106
			blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
107
								blkg, bps);
108
109
110
	}
}

111
static inline void blkio_update_group_iops(struct blkio_group *blkg,
112
113
					   int plid, unsigned int iops,
					   int fileid)
114
115
116
117
118
119
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {

		/* If this policy does not own the blkg, do not send updates */
120
		if (blkiop->plid != plid)
121
122
123
124
			continue;

		if (fileid == BLKIO_THROTL_read_iops_device
		    && blkiop->ops.blkio_update_group_read_iops_fn)
125
			blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
126
								blkg, iops);
127
128
129

		if (fileid == BLKIO_THROTL_write_iops_device
		    && blkiop->ops.blkio_update_group_write_iops_fn)
130
			blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
131
								blkg,iops);
132
133
134
	}
}

135
#ifdef CONFIG_DEBUG_BLK_CGROUP
136
/* This should be called with the queue_lock held. */
137
static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
138
139
					    struct blkio_policy_type *pol,
					    struct blkio_group *curr_blkg)
140
{
141
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
142
143

	if (blkio_blkg_waiting(&pd->stats))
144
145
146
		return;
	if (blkg == curr_blkg)
		return;
147
148
	pd->stats.start_group_wait_time = sched_clock();
	blkio_mark_blkg_waiting(&pd->stats);
149
150
}

151
/* This should be called with the queue_lock held. */
152
153
154
155
156
157
158
159
160
static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
{
	unsigned long long now;

	if (!blkio_blkg_waiting(stats))
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_group_wait_time))
161
162
		blkg_stat_add(&stats->group_wait_time,
			      now - stats->start_group_wait_time);
163
164
165
	blkio_clear_blkg_waiting(stats);
}

166
/* This should be called with the queue_lock held. */
167
168
169
170
171
172
173
174
175
static void blkio_end_empty_time(struct blkio_group_stats *stats)
{
	unsigned long long now;

	if (!blkio_blkg_empty(stats))
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_empty_time))
176
177
		blkg_stat_add(&stats->empty_time,
			      now - stats->start_empty_time);
178
179
180
	blkio_clear_blkg_empty(stats);
}

181
182
void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
					struct blkio_policy_type *pol)
183
{
184
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
185

186
187
188
189
190
	lockdep_assert_held(blkg->q->queue_lock);
	BUG_ON(blkio_blkg_idling(stats));

	stats->start_idle_time = sched_clock();
	blkio_mark_blkg_idling(stats);
191
192
193
}
EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);

194
195
void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
				    struct blkio_policy_type *pol)
196
{
197
198
199
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
200
201

	if (blkio_blkg_idling(stats)) {
202
203
		unsigned long long now = sched_clock();

204
205
206
		if (time_after64(now, stats->start_idle_time))
			blkg_stat_add(&stats->idle_time,
				      now - stats->start_idle_time);
207
208
209
210
211
		blkio_clear_blkg_idling(stats);
	}
}
EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);

212
213
void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
					 struct blkio_policy_type *pol)
214
{
215
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
216

217
218
	lockdep_assert_held(blkg->q->queue_lock);

219
220
221
	blkg_stat_add(&stats->avg_queue_size_sum,
		      blkg_rwstat_sum(&stats->queued));
	blkg_stat_add(&stats->avg_queue_size_samples, 1);
222
	blkio_update_group_wait_time(stats);
223
}
224
225
EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);

226
227
void blkiocg_set_start_empty_time(struct blkio_group *blkg,
				  struct blkio_policy_type *pol)
Divyesh Shah's avatar
Divyesh Shah committed
228
{
229
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
Divyesh Shah's avatar
Divyesh Shah committed
230

231
	lockdep_assert_held(blkg->q->queue_lock);
Divyesh Shah's avatar
Divyesh Shah committed
232

233
	if (blkg_rwstat_sum(&stats->queued))
Divyesh Shah's avatar
Divyesh Shah committed
234
235
236
		return;

	/*
237
238
239
	 * group is already marked empty. This can happen if cfqq got new
	 * request in parent group and moved to this group while being added
	 * to service tree. Just ignore the event and move on.
Divyesh Shah's avatar
Divyesh Shah committed
240
	 */
241
	if (blkio_blkg_empty(stats))
242
243
		return;

Divyesh Shah's avatar
Divyesh Shah committed
244
245
246
247
248
	stats->start_empty_time = sched_clock();
	blkio_mark_blkg_empty(stats);
}
EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);

249
void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
250
251
				  struct blkio_policy_type *pol,
				  unsigned long dequeue)
252
{
253
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
254

255
256
	lockdep_assert_held(blkg->q->queue_lock);

257
	blkg_stat_add(&pd->stats.dequeue, dequeue);
258
259
}
EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
260
261
#else
static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
262
263
264
					struct blkio_policy_type *pol,
					struct blkio_group *curr_blkg) { }
static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
265
266
#endif

267
void blkiocg_update_io_add_stats(struct blkio_group *blkg,
268
269
270
				 struct blkio_policy_type *pol,
				 struct blkio_group *curr_blkg, bool direction,
				 bool sync)
271
{
272
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
273
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
274
275
276

	lockdep_assert_held(blkg->q->queue_lock);

277
	blkg_rwstat_add(&stats->queued, rw, 1);
278
	blkio_end_empty_time(stats);
279
	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
280
}
281
EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
282

283
void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
284
285
				    struct blkio_policy_type *pol,
				    bool direction, bool sync)
286
{
287
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
288
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
289
290

	lockdep_assert_held(blkg->q->queue_lock);
291

292
	blkg_rwstat_add(&stats->queued, rw, -1);
293
}
294
EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
295

296
297
298
299
void blkiocg_update_timeslice_used(struct blkio_group *blkg,
				   struct blkio_policy_type *pol,
				   unsigned long time,
				   unsigned long unaccounted_time)
300
{
301
302
303
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
304

305
	blkg_stat_add(&stats->time, time);
306
#ifdef CONFIG_DEBUG_BLK_CGROUP
307
	blkg_stat_add(&stats->unaccounted_time, unaccounted_time);
308
#endif
309
}
310
EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
311

312
313
314
315
/*
 * should be called under rcu read lock or queue lock to make sure blkg pointer
 * is valid.
 */
316
void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
317
318
				   struct blkio_policy_type *pol,
				   uint64_t bytes, bool direction, bool sync)
319
{
320
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
321
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
322
	struct blkio_group_stats_cpu *stats_cpu;
323
324
	unsigned long flags;

325
326
327
328
	/* If per cpu stats are not allocated yet, don't do any accounting. */
	if (pd->stats_cpu == NULL)
		return;

329
330
331
332
333
334
	/*
	 * Disabling interrupts to provide mutual exclusion between two
	 * writes on same cpu. It probably is not needed for 64bit. Not
	 * optimizing that case yet.
	 */
	local_irq_save(flags);
335

336
	stats_cpu = this_cpu_ptr(pd->stats_cpu);
337

338
339
340
341
	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);

342
	local_irq_restore(flags);
343
}
344
EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
345

346
void blkiocg_update_completion_stats(struct blkio_group *blkg,
347
348
349
350
				     struct blkio_policy_type *pol,
				     uint64_t start_time,
				     uint64_t io_start_time, bool direction,
				     bool sync)
351
{
352
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
353
	unsigned long long now = sched_clock();
354
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
355

356
357
	lockdep_assert_held(blkg->q->queue_lock);

358
	if (time_after64(now, io_start_time))
359
		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
360
	if (time_after64(io_start_time, start_time))
361
362
		blkg_rwstat_add(&stats->wait_time, rw,
				io_start_time - start_time);
363
}
364
EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
365

366
/*  Merged stats are per cpu.  */
367
368
369
void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
				    struct blkio_policy_type *pol,
				    bool direction, bool sync)
Divyesh Shah's avatar
Divyesh Shah committed
370
{
371
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
372
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
373
374

	lockdep_assert_held(blkg->q->queue_lock);
Divyesh Shah's avatar
Divyesh Shah committed
375

376
	blkg_rwstat_add(&stats->merged, rw, 1);
Divyesh Shah's avatar
Divyesh Shah committed
377
378
379
}
EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);

380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
/*
 * Worker for allocating per cpu stat for blk groups. This is scheduled on
 * the system_nrt_wq once there are some groups on the alloc_list waiting
 * for allocation.
 */
static void blkio_stat_alloc_fn(struct work_struct *work)
{
	static void *pcpu_stats[BLKIO_NR_POLICIES];
	struct delayed_work *dwork = to_delayed_work(work);
	struct blkio_group *blkg;
	int i;
	bool empty = false;

alloc_stats:
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		if (pcpu_stats[i] != NULL)
			continue;

		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);

		/* Allocation failed. Try again after some time. */
		if (pcpu_stats[i] == NULL) {
			queue_delayed_work(system_nrt_wq, dwork,
						msecs_to_jiffies(10));
			return;
		}
	}

	spin_lock_irq(&blkio_list_lock);
	spin_lock(&alloc_list_lock);

	/* cgroup got deleted or queue exited. */
	if (!list_empty(&alloc_list)) {
		blkg = list_first_entry(&alloc_list, struct blkio_group,
						alloc_node);
		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
			struct blkg_policy_data *pd = blkg->pd[i];

			if (blkio_policy[i] && pd && !pd->stats_cpu)
				swap(pd->stats_cpu, pcpu_stats[i]);
		}

		list_del_init(&blkg->alloc_node);
	}

	empty = list_empty(&alloc_list);

	spin_unlock(&alloc_list_lock);
	spin_unlock_irq(&blkio_list_lock);

	if (!empty)
		goto alloc_stats;
}

434
435
436
437
438
439
440
441
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
442
	int i;
443
444
445
446

	if (!blkg)
		return;

447
448
449
450
451
452
453
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkg_policy_data *pd = blkg->pd[i];

		if (pd) {
			free_percpu(pd->stats_cpu);
			kfree(pd);
		}
454
	}
455

456
	kfree(blkg);
457
458
459
460
461
462
463
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
464
 * Allocate a new blkg assocating @blkcg and @q.
465
466
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
467
				      struct request_queue *q)
468
469
{
	struct blkio_group *blkg;
470
	int i;
471
472
473
474
475
476

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

477
	blkg->q = q;
478
	INIT_LIST_HEAD(&blkg->q_node);
479
	INIT_LIST_HEAD(&blkg->alloc_node);
480
	blkg->blkcg = blkcg;
Tejun Heo's avatar
Tejun Heo committed
481
	blkg->refcnt = 1;
482
483
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

484
485
486
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
487

488
489
490
491
492
493
494
495
496
497
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
498

499
500
		blkg->pd[i] = pd;
		pd->blkg = blkg;
501
502
	}

503
	/* invoke per-policy init */
504
505
506
507
508
509
510
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

511
512
513
	return blkg;
}

514
515
516
517
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
518
{
519
	struct blkio_group *blkg;
520

521
522
523
524
525
526
527
528
529
530
531
532
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

533
	blkg = blkg_lookup(blkcg, q);
534
535
536
	if (blkg)
		return blkg;

537
	/* blkg holds a reference to blkcg */
538
539
540
541
542
543
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
544
	blkg = blkg_alloc(blkcg, q);
545
546

	/* did alloc fail? */
547
	if (unlikely(!blkg)) {
548
549
550
551
552
553
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
554
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
555
	list_add(&blkg->q_node, &q->blkg_list);
556
	spin_unlock(&blkcg->lock);
557
558
559
560
561
562

	spin_lock(&alloc_list_lock);
	list_add(&blkg->alloc_node, &alloc_list);
	/* Queue per cpu stat allocation from worker thread. */
	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
	spin_unlock(&alloc_list_lock);
563
564
out:
	return blkg;
565
}
566
EXPORT_SYMBOL_GPL(blkg_lookup_create);
567
568

/* called under rcu_read_lock(). */
569
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
570
				struct request_queue *q)
571
572
573
574
{
	struct blkio_group *blkg;
	struct hlist_node *n;

575
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
576
		if (blkg->q == q)
577
578
579
			return blkg;
	return NULL;
}
580
EXPORT_SYMBOL_GPL(blkg_lookup);
581

582
static void blkg_destroy(struct blkio_group *blkg)
583
584
{
	struct request_queue *q = blkg->q;
585
	struct blkio_cgroup *blkcg = blkg->blkcg;
586
587

	lockdep_assert_held(q->queue_lock);
588
	lockdep_assert_held(&blkcg->lock);
589
590

	/* Something wrong if we are trying to remove same group twice */
591
	WARN_ON_ONCE(list_empty(&blkg->q_node));
592
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
593
	list_del_init(&blkg->q_node);
594
	hlist_del_init_rcu(&blkg->blkcg_node);
595

596
597
598
599
	spin_lock(&alloc_list_lock);
	list_del_init(&blkg->alloc_node);
	spin_unlock(&alloc_list_lock);

600
601
602
603
604
605
606
	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
	WARN_ON_ONCE(!pd->stats_cpu);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

640
641
642
643
644
645
646
647
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
648
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
649
{
650
	struct blkio_group *blkg, *n;
651

652
	spin_lock_irq(q->queue_lock);
653

654
655
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
656

657
658
659
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
660

661
662
663
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
664
	}
665
666

	spin_unlock_irq(q->queue_lock);
667
}
668
EXPORT_SYMBOL_GPL(blkg_destroy_all);
669

Tejun Heo's avatar
Tejun Heo committed
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

693
static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
694
{
695
	struct blkg_policy_data *pd = blkg->pd[plid];
Tejun Heo's avatar
Tejun Heo committed
696
	int cpu;
697
698
699

	if (pd->stats_cpu == NULL)
		return;
Tejun Heo's avatar
Tejun Heo committed
700
701
702
703
704

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);

705
706
707
		blkg_rwstat_reset(&sc->service_bytes);
		blkg_rwstat_reset(&sc->serviced);
		blkg_stat_reset(&sc->sectors);
708
709
710
	}
}

711
static int
712
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
713
{
Tejun Heo's avatar
Tejun Heo committed
714
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
715
716
717
	struct blkio_group *blkg;
	struct hlist_node *n;

718
	spin_lock(&blkio_list_lock);
719
	spin_lock_irq(&blkcg->lock);
Tejun Heo's avatar
Tejun Heo committed
720
721
722
723
724
725

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
726
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
727
		struct blkio_policy_type *pol;
728

729
730
		list_for_each_entry(pol, &blkio_list, list) {
			struct blkg_policy_data *pd = blkg->pd[pol->plid];
Tejun Heo's avatar
Tejun Heo committed
731
732
733
			struct blkio_group_stats *stats = &pd->stats;

			/* queued stats shouldn't be cleared */
734
735
736
737
			blkg_rwstat_reset(&stats->merged);
			blkg_rwstat_reset(&stats->service_time);
			blkg_rwstat_reset(&stats->wait_time);
			blkg_stat_reset(&stats->time);
738
#ifdef CONFIG_DEBUG_BLK_CGROUP
739
740
741
742
743
744
745
			blkg_stat_reset(&stats->unaccounted_time);
			blkg_stat_reset(&stats->avg_queue_size_sum);
			blkg_stat_reset(&stats->avg_queue_size_samples);
			blkg_stat_reset(&stats->dequeue);
			blkg_stat_reset(&stats->group_wait_time);
			blkg_stat_reset(&stats->idle_time);
			blkg_stat_reset(&stats->empty_time);
746
#endif
747
748
			blkio_reset_stats_cpu(blkg, pol->plid);
		}
749
	}
750

751
	spin_unlock_irq(&blkcg->lock);
752
	spin_unlock(&blkio_list_lock);
753
754
755
	return 0;
}

756
static const char *blkg_dev_name(struct blkio_group *blkg)
757
{
758
759
760
761
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
762
763
}

764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data.  If @show_total is %true, the sum of the return
 * values from @prfill is printed with "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
static void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
			      u64 (*prfill)(struct seq_file *,
					    struct blkg_policy_data *, int),
			      int pol, int data, bool show_total)
785
{
786
787
788
	struct blkio_group *blkg;
	struct hlist_node *n;
	u64 total = 0;
789

790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
		if (blkg->pd[pol])
			total += prfill(sf, blkg->pd[pol], data);
	spin_unlock_irq(&blkcg->lock);

	if (show_total)
		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device assocaited with @pd.
 */
static u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd,
			     u64 v)
{
	const char *dname = blkg_dev_name(pd->blkg);

	if (!dname)
		return 0;

	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
	return v;
}

/**
 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @rwstat: rwstat to print
 *
 * Print @rwstat to @sf for the device assocaited with @pd.
 */
static u64 __blkg_prfill_rwstat(struct seq_file *sf,
				struct blkg_policy_data *pd,
				const struct blkg_rwstat *rwstat)
{
	static const char *rwstr[] = {
		[BLKG_RWSTAT_READ]	= "Read",
		[BLKG_RWSTAT_WRITE]	= "Write",
		[BLKG_RWSTAT_SYNC]	= "Sync",
		[BLKG_RWSTAT_ASYNC]	= "Async",
	};
	const char *dname = blkg_dev_name(pd->blkg);
	u64 v;
	int i;

	if (!dname)
		return 0;

	for (i = 0; i < BLKG_RWSTAT_NR; i++)
		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
			   (unsigned long long)rwstat->cnt[i]);

	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
	return v;
}

static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
			    int off)
{
	return __blkg_prfill_u64(sf, pd,
				 blkg_stat_read((void *)&pd->stats + off));
}

static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			      int off)
{
	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);

	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}

/* print blkg_stat specified by BLKCG_STAT_PRIV() */
static int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
			    struct seq_file *sf)
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
}

/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
static int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
			      struct seq_file *sf)
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}

static u64 blkg_prfill_cpu_stat(struct seq_file *sf,
				struct blkg_policy_data *pd, int off)
{
	u64 v = 0;
	int cpu;
898

899
	for_each_possible_cpu(cpu) {
900
		struct blkio_group_stats_cpu *sc =
901
902
			per_cpu_ptr(pd->stats_cpu, cpu);

903
		v += blkg_stat_read((void *)sc + off);
904
905
	}

906
	return __blkg_prfill_u64(sf, pd, v);
907
908
}

909
910
static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
				  struct blkg_policy_data *pd, int off)
911
{
912
913
914
915
916
917
	struct blkg_rwstat rwstat = { }, tmp;
	int i, cpu;

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);
918

919
920
921
		tmp = blkg_rwstat_read((void *)sc + off);
		for (i = 0; i < BLKG_RWSTAT_NR; i++)
			rwstat.cnt[i] += tmp.cnt[i];
922
923
	}

924
925
	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
926

927
928
929
930
931
932
933
934
935
936
/* print per-cpu blkg_stat specified by BLKCG_STAT_PRIV() */
static int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
				struct seq_file *sf)
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
937
938
}

939
940
941
/* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
static int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
				  struct seq_file *sf)
942
{
943
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
944

945
946
947
948
949
	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
950

951
952
953
954
955
956
#ifdef CONFIG_DEBUG_BLK_CGROUP
static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
				      struct blkg_policy_data *pd, int off)
{
	u64 samples = blkg_stat_read(&pd->stats.avg_queue_size_samples);
	u64 v = 0;
957

958
959
960
	if (samples) {
		v = blkg_stat_read(&pd->stats.avg_queue_size_sum);
		do_div(v, samples);
961
	}
962
963
964
	__blkg_prfill_u64(sf, pd, v);
	return 0;
}
965

966
967
968
969
970
/* print avg_queue_size */
static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
				      struct seq_file *sf)
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
971

972
973
974
	blkcg_print_blkgs(sf, blkcg, blkg_prfill_avg_queue_size,
			  BLKIO_POLICY_PROP, 0, false);
	return 0;
975
}
976
#endif	/* CONFIG_DEBUG_BLK_CGROUP */
977

Tejun Heo's avatar
Tejun Heo committed
978
979
static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
				      int fileid, struct blkio_cgroup *blkcg)
980
{
981
	struct gendisk *disk = NULL;
982
	struct blkio_group *blkg = NULL;
983
	struct blkg_policy_data *pd;
984
	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
985
	unsigned long major, minor;
986
987
	int i = 0, ret = -EINVAL;
	int part;
988
	dev_t dev;
989
	u64 temp;
990
991
992
993
994
995
996
997
998
999
1000

	memset(s, 0, sizeof(s));

	while ((p = strsep(&buf, " ")) != NULL) {
		if (!*p)
			continue;

		s[i++] = p;

		/* Prevent from inputing too many things */
		if (i == 3)
For faster browsing, not all history is shown. View entire blame