blk-cgroup.c 40.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14
#include <linux/kdev_t.h>
15
#include <linux/module.h>
16
#include <linux/err.h>
17
#include <linux/blkdev.h>
18
#include <linux/slab.h>
19
#include <linux/genhd.h>
20
#include <linux/delay.h>
Tejun Heo's avatar
Tejun Heo committed
21
#include <linux/atomic.h>
22
#include "blk-cgroup.h"
23
#include "blk.h"
24

25
26
#define MAX_KEY_LEN 100

27
28
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
29

30
31
32
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

33
34
35
36
37
38
39
/* List of groups pending per cpu stats allocation */
static DEFINE_SPINLOCK(alloc_list_lock);
static LIST_HEAD(alloc_list);

static void blkio_stat_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);

40
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
41
42
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

43
44
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

45
46
47
48
49
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
50
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
51

52
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
53
54
55
56
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
57
58
59
60
61
62
63
64

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
65

66
67
static inline void blkio_update_group_weight(struct blkio_group *blkg,
					     int plid, unsigned int weight)
68
69
70
71
72
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {
		/* If this policy does not own the blkg, do not send updates */
73
		if (blkiop->plid != plid)
74
75
			continue;
		if (blkiop->ops.blkio_update_group_weight_fn)
76
			blkiop->ops.blkio_update_group_weight_fn(blkg->q,
77
							blkg, weight);
78
79
80
	}
}

81
static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
82
					  u64 bps, int rw)
83
84
85
86
87
88
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {

		/* If this policy does not own the blkg, do not send updates */
89
		if (blkiop->plid != plid)
90
91
			continue;

92
		if (rw == READ && blkiop->ops.blkio_update_group_read_bps_fn)
93
			blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
94
								blkg, bps);
95

96
		if (rw == WRITE && blkiop->ops.blkio_update_group_write_bps_fn)
97
			blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
98
								blkg, bps);
99
100
101
	}
}

102
103
static inline void blkio_update_group_iops(struct blkio_group *blkg, int plid,
					   u64 iops, int rw)
104
105
106
107
108
109
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {

		/* If this policy does not own the blkg, do not send updates */
110
		if (blkiop->plid != plid)
111
112
			continue;

113
		if (rw == READ && blkiop->ops.blkio_update_group_read_iops_fn)
114
			blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
115
								blkg, iops);
116

117
		if (rw == WRITE && blkiop->ops.blkio_update_group_write_iops_fn)
118
			blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
119
								blkg,iops);
120
121
122
	}
}

123
#ifdef CONFIG_DEBUG_BLK_CGROUP
124
/* This should be called with the queue_lock held. */
125
static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
126
127
					    struct blkio_policy_type *pol,
					    struct blkio_group *curr_blkg)
128
{
129
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
130
131

	if (blkio_blkg_waiting(&pd->stats))
132
133
134
		return;
	if (blkg == curr_blkg)
		return;
135
136
	pd->stats.start_group_wait_time = sched_clock();
	blkio_mark_blkg_waiting(&pd->stats);
137
138
}

139
/* This should be called with the queue_lock held. */
140
141
142
143
144
145
146
147
148
static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
{
	unsigned long long now;

	if (!blkio_blkg_waiting(stats))
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_group_wait_time))
149
150
		blkg_stat_add(&stats->group_wait_time,
			      now - stats->start_group_wait_time);
151
152
153
	blkio_clear_blkg_waiting(stats);
}

154
/* This should be called with the queue_lock held. */
155
156
157
158
159
160
161
162
163
static void blkio_end_empty_time(struct blkio_group_stats *stats)
{
	unsigned long long now;

	if (!blkio_blkg_empty(stats))
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_empty_time))
164
165
		blkg_stat_add(&stats->empty_time,
			      now - stats->start_empty_time);
166
167
168
	blkio_clear_blkg_empty(stats);
}

169
170
void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
					struct blkio_policy_type *pol)
171
{
172
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
173

174
175
176
177
178
	lockdep_assert_held(blkg->q->queue_lock);
	BUG_ON(blkio_blkg_idling(stats));

	stats->start_idle_time = sched_clock();
	blkio_mark_blkg_idling(stats);
179
180
181
}
EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);

182
183
void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
				    struct blkio_policy_type *pol)
184
{
185
186
187
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
188
189

	if (blkio_blkg_idling(stats)) {
190
191
		unsigned long long now = sched_clock();

192
193
194
		if (time_after64(now, stats->start_idle_time))
			blkg_stat_add(&stats->idle_time,
				      now - stats->start_idle_time);
195
196
197
198
199
		blkio_clear_blkg_idling(stats);
	}
}
EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);

200
201
void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
					 struct blkio_policy_type *pol)
202
{
203
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
204

205
206
	lockdep_assert_held(blkg->q->queue_lock);

207
208
209
	blkg_stat_add(&stats->avg_queue_size_sum,
		      blkg_rwstat_sum(&stats->queued));
	blkg_stat_add(&stats->avg_queue_size_samples, 1);
210
	blkio_update_group_wait_time(stats);
211
}
212
213
EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);

214
215
void blkiocg_set_start_empty_time(struct blkio_group *blkg,
				  struct blkio_policy_type *pol)
Divyesh Shah's avatar
Divyesh Shah committed
216
{
217
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
Divyesh Shah's avatar
Divyesh Shah committed
218

219
	lockdep_assert_held(blkg->q->queue_lock);
Divyesh Shah's avatar
Divyesh Shah committed
220

221
	if (blkg_rwstat_sum(&stats->queued))
Divyesh Shah's avatar
Divyesh Shah committed
222
223
224
		return;

	/*
225
226
227
	 * group is already marked empty. This can happen if cfqq got new
	 * request in parent group and moved to this group while being added
	 * to service tree. Just ignore the event and move on.
Divyesh Shah's avatar
Divyesh Shah committed
228
	 */
229
	if (blkio_blkg_empty(stats))
230
231
		return;

Divyesh Shah's avatar
Divyesh Shah committed
232
233
234
235
236
	stats->start_empty_time = sched_clock();
	blkio_mark_blkg_empty(stats);
}
EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);

237
void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
238
239
				  struct blkio_policy_type *pol,
				  unsigned long dequeue)
240
{
241
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
242

243
244
	lockdep_assert_held(blkg->q->queue_lock);

245
	blkg_stat_add(&pd->stats.dequeue, dequeue);
246
247
}
EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
248
249
#else
static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
250
251
252
					struct blkio_policy_type *pol,
					struct blkio_group *curr_blkg) { }
static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
253
254
#endif

255
void blkiocg_update_io_add_stats(struct blkio_group *blkg,
256
257
258
				 struct blkio_policy_type *pol,
				 struct blkio_group *curr_blkg, bool direction,
				 bool sync)
259
{
260
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
261
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
262
263
264

	lockdep_assert_held(blkg->q->queue_lock);

265
	blkg_rwstat_add(&stats->queued, rw, 1);
266
	blkio_end_empty_time(stats);
267
	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
268
}
269
EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
270

271
void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
272
273
				    struct blkio_policy_type *pol,
				    bool direction, bool sync)
274
{
275
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
276
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
277
278

	lockdep_assert_held(blkg->q->queue_lock);
279

280
	blkg_rwstat_add(&stats->queued, rw, -1);
281
}
282
EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
283

284
285
286
287
void blkiocg_update_timeslice_used(struct blkio_group *blkg,
				   struct blkio_policy_type *pol,
				   unsigned long time,
				   unsigned long unaccounted_time)
288
{
289
290
291
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
292

293
	blkg_stat_add(&stats->time, time);
294
#ifdef CONFIG_DEBUG_BLK_CGROUP
295
	blkg_stat_add(&stats->unaccounted_time, unaccounted_time);
296
#endif
297
}
298
EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
299

300
301
302
303
/*
 * should be called under rcu read lock or queue lock to make sure blkg pointer
 * is valid.
 */
304
void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
305
306
				   struct blkio_policy_type *pol,
				   uint64_t bytes, bool direction, bool sync)
307
{
308
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
309
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
310
	struct blkio_group_stats_cpu *stats_cpu;
311
312
	unsigned long flags;

313
314
315
316
	/* If per cpu stats are not allocated yet, don't do any accounting. */
	if (pd->stats_cpu == NULL)
		return;

317
318
319
320
321
322
	/*
	 * Disabling interrupts to provide mutual exclusion between two
	 * writes on same cpu. It probably is not needed for 64bit. Not
	 * optimizing that case yet.
	 */
	local_irq_save(flags);
323

324
	stats_cpu = this_cpu_ptr(pd->stats_cpu);
325

326
327
328
329
	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);

330
	local_irq_restore(flags);
331
}
332
EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
333

334
void blkiocg_update_completion_stats(struct blkio_group *blkg,
335
336
337
338
				     struct blkio_policy_type *pol,
				     uint64_t start_time,
				     uint64_t io_start_time, bool direction,
				     bool sync)
339
{
340
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
341
	unsigned long long now = sched_clock();
342
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
343

344
345
	lockdep_assert_held(blkg->q->queue_lock);

346
	if (time_after64(now, io_start_time))
347
		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
348
	if (time_after64(io_start_time, start_time))
349
350
		blkg_rwstat_add(&stats->wait_time, rw,
				io_start_time - start_time);
351
}
352
EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
353

354
/*  Merged stats are per cpu.  */
355
356
357
void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
				    struct blkio_policy_type *pol,
				    bool direction, bool sync)
Divyesh Shah's avatar
Divyesh Shah committed
358
{
359
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
360
	int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
361
362

	lockdep_assert_held(blkg->q->queue_lock);
Divyesh Shah's avatar
Divyesh Shah committed
363

364
	blkg_rwstat_add(&stats->merged, rw, 1);
Divyesh Shah's avatar
Divyesh Shah committed
365
366
367
}
EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);

368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
/*
 * Worker for allocating per cpu stat for blk groups. This is scheduled on
 * the system_nrt_wq once there are some groups on the alloc_list waiting
 * for allocation.
 */
static void blkio_stat_alloc_fn(struct work_struct *work)
{
	static void *pcpu_stats[BLKIO_NR_POLICIES];
	struct delayed_work *dwork = to_delayed_work(work);
	struct blkio_group *blkg;
	int i;
	bool empty = false;

alloc_stats:
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		if (pcpu_stats[i] != NULL)
			continue;

		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);

		/* Allocation failed. Try again after some time. */
		if (pcpu_stats[i] == NULL) {
			queue_delayed_work(system_nrt_wq, dwork,
						msecs_to_jiffies(10));
			return;
		}
	}

	spin_lock_irq(&blkio_list_lock);
	spin_lock(&alloc_list_lock);

	/* cgroup got deleted or queue exited. */
	if (!list_empty(&alloc_list)) {
		blkg = list_first_entry(&alloc_list, struct blkio_group,
						alloc_node);
		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
			struct blkg_policy_data *pd = blkg->pd[i];

			if (blkio_policy[i] && pd && !pd->stats_cpu)
				swap(pd->stats_cpu, pcpu_stats[i]);
		}

		list_del_init(&blkg->alloc_node);
	}

	empty = list_empty(&alloc_list);

	spin_unlock(&alloc_list_lock);
	spin_unlock_irq(&blkio_list_lock);

	if (!empty)
		goto alloc_stats;
}

422
423
424
425
426
427
428
429
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
430
	int i;
431
432
433
434

	if (!blkg)
		return;

435
436
437
438
439
440
441
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkg_policy_data *pd = blkg->pd[i];

		if (pd) {
			free_percpu(pd->stats_cpu);
			kfree(pd);
		}
442
	}
443

444
	kfree(blkg);
445
446
447
448
449
450
451
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
452
 * Allocate a new blkg assocating @blkcg and @q.
453
454
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
455
				      struct request_queue *q)
456
457
{
	struct blkio_group *blkg;
458
	int i;
459
460
461
462
463
464

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

465
	blkg->q = q;
466
	INIT_LIST_HEAD(&blkg->q_node);
467
	INIT_LIST_HEAD(&blkg->alloc_node);
468
	blkg->blkcg = blkcg;
Tejun Heo's avatar
Tejun Heo committed
469
	blkg->refcnt = 1;
470
471
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

472
473
474
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
475

476
477
478
479
480
481
482
483
484
485
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
486

487
488
		blkg->pd[i] = pd;
		pd->blkg = blkg;
489
490
	}

491
	/* invoke per-policy init */
492
493
494
495
496
497
498
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

499
500
501
	return blkg;
}

502
503
504
505
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
506
{
507
	struct blkio_group *blkg;
508

509
510
511
512
513
514
515
516
517
518
519
520
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

521
	blkg = blkg_lookup(blkcg, q);
522
523
524
	if (blkg)
		return blkg;

525
	/* blkg holds a reference to blkcg */
526
527
528
529
530
531
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
532
	blkg = blkg_alloc(blkcg, q);
533
534

	/* did alloc fail? */
535
	if (unlikely(!blkg)) {
536
537
538
539
540
541
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
542
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
543
	list_add(&blkg->q_node, &q->blkg_list);
544
	spin_unlock(&blkcg->lock);
545
546
547
548
549
550

	spin_lock(&alloc_list_lock);
	list_add(&blkg->alloc_node, &alloc_list);
	/* Queue per cpu stat allocation from worker thread. */
	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
	spin_unlock(&alloc_list_lock);
551
552
out:
	return blkg;
553
}
554
EXPORT_SYMBOL_GPL(blkg_lookup_create);
555
556

/* called under rcu_read_lock(). */
557
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
558
				struct request_queue *q)
559
560
561
562
{
	struct blkio_group *blkg;
	struct hlist_node *n;

563
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
564
		if (blkg->q == q)
565
566
567
			return blkg;
	return NULL;
}
568
EXPORT_SYMBOL_GPL(blkg_lookup);
569

570
static void blkg_destroy(struct blkio_group *blkg)
571
572
{
	struct request_queue *q = blkg->q;
573
	struct blkio_cgroup *blkcg = blkg->blkcg;
574
575

	lockdep_assert_held(q->queue_lock);
576
	lockdep_assert_held(&blkcg->lock);
577
578

	/* Something wrong if we are trying to remove same group twice */
579
	WARN_ON_ONCE(list_empty(&blkg->q_node));
580
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
581
	list_del_init(&blkg->q_node);
582
	hlist_del_init_rcu(&blkg->blkcg_node);
583

584
585
586
587
	spin_lock(&alloc_list_lock);
	list_del_init(&blkg->alloc_node);
	spin_unlock(&alloc_list_lock);

588
589
590
591
592
593
594
	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
	WARN_ON_ONCE(!pd->stats_cpu);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

628
629
630
631
632
633
634
635
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
636
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
637
{
638
	struct blkio_group *blkg, *n;
639

640
	spin_lock_irq(q->queue_lock);
641

642
643
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
644

645
646
647
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
648

649
650
651
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
652
	}
653
654

	spin_unlock_irq(q->queue_lock);
655
}
656
EXPORT_SYMBOL_GPL(blkg_destroy_all);
657

Tejun Heo's avatar
Tejun Heo committed
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

681
static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
682
{
683
	struct blkg_policy_data *pd = blkg->pd[plid];
Tejun Heo's avatar
Tejun Heo committed
684
	int cpu;
685
686
687

	if (pd->stats_cpu == NULL)
		return;
Tejun Heo's avatar
Tejun Heo committed
688
689
690
691
692

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);

693
694
695
		blkg_rwstat_reset(&sc->service_bytes);
		blkg_rwstat_reset(&sc->serviced);
		blkg_stat_reset(&sc->sectors);
696
697
698
	}
}

699
static int
700
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
701
{
Tejun Heo's avatar
Tejun Heo committed
702
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
703
704
705
	struct blkio_group *blkg;
	struct hlist_node *n;

706
	spin_lock(&blkio_list_lock);
707
	spin_lock_irq(&blkcg->lock);
Tejun Heo's avatar
Tejun Heo committed
708
709
710
711
712
713

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
714
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
715
		struct blkio_policy_type *pol;
716

717
718
		list_for_each_entry(pol, &blkio_list, list) {
			struct blkg_policy_data *pd = blkg->pd[pol->plid];
Tejun Heo's avatar
Tejun Heo committed
719
720
721
			struct blkio_group_stats *stats = &pd->stats;

			/* queued stats shouldn't be cleared */
722
723
724
725
			blkg_rwstat_reset(&stats->merged);
			blkg_rwstat_reset(&stats->service_time);
			blkg_rwstat_reset(&stats->wait_time);
			blkg_stat_reset(&stats->time);
726
#ifdef CONFIG_DEBUG_BLK_CGROUP
727
728
729
730
731
732
733
			blkg_stat_reset(&stats->unaccounted_time);
			blkg_stat_reset(&stats->avg_queue_size_sum);
			blkg_stat_reset(&stats->avg_queue_size_samples);
			blkg_stat_reset(&stats->dequeue);
			blkg_stat_reset(&stats->group_wait_time);
			blkg_stat_reset(&stats->idle_time);
			blkg_stat_reset(&stats->empty_time);
734
#endif
735
736
			blkio_reset_stats_cpu(blkg, pol->plid);
		}
737
	}
738

739
	spin_unlock_irq(&blkcg->lock);
740
	spin_unlock(&blkio_list_lock);
741
742
743
	return 0;
}

744
static const char *blkg_dev_name(struct blkio_group *blkg)
745
{
746
747
748
749
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
750
751
}

752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data.  If @show_total is %true, the sum of the return
 * values from @prfill is printed with "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
769
770
771
void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
		       int pol, int data, bool show_total)
772
{
773
774
775
	struct blkio_group *blkg;
	struct hlist_node *n;
	u64 total = 0;
776

777
778
779
780
781
782
783
784
785
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
		if (blkg->pd[pol])
			total += prfill(sf, blkg->pd[pol], data);
	spin_unlock_irq(&blkcg->lock);

	if (show_total)
		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
786
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
787
788
789
790
791
792
793
794
795

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device assocaited with @pd.
 */
796
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
797
798
799
800
801
802
803
804
805
{
	const char *dname = blkg_dev_name(pd->blkg);

	if (!dname)
		return 0;

	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
	return v;
}
806
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
807
808
809
810
811
812
813
814
815

/**
 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @rwstat: rwstat to print
 *
 * Print @rwstat to @sf for the device assocaited with @pd.
 */
816
817
u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			 const struct blkg_rwstat *rwstat)
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
{
	static const char *rwstr[] = {
		[BLKG_RWSTAT_READ]	= "Read",
		[BLKG_RWSTAT_WRITE]	= "Write",
		[BLKG_RWSTAT_SYNC]	= "Sync",
		[BLKG_RWSTAT_ASYNC]	= "Async",
	};
	const char *dname = blkg_dev_name(pd->blkg);
	u64 v;
	int i;

	if (!dname)
		return 0;

	for (i = 0; i < BLKG_RWSTAT_NR; i++)
		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
			   (unsigned long long)rwstat->cnt[i]);

	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
	return v;
}

static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
			    int off)
{
	return __blkg_prfill_u64(sf, pd,
				 blkg_stat_read((void *)&pd->stats + off));
}

static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			      int off)
{
	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);

	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}

/* print blkg_stat specified by BLKCG_STAT_PRIV() */
857
858
int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
		     struct seq_file *sf)
859
860
861
862
863
864
865
866
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
}
867
EXPORT_SYMBOL_GPL(blkcg_print_stat);
868
869

/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
870
871
int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
		       struct seq_file *sf)
872
873
874
875
876
877
878
879
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
880
EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
881
882
883
884
885
886

static u64 blkg_prfill_cpu_stat(struct seq_file *sf,
				struct blkg_policy_data *pd, int off)
{
	u64 v = 0;
	int cpu;
887

888
	for_each_possible_cpu(cpu) {
889
		struct blkio_group_stats_cpu *sc =
890
891
			per_cpu_ptr(pd->stats_cpu, cpu);

892
		v += blkg_stat_read((void *)sc + off);
893
894
	}

895
	return __blkg_prfill_u64(sf, pd, v);
896
897
}

898
899
static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
				  struct blkg_policy_data *pd, int off)
900
{
901
902
903
904
905
906
	struct blkg_rwstat rwstat = { }, tmp;
	int i, cpu;

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);
907

908
909
910
		tmp = blkg_rwstat_read((void *)sc + off);
		for (i = 0; i < BLKG_RWSTAT_NR; i++)
			rwstat.cnt[i] += tmp.cnt[i];
911
912
	}

913
914
	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
915

916
/* print per-cpu blkg_stat specified by BLKCG_STAT_PRIV() */
917
918
int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
			 struct seq_file *sf)
919
920
921
922
923
924
925
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
926
}
927
EXPORT_SYMBOL_GPL(blkcg_print_cpu_stat);
928

929
/* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
930
931
int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
			   struct seq_file *sf)
932
{
933
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
934

935
936
937
938
939
	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
940
EXPORT_SYMBOL_GPL(blkcg_print_cpu_rwstat);
941

942
943
944
945
946
947
#ifdef CONFIG_DEBUG_BLK_CGROUP
static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
				      struct blkg_policy_data *pd, int off)
{
	u64 samples = blkg_stat_read(&pd->stats.avg_queue_size_samples);
	u64 v = 0;
948

949
950
951
	if (samples) {
		v = blkg_stat_read(&pd->stats.avg_queue_size_sum);
		do_div(v, samples);
952
	}
953
954
955
	__blkg_prfill_u64(sf, pd, v);
	return 0;
}
956

957
958
959
960
961
/* print avg_queue_size */
static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
				      struct seq_file *sf)
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
962

963
964
965
	blkcg_print_blkgs(sf, blkcg, blkg_prfill_avg_queue_size,
			  BLKIO_POLICY_PROP, 0, false);
	return 0;
966
}
967
#endif	/* CONFIG_DEBUG_BLK_CGROUP */
968

969
970
971
972
973
974
975
976
977
978
979
/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @input: input string
 * @ctx: blkg_conf_ctx to be filled
 *
 * Parse per-blkg config update from @input and initialize @ctx with the
 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 * value.  This function returns with RCU read locked and must be paired
 * with blkg_conf_finish().
 */
980
981
int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
		   struct blkg_conf_ctx *ctx)
982
	__acquires(rcu)
983
{
984
985
	struct gendisk *disk;
	struct blkio_group *blkg;
Tejun Heo's avatar
Tejun Heo committed
986
987
988
	unsigned int major, minor;
	unsigned long long v;
	int part, ret;
989

Tejun Heo's avatar
Tejun Heo committed
990
991
	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
		return -EINVAL;
992

Tejun Heo's avatar
Tejun Heo committed
993
	disk = get_gendisk(MKDEV(major, minor), &part);
Tejun Heo's avatar
Tejun Heo committed
994
	if (!disk || part)
Tejun Heo's avatar
Tejun Heo committed
995
		return -EINVAL;
996
997
998

	rcu_read_lock();

Tejun Heo's avatar
Tejun Heo committed
999
	spin_lock_irq(disk->queue->queue_lock);
1000
	blkg = blkg_lookup_create(blkcg, disk->queue, false);
For faster browsing, not all history is shown. View entire blame