aio.c 35.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
/*
 *	An async IO implementation for Linux
 *	Written by Benjamin LaHaise <bcrl@kvack.org>
 *
 *	Implements an efficient asynchronous io interface.
 *
 *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
 *
 *	See ../COPYING for licensing terms.
 */
Kent Overstreet's avatar
Kent Overstreet committed
11
12
#define pr_fmt(fmt) "%s: " fmt, __func__

Linus Torvalds's avatar
Linus Torvalds committed
13
14
15
16
17
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/aio_abi.h>
18
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
19
#include <linux/syscalls.h>
20
#include <linux/backing-dev.h>
21
#include <linux/uio.h>
Linus Torvalds's avatar
Linus Torvalds committed
22
23
24
25
26
27

#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
28
#include <linux/mmu_context.h>
Linus Torvalds's avatar
Linus Torvalds committed
29
30
31
32
33
34
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
#include <linux/security.h>
35
#include <linux/eventfd.h>
Jeff Moyer's avatar
Jeff Moyer committed
36
#include <linux/blkdev.h>
37
#include <linux/compat.h>
38
39
40
#include <linux/anon_inodes.h>
#include <linux/migrate.h>
#include <linux/ramfs.h>
Linus Torvalds's avatar
Linus Torvalds committed
41
42
43
44

#include <asm/kmap_types.h>
#include <asm/uaccess.h>

Al Viro's avatar
Al Viro committed
45
46
#include "internal.h"

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#define AIO_RING_MAGIC			0xa10a10a1
#define AIO_RING_COMPAT_FEATURES	1
#define AIO_RING_INCOMPAT_FEATURES	0
struct aio_ring {
	unsigned	id;	/* kernel internal index number */
	unsigned	nr;	/* number of io_events */
	unsigned	head;
	unsigned	tail;

	unsigned	magic;
	unsigned	compat_features;
	unsigned	incompat_features;
	unsigned	header_length;	/* size of aio_ring */


	struct io_event		io_events[0];
}; /* 128 bytes + ring size */

#define AIO_RING_PAGES	8

struct kioctx {
	atomic_t		users;
Kent Overstreet's avatar
Kent Overstreet committed
69
	atomic_t		dead;
70
71
72
73
74

	/* This needs improving */
	unsigned long		user_id;
	struct hlist_node	list;

75
76
77
78
	/*
	 * This is what userspace passed to io_setup(), it's not used for
	 * anything but counting against the global max_reqs quota.
	 *
79
	 * The real limit is nr_events - 1, which will be larger (see
80
81
	 * aio_setup_ring())
	 */
82
83
	unsigned		max_reqs;

84
85
	/* Size of ringbuffer, in units of struct io_event */
	unsigned		nr_events;
86

87
88
89
90
91
92
	unsigned long		mmap_base;
	unsigned long		mmap_size;

	struct page		**ring_pages;
	long			nr_pages;

93
94
95
96
97
98
99
100
101
102
103
104
	struct rcu_head		rcu_head;
	struct work_struct	rcu_work;

	struct {
		atomic_t	reqs_active;
	} ____cacheline_aligned_in_smp;

	struct {
		spinlock_t	ctx_lock;
		struct list_head active_reqs;	/* used for cancellation */
	} ____cacheline_aligned_in_smp;

105
106
	struct {
		struct mutex	ring_lock;
107
108
		wait_queue_head_t wait;
	} ____cacheline_aligned_in_smp;
109
110
111
112

	struct {
		unsigned	tail;
		spinlock_t	completion_lock;
113
	} ____cacheline_aligned_in_smp;
114
115

	struct page		*internal_pages[AIO_RING_PAGES];
116
	struct file		*aio_ring_file;
117
118
};

Linus Torvalds's avatar
Linus Torvalds committed
119
/*------ sysctl variables----*/
120
121
122
static DEFINE_SPINLOCK(aio_nr_lock);
unsigned long aio_nr;		/* current system wide number of aio requests */
unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
Linus Torvalds's avatar
Linus Torvalds committed
123
124
/*----end sysctl variables---*/

125
126
static struct kmem_cache	*kiocb_cachep;
static struct kmem_cache	*kioctx_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
127
128
129
130
131
132
133

/* aio_setup
 *	Creates the slab caches used by the aio routines, panic on
 *	failure as this is done early during the boot sequence.
 */
static int __init aio_setup(void)
{
134
135
	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
Linus Torvalds's avatar
Linus Torvalds committed
136

Kent Overstreet's avatar
Kent Overstreet committed
137
	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
Linus Torvalds's avatar
Linus Torvalds committed
138
139
140

	return 0;
}
141
__initcall(aio_setup);
Linus Torvalds's avatar
Linus Torvalds committed
142
143
144

static void aio_free_ring(struct kioctx *ctx)
{
145
146
	int i;
	struct file *aio_ring_file = ctx->aio_ring_file;
Linus Torvalds's avatar
Linus Torvalds committed
147

148
149
150
	for (i = 0; i < ctx->nr_pages; i++) {
		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
				page_count(ctx->ring_pages[i]));
151
		put_page(ctx->ring_pages[i]);
152
	}
Linus Torvalds's avatar
Linus Torvalds committed
153

154
155
	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
		kfree(ctx->ring_pages);
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209

	if (aio_ring_file) {
		truncate_setsize(aio_ring_file->f_inode, 0);
		pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n",
			current->pid, aio_ring_file->f_inode->i_nlink,
			aio_ring_file->f_path.dentry->d_count,
			d_unhashed(aio_ring_file->f_path.dentry),
			atomic_read(&aio_ring_file->f_inode->i_count));
		fput(aio_ring_file);
		ctx->aio_ring_file = NULL;
	}
}

static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
{
	vma->vm_ops = &generic_file_vm_ops;
	return 0;
}

static const struct file_operations aio_ring_fops = {
	.mmap = aio_ring_mmap,
};

static int aio_set_page_dirty(struct page *page)
{
	return 0;
}

static int aio_migratepage(struct address_space *mapping, struct page *new,
			struct page *old, enum migrate_mode mode)
{
	struct kioctx *ctx = mapping->private_data;
	unsigned long flags;
	unsigned idx = old->index;
	int rc;

	/* Writeback must be complete */
	BUG_ON(PageWriteback(old));
	put_page(old);

	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
	if (rc != MIGRATEPAGE_SUCCESS) {
		get_page(old);
		return rc;
	}

	get_page(new);

	spin_lock_irqsave(&ctx->completion_lock, flags);
	migrate_page_copy(new, old);
	ctx->ring_pages[idx] = new;
	spin_unlock_irqrestore(&ctx->completion_lock, flags);

	return rc;
Linus Torvalds's avatar
Linus Torvalds committed
210
211
}

212
213
214
215
216
static const struct address_space_operations aio_ctx_aops = {
	.set_page_dirty = aio_set_page_dirty,
	.migratepage	= aio_migratepage,
};

Linus Torvalds's avatar
Linus Torvalds committed
217
218
219
220
static int aio_setup_ring(struct kioctx *ctx)
{
	struct aio_ring *ring;
	unsigned nr_events = ctx->max_reqs;
Zach Brown's avatar
Zach Brown committed
221
	struct mm_struct *mm = current->mm;
222
	unsigned long size, populate;
Linus Torvalds's avatar
Linus Torvalds committed
223
	int nr_pages;
224
225
	int i;
	struct file *file;
Linus Torvalds's avatar
Linus Torvalds committed
226
227
228
229
230
231
232

	/* Compensate for the ring buffer's head/tail overlap entry */
	nr_events += 2;	/* 1 is required, 2 for good luck */

	size = sizeof(struct aio_ring);
	size += sizeof(struct io_event) * nr_events;

233
	nr_pages = PFN_UP(size);
Linus Torvalds's avatar
Linus Torvalds committed
234
235
236
	if (nr_pages < 0)
		return -EINVAL;

237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
	file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
	if (IS_ERR(file)) {
		ctx->aio_ring_file = NULL;
		return -EAGAIN;
	}

	file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
	file->f_inode->i_mapping->private_data = ctx;
	file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;

	for (i = 0; i < nr_pages; i++) {
		struct page *page;
		page = find_or_create_page(file->f_inode->i_mapping,
					   i, GFP_HIGHUSER | __GFP_ZERO);
		if (!page)
			break;
		pr_debug("pid(%d) page[%d]->count=%d\n",
			 current->pid, i, page_count(page));
		SetPageUptodate(page);
		SetPageDirty(page);
		unlock_page(page);
	}
	ctx->aio_ring_file = file;
	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
			/ sizeof(struct io_event);
Linus Torvalds's avatar
Linus Torvalds committed
262

263
	ctx->ring_pages = ctx->internal_pages;
Linus Torvalds's avatar
Linus Torvalds committed
264
	if (nr_pages > AIO_RING_PAGES) {
265
266
267
		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
					  GFP_KERNEL);
		if (!ctx->ring_pages)
Linus Torvalds's avatar
Linus Torvalds committed
268
269
270
			return -ENOMEM;
	}

271
272
	ctx->mmap_size = nr_pages * PAGE_SIZE;
	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
273

Zach Brown's avatar
Zach Brown committed
274
	down_write(&mm->mmap_sem);
275
276
277
	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
				       PROT_READ | PROT_WRITE,
				       MAP_SHARED | MAP_POPULATE, 0, &populate);
278
	if (IS_ERR((void *)ctx->mmap_base)) {
Zach Brown's avatar
Zach Brown committed
279
		up_write(&mm->mmap_sem);
280
		ctx->mmap_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
281
282
283
		aio_free_ring(ctx);
		return -EAGAIN;
	}
284
285
286
	up_write(&mm->mmap_sem);

	mm_populate(ctx->mmap_base, populate);
Linus Torvalds's avatar
Linus Torvalds committed
287

288
289
290
	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
	ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
				       1, 0, ctx->ring_pages, NULL);
291
292
	for (i = 0; i < ctx->nr_pages; i++)
		put_page(ctx->ring_pages[i]);
Linus Torvalds's avatar
Linus Torvalds committed
293

294
	if (unlikely(ctx->nr_pages != nr_pages)) {
Linus Torvalds's avatar
Linus Torvalds committed
295
296
297
298
		aio_free_ring(ctx);
		return -EAGAIN;
	}

299
300
	ctx->user_id = ctx->mmap_base;
	ctx->nr_events = nr_events; /* trusted copy */
Linus Torvalds's avatar
Linus Torvalds committed
301

302
	ring = kmap_atomic(ctx->ring_pages[0]);
Linus Torvalds's avatar
Linus Torvalds committed
303
304
305
306
307
308
309
	ring->nr = nr_events;	/* user copy */
	ring->id = ctx->user_id;
	ring->head = ring->tail = 0;
	ring->magic = AIO_RING_MAGIC;
	ring->compat_features = AIO_RING_COMPAT_FEATURES;
	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
	ring->header_length = sizeof(struct aio_ring);
310
	kunmap_atomic(ring);
311
	flush_dcache_page(ctx->ring_pages[0]);
Linus Torvalds's avatar
Linus Torvalds committed
312
313
314
315
316
317
318
319

	return 0;
}

#define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)

320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
{
	struct kioctx *ctx = req->ki_ctx;
	unsigned long flags;

	spin_lock_irqsave(&ctx->ctx_lock, flags);

	if (!req->ki_list.next)
		list_add(&req->ki_list, &ctx->active_reqs);

	req->ki_cancel = cancel;

	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);

Kent Overstreet's avatar
Kent Overstreet committed
336
337
338
static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
			struct io_event *res)
{
339
	kiocb_cancel_fn *old, *cancel;
Kent Overstreet's avatar
Kent Overstreet committed
340
341
	int ret = -EINVAL;

342
343
344
345
346
347
348
349
350
	/*
	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
	 * actually has a cancel function, hence the cmpxchg()
	 */

	cancel = ACCESS_ONCE(kiocb->ki_cancel);
	do {
		if (!cancel || cancel == KIOCB_CANCELLED)
			return ret;
Kent Overstreet's avatar
Kent Overstreet committed
351

352
353
354
		old = cancel;
		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
	} while (cancel != old);
Kent Overstreet's avatar
Kent Overstreet committed
355

356
357
358
359
360
361
362
363
364
	atomic_inc(&kiocb->ki_users);
	spin_unlock_irq(&ctx->ctx_lock);

	memset(res, 0, sizeof(*res));
	res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
	res->data = kiocb->ki_user_data;
	ret = cancel(kiocb, res);

	spin_lock_irq(&ctx->ctx_lock);
Kent Overstreet's avatar
Kent Overstreet committed
365
366
367
368

	return ret;
}

Kent Overstreet's avatar
Kent Overstreet committed
369
370
371
372
373
374
375
376
377
378
379
380
381
static void free_ioctx_rcu(struct rcu_head *head)
{
	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
	kmem_cache_free(kioctx_cachep, ctx);
}

/*
 * When this function runs, the kioctx has been removed from the "hash table"
 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
 * now it's safe to cancel any that need to be.
 */
static void free_ioctx(struct kioctx *ctx)
{
382
	struct aio_ring *ring;
Kent Overstreet's avatar
Kent Overstreet committed
383
384
	struct io_event res;
	struct kiocb *req;
385
	unsigned head, avail;
Kent Overstreet's avatar
Kent Overstreet committed
386
387
388
389
390
391
392
393
394
395
396
397
398

	spin_lock_irq(&ctx->ctx_lock);

	while (!list_empty(&ctx->active_reqs)) {
		req = list_first_entry(&ctx->active_reqs,
				       struct kiocb, ki_list);

		list_del_init(&req->ki_list);
		kiocb_cancel(ctx, req, &res);
	}

	spin_unlock_irq(&ctx->ctx_lock);

399
	ring = kmap_atomic(ctx->ring_pages[0]);
400
401
402
403
	head = ring->head;
	kunmap_atomic(ring);

	while (atomic_read(&ctx->reqs_active) > 0) {
404
405
406
		wait_event(ctx->wait,
				head != ctx->tail ||
				atomic_read(&ctx->reqs_active) <= 0);
407

408
		avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
409
410
411

		atomic_sub(avail, &ctx->reqs_active);
		head += avail;
412
		head %= ctx->nr_events;
413
414
415
	}

	WARN_ON(atomic_read(&ctx->reqs_active) < 0);
Kent Overstreet's avatar
Kent Overstreet committed
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436

	aio_free_ring(ctx);

	pr_debug("freeing %p\n", ctx);

	/*
	 * Here the call_rcu() is between the wait_event() for reqs_active to
	 * hit 0, and freeing the ioctx.
	 *
	 * aio_complete() decrements reqs_active, but it has to touch the ioctx
	 * after to issue a wakeup so we use rcu.
	 */
	call_rcu(&ctx->rcu_head, free_ioctx_rcu);
}

static void put_ioctx(struct kioctx *ctx)
{
	if (unlikely(atomic_dec_and_test(&ctx->users)))
		free_ioctx(ctx);
}

Linus Torvalds's avatar
Linus Torvalds committed
437
438
439
440
441
/* ioctx_alloc
 *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
 */
static struct kioctx *ioctx_alloc(unsigned nr_events)
{
Zach Brown's avatar
Zach Brown committed
442
	struct mm_struct *mm = current->mm;
Linus Torvalds's avatar
Linus Torvalds committed
443
	struct kioctx *ctx;
444
	int err = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
445
446
447
448
449
450
451
452

	/* Prevent overflows */
	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
	    (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
		pr_debug("ENOMEM: nr_events too high\n");
		return ERR_PTR(-EINVAL);
	}

453
	if (!nr_events || (unsigned long)nr_events > aio_max_nr)
Linus Torvalds's avatar
Linus Torvalds committed
454
455
		return ERR_PTR(-EAGAIN);

456
	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
457
458
459
460
461
	if (!ctx)
		return ERR_PTR(-ENOMEM);

	ctx->max_reqs = nr_events;

Al Viro's avatar
Al Viro committed
462
	atomic_set(&ctx->users, 2);
Kent Overstreet's avatar
Kent Overstreet committed
463
	atomic_set(&ctx->dead, 0);
Linus Torvalds's avatar
Linus Torvalds committed
464
	spin_lock_init(&ctx->ctx_lock);
465
	spin_lock_init(&ctx->completion_lock);
466
	mutex_init(&ctx->ring_lock);
Linus Torvalds's avatar
Linus Torvalds committed
467
468
469
470
471
472
473
474
	init_waitqueue_head(&ctx->wait);

	INIT_LIST_HEAD(&ctx->active_reqs);

	if (aio_setup_ring(ctx) < 0)
		goto out_freectx;

	/* limit the number of system wide aios */
475
	spin_lock(&aio_nr_lock);
476
477
	if (aio_nr + nr_events > aio_max_nr ||
	    aio_nr + nr_events < aio_nr) {
478
		spin_unlock(&aio_nr_lock);
Linus Torvalds's avatar
Linus Torvalds committed
479
		goto out_cleanup;
480
481
	}
	aio_nr += ctx->max_reqs;
482
	spin_unlock(&aio_nr_lock);
Linus Torvalds's avatar
Linus Torvalds committed
483

Jeff Moyer's avatar
Jeff Moyer committed
484
	/* now link into global list. */
485
486
487
	spin_lock(&mm->ioctx_lock);
	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
	spin_unlock(&mm->ioctx_lock);
Linus Torvalds's avatar
Linus Torvalds committed
488

Kent Overstreet's avatar
Kent Overstreet committed
489
	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
490
		 ctx, ctx->user_id, mm, ctx->nr_events);
Linus Torvalds's avatar
Linus Torvalds committed
491
492
493
	return ctx;

out_cleanup:
494
495
	err = -EAGAIN;
	aio_free_ring(ctx);
Linus Torvalds's avatar
Linus Torvalds committed
496
out_freectx:
497
498
	if (ctx->aio_ring_file)
		fput(ctx->aio_ring_file);
Linus Torvalds's avatar
Linus Torvalds committed
499
	kmem_cache_free(kioctx_cachep, ctx);
Kent Overstreet's avatar
Kent Overstreet committed
500
	pr_debug("error allocating ioctx %d\n", err);
501
	return ERR_PTR(err);
Linus Torvalds's avatar
Linus Torvalds committed
502
503
}

Kent Overstreet's avatar
Kent Overstreet committed
504
static void kill_ioctx_work(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
505
{
Kent Overstreet's avatar
Kent Overstreet committed
506
	struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
507

Kent Overstreet's avatar
Kent Overstreet committed
508
509
510
	wake_up_all(&ctx->wait);
	put_ioctx(ctx);
}
Kent Overstreet's avatar
Kent Overstreet committed
511

Kent Overstreet's avatar
Kent Overstreet committed
512
513
514
static void kill_ioctx_rcu(struct rcu_head *head)
{
	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
Linus Torvalds's avatar
Linus Torvalds committed
515

Kent Overstreet's avatar
Kent Overstreet committed
516
517
518
	INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
	schedule_work(&ctx->rcu_work);
}
Linus Torvalds's avatar
Linus Torvalds committed
519

Kent Overstreet's avatar
Kent Overstreet committed
520
521
522
523
524
525
526
527
528
/* kill_ioctx
 *	Cancels all outstanding aio requests on an aio context.  Used
 *	when the processes owning a context have all exited to encourage
 *	the rapid destruction of the kioctx.
 */
static void kill_ioctx(struct kioctx *ctx)
{
	if (!atomic_xchg(&ctx->dead, 1)) {
		hlist_del_rcu(&ctx->list);
529

Kent Overstreet's avatar
Kent Overstreet committed
530
		/*
531
532
533
534
535
		 * It'd be more correct to do this in free_ioctx(), after all
		 * the outstanding kiocbs have finished - but by then io_destroy
		 * has already returned, so io_setup() could potentially return
		 * -EAGAIN with no ioctxs actually in use (as far as userspace
		 *  could tell).
Kent Overstreet's avatar
Kent Overstreet committed
536
		 */
537
538
539
540
541
542
543
544
545
546
		spin_lock(&aio_nr_lock);
		BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
		aio_nr -= ctx->max_reqs;
		spin_unlock(&aio_nr_lock);

		if (ctx->mmap_size)
			vm_munmap(ctx->mmap_base, ctx->mmap_size);

		/* Between hlist_del_rcu() and dropping the initial ref */
		call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
Kent Overstreet's avatar
Kent Overstreet committed
547
	}
Linus Torvalds's avatar
Linus Torvalds committed
548
549
550
551
552
}

/* wait_on_sync_kiocb:
 *	Waits on the given sync kiocb to complete.
 */
553
ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
Linus Torvalds's avatar
Linus Torvalds committed
554
{
555
	while (atomic_read(&iocb->ki_users)) {
Linus Torvalds's avatar
Linus Torvalds committed
556
		set_current_state(TASK_UNINTERRUPTIBLE);
557
		if (!atomic_read(&iocb->ki_users))
Linus Torvalds's avatar
Linus Torvalds committed
558
			break;
559
		io_schedule();
Linus Torvalds's avatar
Linus Torvalds committed
560
561
562
563
	}
	__set_current_state(TASK_RUNNING);
	return iocb->ki_user_data;
}
564
EXPORT_SYMBOL(wait_on_sync_kiocb);
Linus Torvalds's avatar
Linus Torvalds committed
565

Kent Overstreet's avatar
Kent Overstreet committed
566
567
568
569
570
571
572
/*
 * exit_aio: called when the last user of mm goes away.  At this point, there is
 * no way for any new requests to be submited or any of the io_* syscalls to be
 * called on the context.
 *
 * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
 * them.
Linus Torvalds's avatar
Linus Torvalds committed
573
 */
574
void exit_aio(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
575
{
576
	struct kioctx *ctx;
Kent Overstreet's avatar
Kent Overstreet committed
577
	struct hlist_node *n;
578

Kent Overstreet's avatar
Kent Overstreet committed
579
	hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
Linus Torvalds's avatar
Linus Torvalds committed
580
581
582
		if (1 != atomic_read(&ctx->users))
			printk(KERN_DEBUG
				"exit_aio:ioctx still alive: %d %d %d\n",
Kent Overstreet's avatar
Kent Overstreet committed
583
584
				atomic_read(&ctx->users),
				atomic_read(&ctx->dead),
585
				atomic_read(&ctx->reqs_active));
586
587
588
589
590
591
592
593
		/*
		 * We don't need to bother with munmap() here -
		 * exit_mmap(mm) is coming and it'll unmap everything.
		 * Since aio_free_ring() uses non-zero ->mmap_size
		 * as indicator that it needs to unmap the area,
		 * just set it to 0; aio_free_ring() is the only
		 * place that uses ->mmap_size, so it's safe.
		 */
594
		ctx->mmap_size = 0;
Kent Overstreet's avatar
Kent Overstreet committed
595

596
		kill_ioctx(ctx);
Linus Torvalds's avatar
Linus Torvalds committed
597
598
599
600
	}
}

/* aio_get_req
601
 *	Allocate a slot for an aio request.  Increments the ki_users count
Linus Torvalds's avatar
Linus Torvalds committed
602
603
604
 * of the kioctx so that the kioctx stays around until all requests are
 * complete.  Returns NULL if no requests are free.
 *
605
 * Returns with kiocb->ki_users set to 2.  The io submit code path holds
Linus Torvalds's avatar
Linus Torvalds committed
606
607
608
609
 * an extra reference while submitting the i/o.
 * This prevents races between the aio code path referencing the
 * req (after submitting it) and aio_complete() freeing the req.
 */
Kent Overstreet's avatar
Kent Overstreet committed
610
static inline struct kiocb *aio_get_req(struct kioctx *ctx)
Linus Torvalds's avatar
Linus Torvalds committed
611
{
Kent Overstreet's avatar
Kent Overstreet committed
612
613
	struct kiocb *req;

614
	if (atomic_read(&ctx->reqs_active) >= ctx->nr_events)
Kent Overstreet's avatar
Kent Overstreet committed
615
616
		return NULL;

617
	if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1)
Kent Overstreet's avatar
Kent Overstreet committed
618
		goto out_put;
Linus Torvalds's avatar
Linus Torvalds committed
619

620
	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
Linus Torvalds's avatar
Linus Torvalds committed
621
	if (unlikely(!req))
Kent Overstreet's avatar
Kent Overstreet committed
622
		goto out_put;
Linus Torvalds's avatar
Linus Torvalds committed
623

624
	atomic_set(&req->ki_users, 2);
Linus Torvalds's avatar
Linus Torvalds committed
625
626
	req->ki_ctx = ctx;

Jeff Moyer's avatar
Jeff Moyer committed
627
	return req;
Kent Overstreet's avatar
Kent Overstreet committed
628
629
630
out_put:
	atomic_dec(&ctx->reqs_active);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
631
632
}

633
static void kiocb_free(struct kiocb *req)
Linus Torvalds's avatar
Linus Torvalds committed
634
{
635
636
	if (req->ki_filp)
		fput(req->ki_filp);
637
638
	if (req->ki_eventfd != NULL)
		eventfd_ctx_put(req->ki_eventfd);
Linus Torvalds's avatar
Linus Torvalds committed
639
640
	if (req->ki_dtor)
		req->ki_dtor(req);
641
642
	if (req->ki_iovec != &req->ki_inline_vec)
		kfree(req->ki_iovec);
Linus Torvalds's avatar
Linus Torvalds committed
643
644
645
	kmem_cache_free(kiocb_cachep, req);
}

646
void aio_put_req(struct kiocb *req)
Linus Torvalds's avatar
Linus Torvalds committed
647
{
648
649
	if (atomic_dec_and_test(&req->ki_users))
		kiocb_free(req);
Linus Torvalds's avatar
Linus Torvalds committed
650
}
651
EXPORT_SYMBOL(aio_put_req);
Linus Torvalds's avatar
Linus Torvalds committed
652

653
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
Linus Torvalds's avatar
Linus Torvalds committed
654
{
655
	struct mm_struct *mm = current->mm;
656
	struct kioctx *ctx, *ret = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
657

658
659
	rcu_read_lock();

660
	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
Kent Overstreet's avatar
Kent Overstreet committed
661
662
		if (ctx->user_id == ctx_id) {
			atomic_inc(&ctx->users);
663
			ret = ctx;
Linus Torvalds's avatar
Linus Torvalds committed
664
665
			break;
		}
666
	}
Linus Torvalds's avatar
Linus Torvalds committed
667

668
	rcu_read_unlock();
669
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
670
671
672
673
674
}

/* aio_complete
 *	Called when the io request on the given iocb is complete.
 */
675
void aio_complete(struct kiocb *iocb, long res, long res2)
Linus Torvalds's avatar
Linus Torvalds committed
676
677
678
{
	struct kioctx	*ctx = iocb->ki_ctx;
	struct aio_ring	*ring;
Kent Overstreet's avatar
Kent Overstreet committed
679
	struct io_event	*ev_page, *event;
Linus Torvalds's avatar
Linus Torvalds committed
680
	unsigned long	flags;
Kent Overstreet's avatar
Kent Overstreet committed
681
	unsigned tail, pos;
Linus Torvalds's avatar
Linus Torvalds committed
682

683
684
685
686
687
688
	/*
	 * Special case handling for sync iocbs:
	 *  - events go directly into the iocb for fast handling
	 *  - the sync task with the iocb in its stack holds the single iocb
	 *    ref, no other paths have a way to get another ref
	 *  - the sync task helpfully left a reference to itself in the iocb
Linus Torvalds's avatar
Linus Torvalds committed
689
690
	 */
	if (is_sync_kiocb(iocb)) {
691
		BUG_ON(atomic_read(&iocb->ki_users) != 1);
Linus Torvalds's avatar
Linus Torvalds committed
692
		iocb->ki_user_data = res;
693
		atomic_set(&iocb->ki_users, 0);
Linus Torvalds's avatar
Linus Torvalds committed
694
		wake_up_process(iocb->ki_obj.tsk);
695
		return;
Linus Torvalds's avatar
Linus Torvalds committed
696
697
	}

Kent Overstreet's avatar
Kent Overstreet committed
698
699
700
	/*
	 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
	 * need to issue a wakeup after decrementing reqs_active.
Linus Torvalds's avatar
Linus Torvalds committed
701
	 */
Kent Overstreet's avatar
Kent Overstreet committed
702
	rcu_read_lock();
Linus Torvalds's avatar
Linus Torvalds committed
703

704
705
706
707
708
709
710
	if (iocb->ki_list.next) {
		unsigned long flags;

		spin_lock_irqsave(&ctx->ctx_lock, flags);
		list_del(&iocb->ki_list);
		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
	}
711

Linus Torvalds's avatar
Linus Torvalds committed
712
713
714
715
	/*
	 * cancelled requests don't get events, userland was given one
	 * when the event got cancelled.
	 */
716
	if (unlikely(xchg(&iocb->ki_cancel,
717
718
719
			  KIOCB_CANCELLED) == KIOCB_CANCELLED)) {
		atomic_dec(&ctx->reqs_active);
		/* Still need the wake_up in case free_ioctx is waiting */
Linus Torvalds's avatar
Linus Torvalds committed
720
		goto put_rq;
721
	}
Linus Torvalds's avatar
Linus Torvalds committed
722

723
724
	/*
	 * Add a completion event to the ring buffer. Must be done holding
725
	 * ctx->completion_lock to prevent other code from messing with the tail
726
727
728
729
	 * pointer since we might be called from irq context.
	 */
	spin_lock_irqsave(&ctx->completion_lock, flags);

730
	tail = ctx->tail;
Kent Overstreet's avatar
Kent Overstreet committed
731
732
	pos = tail + AIO_EVENTS_OFFSET;

733
	if (++tail >= ctx->nr_events)
734
		tail = 0;
Linus Torvalds's avatar
Linus Torvalds committed
735

736
	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
Kent Overstreet's avatar
Kent Overstreet committed
737
738
	event = ev_page + pos % AIO_EVENTS_PER_PAGE;

Linus Torvalds's avatar
Linus Torvalds committed
739
740
741
742
743
	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
	event->data = iocb->ki_user_data;
	event->res = res;
	event->res2 = res2;

Kent Overstreet's avatar
Kent Overstreet committed
744
	kunmap_atomic(ev_page);
745
	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
Kent Overstreet's avatar
Kent Overstreet committed
746
747

	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
Kent Overstreet's avatar
Kent Overstreet committed
748
749
		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
		 res, res2);
Linus Torvalds's avatar
Linus Torvalds committed
750
751
752
753
754
755

	/* after flagging the request as done, we
	 * must never even look at it again
	 */
	smp_wmb();	/* make event visible before updating tail */

756
	ctx->tail = tail;
Linus Torvalds's avatar
Linus Torvalds committed
757

758
	ring = kmap_atomic(ctx->ring_pages[0]);
Kent Overstreet's avatar
Kent Overstreet committed
759
	ring->tail = tail;
760
	kunmap_atomic(ring);
761
	flush_dcache_page(ctx->ring_pages[0]);
Linus Torvalds's avatar
Linus Torvalds committed
762

763
764
	spin_unlock_irqrestore(&ctx->completion_lock, flags);

Kent Overstreet's avatar
Kent Overstreet committed
765
	pr_debug("added to ring %p at [%u]\n", iocb, tail);
Davide Libenzi's avatar
Davide Libenzi committed
766
767
768
769
770
771

	/*
	 * Check if the user asked us to deliver the result through an
	 * eventfd. The eventfd_signal() function is safe to be called
	 * from IRQ context.
	 */
772
	if (iocb->ki_eventfd != NULL)
Davide Libenzi's avatar
Davide Libenzi committed
773
774
		eventfd_signal(iocb->ki_eventfd, 1);

Linus Torvalds's avatar
Linus Torvalds committed
775
776
put_rq:
	/* everything turned out well, dispose of the aiocb. */
777
	aio_put_req(iocb);
Linus Torvalds's avatar
Linus Torvalds committed
778

779
780
781
782
783
784
785
786
	/*
	 * We have to order our ring_info tail store above and test
	 * of the wait list below outside the wait lock.  This is
	 * like in wake_up_bit() where clearing a bit has to be
	 * ordered with the unlocked test.
	 */
	smp_mb();

Linus Torvalds's avatar
Linus Torvalds committed
787
788
789
	if (waitqueue_active(&ctx->wait))
		wake_up(&ctx->wait);

Kent Overstreet's avatar
Kent Overstreet committed
790
	rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
791
}
792
EXPORT_SYMBOL(aio_complete);
Linus Torvalds's avatar
Linus Torvalds committed
793

794
795
796
/* aio_read_events
 *	Pull an event off of the ioctx's event ring.  Returns the number of
 *	events fetched
Linus Torvalds's avatar
Linus Torvalds committed
797
 */
798
799
static long aio_read_events_ring(struct kioctx *ctx,
				 struct io_event __user *event, long nr)
Linus Torvalds's avatar
Linus Torvalds committed
800
801
{
	struct aio_ring *ring;
802
803
804
805
	unsigned head, pos;
	long ret = 0;
	int copy_ret;

806
	mutex_lock(&ctx->ring_lock);
Linus Torvalds's avatar
Linus Torvalds committed
807

808
	ring = kmap_atomic(ctx->ring_pages[0]);
809
810
811
	head = ring->head;
	kunmap_atomic(ring);

812
	pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events);
Linus Torvalds's avatar
Linus Torvalds committed
813

814
	if (head == ctx->tail)
Linus Torvalds's avatar
Linus Torvalds committed
815
816
		goto out;

817
818
819
820
821
	while (ret < nr) {
		long avail;
		struct io_event *ev;
		struct page *page;

822
823
		avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
		if (head == ctx->tail)
824
825
826
827
828
829
830
			break;

		avail = min(avail, nr - ret);
		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));

		pos = head + AIO_EVENTS_OFFSET;
831
		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
832
833
834
835
836
837
838
839
840
841
842
843
844
845
		pos %= AIO_EVENTS_PER_PAGE;

		ev = kmap(page);
		copy_ret = copy_to_user(event + ret, ev + pos,
					sizeof(*ev) * avail);
		kunmap(page);

		if (unlikely(copy_ret)) {
			ret = -EFAULT;
			goto out;
		}

		ret += avail;
		head += avail;
846
		head %= ctx->nr_events;
Linus Torvalds's avatar
Linus Torvalds committed
847
848
	}

849
	ring = kmap_atomic(ctx->ring_pages[0]);
850
	ring->head = head;
851
	kunmap_atomic(ring);
852
	flush_dcache_page(ctx->ring_pages[0]);
853

854
	pr_debug("%li  h%u t%u\n", ret, head, ctx->tail);
855
856

	atomic_sub(ret, &ctx->reqs_active);
857
out:
858
	mutex_unlock(&ctx->ring_lock);
859

Linus Torvalds's avatar
Linus Torvalds committed
860
861
862
	return ret;
}

863
864
static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
			    struct io_event __user *event, long *i)
Linus Torvalds's avatar
Linus Torvalds committed
865
{
866
	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
Linus Torvalds's avatar
Linus Torvalds committed
867

868
869
	if (ret > 0)
		*i += ret;
Linus Torvalds's avatar
Linus Torvalds committed
870

871
872
	if (unlikely(atomic_read(&ctx->dead)))
		ret = -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
873

874
875
	if (!*i)
		*i = ret;
Linus Torvalds's avatar
Linus Torvalds committed
876

877
	return ret < 0 || *i >= min_nr;
Linus Torvalds's avatar
Linus Torvalds committed
878
879
}

880
static long read_events(struct kioctx *ctx, long min_nr, long nr,
Linus Torvalds's avatar
Linus Torvalds committed
881
882
883
			struct io_event __user *event,
			struct timespec __user *timeout)
{
884
885
	ktime_t until = { .tv64 = KTIME_MAX };
	long ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
886
887
888

	if (timeout) {
		struct timespec	ts;
889

Linus Torvalds's avatar
Linus Torvalds committed
890
		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
891
			return -EFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
892

893
		until = timespec_to_ktime(ts);
Linus Torvalds's avatar
Linus Torvalds committed
894
895
	}

896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
	/*
	 * Note that aio_read_events() is being called as the conditional - i.e.
	 * we're calling it after prepare_to_wait() has set task state to
	 * TASK_INTERRUPTIBLE.
	 *
	 * But aio_read_events() can block, and if it blocks it's going to flip
	 * the task state back to TASK_RUNNING.
	 *
	 * This should be ok, provided it doesn't flip the state back to
	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
	 * will only happen if the mutex_lock() call blocks, and we then find
	 * the ringbuffer empty. So in practice we should be ok, but it's
	 * something to be aware of when touching this code.
	 */
	wait_event_interruptible_hrtimeout(ctx->wait,
			aio_read_events(ctx, min_nr, nr, event, &ret), until);
Linus Torvalds's avatar
Linus Torvalds committed
912

913
914
	if (!ret && signal_pending(current))
		ret = -EINTR;
Linus Torvalds's avatar
Linus Torvalds committed
915

916
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
}

/* sys_io_setup:
 *	Create an aio_context capable of receiving at least nr_events.
 *	ctxp must not point to an aio_context that already exists, and
 *	must be initialized to 0 prior to the call.  On successful
 *	creation of the aio_context, *ctxp is filled in with the resulting 
 *	handle.  May fail with -EINVAL if *ctxp is not initialized,
 *	if the specified nr_events exceeds internal limits.  May fail 
 *	with -EAGAIN if the specified nr_events exceeds the user's limit 
 *	of available events.  May fail with -ENOMEM if insufficient kernel
 *	resources are available.  May fail with -EFAULT if an invalid
 *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
 *	implemented.
 */
932
SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
Linus Torvalds's avatar
Linus Torvalds committed
933
934
935
936
937
938
939
940
941
942
{
	struct kioctx *ioctx = NULL;
	unsigned long ctx;
	long ret;

	ret = get_user(ctx, ctxp);
	if (unlikely(ret))
		goto out;

	ret = -EINVAL;
943
944
945
	if (unlikely(ctx || nr_events == 0)) {
		pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
		         ctx, nr_events);
Linus Torvalds's avatar
Linus Torvalds committed
946
947
948
949
950
951
952
		goto out;
	}

	ioctx = ioctx_alloc(nr_events);
	ret = PTR_ERR(ioctx);
	if (!IS_ERR(ioctx)) {
		ret = put_user(ioctx->user_id, ctxp);
953
		if (ret)
Kent Overstreet's avatar
Kent Overstreet committed
954
			kill_ioctx(ioctx);
955
		put_ioctx(ioctx);
Linus Torvalds's avatar
Linus Torvalds committed
956
957
958
959
960
961
962
963
964
	}

out:
	return ret;
}

/* sys_io_destroy:
 *	Destroy the aio_context specified.  May cancel any outstanding 
 *	AIOs and block on completion.  Will fail with -ENOSYS if not
965
 *	implemented.  May fail with -EINVAL if the context pointed to
Linus Torvalds's avatar
Linus Torvalds committed
966
967
 *	is invalid.
 */
968
SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
Linus Torvalds's avatar
Linus Torvalds committed
969
970
971
{
	struct kioctx *ioctx = lookup_ioctx(ctx);
	if (likely(NULL != ioctx)) {
Kent Overstreet's avatar
Kent Overstreet committed
972
		kill_ioctx(ioctx);
973
		put_ioctx(ioctx);
Linus Torvalds's avatar
Linus Torvalds committed
974
975
976
977
978
979
		return 0;
	}
	pr_debug("EINVAL: io_destroy: invalid context id\n");
	return -EINVAL;
}

980
static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
Linus Torvalds's avatar
Linus Torvalds committed
981
{
982
983
984
985
986
987
988
989
990
991
992
993
994
	struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];

	BUG_ON(ret <= 0);

	while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
		ssize_t this = min((ssize_t)iov->iov_len, ret);
		iov->iov_base += this;
		iov->iov_len -= this;
		iocb->ki_left -= this;
		ret -= this;
		if (iov->iov_len == 0) {
			iocb->ki_cur_seg++;
			iov++;
995
		}
996
	}
Linus Torvalds's avatar
Linus Torvalds committed
997

998
999
1000
	/* the caller should not have done more io than what fit in
	 * the remaining iovecs */
	BUG_ON(ret > 0 && iocb->ki_left == 0);
Linus Torvalds's avatar
Linus Torvalds committed
1001
1002
}

Kent Overstreet's avatar
Kent Overstreet committed
1003
1004
1005
1006
typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
			    unsigned long, loff_t);

static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
Linus Torvalds's avatar
Linus Torvalds committed
1007
1008
{
	struct file *file = iocb->ki_filp;
1009
1010
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
Linus Torvalds's avatar
Linus Torvalds committed
1011
1012
	ssize_t ret = 0;

1013
1014
1015
1016
	/* This matches the pread()/pwrite() logic */
	if (iocb->ki_pos < 0)
		return -EINVAL;

Kent Overstreet's avatar
Kent Overstreet committed
1017
	if (rw == WRITE)
1018
		file_start_write(file);
1019
	do {
1020
1021
1022
1023
1024
1025
1026
1027
1028
		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
			    iocb->ki_nr_segs - iocb->ki_cur_seg,
			    iocb->ki_pos);
		if (ret > 0)
			aio_advance_iovec(iocb, ret);

	/* retry all partial writes.  retry partial reads as long as its a
	 * regular file. */
	} while (ret > 0 && iocb->ki_left > 0 &&
Kent Overstreet's avatar
Kent Overstreet committed
1029
		 (rw == WRITE ||
1030
		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
Kent Overstreet's avatar
Kent Overstreet committed
1031
	if (rw == WRITE)
1032
		file_end_write(file);
Linus Torvalds's avatar
Linus Torvalds committed
1033

1034
1035
	/* This means we must have transferred all that we could */
	/* No need to retry anymore */
Linus Torvalds's avatar
Linus Torvalds committed
1036
1037
1038
	if ((ret == 0) || (iocb->ki_left == 0))
		ret = iocb->ki_nbytes - iocb->ki_left;

1039
1040
	/* If we managed to write some out we return that, rather than
	 * the eventual error. */
Kent Overstreet's avatar
Kent Overstreet committed
1041
	if (rw == WRITE
Zach Brown's avatar
Zach Brown committed
1042
	    && ret < 0 && ret != -EIOCBQUEUED
1043
1044
1045
	    && iocb->ki_nbytes - iocb->ki_left)
		ret = iocb->ki_nbytes - iocb->ki_left;

Linus Torvalds's avatar
Linus Torvalds committed
1046
1047
1048
	return ret;
}

Kent Overstreet's avatar
Kent Overstreet committed
1049
static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
1050
1051
1052
{
	ssize_t ret;

Kent Overstreet's avatar
Kent Overstreet committed
1053
1054
	kiocb->ki_nr_segs = kiocb->ki_nbytes;

1055
1056
#ifdef CONFIG_COMPAT
	if (compat)
Kent Overstreet's avatar
Kent Overstreet committed
1057
		ret = compat_rw_copy_check_uvector(rw,
1058
				(struct compat_iovec __user *)kiocb->ki_buf,
Kent Overstreet's avatar
Kent Overstreet committed
1059
				kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
1060
				&kiocb->ki_iovec);
1061
1062
	else
#endif
Kent Overstreet's avatar
Kent Overstreet committed
1063
		ret = rw_copy_check_uvector(rw,
1064
				(struct iovec __user *)kiocb->ki_buf,
Kent Overstreet's avatar
Kent Overstreet committed
1065
				kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
1066
				&kiocb->ki_iovec);
1067
	if (ret < 0)
Kent Overstreet's avatar
Kent Overstreet committed
1068
		return ret;
1069

Kent Overstreet's avatar
Kent Overstreet committed
1070
	/* ki_nbytes now reflect bytes instead of segs */
1071
	kiocb->ki_nbytes = ret;
Kent Overstreet's avatar
Kent Overstreet committed
1072
	return 0;
1073
1074
}

Kent Overstreet's avatar
Kent Overstreet committed
1075
static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
1076
{
Kent Overstreet's avatar
Kent Overstreet committed
1077
1078
	if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
		return -EFAULT;
1079

1080
1081
	kiocb->ki_iovec = &kiocb->ki_inline_vec;
	kiocb->ki_iovec->iov_base = kiocb->ki_buf;
Kent Overstreet's avatar
Kent Overstreet committed
1082
	kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
1083
1084
1085
1086
	kiocb->ki_nr_segs = 1;
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
1087
1088
1089
1090
1091
/*
 * aio_setup_iocb:
 *	Performs the initial checks and aio retry method
 *	setup for the kiocb at the time of io submission.
 */
Kent Overstreet's avatar
Kent Overstreet committed
1092
static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
Linus Torvalds's avatar
Linus Torvalds committed
1093
{
Kent Overstreet's avatar
Kent Overstreet committed
1094
1095
1096
1097
1098
	struct file *file = req->ki_filp;
	ssize_t ret;
	int rw;
	fmode_t mode;
	aio_rw_op *rw_op;
Linus Torvalds's avatar
Linus Torvalds committed
1099

Kent Overstreet's avatar
Kent Overstreet committed
1100
	switch (req->ki_opcode) {
Linus Torvalds's avatar
Linus Torvalds committed
1101
	case IOCB_CMD_PREAD:
1102
	case IOCB_CMD_PREADV:
Kent Overstreet's avatar
Kent Overstreet committed
1103
1104
1105
1106
1107
1108
		mode	= FMODE_READ;
		rw	= READ;
		rw_op	= file->f_op->aio_read;
		goto rw_common;

	case IOCB_CMD_PWRITE:
1109
	case IOCB_CMD_PWRITEV:
Kent Overstreet's avatar
Kent Overstreet committed
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
		mode	= FMODE_WRITE;
		rw	= WRITE;
		rw_op	= file->f_op->aio_write;
		goto rw_common;
rw_common:
		if (unlikely(!(file->f_mode & mode)))
			return -EBADF;

		if (!rw_op)
			return -EINVAL;

		ret = (req->ki_opcode == IOCB_CMD_PREADV ||
		       req->ki_opcode == IOCB_CMD_PWRITEV)
			? aio_setup_vectored_rw(rw, req, compat)
			: aio_setup_single_vector(rw, req);
1125
		if (ret)
Kent Overstreet's avatar
Kent Overstreet committed
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
			return ret;

		ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
		if (ret < 0)
			return ret;

		req->ki_nbytes = ret;
		req->ki_left = ret;

		ret = aio_rw_vect_retry(req, rw, rw_op);
Linus Torvalds's avatar
Linus Torvalds committed
1136
		break;
Kent Overstreet's avatar
Kent Overstreet committed
1137

Linus Torvalds's avatar
Linus Torvalds committed
1138
	case IOCB_CMD_FDSYNC:
Kent Overstreet's avatar
Kent Overstreet committed
1139
1140
1141
1142
		if (!file->f_op->aio_fsync)
			return -EINVAL;

		ret = file->f_op->aio_fsync(req, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1143
		break;
Kent Overstreet's avatar
Kent Overstreet committed
1144

Linus Torvalds's avatar
Linus Torvalds committed
1145
	case IOCB_CMD_FSYNC:
Kent Overstreet's avatar
Kent Overstreet committed
1146
1147
1148
1149
		if (!file->f_op->aio_fsync)
			return -EINVAL;

		ret = file->f_op->aio_fsync(req, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1150
		break;
Kent Overstreet's avatar
Kent Overstreet committed
1151

Linus Torvalds's avatar
Linus Torvalds committed
1152
	default:
Kent Overstreet's avatar
Kent Overstreet committed
1153
		pr_debug("EINVAL: no operation provided\n");
Kent Overstreet's avatar
Kent Overstreet committed
1154
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1155
1156
	}

Kent Overstreet's avatar
Kent Overstreet committed
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
	if (ret != -EIOCBQUEUED) {