direct.c 28.9 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
Linus Torvalds's avatar
Linus Torvalds committed
11
 * implement their own cache coherency protocol that subsumes file
12
13
14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
Linus Torvalds's avatar
Linus Torvalds committed
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
Linus Torvalds's avatar
Linus Torvalds committed
38
39
40
41
42
43
44
45
46
 *
 */

#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>
47
#include <linux/slab.h>
48
#include <linux/task_io_accounting_ops.h>
49
#include <linux/module.h>
Linus Torvalds's avatar
Linus Torvalds committed
50
51
52
53
54
55

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/uaccess.h>
Arun Sharma's avatar
Arun Sharma committed
56
#include <linux/atomic.h>
Linus Torvalds's avatar
Linus Torvalds committed
57

58
#include "internal.h"
59
#include "iostat.h"
60
#include "pnfs.h"
61

Linus Torvalds's avatar
Linus Torvalds committed
62
63
#define NFSDBG_FACILITY		NFSDBG_VFS

64
static struct kmem_cache *nfs_direct_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
65
66
67
68

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
69
70
71
72
struct nfs_direct_mirror {
	ssize_t count;
};

Linus Torvalds's avatar
Linus Torvalds committed
73
74
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
75
76

	/* I/O parameters */
77
	struct nfs_open_context	*ctx;		/* file open context info */
78
	struct nfs_lock_context *l_ctx;		/* Lock context info */
79
	struct kiocb *		iocb;		/* controlling i/o request */
80
	struct inode *		inode;		/* target file of i/o */
81
82

	/* completion state */
83
	atomic_t		io_count;	/* i/os we're waiting for */
84
	spinlock_t		lock;		/* protect completion state */
85
86
87
88

	struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
	int			mirror_count;

89
	ssize_t			count,		/* bytes actually processed */
Peng Tao's avatar
Peng Tao committed
90
				bytes_left,	/* bytes left to be sent */
91
				io_start,	/* start of IO */
Linus Torvalds's avatar
Linus Torvalds committed
92
				error;		/* any reported error */
93
	struct completion	completion;	/* wait for i/o completion */
94
95

	/* commit state */
96
97
98
	struct nfs_mds_commit_info mds_cinfo;	/* Storage for cinfo */
	struct pnfs_ds_commit_info ds_cinfo;	/* Storage for cinfo */
	struct work_struct	work;
99
100
101
102
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
Linus Torvalds's avatar
Linus Torvalds committed
103
104
};

105
106
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
107
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
108
static void nfs_direct_write_schedule_work(struct work_struct *work);
109
110
111
112
113
114
115
116
117
118
119

static inline void get_dreq(struct nfs_direct_req *dreq)
{
	atomic_inc(&dreq->io_count);
}

static inline int put_dreq(struct nfs_direct_req *dreq)
{
	return atomic_dec_and_test(&dreq->io_count);
}

120
121
122
123
124
125
void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
{
	dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
}
EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);

126
127
128
129
130
131
132
133
static void
nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
{
	int i;
	ssize_t count;

	WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);

134
135
136
137
138
	count = dreq->mirrors[hdr->pgio_mirror_idx].count;
	if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
		count = hdr->io_start + hdr->good_bytes - dreq->io_start;
		dreq->mirrors[hdr->pgio_mirror_idx].count = count;
	}
139
140
141
142
143
144
145
146
147
148
149

	/* update the dreq->count by finding the minimum agreed count from all
	 * mirrors */
	count = dreq->mirrors[0].count;

	for (i = 1; i < dreq->mirror_count; i++)
		count = min(count, dreq->mirrors[i].count);

	dreq->count = count;
}

150
151
152
153
/*
 * nfs_direct_select_verf - select the right verifier
 * @dreq - direct request possibly spanning multiple servers
 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
154
 * @commit_idx - commit bucket index for the DS
155
156
157
158
159
160
 *
 * returns the correct verifier to use given the role of the server
 */
static struct nfs_writeverf *
nfs_direct_select_verf(struct nfs_direct_req *dreq,
		       struct nfs_client *ds_clp,
161
		       int commit_idx)
162
163
164
165
166
167
{
	struct nfs_writeverf *verfp = &dreq->verf;

#ifdef CONFIG_NFS_V4_1
	if (ds_clp) {
		/* pNFS is in use, use the DS verf */
168
169
		if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
			verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
		else
			WARN_ON_ONCE(1);
	}
#endif
	return verfp;
}


/*
 * nfs_direct_set_hdr_verf - set the write/commit verifier
 * @dreq - direct request possibly spanning multiple servers
 * @hdr - pageio header to validate against previously seen verfs
 *
 * Set the server's (MDS or DS) "seen" verifier
 */
static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
				    struct nfs_pgio_header *hdr)
{
	struct nfs_writeverf *verfp;

190
	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
	WARN_ON_ONCE(verfp->committed >= 0);
	memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
	WARN_ON_ONCE(verfp->committed < 0);
}

/*
 * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
 * @dreq - direct request possibly spanning multiple servers
 * @hdr - pageio header to validate against previously seen verf
 *
 * set the server's "seen" verf if not initialized.
 * returns result of comparison between @hdr->verf and the "seen"
 * verf of the server used by @hdr (DS or MDS)
 */
static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
					  struct nfs_pgio_header *hdr)
{
	struct nfs_writeverf *verfp;

210
	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
	if (verfp->committed < 0) {
		nfs_direct_set_hdr_verf(dreq, hdr);
		return 0;
	}
	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
}

/*
 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
 * @dreq - direct request possibly spanning multiple servers
 * @data - commit data to validate against previously seen verf
 *
 * returns result of comparison between @data->verf and the verf of
 * the server used by @data (DS or MDS)
 */
static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
					   struct nfs_commit_data *data)
{
	struct nfs_writeverf *verfp;

	verfp = nfs_direct_select_verf(dreq, data->ds_clp,
					 data->ds_commit_index);
233
234
235
236
237

	/* verifier not set so always fail */
	if (verfp->committed < 0)
		return 1;

238
239
240
	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
}

Linus Torvalds's avatar
Linus Torvalds committed
241
/**
242
243
244
245
246
247
248
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
Mel Gorman's avatar
Mel Gorman committed
249
250
251
 * the NFS client supports direct I/O. However, for most direct IO, we
 * shunt off direct read and write requests before the VFS gets them,
 * so this method is only ever called for swap.
Linus Torvalds's avatar
Linus Torvalds committed
252
 */
253
ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
254
{
255
256
257
258
259
260
	struct inode *inode = iocb->ki_filp->f_mapping->host;

	/* we only support swap file calling nfs_direct_IO */
	if (!IS_SWAPFILE(inode))
		return 0;

Mel Gorman's avatar
Mel Gorman committed
261
#ifndef CONFIG_NFS_SWAP
262
	dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
Al Viro's avatar
Al Viro committed
263
			iocb->ki_filp, (long long) pos, iter->nr_segs);
264
265

	return -EINVAL;
Mel Gorman's avatar
Mel Gorman committed
266
#else
Christoph Hellwig's avatar
Christoph Hellwig committed
267
	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
Mel Gorman's avatar
Mel Gorman committed
268

269
	if (iov_iter_rw(iter) == READ)
Martin K. Petersen's avatar
Martin K. Petersen committed
270
271
		return nfs_file_direct_read(iocb, iter, pos);
	return nfs_file_direct_write(iocb, iter, pos);
Mel Gorman's avatar
Mel Gorman committed
272
#endif /* CONFIG_NFS_SWAP */
273
274
}

275
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
276
{
277
	unsigned int i;
278
279
	for (i = 0; i < npages; i++)
		page_cache_release(pages[i]);
280
281
}

282
283
284
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
			      struct nfs_direct_req *dreq)
{
285
	cinfo->lock = &dreq->inode->i_lock;
286
287
288
289
290
291
	cinfo->mds = &dreq->mds_cinfo;
	cinfo->ds = &dreq->ds_cinfo;
	cinfo->dreq = dreq;
	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
}

292
293
294
295
296
297
298
299
300
301
302
303
static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
					     struct nfs_pageio_descriptor *pgio,
					     struct nfs_page *req)
{
	int mirror_count = 1;

	if (pgio->pg_ops->pg_get_mirror_count)
		mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);

	dreq->mirror_count = mirror_count;
}

304
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
Linus Torvalds's avatar
Linus Torvalds committed
305
{
306
307
	struct nfs_direct_req *dreq;

308
	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
309
310
311
312
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
313
	kref_get(&dreq->kref);
314
	init_completion(&dreq->completion);
315
	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
316
	dreq->verf.committed = NFS_INVALID_STABLE_HOW;	/* not set yet */
317
	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
318
	dreq->mirror_count = 1;
319
	spin_lock_init(&dreq->lock);
320
321

	return dreq;
Linus Torvalds's avatar
Linus Torvalds committed
322
323
}

324
static void nfs_direct_req_free(struct kref *kref)
Linus Torvalds's avatar
Linus Torvalds committed
325
326
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
327

328
	nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
329
330
	if (dreq->l_ctx != NULL)
		nfs_put_lock_context(dreq->l_ctx);
331
332
	if (dreq->ctx != NULL)
		put_nfs_open_context(dreq->ctx);
Linus Torvalds's avatar
Linus Torvalds committed
333
334
335
	kmem_cache_free(nfs_direct_cachep, dreq);
}

336
337
338
339
340
static void nfs_direct_req_release(struct nfs_direct_req *dreq)
{
	kref_put(&dreq->kref, nfs_direct_req_free);
}

341
342
343
344
345
346
ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
{
	return dreq->bytes_left;
}
EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);

347
348
349
350
351
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
352
	ssize_t result = -EIOCBQUEUED;
353
354
355
356
357

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

358
	result = wait_for_completion_killable(&dreq->completion);
359
360

	if (!result)
361
		result = dreq->error;
362
	if (!result)
363
		result = dreq->count;
364
365
366
367
368

out:
	return (ssize_t) result;
}

369
/*
370
371
 * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
 * the iocb is still valid here if this is a synchronous request.
372
 */
373
static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
374
{
375
376
	struct inode *inode = dreq->inode;

377
	if (dreq->iocb && write) {
378
		loff_t pos = dreq->iocb->ki_pos + dreq->count;
379
380
381
382
383
384
385

		spin_lock(&inode->i_lock);
		if (i_size_read(inode) < pos)
			i_size_write(inode, pos);
		spin_unlock(&inode->i_lock);
	}

386
	if (write)
387
		nfs_zap_mapping(inode, inode->i_mapping);
388
389

	inode_dio_done(inode);
390
391

	if (dreq->iocb) {
392
		long res = (long) dreq->error;
393
		if (!res)
394
			res = (long) dreq->count;
395
		dreq->iocb->ki_complete(dreq->iocb, res, 0);
396
	}
397

398
	complete_all(&dreq->completion);
399

400
	nfs_direct_req_release(dreq);
401
402
}

Trond Myklebust's avatar
Trond Myklebust committed
403
static void nfs_direct_readpage_release(struct nfs_page *req)
Linus Torvalds's avatar
Linus Torvalds committed
404
{
405
	dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
406
		req->wb_context->dentry->d_inode->i_sb->s_id,
407
		(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
408
409
410
		req->wb_bytes,
		(long long)req_offset(req));
	nfs_release_request(req);
411
412
}

413
static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
414
{
415
416
	unsigned long bytes = 0;
	struct nfs_direct_req *dreq = hdr->dreq;
417

418
419
	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
		goto out_put;
420
421

	spin_lock(&dreq->lock);
422
423
424
	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
		dreq->error = hdr->error;
	else
425
426
		nfs_direct_good_bytes(dreq, hdr);

427
428
	spin_unlock(&dreq->lock);

429
430
431
432
	while (!list_empty(&hdr->pages)) {
		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
		struct page *page = req->wb_page;

433
434
		if (!PageCompound(page) && bytes < hdr->good_bytes)
			set_page_dirty(page);
435
436
437
		bytes += req->wb_bytes;
		nfs_list_remove_request(req);
		nfs_direct_readpage_release(req);
438
	}
439
out_put:
440
	if (put_dreq(dreq))
441
		nfs_direct_complete(dreq, false);
442
	hdr->release(hdr);
Linus Torvalds's avatar
Linus Torvalds committed
443
444
}

445
static void nfs_read_sync_pgio_error(struct list_head *head)
446
{
447
	struct nfs_page *req;
448

449
450
451
452
453
	while (!list_empty(head)) {
		req = nfs_list_entry(head->next);
		nfs_list_remove_request(req);
		nfs_release_request(req);
	}
454
455
}

456
457
458
459
460
461
static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
{
	get_dreq(hdr->dreq);
}

static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
462
	.error_cleanup = nfs_read_sync_pgio_error,
463
464
465
466
	.init_hdr = nfs_direct_pgio_init,
	.completion = nfs_direct_read_completion,
};

467
/*
468
469
470
471
472
 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
 * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
 * bail and stop sending more reads.  Read length accounting is
 * handled automatically by nfs_direct_read_result().  Otherwise, if
 * no requests have been sent, just return an error.
Linus Torvalds's avatar
Linus Torvalds committed
473
474
 */

475
476
477
478
479
480
481
482
483
static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
					      struct iov_iter *iter,
					      loff_t pos)
{
	struct nfs_pageio_descriptor desc;
	struct inode *inode = dreq->inode;
	ssize_t result = -EINVAL;
	size_t requested_bytes = 0;
	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
484

485
	nfs_pageio_init_read(&desc, dreq->inode, false,
486
487
488
489
			     &nfs_direct_read_completion_ops);
	get_dreq(dreq);
	desc.pg_dreq = dreq;
	atomic_inc(&inode->i_dio_count);
Mel Gorman's avatar
Mel Gorman committed
490

491
492
493
494
495
	while (iov_iter_count(iter)) {
		struct page **pagevec;
		size_t bytes;
		size_t pgbase;
		unsigned npages, i;
496

497
498
499
500
501
502
503
504
		result = iov_iter_get_pages_alloc(iter, &pagevec, 
						  rsize, &pgbase);
		if (result < 0)
			break;
	
		bytes = result;
		iov_iter_advance(iter, bytes);
		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
505
506
		for (i = 0; i < npages; i++) {
			struct nfs_page *req;
507
			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
508
			/* XXX do we need to do the eof zeroing found in async_filler? */
509
			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
510
511
512
513
514
515
516
						 pgbase, req_len);
			if (IS_ERR(req)) {
				result = PTR_ERR(req);
				break;
			}
			req->wb_index = pos >> PAGE_SHIFT;
			req->wb_offset = pos & ~PAGE_MASK;
517
518
			if (!nfs_pageio_add_request(&desc, req)) {
				result = desc.pg_error;
519
520
521
522
523
				nfs_release_request(req);
				break;
			}
			pgbase = 0;
			bytes -= req_len;
524
			requested_bytes += req_len;
525
			pos += req_len;
Peng Tao's avatar
Peng Tao committed
526
			dreq->bytes_left -= req_len;
527
		}
528
		nfs_direct_release_pages(pagevec, npages);
529
		kvfree(pagevec);
530
531
532
533
		if (result < 0)
			break;
	}

534
535
	nfs_pageio_complete(&desc);

536
537
538
539
540
	/*
	 * If no bytes were started, return the error, and let the
	 * generic layer handle the completion.
	 */
	if (requested_bytes == 0) {
541
		inode_dio_done(inode);
542
543
544
545
		nfs_direct_req_release(dreq);
		return result < 0 ? result : -EIO;
	}

546
	if (put_dreq(dreq))
547
		nfs_direct_complete(dreq, false);
548
	return 0;
549
550
}

551
552
553
/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
554
 * @iter: vector of user buffers into which to read data
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
 * @pos: byte offset in file where reading starts
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
 * READ where the file size could change.  Our preference is simply
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
 *
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
571
ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
Martin K. Petersen's avatar
Martin K. Petersen committed
572
				loff_t pos)
Linus Torvalds's avatar
Linus Torvalds committed
573
{
574
575
576
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
Linus Torvalds's avatar
Linus Torvalds committed
577
	struct nfs_direct_req *dreq;
578
	struct nfs_lock_context *l_ctx;
579
	ssize_t result = -EINVAL;
580
	size_t count = iov_iter_count(iter);
581
582
583
584
585
586
587
588
589
	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);

	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
		file, count, (long long) pos);

	result = 0;
	if (!count)
		goto out;

590
	mutex_lock(&inode->i_mutex);
591
592
	result = nfs_sync_mapping(mapping);
	if (result)
593
		goto out_unlock;
Linus Torvalds's avatar
Linus Torvalds committed
594

595
596
597
	task_io_account_read(count);

	result = -ENOMEM;
598
	dreq = nfs_direct_req_alloc();
599
	if (dreq == NULL)
600
		goto out_unlock;
Linus Torvalds's avatar
Linus Torvalds committed
601

602
	dreq->inode = inode;
603
	dreq->bytes_left = count;
604
	dreq->io_start = pos;
605
	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
606
607
608
	l_ctx = nfs_get_lock_context(dreq->ctx);
	if (IS_ERR(l_ctx)) {
		result = PTR_ERR(l_ctx);
609
		goto out_release;
610
611
	}
	dreq->l_ctx = l_ctx;
612
613
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
Linus Torvalds's avatar
Linus Torvalds committed
614

615
	NFS_I(inode)->read_io += count;
616
	result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
617
618
619

	mutex_unlock(&inode->i_mutex);

620
	if (!result) {
621
		result = nfs_direct_wait(dreq);
622
623
624
		if (result > 0)
			iocb->ki_pos = pos + result;
	}
625
626
627
628

	nfs_direct_req_release(dreq);
	return result;

629
out_release:
630
	nfs_direct_req_release(dreq);
631
632
out_unlock:
	mutex_unlock(&inode->i_mutex);
633
out:
Linus Torvalds's avatar
Linus Torvalds committed
634
635
636
	return result;
}

637
638
639
640
641
642
643
644
645
646
647
648
649
650
static void
nfs_direct_write_scan_commit_list(struct inode *inode,
				  struct list_head *list,
				  struct nfs_commit_info *cinfo)
{
	spin_lock(cinfo->lock);
#ifdef CONFIG_NFS_V4_1
	if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
		NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
#endif
	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
	spin_unlock(cinfo->lock);
}

651
652
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
653
654
655
656
657
	struct nfs_pageio_descriptor desc;
	struct nfs_page *req, *tmp;
	LIST_HEAD(reqs);
	struct nfs_commit_info cinfo;
	LIST_HEAD(failed);
658
	int i;
659
660

	nfs_init_cinfo_from_dreq(&cinfo, dreq);
661
	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
Linus Torvalds's avatar
Linus Torvalds committed
662

663
	dreq->count = 0;
664
665
	for (i = 0; i < dreq->mirror_count; i++)
		dreq->mirrors[i].count = 0;
666
667
	get_dreq(dreq);

668
	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
669
670
			      &nfs_direct_write_completion_ops);
	desc.pg_dreq = dreq;
671

672
673
674
	req = nfs_list_entry(reqs.next);
	nfs_direct_setup_mirroring(dreq, &desc, req);

675
676
	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
		if (!nfs_pageio_add_request(&desc, req)) {
677
			nfs_list_remove_request(req);
678
679
680
681
682
683
			nfs_list_add_request(req, &failed);
			spin_lock(cinfo.lock);
			dreq->flags = 0;
			dreq->error = -EIO;
			spin_unlock(cinfo.lock);
		}
684
		nfs_release_request(req);
685
686
	}
	nfs_pageio_complete(&desc);
687

688
689
690
	while (!list_empty(&failed)) {
		req = nfs_list_entry(failed.next);
		nfs_list_remove_request(req);
691
		nfs_unlock_and_release_request(req);
692
	}
693

694
695
	if (put_dreq(dreq))
		nfs_direct_write_complete(dreq, dreq->inode);
696
697
}

698
static void nfs_direct_commit_complete(struct nfs_commit_data *data)
699
{
700
	struct nfs_direct_req *dreq = data->dreq;
701
702
	struct nfs_commit_info cinfo;
	struct nfs_page *req;
703
704
	int status = data->task.tk_status;

705
	nfs_init_cinfo_from_dreq(&cinfo, dreq);
706
	if (status < 0) {
707
		dprintk("NFS: %5u commit failed with error %d.\n",
708
			data->task.tk_pid, status);
709
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
710
	} else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
711
		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
712
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
Linus Torvalds's avatar
Linus Torvalds committed
713
714
	}

715
	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
716
717
718
719
720
	while (!list_empty(&data->pages)) {
		req = nfs_list_entry(data->pages.next);
		nfs_list_remove_request(req);
		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
			/* Note the rewrite will go through mds */
721
			nfs_mark_request_commit(req, NULL, &cinfo, 0);
722
723
		} else
			nfs_release_request(req);
724
		nfs_unlock_and_release_request(req);
725
726
727
728
	}

	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
		nfs_direct_write_complete(dreq, data->inode);
Linus Torvalds's avatar
Linus Torvalds committed
729
730
}

731
732
733
734
735
736
737
738
static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
{
	/* There is no lock to clear */
}

static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
	.completion = nfs_direct_commit_complete,
	.error_cleanup = nfs_direct_error_cleanup,
739
740
741
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
Linus Torvalds's avatar
Linus Torvalds committed
742
{
743
744
745
746
747
748
749
750
751
	int res;
	struct nfs_commit_info cinfo;
	LIST_HEAD(mds_list);

	nfs_init_cinfo_from_dreq(&cinfo, dreq);
	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
	if (res < 0) /* res == -ENOMEM */
		nfs_direct_write_reschedule(dreq);
752
}
Linus Torvalds's avatar
Linus Torvalds committed
753

754
static void nfs_direct_write_schedule_work(struct work_struct *work)
755
{
756
	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
757
	int flags = dreq->flags;
Linus Torvalds's avatar
Linus Torvalds committed
758

759
760
761
762
	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
Linus Torvalds's avatar
Linus Torvalds committed
763
			break;
764
765
766
767
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
768
			nfs_direct_complete(dreq, true);
769
770
	}
}
Linus Torvalds's avatar
Linus Torvalds committed
771

772
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
773
{
774
	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
775
}
776
777
778
779
780

static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
	struct nfs_direct_req *dreq = hdr->dreq;
	struct nfs_commit_info cinfo;
781
	bool request_commit = false;
782
783
784
785
786
787
788
789
790
791
792
793
794
	struct nfs_page *req = nfs_list_entry(hdr->pages.next);

	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
		goto out_put;

	nfs_init_cinfo_from_dreq(&cinfo, dreq);

	spin_lock(&dreq->lock);

	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
		dreq->flags = 0;
		dreq->error = hdr->error;
	}
795
	if (dreq->error == 0) {
796
		nfs_direct_good_bytes(dreq, hdr);
797
		if (nfs_write_need_commit(hdr)) {
798
			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
799
				request_commit = true;
800
			else if (dreq->flags == 0) {
801
				nfs_direct_set_hdr_verf(dreq, hdr);
802
				request_commit = true;
803
804
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
805
806
				request_commit = true;
				if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
807
808
					dreq->flags =
						NFS_ODIRECT_RESCHED_WRITES;
809
810
811
812
813
814
			}
		}
	}
	spin_unlock(&dreq->lock);

	while (!list_empty(&hdr->pages)) {
815

816
817
		req = nfs_list_entry(hdr->pages.next);
		nfs_list_remove_request(req);
818
		if (request_commit) {
819
			kref_get(&req->wb_kref);
820
821
			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
				hdr->ds_commit_idx);
822
		}
823
		nfs_unlock_and_release_request(req);
824
825
826
827
828
829
830
831
	}

out_put:
	if (put_dreq(dreq))
		nfs_direct_write_complete(dreq, hdr->inode);
	hdr->release(hdr);
}

832
833
834
835
836
837
838
static void nfs_write_sync_pgio_error(struct list_head *head)
{
	struct nfs_page *req;

	while (!list_empty(head)) {
		req = nfs_list_entry(head->next);
		nfs_list_remove_request(req);
839
		nfs_unlock_and_release_request(req);
840
841
842
	}
}

843
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
844
	.error_cleanup = nfs_write_sync_pgio_error,
845
846
847
848
	.init_hdr = nfs_direct_pgio_init,
	.completion = nfs_direct_write_completion,
};

849
850
851
852
853
854
855
856
857
858
859
860

/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
/*
 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
 * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
 * bail and stop sending more writes.  Write length accounting is
 * handled automatically by nfs_direct_write_result().  Otherwise, if
 * no requests have been sent, just return an error.
 */
861
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
862
					       struct iov_iter *iter,
863
					       loff_t pos)
864
{
865
	struct nfs_pageio_descriptor desc;
866
	struct inode *inode = dreq->inode;
867
868
	ssize_t result = 0;
	size_t requested_bytes = 0;
869
	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
870

871
	nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
872
873
			      &nfs_direct_write_completion_ops);
	desc.pg_dreq = dreq;
874
	get_dreq(dreq);
875
	atomic_inc(&inode->i_dio_count);
876

877
878
879
880
881
882
883
884
885
	NFS_I(inode)->write_io += iov_iter_count(iter);
	while (iov_iter_count(iter)) {
		struct page **pagevec;
		size_t bytes;
		size_t pgbase;
		unsigned npages, i;

		result = iov_iter_get_pages_alloc(iter, &pagevec, 
						  wsize, &pgbase);
886
887
		if (result < 0)
			break;
888
889
890
891
892
893
894
895

		bytes = result;
		iov_iter_advance(iter, bytes);
		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
		for (i = 0; i < npages; i++) {
			struct nfs_page *req;
			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);

896
			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
897
898
899
900
901
						 pgbase, req_len);
			if (IS_ERR(req)) {
				result = PTR_ERR(req);
				break;
			}
902
903
904

			nfs_direct_setup_mirroring(dreq, &desc, req);

905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
			nfs_lock_request(req);
			req->wb_index = pos >> PAGE_SHIFT;
			req->wb_offset = pos & ~PAGE_MASK;
			if (!nfs_pageio_add_request(&desc, req)) {
				result = desc.pg_error;
				nfs_unlock_and_release_request(req);
				break;
			}
			pgbase = 0;
			bytes -= req_len;
			requested_bytes += req_len;
			pos += req_len;
			dreq->bytes_left -= req_len;
		}
		nfs_direct_release_pages(pagevec, npages);
		kvfree(pagevec);
		if (result < 0)
922
923
			break;
	}
924
	nfs_pageio_complete(&desc);
925

926
927
928
929
930
	/*
	 * If no bytes were started, return the error, and let the
	 * generic layer handle the completion.
	 */
	if (requested_bytes == 0) {
931
		inode_dio_done(inode);
932
933
934
935
		nfs_direct_req_release(dreq);
		return result < 0 ? result : -EIO;
	}

936
937
	if (put_dreq(dreq))
		nfs_direct_write_complete(dreq, dreq->inode);
938
	return 0;
939
940
}

Linus Torvalds's avatar
Linus Torvalds committed
941
942
943
/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
944
 * @iter: vector of user buffers from which to write data
945
 * @pos: byte offset in file where writing starts
Linus Torvalds's avatar
Linus Torvalds committed
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
962
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
Martin K. Petersen's avatar
Martin K. Petersen committed
963
				loff_t pos)
Linus Torvalds's avatar
Linus Torvalds committed
964
{
965
	ssize_t result = -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
966
967
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;
968
969
970
	struct inode *inode = mapping->host;
	struct nfs_direct_req *dreq;
	struct nfs_lock_context *l_ctx;
971
	loff_t end;
972
	size_t count = iov_iter_count(iter);
973
974
	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;

975
976
	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);

977
978
	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
		file, count, (long long) pos);
979

980
	result = generic_write_checks(file, &pos, &count);
981
	if (result)
Linus Torvalds's avatar
Linus Torvalds committed
982
		goto out;
983

984
	result = -EINVAL;
985
	if ((ssize_t) count < 0)
Linus Torvalds's avatar
Linus Torvalds committed
986
		goto out;
987
	result = 0;
Linus Torvalds's avatar
Linus Torvalds committed
988
989
	if (!count)
		goto out;
990

991
992
	mutex_lock(&inode->i_mutex);

993
994
	result = nfs_sync_mapping(mapping);
	if (result)
995
996
997
998
999
1000
1001
1002
		goto out_unlock;

	if (mapping->nrpages) {
		result = invalidate_inode_pages2_range(mapping,
					pos >> PAGE_CACHE_SHIFT, end);
		if (result)
			goto out_unlock;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1003

1004
1005
	task_io_account_write(count);

1006
1007
1008
	result = -ENOMEM;
	dreq = nfs_direct_req_alloc();
	if (!dreq)
1009
		goto out_unlock;
1010

1011
1012
	dreq->inode = inode;
	dreq->bytes_left = count;
1013
	dreq->io_start = pos;
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
	l_ctx = nfs_get_lock_context(dreq->ctx);
	if (IS_ERR(l_ctx)) {
		result = PTR_ERR(l_ctx);
		goto out_release;
	}
	dreq->l_ctx = l_ctx;
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;

1024
	result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
1025
1026
1027
1028
1029
1030
1031
1032

	if (mapping->nrpages) {
		invalidate_inode_pages2_range(mapping,
					      pos >> PAGE_CACHE_SHIFT, end);
	}

	mutex_unlock(&inode->i_mutex);

1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
	if (!result) {
		result = nfs_direct_wait(dreq);
		if (result > 0) {
			struct inode *inode = mapping->host;

			iocb->ki_pos = pos + result;
			spin_lock(&inode->i_lock);
			if (i_size_read(inode) < iocb->ki_pos)
				i_size_write(inode, iocb->ki_pos);
			spin_unlock(&inode->i_lock);
		}
1044
	}
1045
1046
1047
	nfs_direct_req_release(dreq);
	return result;

1048
1049
out_release:
	nfs_direct_req_release(dreq);
1050
1051
out_unlock:
	mutex_unlock(&inode->i_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1052
out:
1053
	return result;
Linus Torvalds's avatar
Linus Torvalds committed
1054
1055
}

1056
1057
1058
1059
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
David Howells's avatar
David Howells committed
1060
int __init nfs_init_directcache(void)
Linus Torvalds's avatar
Linus Torvalds committed
1061
1062
1063
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
1064
1065
						0, (SLAB_RECLAIM_ACCOUNT|
							SLAB_MEM_SPREAD),
1066
						NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1067
1068
1069
1070
1071
1072
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

1073
/**
David Howells's avatar
David Howells committed
1074
 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1075
1076
 *
 */
1077
void nfs_destroy_directcache(void)
Linus Torvalds's avatar
Linus Torvalds committed
1078
{
1079
	kmem_cache_destroy(nfs_direct_cachep);
Linus Torvalds's avatar
Linus Torvalds committed
1080
}