inode.c 282 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19
#include <linux/kernel.h>
20
#include <linux/bio.h>
Chris Mason's avatar
Chris Mason committed
21
#include <linux/buffer_head.h>
Sage Weil's avatar
Sage Weil committed
22
#include <linux/file.h>
Chris Mason's avatar
Chris Mason committed
23
24
25
26
27
28
29
30
31
32
33
34
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
Chris Mason's avatar
Chris Mason committed
35
#include <linux/bit_spinlock.h>
Josef Bacik's avatar
Josef Bacik committed
36
#include <linux/xattr.h>
Josef Bacik's avatar
Josef Bacik committed
37
#include <linux/posix_acl.h>
Yan Zheng's avatar
Yan Zheng committed
38
#include <linux/falloc.h>
39
#include <linux/slab.h>
40
#include <linux/ratelimit.h>
41
#include <linux/mount.h>
42
#include <linux/btrfs.h>
David Woodhouse's avatar
David Woodhouse committed
43
#include <linux/blkdev.h>
44
#include <linux/posix_acl_xattr.h>
45
#include <linux/uio.h>
Chris Mason's avatar
Chris Mason committed
46
47
48
49
50
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
51
#include "ordered-data.h"
52
#include "xattr.h"
53
#include "tree-log.h"
54
#include "volumes.h"
55
#include "compression.h"
56
#include "locking.h"
57
#include "free-space-cache.h"
58
#include "inode-map.h"
Liu Bo's avatar
Liu Bo committed
59
#include "backref.h"
60
#include "hash.h"
61
#include "props.h"
62
#include "qgroup.h"
63
#include "dedupe.h"
Chris Mason's avatar
Chris Mason committed
64
65

struct btrfs_iget_args {
66
	struct btrfs_key *location;
Chris Mason's avatar
Chris Mason committed
67
68
69
	struct btrfs_root *root;
};

70
71
72
73
74
75
76
struct btrfs_dio_data {
	u64 outstanding_extents;
	u64 reserve;
	u64 unsubmitted_oe_range_start;
	u64 unsubmitted_oe_range_end;
};

77
78
79
80
81
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
82
83
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
84
static const struct file_operations btrfs_dir_file_operations;
85
static const struct extent_io_ops btrfs_extent_io_ops;
Chris Mason's avatar
Chris Mason committed
86
87
88
89
90

static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
91
struct kmem_cache *btrfs_free_space_cachep;
Chris Mason's avatar
Chris Mason committed
92
93

#define S_SHIFT 12
David Sterba's avatar
David Sterba committed
94
static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
Chris Mason's avatar
Chris Mason committed
95
96
97
98
99
100
101
102
103
	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
};

104
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
105
static int btrfs_truncate(struct inode *inode);
106
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
107
108
static noinline int cow_file_range(struct inode *inode,
				   struct page *locked_page,
109
110
111
				   u64 start, u64 end, u64 delalloc_end,
				   int *page_started, unsigned long *nr_written,
				   int unlock, struct btrfs_dedupe_hash *hash);
112
113
114
static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
					   u64 len, u64 orig_start,
					   u64 block_start, u64 block_len,
Josef Bacik's avatar
Josef Bacik committed
115
116
					   u64 orig_block_len, u64 ram_bytes,
					   int type);
117

118
static int btrfs_dirty_inode(struct inode *inode);
119

120
121
122
123
124
125
126
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_inode_set_ops(struct inode *inode)
{
	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
}
#endif

127
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
128
129
				     struct inode *inode,  struct inode *dir,
				     const struct qstr *qstr)
Jim Owens's avatar
Jim Owens committed
130
131
132
{
	int err;

133
	err = btrfs_init_acl(trans, inode, dir);
Jim Owens's avatar
Jim Owens committed
134
	if (!err)
135
		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
Jim Owens's avatar
Jim Owens committed
136
137
138
	return err;
}

139
140
141
142
143
/*
 * this does all the hard work for inserting an inline extent into
 * the btree.  The caller should have done a btrfs_drop_extents so that
 * no overlapping inline items exist in the btree
 */
144
static int insert_inline_extent(struct btrfs_trans_handle *trans,
145
				struct btrfs_path *path, int extent_inserted,
146
147
				struct btrfs_root *root, struct inode *inode,
				u64 start, size_t size, size_t compressed_size,
148
				int compress_type,
149
150
151
152
153
154
155
156
157
158
159
160
				struct page **compressed_pages)
{
	struct extent_buffer *leaf;
	struct page *page = NULL;
	char *kaddr;
	unsigned long ptr;
	struct btrfs_file_extent_item *ei;
	int err = 0;
	int ret;
	size_t cur_size = size;
	unsigned long offset;

161
	if (compressed_size && compressed_pages)
162
163
		cur_size = compressed_size;

164
	inode_add_bytes(inode, size);
165

166
167
168
	if (!extent_inserted) {
		struct btrfs_key key;
		size_t datasize;
169

170
171
		key.objectid = btrfs_ino(inode);
		key.offset = start;
172
		key.type = BTRFS_EXTENT_DATA_KEY;
173

174
175
176
177
178
179
180
181
		datasize = btrfs_file_extent_calc_inline_size(cur_size);
		path->leave_spinning = 1;
		ret = btrfs_insert_empty_item(trans, root, path, &key,
					      datasize);
		if (ret) {
			err = ret;
			goto fail;
		}
182
183
184
185
186
187
188
189
190
191
192
	}
	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
	btrfs_set_file_extent_encryption(leaf, ei, 0);
	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
	ptr = btrfs_file_extent_inline_start(ei);

193
	if (compress_type != BTRFS_COMPRESS_NONE) {
194
195
		struct page *cpage;
		int i = 0;
196
		while (compressed_size > 0) {
197
			cpage = compressed_pages[i];
198
			cur_size = min_t(unsigned long, compressed_size,
199
				       PAGE_SIZE);
200

201
			kaddr = kmap_atomic(cpage);
202
			write_extent_buffer(leaf, kaddr, ptr, cur_size);
203
			kunmap_atomic(kaddr);
204
205
206
207
208
209

			i++;
			ptr += cur_size;
			compressed_size -= cur_size;
		}
		btrfs_set_file_extent_compression(leaf, ei,
210
						  compress_type);
211
212
	} else {
		page = find_get_page(inode->i_mapping,
213
				     start >> PAGE_SHIFT);
214
		btrfs_set_file_extent_compression(leaf, ei, 0);
215
		kaddr = kmap_atomic(page);
216
		offset = start & (PAGE_SIZE - 1);
217
		write_extent_buffer(leaf, kaddr + offset, ptr, size);
218
		kunmap_atomic(kaddr);
219
		put_page(page);
220
221
	}
	btrfs_mark_buffer_dirty(leaf);
222
	btrfs_release_path(path);
223

224
225
226
227
228
229
230
231
232
	/*
	 * we're an inline extent, so nobody can
	 * extend the file past i_size without locking
	 * a page we already have locked.
	 *
	 * We must do any isize and inode updates
	 * before we unlock the pages.  Otherwise we
	 * could end up racing with unlink.
	 */
233
	BTRFS_I(inode)->disk_i_size = inode->i_size;
234
	ret = btrfs_update_inode(trans, root, inode);
235

236
	return ret;
237
238
239
240
241
242
243
244
245
246
fail:
	return err;
}


/*
 * conditionally insert an inline extent into the file.  This
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 */
247
248
249
250
251
static noinline int cow_file_range_inline(struct btrfs_root *root,
					  struct inode *inode, u64 start,
					  u64 end, size_t compressed_size,
					  int compress_type,
					  struct page **compressed_pages)
252
{
253
	struct btrfs_trans_handle *trans;
254
255
256
	u64 isize = i_size_read(inode);
	u64 actual_end = min(end + 1, isize);
	u64 inline_len = actual_end - start;
257
	u64 aligned_end = ALIGN(end, root->sectorsize);
258
259
	u64 data_len = inline_len;
	int ret;
260
261
262
	struct btrfs_path *path;
	int extent_inserted = 0;
	u32 extent_item_size;
263
264
265
266
267

	if (compressed_size)
		data_len = compressed_size;

	if (start > 0 ||
268
	    actual_end > root->sectorsize ||
269
	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
270
271
272
273
274
275
276
	    (!compressed_size &&
	    (actual_end & (root->sectorsize - 1)) == 0) ||
	    end + 1 < isize ||
	    data_len > root->fs_info->max_inline) {
		return 1;
	}

277
278
279
280
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

281
	trans = btrfs_join_transaction(root);
282
283
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
284
		return PTR_ERR(trans);
285
	}
286
287
	trans->block_rsv = &root->fs_info->delalloc_block_rsv;

288
289
290
291
292
293
294
295
296
297
	if (compressed_size && compressed_pages)
		extent_item_size = btrfs_file_extent_calc_inline_size(
		   compressed_size);
	else
		extent_item_size = btrfs_file_extent_calc_inline_size(
		    inline_len);

	ret = __btrfs_drop_extents(trans, root, inode, path,
				   start, aligned_end, NULL,
				   1, 1, extent_item_size, &extent_inserted);
298
	if (ret) {
299
		btrfs_abort_transaction(trans, ret);
300
301
		goto out;
	}
302
303
304

	if (isize > actual_end)
		inline_len = min_t(u64, isize, actual_end);
305
306
	ret = insert_inline_extent(trans, path, extent_inserted,
				   root, inode, start,
307
				   inline_len, compressed_size,
308
				   compress_type, compressed_pages);
309
	if (ret && ret != -ENOSPC) {
310
		btrfs_abort_transaction(trans, ret);
311
		goto out;
312
	} else if (ret == -ENOSPC) {
313
314
		ret = 1;
		goto out;
315
	}
316

317
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
318
	btrfs_delalloc_release_metadata(inode, end + 1 - start);
319
	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
320
out:
321
322
323
324
325
326
	/*
	 * Don't forget to free the reserved space, as for inlined extent
	 * it won't count as data extent, free them directly here.
	 * And at reserve time, it's always aligned to page size, so
	 * just free one page here.
	 */
327
	btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
328
	btrfs_free_path(path);
329
330
	btrfs_end_transaction(trans, root);
	return ret;
331
332
}

333
334
335
336
337
338
struct async_extent {
	u64 start;
	u64 ram_size;
	u64 compressed_size;
	struct page **pages;
	unsigned long nr_pages;
339
	int compress_type;
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
	struct list_head list;
};

struct async_cow {
	struct inode *inode;
	struct btrfs_root *root;
	struct page *locked_page;
	u64 start;
	u64 end;
	struct list_head extents;
	struct btrfs_work work;
};

static noinline int add_async_extent(struct async_cow *cow,
				     u64 start, u64 ram_size,
				     u64 compressed_size,
				     struct page **pages,
357
358
				     unsigned long nr_pages,
				     int compress_type)
359
360
361
362
{
	struct async_extent *async_extent;

	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
363
	BUG_ON(!async_extent); /* -ENOMEM */
364
365
366
367
368
	async_extent->start = start;
	async_extent->ram_size = ram_size;
	async_extent->compressed_size = compressed_size;
	async_extent->pages = pages;
	async_extent->nr_pages = nr_pages;
369
	async_extent->compress_type = compress_type;
370
371
372
373
	list_add_tail(&async_extent->list, &cow->extents);
	return 0;
}

374
375
376
377
378
static inline int inode_need_compress(struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;

	/* force compress */
379
	if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
380
381
382
383
		return 1;
	/* bad compression ratios */
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
		return 0;
384
	if (btrfs_test_opt(root->fs_info, COMPRESS) ||
385
386
387
388
389
390
	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
	    BTRFS_I(inode)->force_compress)
		return 1;
	return 0;
}

Chris Mason's avatar
Chris Mason committed
391
/*
392
393
394
 * we create compressed extents in two phases.  The first
 * phase compresses a range of pages that have already been
 * locked (both pages and state bits are locked).
395
 *
396
397
398
399
400
 * This is done inside an ordered work queue, and the compression
 * is spread across many cpus.  The actual IO submission is step
 * two, and the ordered work queue takes care of making sure that
 * happens in the same order things were put onto the queue by
 * writepages and friends.
401
 *
402
403
404
 * If this code finds it can't get good compression, it puts an
 * entry onto the work queue to write the uncompressed bytes.  This
 * makes sure that both compressed inodes and uncompressed inodes
405
406
 * are written in the same order that the flusher thread sent them
 * down.
Chris Mason's avatar
Chris Mason committed
407
 */
408
static noinline void compress_file_range(struct inode *inode,
409
410
411
412
					struct page *locked_page,
					u64 start, u64 end,
					struct async_cow *async_cow,
					int *num_added)
413
414
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
415
416
	u64 num_bytes;
	u64 blocksize = root->sectorsize;
417
	u64 actual_end;
418
	u64 isize = i_size_read(inode);
419
	int ret = 0;
420
421
422
423
424
	struct page **pages = NULL;
	unsigned long nr_pages;
	unsigned long nr_pages_ret = 0;
	unsigned long total_compressed = 0;
	unsigned long total_in = 0;
425
426
	unsigned long max_compressed = SZ_128K;
	unsigned long max_uncompressed = SZ_128K;
427
428
	int i;
	int will_compress;
429
	int compress_type = root->fs_info->compress_type;
430
	int redirty = 0;
431

432
	/* if this is a small write inside eof, kick off a defrag */
433
	if ((end - start + 1) < SZ_16K &&
434
	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
Chris Mason's avatar
Chris Mason committed
435
436
		btrfs_add_inode_defrag(NULL, inode);

437
	actual_end = min_t(u64, isize, end + 1);
438
439
again:
	will_compress = 0;
440
441
	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
	nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
442

443
444
445
446
447
448
449
450
451
452
453
454
455
	/*
	 * we don't want to send crud past the end of i_size through
	 * compression, that's just a waste of CPU time.  So, if the
	 * end of the file is before the start of our current
	 * requested range of bytes, we bail out to the uncompressed
	 * cleanup code that can deal with all of this.
	 *
	 * It isn't really the fastest way to fix things, but this is a
	 * very uncommon corner.
	 */
	if (actual_end <= start)
		goto cleanup_and_bail_uncompressed;

456
457
	total_compressed = actual_end - start;

458
459
	/*
	 * skip compression for a small file range(<=blocksize) that
460
	 * isn't an inline extent, since it doesn't save disk space at all.
461
462
463
464
465
	 */
	if (total_compressed <= blocksize &&
	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
		goto cleanup_and_bail_uncompressed;

466
467
	/* we want to make sure that amount of ram required to uncompress
	 * an extent is reasonable, so we limit the total size in ram
468
469
470
471
472
473
474
	 * of a compressed extent to 128k.  This is a crucial number
	 * because it also controls how easily we can spread reads across
	 * cpus for decompression.
	 *
	 * We also want to make sure the amount of IO required to do
	 * a random read is reasonably small, so we limit the size of
	 * a compressed extent to 128k.
475
476
	 */
	total_compressed = min(total_compressed, max_uncompressed);
477
	num_bytes = ALIGN(end - start + 1, blocksize);
478
	num_bytes = max(blocksize,  num_bytes);
479
480
	total_in = 0;
	ret = 0;
481

482
483
484
485
	/*
	 * we do compression for mount -o compress and when the
	 * inode has not been flagged as nocompress.  This flag can
	 * change at any time if we discover bad compression ratios.
486
	 */
487
	if (inode_need_compress(inode)) {
488
		WARN_ON(pages);
489
		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
490
491
492
493
		if (!pages) {
			/* just bail out to the uncompressed code */
			goto cont;
		}
494

495
496
497
		if (BTRFS_I(inode)->force_compress)
			compress_type = BTRFS_I(inode)->force_compress;

498
499
500
501
502
503
504
505
506
507
508
		/*
		 * we need to call clear_page_dirty_for_io on each
		 * page in the range.  Otherwise applications with the file
		 * mmap'd can wander in and change the page contents while
		 * we are compressing them.
		 *
		 * If the compression fails for any reason, we set the pages
		 * dirty again later on.
		 */
		extent_range_clear_dirty_for_io(inode, start, end);
		redirty = 1;
509
510
511
512
513
514
515
		ret = btrfs_compress_pages(compress_type,
					   inode->i_mapping, start,
					   total_compressed, pages,
					   nr_pages, &nr_pages_ret,
					   &total_in,
					   &total_compressed,
					   max_compressed);
516
517
518

		if (!ret) {
			unsigned long offset = total_compressed &
519
				(PAGE_SIZE - 1);
520
521
522
523
524
525
526
			struct page *page = pages[nr_pages_ret - 1];
			char *kaddr;

			/* zero the tail end of the last page, we might be
			 * sending it down to disk
			 */
			if (offset) {
527
				kaddr = kmap_atomic(page);
528
				memset(kaddr + offset, 0,
529
				       PAGE_SIZE - offset);
530
				kunmap_atomic(kaddr);
531
532
533
534
			}
			will_compress = 1;
		}
	}
535
cont:
536
537
	if (start == 0) {
		/* lets try to make an inline extent */
538
		if (ret || total_in < (actual_end - start)) {
539
			/* we didn't compress the entire range, try
540
			 * to make an uncompressed inline extent.
541
			 */
542
543
			ret = cow_file_range_inline(root, inode, start, end,
						    0, 0, NULL);
544
		} else {
545
			/* try making a compressed inline extent */
546
			ret = cow_file_range_inline(root, inode, start, end,
547
548
						    total_compressed,
						    compress_type, pages);
549
		}
550
		if (ret <= 0) {
551
552
			unsigned long clear_flags = EXTENT_DELALLOC |
				EXTENT_DEFRAG;
553
554
			unsigned long page_error_op;

555
			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
556
			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
557

558
			/*
559
560
561
			 * inline extent creation worked or returned error,
			 * we don't need to create any more async work items.
			 * Unlock and free up our temp pages.
562
			 */
563
			extent_clear_unlock_delalloc(inode, start, end, NULL,
564
						     clear_flags, PAGE_UNLOCK |
565
566
						     PAGE_CLEAR_DIRTY |
						     PAGE_SET_WRITEBACK |
567
						     page_error_op |
568
						     PAGE_END_WRITEBACK);
569
570
571
572
573
574
575
576
577
578
			goto free_pages_out;
		}
	}

	if (will_compress) {
		/*
		 * we aren't doing an inline extent round the compressed size
		 * up to a block size boundary so the allocator does sane
		 * things
		 */
579
		total_compressed = ALIGN(total_compressed, blocksize);
580
581
582
583
584

		/*
		 * one last check to make sure the compression is really a
		 * win, compare the page count read with the blocks on disk
		 */
585
		total_in = ALIGN(total_in, PAGE_SIZE);
586
587
588
589
		if (total_compressed >= total_in) {
			will_compress = 0;
		} else {
			num_bytes = total_in;
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
			*num_added += 1;

			/*
			 * The async work queues will take care of doing actual
			 * allocation on disk for these compressed pages, and
			 * will submit them to the elevator.
			 */
			add_async_extent(async_cow, start, num_bytes,
					total_compressed, pages, nr_pages_ret,
					compress_type);

			if (start + num_bytes < end) {
				start += num_bytes;
				pages = NULL;
				cond_resched();
				goto again;
			}
			return;
608
609
		}
	}
610
	if (pages) {
611
612
613
614
615
		/*
		 * the compression code ran but failed to make things smaller,
		 * free any pages it allocated and our page pointer array
		 */
		for (i = 0; i < nr_pages_ret; i++) {
Chris Mason's avatar
Chris Mason committed
616
			WARN_ON(pages[i]->mapping);
617
			put_page(pages[i]);
618
619
620
621
622
623
624
		}
		kfree(pages);
		pages = NULL;
		total_compressed = 0;
		nr_pages_ret = 0;

		/* flag the file so we don't compress in the future */
625
		if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) &&
626
		    !(BTRFS_I(inode)->force_compress)) {
627
			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
628
		}
629
	}
630
cleanup_and_bail_uncompressed:
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
	/*
	 * No compression, but we still need to write the pages in the file
	 * we've been given so far.  redirty the locked page if it corresponds
	 * to our extent and set things up for the async work queue to run
	 * cow_file_range to do the normal delalloc dance.
	 */
	if (page_offset(locked_page) >= start &&
	    page_offset(locked_page) <= end)
		__set_page_dirty_nobuffers(locked_page);
		/* unlocked later on in the async handlers */

	if (redirty)
		extent_range_redirty_for_io(inode, start, end);
	add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
			 BTRFS_COMPRESS_NONE);
	*num_added += 1;
647

648
	return;
649
650
651
652

free_pages_out:
	for (i = 0; i < nr_pages_ret; i++) {
		WARN_ON(pages[i]->mapping);
653
		put_page(pages[i]);
654
	}
655
	kfree(pages);
656
657
}

658
659
660
661
662
663
664
665
666
static void free_async_extent_pages(struct async_extent *async_extent)
{
	int i;

	if (!async_extent->pages)
		return;

	for (i = 0; i < async_extent->nr_pages; i++) {
		WARN_ON(async_extent->pages[i]->mapping);
667
		put_page(async_extent->pages[i]);
668
669
670
671
	}
	kfree(async_extent->pages);
	async_extent->nr_pages = 0;
	async_extent->pages = NULL;
672
673
674
675
676
677
678
679
}

/*
 * phase two of compressed writeback.  This is the ordered portion
 * of the code, which only gets called in the order the work was
 * queued.  We walk all the async extents created by compress_file_range
 * and send them down to the disk.
 */
680
static noinline void submit_compressed_extents(struct inode *inode,
681
682
683
684
685
686
687
688
689
					      struct async_cow *async_cow)
{
	struct async_extent *async_extent;
	u64 alloc_hint = 0;
	struct btrfs_key ins;
	struct extent_map *em;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_io_tree *io_tree;
690
	int ret = 0;
691

692
again:
693
	while (!list_empty(&async_cow->extents)) {
694
695
696
		async_extent = list_entry(async_cow->extents.next,
					  struct async_extent, list);
		list_del(&async_extent->list);
697

698
699
		io_tree = &BTRFS_I(inode)->io_tree;

700
retry:
701
702
703
704
705
706
		/* did the compression code fall back to uncompressed IO? */
		if (!async_extent->pages) {
			int page_started = 0;
			unsigned long nr_written = 0;

			lock_extent(io_tree, async_extent->start,
707
					 async_extent->start +
708
					 async_extent->ram_size - 1);
709
710

			/* allocate blocks */
711
712
713
714
			ret = cow_file_range(inode, async_cow->locked_page,
					     async_extent->start,
					     async_extent->start +
					     async_extent->ram_size - 1,
715
716
717
718
					     async_extent->start +
					     async_extent->ram_size - 1,
					     &page_started, &nr_written, 0,
					     NULL);
719

720
721
			/* JDM XXX */

722
723
724
725
726
727
			/*
			 * if page_started, cow_file_range inserted an
			 * inline extent and took care of all the unlocking
			 * and IO for us.  Otherwise, we need to submit
			 * all those pages down to the drive.
			 */
728
			if (!page_started && !ret)
729
730
				extent_write_locked_range(io_tree,
						  inode, async_extent->start,
731
						  async_extent->start +
732
733
734
						  async_extent->ram_size - 1,
						  btrfs_get_extent,
						  WB_SYNC_ALL);
735
736
			else if (ret)
				unlock_page(async_cow->locked_page);
737
738
739
740
741
742
			kfree(async_extent);
			cond_resched();
			continue;
		}

		lock_extent(io_tree, async_extent->start,
743
			    async_extent->start + async_extent->ram_size - 1);
744

745
		ret = btrfs_reserve_extent(root,
746
747
					   async_extent->compressed_size,
					   async_extent->compressed_size,
748
					   0, alloc_hint, &ins, 1, 1);
749
		if (ret) {
750
			free_async_extent_pages(async_extent);
751

752
753
754
755
			if (ret == -ENOSPC) {
				unlock_extent(io_tree, async_extent->start,
					      async_extent->start +
					      async_extent->ram_size - 1);
756
757
758
759
760
761
762
763
764
765
766
767

				/*
				 * we need to redirty the pages if we decide to
				 * fallback to uncompressed IO, otherwise we
				 * will not submit these pages down to lower
				 * layers.
				 */
				extent_range_redirty_for_io(inode,
						async_extent->start,
						async_extent->start +
						async_extent->ram_size - 1);

768
				goto retry;
769
			}
770
			goto out_free;
771
		}
772
773
774
775
776
777
778
779
		/*
		 * here we're doing allocation and writeback of the
		 * compressed pages
		 */
		btrfs_drop_extent_cache(inode, async_extent->start,
					async_extent->start +
					async_extent->ram_size - 1, 0);

780
		em = alloc_extent_map();
781
782
		if (!em) {
			ret = -ENOMEM;
783
			goto out_free_reserve;
784
		}
785
786
		em->start = async_extent->start;
		em->len = async_extent->ram_size;
787
		em->orig_start = em->start;
788
789
		em->mod_start = em->start;
		em->mod_len = em->len;
790

791
792
		em->block_start = ins.objectid;
		em->block_len = ins.offset;
793
		em->orig_block_len = ins.offset;
Josef Bacik's avatar
Josef Bacik committed
794
		em->ram_bytes = async_extent->ram_size;
795
		em->bdev = root->fs_info->fs_devices->latest_bdev;
796
		em->compress_type = async_extent->compress_type;
797
798
		set_bit(EXTENT_FLAG_PINNED, &em->flags);
		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
799
		em->generation = -1;
800

801
		while (1) {
802
			write_lock(&em_tree->lock);
Josef Bacik's avatar
Josef Bacik committed
803
			ret = add_extent_mapping(em_tree, em, 1);
804
			write_unlock(&em_tree->lock);
805
806
807
808
809
810
811
812
813
			if (ret != -EEXIST) {
				free_extent_map(em);
				break;
			}
			btrfs_drop_extent_cache(inode, async_extent->start,
						async_extent->start +
						async_extent->ram_size - 1, 0);
		}

814
815
816
		if (ret)
			goto out_free_reserve;

817
818
819
820
821
822
823
		ret = btrfs_add_ordered_extent_compress(inode,
						async_extent->start,
						ins.objectid,
						async_extent->ram_size,
						ins.offset,
						BTRFS_ORDERED_COMPRESSED,
						async_extent->compress_type);
824
825
826
827
		if (ret) {
			btrfs_drop_extent_cache(inode, async_extent->start,
						async_extent->start +
						async_extent->ram_size - 1, 0);
828
			goto out_free_reserve;
829
		}
830
		btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
831
832
833
834

		/*
		 * clear dirty, set writeback and unlock the pages.
		 */
835
		extent_clear_unlock_delalloc(inode, async_extent->start,
836
837
				async_extent->start +
				async_extent->ram_size - 1,
838
839
				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
840
				PAGE_SET_WRITEBACK);
841
		ret = btrfs_submit_compressed_write(inode,
842
843
844
845
846
				    async_extent->start,
				    async_extent->ram_size,
				    ins.objectid,
				    ins.offset, async_extent->pages,
				    async_extent->nr_pages);
847
848
849
850
851
852
853
854
855
856
857
858
859
		if (ret) {
			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
			struct page *p = async_extent->pages[0];
			const u64 start = async_extent->start;
			const u64 end = start + async_extent->ram_size - 1;

			p->mapping = inode->i_mapping;
			tree->ops->writepage_end_io_hook(p, start, end,
							 NULL, 0);
			p->mapping = NULL;
			extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
						     PAGE_END_WRITEBACK |
						     PAGE_SET_ERROR);
860
			free_async_extent_pages(async_extent);
861
		}
862
863
864
865
		alloc_hint = ins.objectid + ins.offset;
		kfree(async_extent);
		cond_resched();
	}
866
	return;
867
out_free_reserve:
868
	btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
869
	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
870
out_free:
871
	extent_clear_unlock_delalloc(inode, async_extent->start,
872
873
				     async_extent->start +
				     async_extent->ram_size - 1,
874
				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
875
876
				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
877
878
				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
				     PAGE_SET_ERROR);
879
	free_async_extent_pages(async_extent);
880
	kfree(async_extent);
881
	goto again;
882
883
}

884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
				      u64 num_bytes)
{
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_map *em;
	u64 alloc_hint = 0;

	read_lock(&em_tree->lock);
	em = search_extent_mapping(em_tree, start, num_bytes);
	if (em) {
		/*
		 * if block start isn't an actual block number then find the
		 * first block in this inode and use that as a hint.  If that
		 * block is also bogus then just don't worry about it.
		 */
		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
			free_extent_map(em);
			em = search_extent_mapping(em_tree, 0, 0);
			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
				alloc_hint = em->block_start;
			if (em)
				free_extent_map(em);
		} else {
			alloc_hint = em->block_start;
			free_extent_map(em);
		}
	}
	read_unlock(&em_tree->lock);

	return alloc_hint;
}

916
917
918
919
920
921
922
923
924
925
926
927
928
/*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
 * allocate extents on disk for the range, and create ordered data structs
 * in ram to track those extents.
 *
 * locked_page is the page that writepage had locked already.  We use
 * it to make sure we don't do extra locks or unlocks.
 *
 * *page_started is set to one if we unlock locked_page and do everything
 * required to start IO on it.  It may be clean and already done with
 * IO when we return.
 */
929
930
static noinline int cow_file_range(struct inode *inode,
				   struct page *locked_page,
931
932
933
				   u64 start, u64 end, u64 delalloc_end,
				   int *page_started, unsigned long *nr_written,
				   int unlock, struct btrfs_dedupe_hash *hash)
934
{
935
	struct btrfs_root *root = BTRFS_I(inode)->root;
936
937
938
939
940
941
942
943
944
945
946
	u64 alloc_hint = 0;
	u64 num_bytes;
	unsigned long ram_size;
	u64 disk_num_bytes;
	u64 cur_alloc_size;
	u64 blocksize = root->sectorsize;
	struct btrfs_key ins;
	struct extent_map *em;
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	int ret = 0;

947
948
	if (btrfs_is_free_space_inode(inode)) {
		WARN_ON_ONCE(1);
949
950
		ret = -EINVAL;
		goto out_unlock;
951
	}
952

953
	num_bytes = ALIGN(end - start + 1, blocksize);
954
955
956
	num_bytes = max(blocksize,  num_bytes);
	disk_num_bytes = num_bytes;

Chris Mason's avatar
Chris Mason committed
957
	/* if this is a small write inside eof, kick off defrag */
958
	if (num_bytes < SZ_64K &&
959
	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
960
		btrfs_add_inode_defrag(NULL, inode);
Chris Mason's avatar
Chris Mason committed
961

962
963
	if (start == 0) {
		/* lets try to make an inline extent */
964
965
		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
					    NULL);
966
		if (ret == 0) {
967
968
			extent_clear_unlock_delalloc(inode, start, end, NULL,
				     EXTENT_LOCKED | EXTENT_DELALLOC |
969
				     EXTENT_DEFRAG, PAGE_UNLOCK |
970
971
				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
				     PAGE_END_WRITEBACK);
972

973
			*nr_written = *nr_written +
974
			     (end - start + PAGE_SIZE) / PAGE_SIZE;
975
976
			*page_started = 1;
			goto out;
977
978
		} else if (ret < 0) {
			goto out_unlock;
979
980
981
982
		}
	}

	BUG_ON(disk_num_bytes >
983
	       btrfs_super_total_bytes(root->fs_info->super_copy));
984

985
	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
986
987
	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);

988
	while (disk_num_bytes > 0) {
989
990
		unsigned long op;

991
		cur_alloc_size = disk_num_bytes;
992
		ret = btrfs_reserve_extent(root, cur_alloc_size,
993
					   root->sectorsize, 0, alloc_hint,
994
					   &ins, 1, 1);
995
		if (ret < 0)
996
			goto out_unlock;
997

998
		em = alloc_extent_map();
999
1000
		if (!em) {
			ret = -ENOMEM;
1001
			goto out_reserve;
1002
		}
1003
		em->start = start;
1004
		em->orig_start = em->start;
1005
1006
		ram_size = ins.offset;
		em->len = ins.offset;
1007
1008
		em->mod_start = em->start;
		em->mod_len = em->len;
1009

1010
		em->block_start = ins.objectid;
1011
		em->block_len = ins.offset;
1012
		em->orig_block_len = ins.offset;
Josef Bacik's avatar
Josef Bacik committed
1013
		em->ram_bytes = ram_size;
1014
		em->bdev = root->fs_info->fs_devices->latest_bdev;
1015
		set_bit(EXTENT_FLAG_PINNED, &em->flags);
1016
		em->generation = -1;
1017

1018
		while (1) {
1019
			write_lock(&em_tree->lock);
Josef Bacik's avatar
Josef Bacik committed
1020
			ret = add_extent_mapping(em_tree, em, 1);
1021
			write_unlock(&em_tree->lock);
1022
1023
1024
1025
1026
			if (ret != -EEXIST) {
				free_extent_map(em);
				break;
			}
			btrfs_drop_extent_cache(inode, start,
1027
						start + ram_size - 1, 0);
1028
		}
1029
1030
		if (ret)
			goto out_reserve;
1031

1032
		cur_alloc_size = ins.offset;
1033
		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1034
					       ram_size, cur_alloc_size, 0);
1035
		if (ret)
1036
			goto out_drop_extent_cache;
1037

1038
1039
1040
1041
		if (root->root_key.objectid ==
		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
			ret = btrfs_reloc_clone_csums(inode, start,
						      cur_alloc_size);
1042
			if (ret)
1043
				goto out_drop_extent_cache;
1044
1045
		}

1046
1047
		btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);

1048
		if (disk_num_bytes < cur_alloc_size)