inode.c 168 KB
Newer Older
1
/*
2
 *  linux/fs/ext4/inode.c
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *	(jj@sunsite.ms.mff.cuni.cz)
 *
18
 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
19
20
21
22
23
24
 */

#include <linux/fs.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
25
#include <linux/dax.h>
26
27
28
29
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
30
#include <linux/pagevec.h>
31
#include <linux/mpage.h>
32
#include <linux/namei.h>
33
34
#include <linux/uio.h>
#include <linux/bio.h>
35
#include <linux/workqueue.h>
36
#include <linux/kernel.h>
37
#include <linux/printk.h>
38
#include <linux/slab.h>
39
#include <linux/bitops.h>
40

41
#include "ext4_jbd2.h"
42
43
#include "xattr.h"
#include "acl.h"
44
#include "truncate.h"
45

46
47
#include <trace/events/ext4.h>

48
49
#define MPAGE_DA_EXTENT_TAIL 0x01

50
51
52
53
54
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
			      struct ext4_inode_info *ei)
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	__u32 csum;
55
56
57
	__u16 dummy_csum = 0;
	int offset = offsetof(struct ext4_inode, i_checksum_lo);
	unsigned int csum_size = sizeof(dummy_csum);
58

59
60
61
62
63
	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
	csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
	offset += csum_size;
	csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
			   EXT4_GOOD_OLD_INODE_SIZE - offset);
64

65
66
67
68
69
70
71
72
73
74
75
76
77
	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
		offset = offsetof(struct ext4_inode, i_checksum_hi);
		csum = ext4_chksum(sbi, csum, (__u8 *)raw +
				   EXT4_GOOD_OLD_INODE_SIZE,
				   offset - EXT4_GOOD_OLD_INODE_SIZE);
		if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
			csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
					   csum_size);
			offset += csum_size;
			csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
					   EXT4_INODE_SIZE(inode->i_sb) -
					   offset);
		}
78
79
80
81
82
83
84
85
86
87
88
89
	}

	return csum;
}

static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
				  struct ext4_inode_info *ei)
{
	__u32 provided, calculated;

	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
	    cpu_to_le32(EXT4_OS_LINUX) ||
90
	    !ext4_has_metadata_csum(inode->i_sb))
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
		return 1;

	provided = le16_to_cpu(raw->i_checksum_lo);
	calculated = ext4_inode_csum(inode, raw, ei);
	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
		provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
	else
		calculated &= 0xFFFF;

	return provided == calculated;
}

static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
				struct ext4_inode_info *ei)
{
	__u32 csum;

	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
	    cpu_to_le32(EXT4_OS_LINUX) ||
111
	    !ext4_has_metadata_csum(inode->i_sb))
112
113
114
115
116
117
118
119
120
		return;

	csum = ext4_inode_csum(inode, raw, ei);
	raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
		raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}

121
122
123
static inline int ext4_begin_ordered_truncate(struct inode *inode,
					      loff_t new_size)
{
124
	trace_ext4_begin_ordered_truncate(inode, new_size);
125
126
127
128
129
130
131
132
133
134
135
	/*
	 * If jinode is zero, then we never opened the file for
	 * writing, so there's no need to call
	 * jbd2_journal_begin_ordered_truncate() since there's no
	 * outstanding writes we need to flush.
	 */
	if (!EXT4_I(inode)->jinode)
		return 0;
	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
						   EXT4_I(inode)->jinode,
						   new_size);
136
137
}

138
139
static void ext4_invalidatepage(struct page *page, unsigned int offset,
				unsigned int length);
140
141
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
142
143
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
				  int pextents);
144

145
146
147
/*
 * Test whether an inode is a fast symlink.
 */
148
int ext4_inode_is_fast_symlink(struct inode *inode)
149
{
150
151
        int ea_blocks = EXT4_I(inode)->i_file_acl ?
		EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
152

153
154
155
	if (ext4_has_inline_data(inode))
		return 0;

156
157
158
159
160
161
162
163
	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}

/*
 * Restart the transaction associated with *handle.  This does a commit,
 * so before we call here everything must be consistently dirtied against
 * this transaction.
 */
164
int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
165
				 int nblocks)
166
{
167
168
169
	int ret;

	/*
170
	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
171
172
173
174
	 * moment, get_block can be called only for blocks inside i_size since
	 * page cache has been already dropped and writes are blocked by
	 * i_mutex. So we can safely drop the i_data_sem here.
	 */
175
	BUG_ON(EXT4_JOURNAL(inode) == NULL);
176
	jbd_debug(2, "restarting handle %p\n", handle);
177
	up_write(&EXT4_I(inode)->i_data_sem);
178
	ret = ext4_journal_restart(handle, nblocks);
179
	down_write(&EXT4_I(inode)->i_data_sem);
180
	ext4_discard_preallocations(inode);
181
182

	return ret;
183
184
185
186
187
}

/*
 * Called at the last iput() if i_nlink is zero.
 */
Al Viro's avatar
Al Viro committed
188
void ext4_evict_inode(struct inode *inode)
189
190
{
	handle_t *handle;
191
	int err;
192

193
	trace_ext4_evict_inode(inode);
194

Al Viro's avatar
Al Viro committed
195
	if (inode->i_nlink) {
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
		/*
		 * When journalling data dirty buffers are tracked only in the
		 * journal. So although mm thinks everything is clean and
		 * ready for reaping the inode might still have some pages to
		 * write in the running transaction or waiting to be
		 * checkpointed. Thus calling jbd2_journal_invalidatepage()
		 * (via truncate_inode_pages()) to discard these buffers can
		 * cause data loss. Also even if we did not discard these
		 * buffers, we would have no way to find them after the inode
		 * is reaped and thus user could see stale data if he tries to
		 * read them before the transaction is checkpointed. So be
		 * careful and force everything to disk here... We use
		 * ei->i_datasync_tid to store the newest transaction
		 * containing inode's data.
		 *
		 * Note that directories do not have this problem because they
		 * don't use page cache.
		 */
214
215
216
		if (inode->i_ino != EXT4_JOURNAL_INO &&
		    ext4_should_journal_data(inode) &&
		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
217
218
219
			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;

220
			jbd2_complete_transaction(journal, commit_tid);
221
222
			filemap_write_and_wait(&inode->i_data);
		}
223
		truncate_inode_pages_final(&inode->i_data);
Jan Kara's avatar
Jan Kara committed
224

Al Viro's avatar
Al Viro committed
225
226
227
		goto no_delete;
	}

228
229
230
	if (is_bad_inode(inode))
		goto no_delete;
	dquot_initialize(inode);
231

232
233
	if (ext4_should_order_data(inode))
		ext4_begin_ordered_truncate(inode, 0);
234
	truncate_inode_pages_final(&inode->i_data);
235

236
237
238
239
240
	/*
	 * Protect us against freezing - iput() caller didn't have to have any
	 * protection against it
	 */
	sb_start_intwrite(inode->i_sb);
241
242
	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
				    ext4_blocks_for_truncate(inode)+3);
243
	if (IS_ERR(handle)) {
244
		ext4_std_error(inode->i_sb, PTR_ERR(handle));
245
246
247
248
249
		/*
		 * If we're going to skip the normal cleanup, we still need to
		 * make sure that the in-core orphan linked list is properly
		 * cleaned up.
		 */
250
		ext4_orphan_del(NULL, inode);
251
		sb_end_intwrite(inode->i_sb);
252
253
254
255
		goto no_delete;
	}

	if (IS_SYNC(inode))
256
		ext4_handle_sync(handle);
257
	inode->i_size = 0;
258
259
	err = ext4_mark_inode_dirty(handle, inode);
	if (err) {
260
		ext4_warning(inode->i_sb,
261
262
263
			     "couldn't mark inode dirty (err %d)", err);
		goto stop_handle;
	}
264
	if (inode->i_blocks)
265
		ext4_truncate(inode);
266
267
268
269
270
271
272

	/*
	 * ext4_ext_truncate() doesn't reserve any slop when it
	 * restarts journal transactions; therefore there may not be
	 * enough credits left in the handle to remove the inode from
	 * the orphan list and set the dtime field.
	 */
273
	if (!ext4_handle_has_enough_credits(handle, 3)) {
274
275
276
277
		err = ext4_journal_extend(handle, 3);
		if (err > 0)
			err = ext4_journal_restart(handle, 3);
		if (err != 0) {
278
			ext4_warning(inode->i_sb,
279
280
281
				     "couldn't extend journal (err %d)", err);
		stop_handle:
			ext4_journal_stop(handle);
282
			ext4_orphan_del(NULL, inode);
283
			sb_end_intwrite(inode->i_sb);
284
285
286
287
			goto no_delete;
		}
	}

288
	/*
289
	 * Kill off the orphan record which ext4_truncate created.
290
	 * AKPM: I think this can be inside the above `if'.
291
	 * Note that ext4_orphan_del() has to be able to cope with the
292
	 * deletion of a non-existent orphan - this is because we don't
293
	 * know if ext4_truncate() actually created an orphan record.
294
295
	 * (Well, we could do this if we need to, but heck - it works)
	 */
296
297
	ext4_orphan_del(handle, inode);
	EXT4_I(inode)->i_dtime	= get_seconds();
298
299
300
301
302
303
304
305

	/*
	 * One subtle ordering requirement: if anything has gone wrong
	 * (transaction abort, IO errors, whatever), then we can still
	 * do these next steps (the fs will already have been marked as
	 * having errors), but we can't free the inode if the mark_dirty
	 * fails.
	 */
306
	if (ext4_mark_inode_dirty(handle, inode))
307
		/* If that failed, just do the required in-core inode clear. */
Al Viro's avatar
Al Viro committed
308
		ext4_clear_inode(inode);
309
	else
310
311
		ext4_free_inode(handle, inode);
	ext4_journal_stop(handle);
312
	sb_end_intwrite(inode->i_sb);
313
314
	return;
no_delete:
Al Viro's avatar
Al Viro committed
315
	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
316
317
}

318
319
#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
320
{
321
	return &EXT4_I(inode)->i_reserved_quota;
322
}
323
#endif
324

325
326
327
328
/*
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
329
330
void ext4_da_update_reserve_space(struct inode *inode,
					int used, int quota_claim)
331
332
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
333
334
335
	struct ext4_inode_info *ei = EXT4_I(inode);

	spin_lock(&ei->i_block_reservation_lock);
336
	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
337
	if (unlikely(used > ei->i_reserved_data_blocks)) {
338
		ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
339
			 "with only %d reserved data blocks",
340
341
342
343
344
			 __func__, inode->i_ino, used,
			 ei->i_reserved_data_blocks);
		WARN_ON(1);
		used = ei->i_reserved_data_blocks;
	}
345

346
347
	/* Update per-inode reservations */
	ei->i_reserved_data_blocks -= used;
348
	percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
349

350
	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
351

352
353
	/* Update quota subsystem for data blocks */
	if (quota_claim)
354
		dquot_claim_block(inode, EXT4_C2B(sbi, used));
355
	else {
356
357
358
		/*
		 * We did fallocate with an offset that is already delayed
		 * allocated. So on delayed allocated writeback we should
359
		 * not re-claim the quota for fallocated blocks.
360
		 */
361
		dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
362
	}
363
364
365
366
367
368

	/*
	 * If we have done all the pending block allocations and if
	 * there aren't any writers on the inode, we can discard the
	 * inode's preallocations.
	 */
369
370
	if ((ei->i_reserved_data_blocks == 0) &&
	    (atomic_read(&inode->i_writecount) == 0))
371
		ext4_discard_preallocations(inode);
372
373
}

374
static int __check_block_validity(struct inode *inode, const char *func,
375
376
				unsigned int line,
				struct ext4_map_blocks *map)
377
{
378
379
	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
				   map->m_len)) {
380
381
382
383
		ext4_error_inode(inode, func, line, map->m_pblk,
				 "lblock %lu mapped to illegal pblock "
				 "(length %d)", (unsigned long) map->m_lblk,
				 map->m_len);
384
		return -EFSCORRUPTED;
385
386
387
388
	}
	return 0;
}

Jan Kara's avatar
Jan Kara committed
389
390
391
392
393
394
int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
		       ext4_lblk_t len)
{
	int ret;

	if (ext4_encrypted_inode(inode))
395
		return fscrypt_zeroout_range(inode, lblk, pblk, len);
Jan Kara's avatar
Jan Kara committed
396
397
398
399
400
401
402
403

	ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
	if (ret > 0)
		ret = 0;

	return ret;
}

404
#define check_block_validity(inode, map)	\
405
	__check_block_validity((inode), __func__, __LINE__, (map))
406

407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
				       struct inode *inode,
				       struct ext4_map_blocks *es_map,
				       struct ext4_map_blocks *map,
				       int flags)
{
	int retval;

	map->m_flags = 0;
	/*
	 * There is a race window that the result is not the same.
	 * e.g. xfstests #223 when dioread_nolock enables.  The reason
	 * is that we lookup a block mapping in extent status tree with
	 * out taking i_data_sem.  So at the time the unwritten extent
	 * could be converted.
	 */
424
	down_read(&EXT4_I(inode)->i_data_sem);
425
426
427
428
429
430
431
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
		retval = ext4_ext_map_blocks(handle, inode, map, flags &
					     EXT4_GET_BLOCKS_KEEP_SIZE);
	} else {
		retval = ext4_ind_map_blocks(handle, inode, map, flags &
					     EXT4_GET_BLOCKS_KEEP_SIZE);
	}
432
	up_read((&EXT4_I(inode)->i_data_sem));
433
434
435
436
437
438
439
440

	/*
	 * We don't check m_len because extent will be collpased in status
	 * tree.  So the m_len might not equal.
	 */
	if (es_map->m_lblk != map->m_lblk ||
	    es_map->m_flags != map->m_flags ||
	    es_map->m_pblk != map->m_pblk) {
441
		printk("ES cache assertion failed for inode: %lu "
442
443
444
445
446
447
448
449
450
451
		       "es_cached ex [%d/%d/%llu/%x] != "
		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
		       inode->i_ino, es_map->m_lblk, es_map->m_len,
		       es_map->m_pblk, es_map->m_flags, map->m_lblk,
		       map->m_len, map->m_pblk, map->m_flags,
		       retval, flags);
	}
}
#endif /* ES_AGGRESSIVE_TEST */

452
/*
453
 * The ext4_map_blocks() function tries to look up the requested blocks,
454
 * and returns if the blocks are already mapped.
455
456
457
458
459
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
460
461
 * If file type is extents based, it will call ext4_ext_map_blocks(),
 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
462
463
 * based files
 *
464
465
466
 * On success, it returns the number of blocks being mapped or allocated.  if
 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
 * is marked as unwritten. If the create == 1, it will mark @map as mapped.
467
468
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
469
470
 * that case, @map is returned as unmapped but we still do fill map->m_len to
 * indicate the length of a hole starting at map->m_lblk.
471
472
473
 *
 * It returns the error in case of allocation failure.
 */
474
475
int ext4_map_blocks(handle_t *handle, struct inode *inode,
		    struct ext4_map_blocks *map, int flags)
476
{
477
	struct extent_status es;
478
	int retval;
479
	int ret = 0;
480
481
482
483
484
#ifdef ES_AGGRESSIVE_TEST
	struct ext4_map_blocks orig_map;

	memcpy(&orig_map, map, sizeof(*map));
#endif
485

486
487
488
489
	map->m_flags = 0;
	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
		  (unsigned long) map->m_lblk);
490

491
492
493
494
495
496
	/*
	 * ext4_map_blocks returns an int, and m_len is an unsigned int
	 */
	if (unlikely(map->m_len > INT_MAX))
		map->m_len = INT_MAX;

497
498
	/* We can handle the block number less than EXT_MAX_BLOCKS */
	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
499
		return -EFSCORRUPTED;
500

501
502
503
504
505
506
507
508
509
510
511
512
	/* Lookup extent status tree firstly */
	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
			map->m_pblk = ext4_es_pblock(&es) +
					map->m_lblk - es.es_lblk;
			map->m_flags |= ext4_es_is_written(&es) ?
					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
			retval = es.es_len - (map->m_lblk - es.es_lblk);
			if (retval > map->m_len)
				retval = map->m_len;
			map->m_len = retval;
		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
513
514
515
516
517
			map->m_pblk = 0;
			retval = es.es_len - (map->m_lblk - es.es_lblk);
			if (retval > map->m_len)
				retval = map->m_len;
			map->m_len = retval;
518
519
520
521
			retval = 0;
		} else {
			BUG_ON(1);
		}
522
523
524
525
#ifdef ES_AGGRESSIVE_TEST
		ext4_map_blocks_es_recheck(handle, inode, map,
					   &orig_map, flags);
#endif
526
527
528
		goto found;
	}

529
	/*
530
531
	 * Try to see if we can get the block without requesting a new
	 * file system block.
532
	 */
533
	down_read(&EXT4_I(inode)->i_data_sem);
534
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
535
536
		retval = ext4_ext_map_blocks(handle, inode, map, flags &
					     EXT4_GET_BLOCKS_KEEP_SIZE);
537
	} else {
538
539
		retval = ext4_ind_map_blocks(handle, inode, map, flags &
					     EXT4_GET_BLOCKS_KEEP_SIZE);
540
	}
541
	if (retval > 0) {
542
		unsigned int status;
543

544
545
546
547
548
549
		if (unlikely(retval != map->m_len)) {
			ext4_warning(inode->i_sb,
				     "ES len assertion failed for inode "
				     "%lu: retval %d != map->m_len %d",
				     inode->i_ino, retval, map->m_len);
			WARN_ON(1);
550
551
		}

552
553
554
		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
555
		    !(status & EXTENT_STATUS_WRITTEN) &&
556
557
558
559
560
561
562
563
		    ext4_find_delalloc_range(inode, map->m_lblk,
					     map->m_lblk + map->m_len - 1))
			status |= EXTENT_STATUS_DELAYED;
		ret = ext4_es_insert_extent(inode, map->m_lblk,
					    map->m_len, map->m_pblk, status);
		if (ret < 0)
			retval = ret;
	}
564
	up_read((&EXT4_I(inode)->i_data_sem));
565

566
found:
567
	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
568
		ret = check_block_validity(inode, map);
569
570
571
572
		if (ret != 0)
			return ret;
	}

573
	/* If it is only a block(s) look up */
574
	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
575
576
577
578
579
580
		return retval;

	/*
	 * Returns if the blocks have already allocated
	 *
	 * Note that if blocks have been preallocated
581
	 * ext4_ext_get_block() returns the create = 0
582
583
	 * with buffer head unmapped.
	 */
584
	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
585
586
587
588
589
590
591
		/*
		 * If we need to convert extent to unwritten
		 * we continue and do the actual work in
		 * ext4_ext_map_blocks()
		 */
		if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
			return retval;
592

593
	/*
594
595
	 * Here we clear m_flags because after allocating an new extent,
	 * it will be set again.
596
	 */
597
	map->m_flags &= ~EXT4_MAP_FLAGS;
598

599
	/*
600
	 * New blocks allocate and/or writing to unwritten extent
601
	 * will possibly result in updating i_data, so we take
602
	 * the write lock of i_data_sem, and call get_block()
603
	 * with create == 1 flag.
604
	 */
605
	down_write(&EXT4_I(inode)->i_data_sem);
606

607
608
609
610
	/*
	 * We need to check for EXT4 here because migrate
	 * could have changed the inode type in between
	 */
611
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
612
		retval = ext4_ext_map_blocks(handle, inode, map, flags);
613
	} else {
614
		retval = ext4_ind_map_blocks(handle, inode, map, flags);
615

616
		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
617
618
619
620
621
			/*
			 * We allocated new blocks which will result in
			 * i_data's format changing.  Force the migrate
			 * to fail by clearing migrate flags
			 */
622
			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
623
		}
624

625
626
627
628
629
630
631
		/*
		 * Update reserved blocks/metadata blocks after successful
		 * block allocation which had been deferred till now. We don't
		 * support fallocate for non extent files. So we can update
		 * reserve space here.
		 */
		if ((retval > 0) &&
632
			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
633
634
			ext4_da_update_reserve_space(inode, retval, 1);
	}
635

636
	if (retval > 0) {
637
		unsigned int status;
638

639
640
641
642
643
644
		if (unlikely(retval != map->m_len)) {
			ext4_warning(inode->i_sb,
				     "ES len assertion failed for inode "
				     "%lu: retval %d != map->m_len %d",
				     inode->i_ino, retval, map->m_len);
			WARN_ON(1);
645
646
		}

647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
		/*
		 * We have to zeroout blocks before inserting them into extent
		 * status tree. Otherwise someone could look them up there and
		 * use them before they are really zeroed.
		 */
		if (flags & EXT4_GET_BLOCKS_ZERO &&
		    map->m_flags & EXT4_MAP_MAPPED &&
		    map->m_flags & EXT4_MAP_NEW) {
			ret = ext4_issue_zeroout(inode, map->m_lblk,
						 map->m_pblk, map->m_len);
			if (ret) {
				retval = ret;
				goto out_sem;
			}
		}

663
664
665
666
667
668
669
		/*
		 * If the extent has been zeroed out, we don't need to update
		 * extent status tree.
		 */
		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
			if (ext4_es_is_written(&es))
670
				goto out_sem;
671
		}
672
673
674
		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
675
		    !(status & EXTENT_STATUS_WRITTEN) &&
676
677
678
679
680
		    ext4_find_delalloc_range(inode, map->m_lblk,
					     map->m_lblk + map->m_len - 1))
			status |= EXTENT_STATUS_DELAYED;
		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
					    map->m_pblk, status);
681
		if (ret < 0) {
682
			retval = ret;
683
684
			goto out_sem;
		}
685
686
	}

687
out_sem:
688
	up_write((&EXT4_I(inode)->i_data_sem));
689
	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
690
		ret = check_block_validity(inode, map);
691
692
		if (ret != 0)
			return ret;
693
694
695
696
697
698
699
700
701
702
703

		/*
		 * Inodes with freshly allocated blocks where contents will be
		 * visible after transaction commit must be on transaction's
		 * ordered data list.
		 */
		if (map->m_flags & EXT4_MAP_NEW &&
		    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
		    !(flags & EXT4_GET_BLOCKS_ZERO) &&
		    !IS_NOQUOTA(inode) &&
		    ext4_should_order_data(inode)) {
704
705
706
707
			if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
				ret = ext4_jbd2_inode_add_wait(handle, inode);
			else
				ret = ext4_jbd2_inode_add_write(handle, inode);
708
709
710
			if (ret)
				return ret;
		}
711
	}
712
713
714
	return retval;
}

Jan Kara's avatar
Jan Kara committed
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
/*
 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
 * we have to be careful as someone else may be manipulating b_state as well.
 */
static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
{
	unsigned long old_state;
	unsigned long new_state;

	flags &= EXT4_MAP_FLAGS;

	/* Dummy buffer_head? Set non-atomically. */
	if (!bh->b_page) {
		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
		return;
	}
	/*
	 * Someone else may be modifying b_state. Be careful! This is ugly but
	 * once we get rid of using bh as a container for mapping information
	 * to pass to / from get_block functions, this can go away.
	 */
	do {
		old_state = READ_ONCE(bh->b_state);
		new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
	} while (unlikely(
		 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
}

743
744
static int _ext4_get_block(struct inode *inode, sector_t iblock,
			   struct buffer_head *bh, int flags)
745
{
746
	struct ext4_map_blocks map;
747
	int ret = 0;
748

749
750
751
	if (ext4_has_inline_data(inode))
		return -ERANGE;

752
753
754
	map.m_lblk = iblock;
	map.m_len = bh->b_size >> inode->i_blkbits;

755
756
	ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
			      flags);
Jan Kara's avatar
Jan Kara committed
757
	if (ret > 0) {
758
		map_bh(bh, inode->i_sb, map.m_pblk);
Jan Kara's avatar
Jan Kara committed
759
		ext4_update_bh_state(bh, map.m_flags);
760
		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
Jan Kara's avatar
Jan Kara committed
761
		ret = 0;
762
763
764
765
	}
	return ret;
}

766
767
768
769
770
771
772
int ext4_get_block(struct inode *inode, sector_t iblock,
		   struct buffer_head *bh, int create)
{
	return _ext4_get_block(inode, iblock, bh,
			       create ? EXT4_GET_BLOCKS_CREATE : 0);
}

773
774
775
776
777
778
779
780
781
782
783
784
785
786
/*
 * Get block function used when preparing for buffered write if we require
 * creating an unwritten extent if blocks haven't been allocated.  The extent
 * will be converted to written after the IO is complete.
 */
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
			     struct buffer_head *bh_result, int create)
{
	ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
		   inode->i_ino, create);
	return _ext4_get_block(inode, iblock, bh_result,
			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
}

787
788
789
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

790
791
792
793
794
795
796
/*
 * Get blocks function for the cases that need to start a transaction -
 * generally difference cases of direct IO and DAX IO. It also handles retries
 * in case of ENOSPC.
 */
static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
				struct buffer_head *bh_result, int flags)
797
798
{
	int dio_credits;
799
800
801
	handle_t *handle;
	int retries = 0;
	int ret;
802
803
804
805
806
807

	/* Trim mapping request to maximum we can map at once for DIO */
	if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
		bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
	dio_credits = ext4_chunk_trans_blocks(inode,
				      bh_result->b_size >> inode->i_blkbits);
808
809
810
811
812
813
814
815
816
817
818
retry:
	handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	ret = _ext4_get_block(inode, iblock, bh_result, flags);
	ext4_journal_stop(handle);

	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
		goto retry;
	return ret;
819
820
}

821
822
823
824
/* Get block function for DIO reads and writes to inodes without extents */
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
		       struct buffer_head *bh, int create)
{
825
826
827
	/* We don't expect handle for direct IO */
	WARN_ON_ONCE(ext4_journal_current_handle());

828
829
830
	if (!create)
		return _ext4_get_block(inode, iblock, bh, 0);
	return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
831
832
833
}

/*
834
 * Get block function for AIO DIO writes when we create unwritten extent if
835
836
837
 * blocks are not allocated yet. The extent will be converted to written
 * after IO is complete.
 */
838
839
static int ext4_dio_get_block_unwritten_async(struct inode *inode,
		sector_t iblock, struct buffer_head *bh_result,	int create)
840
{
841
842
843
844
845
	int ret;

	/* We don't expect handle for direct IO */
	WARN_ON_ONCE(ext4_journal_current_handle());

846
847
	ret = ext4_get_block_trans(inode, iblock, bh_result,
				   EXT4_GET_BLOCKS_IO_CREATE_EXT);
848

849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
	/*
	 * When doing DIO using unwritten extents, we need io_end to convert
	 * unwritten extents to written on IO completion. We allocate io_end
	 * once we spot unwritten extent and store it in b_private. Generic
	 * DIO code keeps b_private set and furthermore passes the value to
	 * our completion callback in 'private' argument.
	 */
	if (!ret && buffer_unwritten(bh_result)) {
		if (!bh_result->b_private) {
			ext4_io_end_t *io_end;

			io_end = ext4_init_io_end(inode, GFP_KERNEL);
			if (!io_end)
				return -ENOMEM;
			bh_result->b_private = io_end;
			ext4_set_io_unwritten_flag(inode, io_end);
		}
866
867
868
869
		set_buffer_defer_completion(bh_result);
	}

	return ret;
870
871
}

872
873
874
875
876
877
878
879
880
881
882
883
884
/*
 * Get block function for non-AIO DIO writes when we create unwritten extent if
 * blocks are not allocated yet. The extent will be converted to written
 * after IO is complete from ext4_ext_direct_IO() function.
 */
static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
		sector_t iblock, struct buffer_head *bh_result,	int create)
{
	int ret;

	/* We don't expect handle for direct IO */
	WARN_ON_ONCE(ext4_journal_current_handle());

885
886
	ret = ext4_get_block_trans(inode, iblock, bh_result,
				   EXT4_GET_BLOCKS_IO_CREATE_EXT);
887
888
889
890
891
892
893
894
895
896
897
898

	/*
	 * Mark inode as having pending DIO writes to unwritten extents.
	 * ext4_ext_direct_IO() checks this flag and converts extents to
	 * written.
	 */
	if (!ret && buffer_unwritten(bh_result))
		ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);

	return ret;
}

899
900
901
902
903
904
905
static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
		   struct buffer_head *bh_result, int create)
{
	int ret;

	ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
		   inode->i_ino, create);
906
907
908
	/* We don't expect handle for direct IO */
	WARN_ON_ONCE(ext4_journal_current_handle());

909
910
911
912
913
	ret = _ext4_get_block(inode, iblock, bh_result, 0);
	/*
	 * Blocks should have been preallocated! ext4_file_write_iter() checks
	 * that.
	 */
914
	WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
915
916
917
918
919

	return ret;
}


920
921
922
/*
 * `handle' can be NULL if create is zero
 */
923
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
924
				ext4_lblk_t block, int map_flags)
925
{
926
927
	struct ext4_map_blocks map;
	struct buffer_head *bh;
928
	int create = map_flags & EXT4_GET_BLOCKS_CREATE;
929
	int err;
930
931
932

	J_ASSERT(handle != NULL || create == 0);

933
934
	map.m_lblk = block;
	map.m_len = 1;
935
	err = ext4_map_blocks(handle, inode, &map, map_flags);
936

937
938
	if (err == 0)
		return create ? ERR_PTR(-ENOSPC) : NULL;
939
	if (err < 0)
940
		return ERR_PTR(err);
941
942

	bh = sb_getblk(inode->i_sb, map.m_pblk);
943
944
	if (unlikely(!bh))
		return ERR_PTR(-ENOMEM);
945
946
947
	if (map.m_flags & EXT4_MAP_NEW) {
		J_ASSERT(create != 0);
		J_ASSERT(handle != NULL);
948

949
950
951
952
953
954
955
956
957
		/*
		 * Now that we do not always journal data, we should
		 * keep in mind whether this should always journal the
		 * new buffer as metadata.  For now, regular file
		 * writes use ext4_get_block instead, so it's not a
		 * problem.
		 */
		lock_buffer(bh);
		BUFFER_TRACE(bh, "call get_create_access");
958
959
960
961
962
963
		err = ext4_journal_get_create_access(handle, bh);
		if (unlikely(err)) {
			unlock_buffer(bh);
			goto errout;
		}
		if (!buffer_uptodate(bh)) {
964
965
			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
			set_buffer_uptodate(bh);
966
		}
967
968
969
		unlock_buffer(bh);
		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
		err = ext4_handle_dirty_metadata(handle, inode, bh);
970
971
972
		if (unlikely(err))
			goto errout;
	} else
973
974
		BUFFER_TRACE(bh, "not a new buffer");
	return bh;
975
976
977
errout:
	brelse(bh);
	return ERR_PTR(err);
978
979
}

980
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
981
			       ext4_lblk_t block, int map_flags)
982
{
983
	struct buffer_head *bh;
984

985
	bh = ext4_getblk(handle, inode, block, map_flags);
986
	if (IS_ERR(bh))
987
		return bh;
988
	if (!bh || buffer_uptodate(bh))
989
		return bh;
990
	ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
991
992
993
994
	wait_on_buffer(bh);
	if (buffer_uptodate(bh))
		return bh;
	put_bh(bh);
995
	return ERR_PTR(-EIO);
996
997
}

998
999
1000
1001
1002
1003
1004
int ext4_walk_page_buffers(handle_t *handle,
			   struct buffer_head *head,
			   unsigned from,
			   unsigned to,
			   int *partial,
			   int (*fn)(handle_t *handle,
				     struct buffer_head *bh))
1005
1006
1007
1008
1009
1010
1011
{
	struct buffer_head *bh;
	unsigned block_start, block_end;
	unsigned blocksize = head->b_size;
	int err, ret = 0;
	struct buffer_head *next;

1012
1013
	for (bh = head, block_start = 0;
	     ret == 0 && (bh != head || !block_start);
1014
	     block_start = block_end, bh = next) {
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
		next = bh->b_this_page;
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (partial && !buffer_uptodate(bh))
				*partial = 1;
			continue;
		}
		err = (*fn)(handle, bh);
		if (!ret)
			ret = err;
	}
	return ret;
}

/*
 * To preserve ordering, it is essential that the hole instantiation and
 * the data write be encapsulated in a single transaction.  We cannot
1032
 * close off a transaction and start a new one between the ext4_get_block()
1033
 * and the commit_write().  So doing the jbd2_journal_start at the start of
1034
1035
 * prepare_write() is the right place.
 *
1036
1037
1038
1039
 * Also, this function can nest inside ext4_writepage().  In that case, we
 * *know* that ext4_writepage() has generated enough buffer credits to do the
 * whole page.  So we won't block on the journal in that case, which is good,
 * because the caller may be PF_MEMALLOC.
1040
 *
1041
 * By accident, ext4 can be reentered when a transaction is open via
1042
1043
1044
1045
1046
1047
 * quota file writes.  If we were to commit the transaction while thus
 * reentered, there can be a deadlock - we would be holding a quota
 * lock, and the commit would never complete if another thread had a
 * transaction open and was blocking on the quota lock - a ranking
 * violation.
 *
1048
 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1049
1050
1051
1052
 * will _not_ run commit under these circumstances because handle->h_ref
 * is elevated.  We'll still have enough credits for the tiny quotafile
 * write.
 */
1053
1054
int do_journal_get_write_access(handle_t *handle,
				struct buffer_head *bh)
1055
{
1056
1057
1058
	int dirty = buffer_dirty(bh);
	int ret;

1059
1060
	if (!buffer_mapped(bh) || buffer_freed(bh))
		return 0;
1061
	/*
1062
	 * __block_write_begin() could have dirtied some buffers. Clean
1063
1064
	 * the dirty bit as jbd2_journal_get_write_access() could complain
	 * otherwise about fs integrity issues. Setting of the dirty bit
1065
	 * by __block_write_begin() isn't a real problem here as we clear
1066
1067
1068
1069
1070
	 * the bit before releasing a page lock and thus writeback cannot
	 * ever write the buffer.
	 */
	if (dirty)
		clear_buffer_dirty(bh);
1071
	BUFFER_TRACE(bh, "get write access");
1072
1073
1074
1075
	ret = ext4_journal_get_write_access(handle, bh);
	if (!ret && dirty)
		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
	return ret;
1076
1077
}

1078
1079
1080
1081
#ifdef CONFIG_EXT4_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
				  get_block_t *get_block)
{
1082
	unsigned from = pos & (PAGE_SIZE - 1);
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
	unsigned to = from + len;
	struct inode *inode = page->mapping->host;
	unsigned block_start, block_end;
	sector_t block;
	int err = 0;
	unsigned blocksize = inode->i_sb->s_blocksize;
	unsigned bbits;
	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
	bool decrypt = false;

	BUG_ON(!PageLocked(page));
1094
1095
	BUG_ON(from > PAGE_SIZE);
	BUG_ON(to > PAGE_SIZE);
1096
1097
1098
1099
1100
1101
	BUG_ON(from > to);

	if (!page_has_buffers(page))
		create_empty_buffers(page, blocksize, 0);
	head = page_buffers(page);
	bbits = ilog2(blocksize);
1102
	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143

	for (bh = head, block_start = 0; bh != head || !block_start;
	    block++, block_start = block_end, bh = bh->b_this_page) {
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (PageUptodate(page)) {
				if (!buffer_uptodate(bh))
					set_buffer_uptodate(bh);
			}
			continue;
		}
		if (buffer_new(bh))
			clear_buffer_new(bh);
		if (!buffer_mapped(bh)) {
			WARN_ON(bh->b_size != blocksize);
			err = get_block(inode, block, bh, 1);
			if (err)
				break;
			if (buffer_new(bh)) {
				unmap_underlying_metadata(bh->b_bdev,
							  bh->b_blocknr);
				if (PageUptodate(page)) {
					clear_buffer_new(bh);
					set_buffer_uptodate(bh);
					mark_buffer_dirty(bh);
					continue;
				}
				if (block_end > to || block_start < from)
					zero_user_segments(page, to, block_end,
							   block_start, from);
				continue;
			}
		}
		if (PageUptodate(page)) {
			if (!buffer_uptodate(bh))
				set_buffer_uptodate(bh);
			continue;
		}
		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
		    !buffer_unwritten(bh) &&
		    (block_start < from || block_end > to)) {
1144
			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
			*wait_bh++ = bh;
			decrypt = ext4_encrypted_inode(inode) &&
				S_ISREG(inode->i_mode);
		}
	}
	/*
	 * If we issued read requests, let them complete.
	 */
	while (wait_bh > wait) {
		wait_on_buffer(*--wait_bh);
		if (!buffer_uptodate(*wait_bh))
			err = -EIO;
	}
	if (unlikely(err))
		page_zero_new_buffers(page, from, to);
	else if (decrypt)
1161
		err = fscrypt_decrypt_page(page);
1162
1163
1164
1165
	return err;
}
#endif

Nick Piggin's avatar
Nick Piggin committed
1166
static int ext4_write_begin(struct file *file, struct address_space *mapping,
1167
1168
			    loff_t pos, unsigned len, unsigned flags,
			    struct page **pagep, void **fsdata)
1169
{
1170
	struct inode *inode = mapping->host;
1171
	int ret, needed_blocks;