Commit dec214d0 authored by Tahsin Erdogan's avatar Tahsin Erdogan Committed by Theodore Ts'o
Browse files

ext4: xattr inode deduplication



Ext4 now supports xattr values that are up to 64k in size (vfs limit).
Large xattr values are stored in external inodes each one holding a
single value. Once written the data blocks of these inodes are immutable.

The real world use cases are expected to have a lot of value duplication
such as inherited acls etc. To reduce data duplication on disk, this patch
implements a deduplicator that allows sharing of xattr inodes.

The deduplication is based on an in-memory hash lookup that is a best
effort sharing scheme. When a xattr inode is read from disk (i.e.
getxattr() call), its crc32c hash is added to a hash table. Before
creating a new xattr inode for a value being set, the hash table is
checked to see if an existing inode holds an identical value. If such an
inode is found, the ref count on that inode is incremented. On value
removal the ref count is decremented and if it reaches zero the inode is
deleted.

The quota charging for such inodes is manually managed. Every reference
holder is charged the full size as if there was no sharing happening.
This is consistent with how xattr blocks are also charged.

[ Fixed up journal credits calculation to handle inline data and the
  rare case where an shared xattr block can get freed when two thread
  race on breaking the xattr block sharing. --tytso ]
Signed-off-by: default avatarTahsin Erdogan <tahsin@google.com>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 30a7eb97
......@@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (error)
return error;
retry:
credits = ext4_xattr_set_credits(inode, acl_size);
error = ext4_xattr_set_credits(inode, acl_size, &credits);
if (error)
return error;
handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
......
......@@ -1517,6 +1517,7 @@ struct ext4_sb_info {
long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
struct mb_cache *s_ea_block_cache;
struct mb_cache *s_ea_inode_cache;
spinlock_t s_es_lock ____cacheline_aligned_in_smp;
/* Ratelimit ext4 messages. */
......@@ -2100,7 +2101,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}
#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
static inline bool ext4_is_quota_file(struct inode *inode)
{
return IS_NOQUOTA(inode) &&
!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
}
/*
* This structure is stuffed into the struct file's private_data field
......@@ -2493,7 +2498,6 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
......@@ -2720,19 +2724,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
extern int ext4_register_li_request(struct super_block *sb,
ext4_group_t first_not_zeroed);
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
return ext4_has_feature_gdt_csum(sb) ||
EXT4_SB(sb)->s_chksum_driver != NULL;
}
static inline int ext4_has_metadata_csum(struct super_block *sb)
{
WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
!EXT4_SB(sb)->s_chksum_driver);
return (EXT4_SB(sb)->s_chksum_driver != NULL);
return ext4_has_feature_metadata_csum(sb) &&
(EXT4_SB(sb)->s_chksum_driver != NULL);
}
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
}
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
......
......@@ -139,6 +139,8 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
/*
* Test whether an inode is a fast symlink.
......@@ -4843,8 +4845,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
brelse(iloc.bh);
ext4_set_inode_flags(inode);
if (ei->i_flags & EXT4_EA_INODE_FL)
if (ei->i_flags & EXT4_EA_INODE_FL) {
ext4_xattr_inode_set_class(inode);
inode_lock(inode);
inode->i_flags |= S_NOQUOTA;
inode_unlock(inode);
}
unlock_new_inode(inode);
return inode;
......@@ -5503,7 +5512,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
*
* Also account for superblock, inode, quota and xattr blocks
*/
int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
......
......@@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
if (sbi->s_ea_inode_cache) {
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
sbi->s_ea_inode_cache = NULL;
}
if (sbi->s_ea_block_cache) {
ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
sbi->s_ea_block_cache = NULL;
......@@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
if (res)
return res;
retry:
credits = ext4_xattr_set_credits(inode, len);
res = ext4_xattr_set_credits(inode, len, &credits);
if (res)
return res;
handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
......@@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
/* Load the checksum driver */
if (ext4_has_feature_metadata_csum(sb)) {
if (ext4_has_feature_metadata_csum(sb) ||
ext4_has_feature_ea_inode(sb)) {
sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
......@@ -3467,7 +3475,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Precompute checksum seed for all metadata */
if (ext4_has_feature_csum_seed(sb))
sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
else if (ext4_has_metadata_csum(sb))
else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
......@@ -3597,6 +3605,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"The Hurd can't support 64-bit file systems");
goto failed_mount;
}
/*
* ea_inode feature uses l_i_version field which is not
* available in HURD_COMPAT mode.
*/
if (ext4_has_feature_ea_inode(sb)) {
ext4_msg(sb, KERN_ERR,
"ea_inode feature is not supported for Hurd");
goto failed_mount;
}
}
if (IS_EXT2_SB(sb)) {
......@@ -4067,6 +4085,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount_wq;
}
if (ext4_has_feature_ea_inode(sb)) {
sbi->s_ea_inode_cache = ext4_xattr_create_cache();
if (!sbi->s_ea_inode_cache) {
ext4_msg(sb, KERN_ERR,
"Failed to create ea_inode_cache");
goto failed_mount_wq;
}
}
if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
(blocksize != PAGE_SIZE)) {
ext4_msg(sb, KERN_ERR,
......@@ -4296,6 +4323,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
if (sbi->s_ea_inode_cache) {
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
sbi->s_ea_inode_cache = NULL;
}
if (sbi->s_ea_block_cache) {
ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
sbi->s_ea_block_cache = NULL;
......
This diff is collapsed.
......@@ -69,19 +69,6 @@ struct ext4_xattr_entry {
EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
/*
* Link EA inode back to parent one using i_mtime field.
* Extra integer type conversion added to ignore higher
* bits in i_mtime.tv_sec which might be set by ext4_get()
*/
#define EXT4_XATTR_INODE_SET_PARENT(inode, inum) \
do { \
(inode)->i_mtime.tv_sec = inum; \
} while(0)
#define EXT4_XATTR_INODE_GET_PARENT(inode) \
((__u32)(inode)->i_mtime.tv_sec)
/*
* The minimum size of EA value when you start storing it in an external inode
* size of block - size of header - size of 1 entry - 4 null bytes
......@@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
int *credits);
extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
struct ext4_xattr_inode_array **array,
int extra_credits);
......
......@@ -13,10 +13,11 @@
* mb_cache_entry_delete()).
*
* Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
* They use hash of a block contents as a key and block number as a value.
* That's why keys need not be unique (different xattr blocks may end up having
* the same hash). However block number always uniquely identifies a cache
* entry.
* Ext4 also uses it for deduplication of xattr values stored in inodes.
* They use hash of data as a key and provide a value that may represent a
* block or inode number. That's why keys need not be unique (hash of different
* data may be the same). However user provided value always uniquely
* identifies a cache entry.
*
* We provide functions for creation and removal of entries, search by key,
* and a special "delete entry with given key-value pair" operation. Fixed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment