Commit 8e2e0047 authored by Bob Peterson's avatar Bob Peterson Committed by Steven Whitehouse
Browse files

GFS2: Reduce file fragmentation



This patch reduces GFS2 file fragmentation by pre-reserving blocks. The
resulting improved on disk layout greatly speeds up operations in cases
which would have resulted in interlaced allocation of blocks previously.
A typical example of this is 10 parallel dd processes, each writing to a
file in a common dirctory.

The implementation uses an rbtree of reservations attached to each
resource group (and each inode).
Signed-off-by: default avatarBob Peterson <rpeterso@redhat.com>
Signed-off-by: default avatarSteven Whitehouse <swhiteho@redhat.com>
parent 294f2ad5
......@@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (error)
goto out_rlist;
if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
gfs2_rs_deltree(ip->i_res);
error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
RES_INDIRECT + RES_STATFS + RES_QUOTA,
revokes);
......
......@@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
return ret;
atomic_set(&ip->i_res->rs_sizehint,
PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret)
......@@ -571,22 +574,15 @@ static int gfs2_open(struct inode *inode, struct file *file)
static int gfs2_release(struct inode *inode, struct file *file)
{
struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
struct gfs2_file *fp;
struct gfs2_inode *ip = GFS2_I(inode);
fp = file->private_data;
kfree(file->private_data);
file->private_data = NULL;
if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
if ((file->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1))
gfs2_rs_delete(ip);
if (gfs2_assert_warn(sdp, fp))
return -EIO;
kfree(fp);
return 0;
}
......@@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
size_t writesize = iov_length(iov, nr_segs);
struct dentry *dentry = file->f_dentry;
struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
struct gfs2_sbd *sdp;
int ret;
sdp = GFS2_SB(file->f_mapping->host);
ret = gfs2_rs_alloc(ip);
if (ret)
return ret;
atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
if (file->f_flags & O_APPEND) {
struct gfs2_holder gh;
......@@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
if (unlikely(error))
goto out_uninit;
atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
while (len > 0) {
if (len < bytes)
bytes = len;
......@@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
offset += bytes;
continue;
}
error = gfs2_rindex_update(sdp);
if (error)
goto out_unlock;
error = gfs2_quota_lock_check(ip);
if (error)
goto out_unlock;
......
......@@ -84,6 +84,7 @@ struct gfs2_rgrpd {
u32 rd_data; /* num of data blocks in rgrp */
u32 rd_bitbytes; /* number of bytes in data bitmaps */
u32 rd_free;
u32 rd_reserved; /* number of blocks reserved */
u32 rd_free_clone;
u32 rd_dinodes;
u64 rd_igeneration;
......@@ -96,6 +97,9 @@ struct gfs2_rgrpd {
#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
spinlock_t rd_rsspin; /* protects reservation related vars */
struct rb_root rd_rstree; /* multi-block reservation tree */
u32 rd_rs_cnt; /* count of current reservations */
};
enum gfs2_state_bits {
......@@ -233,6 +237,38 @@ struct gfs2_holder {
unsigned long gh_ip;
};
/* Resource group multi-block reservation, in order of appearance:
Step 1. Function prepares to write, allocates a mb, sets the size hint.
Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
Step 4. Bits are assigned from the rgrp based on either the reservation
or wherever it can.
*/
struct gfs2_blkreserv {
/* components used during write (step 1): */
atomic_t rs_sizehint; /* hint of the write size */
/* components used during inplace_reserve (step 2): */
u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
/* components used during get_local_rgrp (step 3): */
struct gfs2_rgrpd *rs_rgd; /* pointer to the gfs2_rgrpd */
struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
struct rb_node rs_node; /* link to other block reservations */
/* components used during block searches and assignments (step 4): */
struct gfs2_bitmap *rs_bi; /* bitmap for the current allocation */
u32 rs_biblk; /* start block relative to the bi */
u32 rs_free; /* how many blocks are still free */
/* ancillary quota stuff */
struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
unsigned int rs_qa_qd_num;
};
enum {
GLF_LOCK = 1,
GLF_DEMOTE = 3,
......@@ -290,16 +326,6 @@ struct gfs2_glock {
#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
struct gfs2_blkreserv {
u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
/* ancillary quota stuff */
struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
unsigned int rs_qa_qd_num;
};
enum {
GIF_INVALID = 0,
GIF_QD_LOCKED = 1,
......@@ -307,7 +333,6 @@ enum {
GIF_SW_PAGED = 3,
};
struct gfs2_inode {
struct inode i_inode;
u64 i_no_addr;
......@@ -318,7 +343,7 @@ struct gfs2_inode {
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_blkreserv *i_res; /* resource group block reservation */
struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
......
......@@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
int error;
munge_mode_uid_gid(dip, &mode, &uid, &gid);
error = gfs2_rindex_update(sdp);
if (error)
return error;
error = gfs2_quota_lock(dip, uid, gid);
if (error)
......@@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
struct buffer_head *dibh;
int error;
error = gfs2_rindex_update(sdp);
if (error)
return error;
error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
goto fail;
......@@ -596,7 +603,8 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
gfs2_trans_end(sdp);
fail_ipreserv:
gfs2_inplace_release(dip);
if (alloc_required)
gfs2_inplace_release(dip);
fail_quota_locks:
gfs2_quota_unlock(dip);
......@@ -647,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
const struct qstr *name = &dentry->d_name;
struct gfs2_holder ghs[2];
struct inode *inode = NULL;
struct gfs2_inode *dip = GFS2_I(dir);
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
int error;
......@@ -657,6 +665,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
/* We need a reservation to allocate the new dinode block. The
directory ip temporarily points to the reservation, but this is
being done to get a set of contiguous blocks for the new dinode.
Since this is a create, we don't have a sizehint yet, so it will
have to use the minimum reservation size. */
error = gfs2_rs_alloc(dip);
if (error)
return error;
......@@ -694,24 +707,29 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
goto fail_gunlock2;
error = gfs2_inode_refresh(GFS2_I(inode));
ip = GFS2_I(inode);
error = gfs2_inode_refresh(ip);
if (error)
goto fail_gunlock2;
/* the new inode needs a reservation so it can allocate xattrs. */
error = gfs2_rs_alloc(GFS2_I(inode));
if (error)
goto fail_gunlock2;
/* The newly created inode needs a reservation so it can allocate
xattrs. At the same time, we want new blocks allocated to the new
dinode to be as contiguous as possible. Since we allocated the
dinode block under the directory's reservation, we transfer
ownership of that reservation to the new inode. The directory
doesn't need a reservation unless it needs a new allocation. */
ip->i_res = dip->i_res;
dip->i_res = NULL;
error = gfs2_acl_create(dip, inode);
if (error)
goto fail_gunlock2;
error = gfs2_security_init(dip, GFS2_I(inode), name);
error = gfs2_security_init(dip, ip, name);
if (error)
goto fail_gunlock2;
error = link_dinode(dip, name, GFS2_I(inode));
error = link_dinode(dip, name, ip);
if (error)
goto fail_gunlock2;
......@@ -738,6 +756,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
iput(inode);
}
fail:
gfs2_rs_delete(dip);
if (bh)
brelse(bh);
return error;
......
This diff is collapsed.
......@@ -13,6 +13,14 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
/* Since each block in the file system is represented by two bits in the
* bitmap, one 64-bit word in the bitmap will represent 32 blocks.
* By reserving 32 blocks at a time, we can optimize / shortcut how we search
* through the bitmaps by looking a word at a time.
*/
#define RGRP_RSRV_MINBYTES 8
#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
struct gfs2_rgrpd;
struct gfs2_sbd;
struct gfs2_holder;
......@@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
extern void gfs2_inplace_release(struct gfs2_inode *ip);
......@@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
extern int gfs2_rs_alloc(struct gfs2_inode *ip);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
extern void gfs2_rs_delete(struct gfs2_inode *ip);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
......@@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
extern int gfs2_fitrim(struct file *filp, void __user *argp);
/* This is how to tell if a reservation is "inplace" reserved: */
/* This is how to tell if a multi-block reservation is "inplace" reserved: */
static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
{
if (ip->i_res && ip->i_res->rs_requested)
......@@ -70,4 +81,22 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
return 0;
}
/* This is how to tell if a multi-block reservation is in the rgrp tree: */
static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
{
if (rs && rs->rs_bi)
return 1;
return 0;
}
static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk)
{
return (bi->bi_start * GFS2_NBBY) + blk;
}
static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs)
{
return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0;
}
#endif /* __RGRP_DOT_H__ */
......@@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
return -EIO;
}
error = gfs2_rindex_update(sdp);
if (error)
return error;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
return error;
......@@ -1550,6 +1554,9 @@ static void gfs2_evict_inode(struct inode *inode)
out_unlock:
/* Error path for case 1 */
if (gfs2_rs_active(ip->i_res))
gfs2_rs_deltree(ip->i_res);
if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
gfs2_glock_dq(&ip->i_iopen_gh);
gfs2_holder_uninit(&ip->i_iopen_gh);
......
......@@ -14,6 +14,7 @@
#include <linux/ktime.h>
#include "incore.h"
#include "glock.h"
#include "rgrp.h"
#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
#define glock_trace_name(x) __print_symbolic(x, \
......@@ -31,6 +32,17 @@
{ GFS2_BLKST_DINODE, "dinode" }, \
{ GFS2_BLKST_UNLINKED, "unlinked" })
#define TRACE_RS_DELETE 0
#define TRACE_RS_TREEDEL 1
#define TRACE_RS_INSERT 2
#define TRACE_RS_CLAIM 3
#define rs_func_name(x) __print_symbolic(x, \
{ 0, "del " }, \
{ 1, "tdel" }, \
{ 2, "ins " }, \
{ 3, "clm " })
#define show_glock_flags(flags) __print_flags(flags, "", \
{(1UL << GLF_LOCK), "l" }, \
{(1UL << GLF_DEMOTE), "D" }, \
......@@ -470,6 +482,7 @@ TRACE_EVENT(gfs2_block_alloc,
__field( u8, block_state )
__field( u64, rd_addr )
__field( u32, rd_free_clone )
__field( u32, rd_reserved )
),
TP_fast_assign(
......@@ -480,16 +493,58 @@ TRACE_EVENT(gfs2_block_alloc,
__entry->block_state = block_state;
__entry->rd_addr = rgd->rd_addr;
__entry->rd_free_clone = rgd->rd_free_clone;
__entry->rd_reserved = rgd->rd_reserved;
),
TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->inum,
(unsigned long long)__entry->start,
(unsigned long)__entry->len,
block_state_name(__entry->block_state),
(unsigned long long)__entry->rd_addr,
__entry->rd_free_clone)
__entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
);
/* Keep track of multi-block reservations as they are allocated/freed */
TRACE_EVENT(gfs2_rs,
TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
u8 func),
TP_ARGS(ip, rs, func),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( u64, rd_addr )
__field( u32, rd_free_clone )
__field( u32, rd_reserved )
__field( u64, inum )
__field( u64, start )
__field( u32, free )
__field( u8, func )
),
TP_fast_assign(
__entry->dev = rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
__entry->rd_addr = rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
__entry->rd_free_clone = rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
__entry->rd_reserved = rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
__entry->inum = ip ? ip->i_no_addr : 0;
__entry->start = gfs2_rs_startblk(rs);
__entry->free = rs->rs_free;
__entry->func = func;
),
TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s "
"f:%lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->inum,
(unsigned long long)__entry->start,
(unsigned long long)__entry->rd_addr,
(unsigned long)__entry->rd_free_clone,
(unsigned long)__entry->rd_reserved,
rs_func_name(__entry->func), (unsigned long)__entry->free)
);
#endif /* _TRACE_GFS2_H */
......
......@@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
{
int error;
error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
if (error)
return error;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
goto out_alloc;
......@@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
struct buffer_head *dibh;
int error;
error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
if (error)
return error;
error = gfs2_quota_lock_check(ip);
if (error)
return error;
......@@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
{
int error;
error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
if (error)
return error;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
return error;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment