Commit 0a2b2a84 authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason
Browse files

Btrfs: throttle delayed refs better

On one of our gluster clusters we noticed some pretty big lag spikes.  This
turned out to be because our transaction commit was taking like 3 minutes to
complete.  This is because we have like 30 gigs of metadata, so our global
reserve would end up being the max which is like 512 mb.  So our throttling code
would allow a ridiculous amount of delayed refs to build up and then they'd all
get run at transaction commit time, and for a cold mounted file system that
could take up to 3 minutes to run.  So fix the throttling to be based on both
the size of the global reserve and how long it takes us to run delayed refs.
This patch tracks the time it takes to run delayed refs and then only allows 1
seconds worth of outstanding delayed refs at a time.  This way it will auto-tune
itself from cold cache up to when everything is in memory and it no longer has
to go to disk.  This makes our transaction commits take much less time to run.
Signed-off-by: default avatarJosef Bacik <>
Signed-off-by: default avatarChris Mason <>
parent d7df2c79
......@@ -1360,6 +1360,7 @@ struct btrfs_fs_info {
u64 generation;
u64 last_trans_committed;
u64 avg_delayed_ref_runtime;
* this is updated to the current trans every time a full commit
......@@ -3172,6 +3173,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
......@@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb,
fs_info->free_chunk_space = 0;
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
......@@ -2322,8 +2322,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
struct btrfs_fs_info *fs_info = root->fs_info;
ktime_t start = ktime_get();
int ret;
unsigned long count = 0;
unsigned long actual_count = 0;
int must_insert_reserved = 0;
delayed_refs = &trans->transaction->delayed_refs;
......@@ -2452,6 +2454,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
} else {
ref->in_tree = 0;
rb_erase(&ref->rb_node, &locked_ref->ref_root);
......@@ -2502,6 +2505,26 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* We don't want to include ref heads since we can have empty ref heads
* and those will drastically skew our runtime down since we just do
* accounting, no actual extent tree updates.
if (actual_count > 0) {
u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
u64 avg;
* We weigh the current average higher than our current runtime
* to avoid large swings in the average.
avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
avg = div64_u64(avg, 4);
fs_info->avg_delayed_ref_runtime = avg;
return 0;
......@@ -2600,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
struct btrfs_block_rsv *global_rsv;
......@@ -2629,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
return ret;
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
u64 num_entries =
u64 avg_runtime;
avg_runtime = fs_info->avg_delayed_ref_runtime;
if (num_entries * avg_runtime >= NSEC_PER_SEC)
return 1;
return btrfs_check_space_for_delayed_refs(trans, root);
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
......@@ -645,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
if (root->fs_info->global_block_rsv.space_info->full &&
btrfs_should_throttle_delayed_refs(trans, root))
btrfs_check_space_for_delayed_refs(trans, root))
return 1;
return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
......@@ -710,7 +710,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates = 0;
if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
cur = max_t(unsigned long, cur, 1);
cur = max_t(unsigned long, cur, 32);
trans->delayed_ref_updates = 0;
btrfs_run_delayed_refs(trans, root, cur);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment