diff --git a/.bcachefs_revision b/.bcachefs_revision index 485f92e1a..ec5195e12 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -22fa8fc32e6aafb8bd76c6b746868dbdbc6a934d +ec2ddb95112b8967753591b16e2e439eee76c5b1 diff --git a/include/linux/sched.h b/include/linux/sched.h index 99d6a47af..153bd73d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,6 +65,8 @@ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ +#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */ +#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ diff --git a/include/linux/swap.h b/include/linux/swap.h new file mode 100644 index 000000000..818642223 --- /dev/null +++ b/include/linux/swap.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SWAP_H +#define _LINUX_SWAP_H + +static inline void mm_account_reclaimed_pages(unsigned long pages) {} + +#endif /* _LINUX_SWAP_H */ diff --git a/include/linux/time64.h b/include/linux/time64.h index cd6cc1c19..0cef3f8c6 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -44,6 +44,20 @@ static inline struct timespec timespec_trunc(struct timespec t, unsigned gran) return t; } +static inline void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + #define ns_to_timespec64 ns_to_timespec #define timespec64_to_ns timespec_to_ns #define timespec64_trunc timespec_trunc diff --git a/include/linux/types.h b/include/linux/types.h index 004d5eb0d..5ee5ebc60 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -37,6 +37,8 @@ typedef unsigned gfp_t; #define __GFP_NOWARN 0 #define __GFP_NORETRY 0 #define __GFP_NOFAIL 0 +#define __GFP_ACCOUNT 0 +#define __GFP_RECLAIMABLE 0 #define __GFP_ZERO 1 #define GFP_KERNEL 2 diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 87f1be9d4..1def61875 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -137,7 +137,7 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, return NULL; acl = allocate_dropping_locks(trans, ret, - posix_acl_alloc(count, _gfp)); + posix_acl_alloc(count, GFP_KERNEL)); if (!acl) return ERR_PTR(-ENOMEM); if (ret) { @@ -427,7 +427,8 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, if (ret) goto err; - ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); + ret = allocate_dropping_locks_errcode(trans, + __posix_acl_chmod(&acl, GFP_KERNEL, mode)); if (ret) goto err; diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index b54ce7f88..51a01423a 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1969,8 +1969,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) @@ -1980,18 +1980,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (discard_in_flight_add(ca, bucket, false)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_ref; if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); -put_ioref: percpu_ref_put(&ca->io_ref); +put_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -2133,26 +2133,26 @@ static void bch2_do_invalidates_work(struct work_struct *work) bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_dev_do_invalidates(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_ref; if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); -put_ioref: percpu_ref_put(&ca->io_ref); +put_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) @@ -2298,6 +2298,36 @@ int bch2_fs_freespace_init(struct bch_fs *c) return 0; } +/* device removal */ + +int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + struct bpos start = POS(ca->dev_idx, 0); + struct bpos end = POS(ca->dev_idx, U64_MAX); + int ret; + + /* + * We clear the LRU and need_discard btrees first so that we don't race + * with bch2_do_invalidates() and bch2_do_discards() + */ + ret = bch2_dev_remove_stripes(c, ca) ?: + bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_dev_usage_remove(c, ca->dev_idx); + bch_err_msg(c, ret, "removing dev alloc info"); + return ret; +} + /* Bucket IO clocks: */ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, @@ -2433,13 +2463,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; + lockdep_assert_held(&c->state_lock); /* First, remove device from allocation groups: */ - for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) clear_bit(ca->dev_idx, c->rw_devs[i].d); + c->rw_devs_change_count++; + /* * Capacity is calculated based off of devices in allocation groups: */ @@ -2468,11 +2500,13 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* device goes rw: */ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; + lockdep_assert_held(&c->state_lock); - for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) if (ca->mi.data_allowed & (1 << i)) set_bit(ca->dev_idx, c->rw_devs[i].d); + + c->rw_devs_change_count++; } void bch2_dev_allocator_background_exit(struct bch_dev *ca) diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index fd790b03f..577f823a0 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -338,6 +338,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); int bch2_fs_freespace_init(struct bch_fs *); +int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); void bch2_recalc_capacity(struct bch_fs *); u64 bch2_min_rw_member_capacity(struct bch_fs *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 084b03b8c..d0e0b5689 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -600,6 +600,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, enum bch_watermark watermark, enum bch_data_type data_type, struct closure *cl, + bool nowait, struct bch_dev_usage *usage) { struct bch_fs *c = trans->c; @@ -609,7 +610,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bucket_alloc_state s = { .btree_bitmap = data_type == BCH_DATA_btree, }; - bool waiting = false; + bool waiting = nowait; again: bch2_dev_usage_read_fast(ca, usage); avail = dev_buckets_free(ca, *usage, watermark); @@ -685,7 +686,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bch2_trans_do(c, NULL, NULL, 0, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - data_type, cl, &usage))); + data_type, cl, false, &usage))); return ob; } @@ -748,7 +749,6 @@ static int add_new_bucket(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - unsigned flags, struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; @@ -775,7 +775,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - unsigned flags, + enum bch_write_flags flags, enum bch_data_type data_type, enum bch_watermark watermark, struct closure *cl) @@ -801,7 +801,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage); + ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, + cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -815,7 +816,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, if (add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob)) { + have_cache, ob)) { ret = 0; break; } @@ -841,7 +842,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl) { struct bch_fs *c = trans->c; @@ -883,7 +884,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob); + have_cache, ob); out_put_head: bch2_ec_stripe_head_put(c, h); return ret; @@ -922,7 +923,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - bool ec, unsigned flags) + bool ec) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; @@ -934,7 +935,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c, have_cache, ec, ob)) ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob); + have_cache, ob); else ob_push(c, &ptrs_skip, ob); } @@ -950,8 +951,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, bool ec, - enum bch_watermark watermark, - unsigned flags) + enum bch_watermark watermark) { int i, ret = 0; @@ -983,7 +983,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob); + have_cache, ob); if (ret) break; } @@ -1003,7 +1003,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *_cl) { struct bch_fs *c = trans->c; @@ -1024,13 +1024,13 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, nr_replicas, nr_effective, - have_cache, erasure_code, flags); + have_cache, erasure_code); if (ret) return ret; ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, nr_replicas, nr_effective, - have_cache, erasure_code, watermark, flags); + have_cache, erasure_code, watermark); if (ret) return ret; @@ -1071,7 +1071,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl) { int ret; @@ -1373,7 +1373,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, unsigned nr_replicas, unsigned nr_replicas_required, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl, struct write_point **wp_ret) { @@ -1389,8 +1389,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; - BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); - BUG_ON(!nr_replicas || !nr_replicas_required); retry: ptrs.nr = 0; @@ -1495,11 +1493,12 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, try_decrease_writepoints(trans, write_points_nr)) goto retry; - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || + if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + + if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) - return cl - ? -BCH_ERR_bucket_alloc_blocked - : -BCH_ERR_ENOSPC_bucket_alloc; + ret = -BCH_ERR_bucket_alloc_blocked; return ret; } @@ -1730,13 +1729,6 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - bch2_dev_usage_to_text(out, ca, &stats); prt_newline(out); diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 386d231ce..1a16fd5bd 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -155,9 +155,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 return ret; } +enum bch_write_flags; int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, unsigned, + unsigned, unsigned *, bool *, enum bch_write_flags, enum bch_data_type, enum bch_watermark, struct closure *); @@ -167,7 +168,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, struct bch_devs_list *, unsigned, unsigned, enum bch_watermark, - unsigned, + enum bch_write_flags, struct closure *, struct write_point **); diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index d4da6343e..f1862c3f9 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -3,12 +3,14 @@ #include "bbpos.h" #include "alloc_background.h" #include "backpointers.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "checksum.h" +#include "disk_accounting.h" #include "error.h" #include @@ -750,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; + bch2_btree_cache_unpin(c); + btree_interior_mask |= btree_leaf_mask; - c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; - c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; + c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; + c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; c->btree_cache.pinned_nodes_start = start; c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; @@ -775,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, BBPOS(btree, b->key.k.p); break; } + bch2_node_pin(c, b); 0; })); } @@ -782,12 +787,80 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } +struct progress_indicator_state { + unsigned long next_print; + u64 nodes_seen; + u64 nodes_total; + struct btree *last_node; +}; + +static inline void progress_init(struct progress_indicator_state *s, + struct bch_fs *c, + u64 btree_id_mask) +{ + memset(s, 0, sizeof(*s)); + + s->next_print = jiffies + HZ * 10; + + for (unsigned i = 0; i < BTREE_ID_NR; i++) { + if (!(btree_id_mask & BIT_ULL(i))) + continue; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_btree, + .btree.id = i, + }; + + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + s->nodes_total += div64_ul(v, btree_sectors(c)); + } +} + +static inline bool progress_update_p(struct progress_indicator_state *s) +{ + bool ret = time_after_eq(jiffies, s->next_print); + + if (ret) + s->next_print = jiffies + HZ * 10; + return ret; +} + +static void progress_update_iter(struct btree_trans *trans, + struct progress_indicator_state *s, + struct btree_iter *iter, + const char *msg) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(btree_iter_path(trans, iter))->b; + + s->nodes_seen += b != s->last_node; + s->last_node = b; + + if (progress_update_p(s)) { + struct printbuf buf = PRINTBUF; + unsigned percent = s->nodes_total + ? div64_u64(s->nodes_seen * 100, s->nodes_total) + : 0; + + prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", + msg, percent, s->nodes_seen, s->nodes_total); + bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } +} + static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; + struct progress_indicator_state progress; int ret = 0; + progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { @@ -805,6 +878,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); @@ -865,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); - c->btree_cache.pinned_nodes_leaf_mask = 0; - c->btree_cache.pinned_nodes_interior_mask = 0; + bch2_btree_cache_unpin(c); bch_err_fn(c, ret); return ret; @@ -920,19 +993,24 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { + struct bch_fs *c = trans->c; struct bkey_buf last_flushed; + struct progress_indicator_state progress; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); + progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_one_backpointer(trans, start, end, - bkey_s_c_to_backpointer(k), - &last_flushed)); - - bch2_bkey_buf_exit(&last_flushed, trans->c); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + check_one_backpointer(trans, start, end, + bkey_s_c_to_backpointer(k), + &last_flushed); + })); + + bch2_bkey_buf_exit(&last_flushed, c); return ret; } @@ -977,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); - c->btree_cache.pinned_nodes_leaf_mask = 0; - c->btree_cache.pinned_nodes_interior_mask = 0; + bch2_btree_cache_unpin(c); bch_err_fn(c, ret); return ret; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index d43bbdbac..c711d4c27 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -542,7 +542,7 @@ struct bch_dev { * gc_gens_lock, for device resize - holding any is sufficient for * access: Or rcu_read_lock(), but only for dev_ptr_stale(): */ - struct bucket_array __rcu *buckets_gc; + GENRADIX(struct bucket) buckets_gc; struct bucket_gens __rcu *bucket_gens; u8 *oldest_gen; unsigned long *buckets_nouse; @@ -871,6 +871,7 @@ struct bch_fs { /* ALLOCATION */ struct bch_devs_mask rw_devs[BCH_DATA_NR]; + unsigned long rw_devs_change_count; u64 capacity; /* sectors */ u64 reserved; /* sectors */ @@ -1045,8 +1046,6 @@ struct bch_fs { * for signaling to the toplevel code which pass we want to run now. */ enum bch_recovery_pass curr_recovery_pass; - /* bitmap of explicitly enabled recovery passes: */ - u64 recovery_passes_explicit; /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; /* never rewinds version of curr_recovery_pass */ @@ -1195,12 +1194,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) { struct timespec64 t; + s64 sec; s32 rem; time += c->sb.time_base_lo; - t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); - t.tv_nsec = rem * c->sb.nsec_per_time_unit; + sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); + + set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit); + return t; } diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 662f0f79b..7b951b273 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -15,11 +15,12 @@ #include #include +#include #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ do { \ if (shrinker_counter) \ - bc->not_freed_##counter++; \ + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ } while (0) const char * const bch2_btree_node_flags[] = { @@ -31,24 +32,29 @@ const char * const bch2_btree_node_flags[] = { void bch2_recalc_btree_reserve(struct bch_fs *c) { - unsigned i, reserve = 16; + unsigned reserve = 16; if (!c->btree_roots_known[0].b) reserve += 8; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) reserve += min_t(unsigned, 1, r->b->c.level) * 8; } - c->btree_cache.reserve = reserve; + c->btree_cache.nr_reserve = reserve; } -static inline unsigned btree_cache_can_free(struct btree_cache *bc) +static inline size_t btree_cache_can_free(struct btree_cache_list *list) { - return max_t(int, 0, bc->used - bc->reserve); + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + size_t can_free = list->nr; + if (!list->idx) + can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); + return can_free; } static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) @@ -63,6 +69,18 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; + BUG_ON(btree_node_hashed(b)); + + /* + * This should really be done in slub/vmalloc, but we're using the + * kmalloc_large() path, so we're working around a slub bug by doing + * this here: + */ + if (b->data) + mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); + if (b->aux_data) + mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); + EBUG_ON(btree_node_write_in_flight(b)); clear_btree_node_just_written(b); @@ -76,7 +94,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) #endif b->aux_data = NULL; - bc->used--; + bc->nr_freeable--; btree_node_to_freedlist(bc, b); } @@ -102,6 +120,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; @@ -154,7 +174,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) bch2_btree_lock_init(&b->c, 0); - bc->used++; + bc->nr_freeable++; list_add(&b->list, &bc->freeable); return b; } @@ -169,10 +189,56 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) six_unlock_intent(&b->c.lock); } +static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) +{ + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = bc->pinned_nodes_mask[!!b->c.level]; + + return ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); +} + +void bch2_node_pin(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; + + mutex_lock(&bc->lock); + BUG_ON(!__btree_node_pinned(bc, b)); + if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { + set_btree_node_pinned(b); + list_move(&b->list, &bc->live[1].list); + bc->live[0].nr--; + bc->live[1].nr++; + } + mutex_unlock(&bc->lock); +} + +void bch2_btree_cache_unpin(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *n; + + mutex_lock(&bc->lock); + c->btree_cache.pinned_nodes_mask[0] = 0; + c->btree_cache.pinned_nodes_mask[1] = 0; + + list_for_each_entry_safe(b, n, &bc->live[1].list, list) { + clear_btree_node_pinned(b); + list_move(&b->list, &bc->live[0].list); + bc->live[0].nr++; + bc->live[1].nr--; + } + + mutex_unlock(&bc->lock); +} + /* Btree in memory cache - hash table */ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { + lockdep_assert_held(&bc->lock); int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); @@ -181,7 +247,11 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) b->hash_val = 0; if (b->c.btree_id < BTREE_ID_NR) - --bc->used_by_btree[b->c.btree_id]; + --bc->nr_by_btree[b->c.btree_id]; + + bc->live[btree_node_pinned(b)].nr--; + bc->nr_freeable++; + list_move(&b->list, &bc->freeable); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -191,23 +261,30 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); - if (!ret && b->c.btree_id < BTREE_ID_NR) - bc->used_by_btree[b->c.btree_id]++; - return ret; + if (ret) + return ret; + + if (b->c.btree_id < BTREE_ID_NR) + bc->nr_by_btree[b->c.btree_id]++; + + bool p = __btree_node_pinned(bc, b); + mod_bit(BTREE_NODE_pinned, &b->flags, p); + + list_move_tail(&b->list, &bc->live[p].list); + bc->live[p].nr++; + + bc->nr_freeable--; + return 0; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, unsigned level, enum btree_id id) { - int ret; - b->c.level = level; b->c.btree_id = id; mutex_lock(&bc->lock); - ret = __bch2_btree_node_hash_insert(bc, b); - if (!ret) - list_add_tail(&b->list, &bc->live); + int ret = __bch2_btree_node_hash_insert(bc, b); mutex_unlock(&bc->lock); return ret; @@ -261,18 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b int ret = 0; lockdep_assert_held(&bc->lock); - - struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); - - u64 mask = b->c.level - ? bc->pinned_nodes_interior_mask - : bc->pinned_nodes_leaf_mask; - - if ((mask & BIT_ULL(b->c.btree_id)) && - bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && - bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) - return -BCH_ERR_ENOMEM_btree_node_reclaim; - wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| @@ -377,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free = 0; @@ -386,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_read(&bc->dirty) + nr >= - bc->used * 3 / 4; + bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; @@ -402,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ - can_free = btree_cache_can_free(bc); + can_free = btree_cache_can_free(list); nr = min_t(unsigned long, nr, can_free); i = 0; @@ -424,22 +489,24 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; - bc->freed++; + bc->nr_freed++; } } restart: - list_for_each_entry_safe(b, t, &bc->live, list) { + list_for_each_entry_safe(b, t, &list->list, list) { touched++; if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - bc->not_freed_access_bit++; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; + --touched;; } else if (!btree_node_reclaim(c, b, true)) { + bch2_btree_node_hash_remove(bc, b); + freed++; btree_node_data_free(c, b); - bc->freed++; + bc->nr_freed++; - bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -450,7 +517,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, !btree_node_will_make_reachable(b) && !btree_node_write_blocked(b) && six_trylock_read(&b->c.lock)) { - list_move(&bc->live, &b->list); + list_move(&list->list, &b->list); mutex_unlock(&bc->lock); __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); @@ -464,8 +531,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, break; } out_rotate: - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); + if (&t->list != &list->list) + list_move_tail(&list->list, &t->list); out: mutex_unlock(&bc->lock); out_nounlock: @@ -478,44 +545,45 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; if (bch2_btree_shrinker_disabled) return 0; - return btree_cache_can_free(bc); + return btree_cache_can_free(list); } void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; - struct btree *b; - unsigned i, flags; + struct btree *b, *t; + unsigned long flags; - shrinker_free(bc->shrink); + shrinker_free(bc->live[1].shrink); + shrinker_free(bc->live[0].shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); mutex_lock(&bc->lock); if (c->verify_data) - list_move(&c->verify_data->list, &bc->live); + list_move(&c->verify_data->list, &bc->live[0].list); kvfree(c->verify_ondisk); - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) - list_add(&r->b->list, &bc->live); + list_add(&r->b->list, &bc->live[0].list); } - list_splice(&bc->freeable, &bc->live); - - while (!list_empty(&bc->live)) { - b = list_first_entry(&bc->live, struct btree, list); + list_for_each_entry_safe(b, t, &bc->live[1].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->live[0].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->freeable, list) { BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); @@ -523,12 +591,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) } BUG_ON(!bch2_journal_error(&c->journal) && - atomic_read(&c->btree_cache.dirty)); + atomic_long_read(&c->btree_cache.nr_dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); - while (!list_empty(&bc->freed_nonpcpu)) { - b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { list_del(&b->list); six_lock_exit(&b->c.lock); kfree(b); @@ -537,6 +604,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) mutex_unlock(&bc->lock); memalloc_nofs_restore(flags); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) + BUG_ON(bc->nr_by_btree[i]); + BUG_ON(bc->live[0].nr); + BUG_ON(bc->live[1].nr); + BUG_ON(bc->nr_freeable); + if (bc->table_init_done) rhashtable_destroy(&bc->table); } @@ -556,22 +629,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->reserve; i++) + for (i = 0; i < bc->nr_reserve; i++) if (!__bch2_btree_node_mem_alloc(c)) goto err; - list_splice_init(&bc->live, &bc->freeable); + list_splice_init(&bc->live[0].list, &bc->freeable); mutex_init(&c->verify_lock); shrink = shrinker_alloc(0, "%s-btree_cache", c->name); if (!shrink) goto err; - bc->shrink = shrink; + bc->live[0].shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->seeks = 2; + shrink->private_data = &bc->live[0]; + shrinker_register(shrink); + + shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); + if (!shrink) + goto err; + bc->live[1].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 4; - shrink->private_data = c; + shrink->seeks = 8; + shrink->private_data = &bc->live[1]; shrinker_register(shrink); return 0; @@ -582,7 +665,10 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) void bch2_fs_btree_cache_init_early(struct btree_cache *bc) { mutex_init(&bc->lock); - INIT_LIST_HEAD(&bc->live); + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { + bc->live[i].idx = i; + INIT_LIST_HEAD(&bc->live[i].list); + } INIT_LIST_HEAD(&bc->freeable); INIT_LIST_HEAD(&bc->freed_pcpu); INIT_LIST_HEAD(&bc->freed_nonpcpu); @@ -644,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; struct btree *b; - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b, false)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_reclaim(c, b, false)) + return b; while (1) { - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_write_and_reclaim(c, b)) + return b; /* * Rare case: all nodes were intent-locked. @@ -716,14 +804,15 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea mutex_unlock(&bc->lock); - if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { + if (memalloc_flags_do(PF_MEMALLOC_NORECLAIM, + btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))) { bch2_trans_unlock(trans); if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) goto err; } mutex_lock(&bc->lock); - bc->used++; + bc->nr_freeable++; got_mem: mutex_unlock(&bc->lock); @@ -1264,8 +1353,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) BUG_ON(btree_node_dirty(b)); mutex_lock(&bc->lock); - btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); + btree_node_data_free(c, b); mutex_unlock(&bc->lock); out: six_unlock_write(&b->c.lock); @@ -1337,13 +1426,20 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc } static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, - const char *label, unsigned nr) + const char *label, size_t nr) { prt_printf(out, "%s\t", label); prt_human_readable_u64(out, nr * c->opts.btree_node_size); - prt_printf(out, " (%u)\n", nr); + prt_printf(out, " (%zu)\n", nr); } +static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { +#define x(n) #n, + BCH_BTREE_CACHE_NOT_FREED_REASONS() +#undef x + NULL +}; + void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); @@ -1351,24 +1447,21 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_btree_cache_line(out, c, "total:", bc->used); - prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty)); + prt_btree_cache_line(out, c, "live:", bc->live[0].nr); + prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); + prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); prt_newline(out); - for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++) - prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) + prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); prt_newline(out); - prt_printf(out, "freed:\t%u\n", bc->freed); + prt_printf(out, "freed:\t%zu\n", bc->nr_freed); prt_printf(out, "not freed:\n"); - prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty); - prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight); - prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight); - prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent); - prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write); - prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit); - prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict); - prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked); - prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) + prt_printf(out, " %s\t%llu\n", + bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); } diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index f82064007..367acd217 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); +void bch2_node_pin(struct bch_fs *, struct btree *); +void bch2_btree_cache_unpin(struct bch_fs *); + void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index eb3002c4e..b5e0692f0 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -549,9 +549,8 @@ int bch2_check_topology(struct bch_fs *c) six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { - bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); + bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); r->b = NULL; @@ -753,10 +752,8 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); - for_each_member_device(c, ca) { - kvfree(rcu_dereference_protected(ca->buckets_gc, 1)); - ca->buckets_gc = NULL; - } + for_each_member_device(c, ca) + genradix_free(&ca->buckets_gc); } static int bch2_gc_start(struct bch_fs *c) @@ -910,20 +907,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c) int ret = 0; for_each_member_device(c, ca) { - struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO); - if (!buckets) { + ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); + if (ret) { bch2_dev_put(ca); ret = -BCH_ERR_ENOMEM_gc_alloc_start; break; } - - buckets->first_bucket = ca->mi.first_bucket; - buckets->nbuckets = ca->mi.nbuckets; - buckets->nbuckets_minus_first = - buckets->nbuckets - buckets->first_bucket; - rcu_assign_pointer(ca->buckets_gc, buckets); } bch_err_fn(c, ret); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 56ea9a77c..cb48a9477 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1666,7 +1666,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_btree_pos_to_text(&buf, c, b); bch_err_ratelimited(c, "%s", buf.buf); - if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) bch2_fatal_error(c); @@ -1749,10 +1749,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); + bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); ret = -BCH_ERR_btree_node_read_error; @@ -2031,7 +2029,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) do_write: BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); - atomic_dec(&c->btree_cache.dirty); + atomic_long_dec(&c->btree_cache.nr_dirty); BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 63d76f5c6..9b01ca3de 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -18,13 +18,13 @@ struct btree_node_read_all; static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) - atomic_inc(&c->btree_cache.dirty); + atomic_long_inc(&c->btree_cache.nr_dirty); } static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) - atomic_dec(&c->btree_cache.dirty); + atomic_long_dec(&c->btree_cache.nr_dirty); } static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k) diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 6d87e5774..aec89e001 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -6,6 +6,8 @@ #include "btree_types.h" #include "trace.h" +#include + void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); @@ -529,6 +531,12 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *); void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { size = roundup(size, 8); @@ -865,29 +873,33 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); (_do) ?: bch2_trans_relock(_trans); \ }) -#define allocate_dropping_locks_errcode(_trans, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - int _ret = _do; \ - \ - if (bch2_err_matches(_ret, ENOMEM)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, _do); \ - } \ - _ret; \ +#define memalloc_flags_do(_flags, _do) \ +({ \ + unsigned _saved_flags = memalloc_flags_save(_flags); \ + typeof(_do) _ret = _do; \ + memalloc_noreclaim_restore(_saved_flags); \ + _ret; \ }) -#define allocate_dropping_locks(_trans, _ret, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - typeof(_do) _p = _do; \ - \ - _ret = 0; \ - if (unlikely(!_p)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ - } \ - _p; \ +#define allocate_dropping_locks_errcode(_trans, _do) \ +({ \ + int _ret = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\ + \ + if (bch2_err_matches(_ret, ENOMEM)) { \ + _ret = drop_locks_do(_trans, _do); \ + } \ + _ret; \ +}) + +#define allocate_dropping_locks(_trans, _ret, _do) \ +({ \ + typeof(_do) _p = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\ + \ + _ret = 0; \ + if (unlikely(!_p)) { \ + _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ + } \ + _p; \ }) #define bch2_trans_run(_c, _do) \ diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c index 74933490a..c1657182c 100644 --- a/libbcachefs/btree_journal_iter.c +++ b/libbcachefs/btree_journal_iter.c @@ -530,6 +530,8 @@ static void __journal_keys_sort(struct journal_keys *keys) { sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); + cond_resched(); + struct journal_key *dst = keys->data; darray_for_each(*keys, src) { diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 2e49ca711..4b2423b09 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -116,8 +116,10 @@ static void bkey_cached_free(struct btree_key_cache *bc, this_cpu_inc(*bc->nr_pending); } -static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) +static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s) { + gfp_t gfp = GFP_KERNEL|__GFP_ACCOUNT|__GFP_RECLAIMABLE; + struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); if (unlikely(!ck)) return NULL; @@ -145,7 +147,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k goto lock; ck = allocate_dropping_locks(trans, ret, - __bkey_cached_alloc(key_u64s, _gfp)); + __bkey_cached_alloc(key_u64s)); if (ret) { if (ck) kfree(ck->k); @@ -239,7 +241,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); struct bkey_i *new_k = allocate_dropping_locks(trans, ret, - kmalloc(key_u64s * sizeof(u64), _gfp)); + kmalloc(key_u64s * sizeof(u64), GFP_KERNEL)); if (unlikely(!new_k)) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(ck->key.btree_id), key_u64s); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 0df07929c..4568a41fe 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -138,6 +138,31 @@ struct btree { struct list_head list; }; +#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ + x(lock_intent) \ + x(lock_write) \ + x(dirty) \ + x(read_in_flight) \ + x(write_in_flight) \ + x(noevict) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(access_bit) + +enum bch_btree_cache_not_freed_reasons { +#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n, + BCH_BTREE_CACHE_NOT_FREED_REASONS() +#undef x + BCH_BTREE_CACHE_NOT_FREED_REASONS_NR, +}; + +struct btree_cache_list { + unsigned idx; + struct shrinker *shrink; + struct list_head list; + size_t nr; +}; + struct btree_cache { struct rhashtable table; bool table_init_done; @@ -155,28 +180,19 @@ struct btree_cache { * should never grow past ~2-3 nodes in practice. */ struct mutex lock; - struct list_head live; struct list_head freeable; struct list_head freed_pcpu; struct list_head freed_nonpcpu; + struct btree_cache_list live[2]; - /* Number of elements in live + freeable lists */ - unsigned used; - unsigned reserve; - unsigned freed; - unsigned not_freed_lock_intent; - unsigned not_freed_lock_write; - unsigned not_freed_dirty; - unsigned not_freed_read_in_flight; - unsigned not_freed_write_in_flight; - unsigned not_freed_noevict; - unsigned not_freed_write_blocked; - unsigned not_freed_will_make_reachable; - unsigned not_freed_access_bit; - atomic_t dirty; - struct shrinker *shrink; + size_t nr_freeable; + size_t nr_reserve; + size_t nr_by_btree[BTREE_ID_NR]; + atomic_long_t nr_dirty; - unsigned used_by_btree[BTREE_ID_NR]; + /* shrinker stats */ + size_t nr_freed; + u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR]; /* * If we need to allocate memory for a new btree node and that @@ -189,8 +205,8 @@ struct btree_cache { struct bbpos pinned_nodes_start; struct bbpos pinned_nodes_end; - u64 pinned_nodes_leaf_mask; - u64 pinned_nodes_interior_mask; + /* btree id mask: 0 for leaves, 1 for interior */ + u64 pinned_nodes_mask[2]; }; struct btree_node_iter { @@ -582,7 +598,8 @@ enum btree_write_type { x(dying) \ x(fake) \ x(need_rewrite) \ - x(never_write) + x(never_write) \ + x(pinned) enum btree_flags { /* First bits for btree node write type */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 1433aefb4..190bc1e81 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -16,6 +16,7 @@ #include "clock.h" #include "error.h" #include "extents.h" +#include "io_write.h" #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" @@ -145,7 +146,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) printbuf_exit(&buf); return ret; topology_repair: - if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && + if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { bch2_inconsistent_error(c); ret = -BCH_ERR_btree_need_topology_repair; @@ -250,8 +251,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, unsigned i, level = b->c.level; bch2_btree_node_lock_write_nofail(trans, path, &b->c); + + mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); + __btree_node_free(trans, b); + six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); @@ -283,7 +289,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, clear_btree_node_need_write(b); mutex_lock(&c->btree_cache.lock); - list_del_init(&b->list); bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); @@ -1899,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * six_unlock_intent(&n->c.lock); mutex_lock(&c->btree_cache.lock); - list_add_tail(&b->list, &c->btree_cache.live); + list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); mutex_unlock(&c->btree_cache.lock); bch2_trans_verify_locks(trans); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index a2274429e..aef58043f 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -75,6 +75,15 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev *ca, struct bch_dev_usage *usage) { + if (out->nr_tabstops < 5) { + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + } + prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { @@ -100,12 +109,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (!ca) { - if (fsck_err(trans, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, + trans, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; return 0; } @@ -476,7 +486,7 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, return ret; err: bch2_dump_trans_updates(trans); - ret = -EIO; + ret = -BCH_ERR_bucket_ref_update; goto out; } @@ -562,8 +572,8 @@ static int bch2_trigger_pointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { - if (insert) - ret = -EIO; + if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) + ret = -BCH_ERR_trigger_pointer; goto err; } @@ -592,7 +602,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_trigger_pointer; goto err_unlock; } @@ -637,7 +647,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); - ret = -EIO; + ret = -BCH_ERR_trigger_stripe_pointer; goto err; } @@ -676,7 +686,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, (u64) p.ec.idx, buf.buf); printbuf_exit(&buf); bch2_inconsistent_error(c); - return -EIO; + return -BCH_ERR_trigger_stripe_pointer; } m->block_sectors[p.ec.block] += sectors; @@ -740,7 +750,7 @@ static int __trigger_extent(struct btree_trans *trans, return ret; } else if (!p.has_ec) { *replicas_sectors += disk_sectors; - acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev; + replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); if (ret) @@ -876,7 +886,7 @@ int bch2_trigger_extent(struct btree_trans *trans, need_rebalance_delta -= s != 0; need_rebalance_sectors_delta -= s; - s = bch2_bkey_sectors_need_rebalance(c, old); + s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; need_rebalance_sectors_delta += s; @@ -956,7 +966,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_data_type_str(a->v.data_type), bch2_data_type_str(type), bch2_data_type_str(type)); - ret = -EIO; + ret = -BCH_ERR_metadata_bucket_inconsistency; goto err; } @@ -1012,7 +1022,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * bucket_unlock(g); err_unlock: percpu_up_read(&c->mark_lock); - return -EIO; + return -BCH_ERR_metadata_bucket_inconsistency; } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index edbdffd50..e2cb7b24b 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b) TASK_UNINTERRUPTIBLE); } -static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) -{ - return rcu_dereference_check(ca->buckets_gc, - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->state_lock) || - lockdep_is_held(&ca->bucket_lock)); -} - static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - struct bucket_array *buckets = gc_bucket_array(ca); - - if (b - buckets->first_bucket >= buckets->nbuckets_minus_first) - return NULL; - return buckets->b + b; + return genradix_ptr(&ca->buckets_gc, b); } static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index a19460a1b..28bd09a25 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -19,14 +19,6 @@ struct bucket { u32 stripe_sectors; } __aligned(sizeof(long)); -struct bucket_array { - struct rcu_head rcu; - u16 first_bucket; - size_t nbuckets; - size_t nbuckets_minus_first; - struct bucket b[] __counted_by(nbuckets); -}; - struct bucket_gens { struct rcu_head rcu; u16 first_bucket; diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index e7208bf19..ce8fc677b 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -100,13 +100,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct scatterlist *sg, size_t len) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - int ret; skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); - ret = crypto_skcipher_encrypt(req); + int ret = crypto_skcipher_encrypt(req); if (ret) pr_err("got error %i from crypto_skcipher_encrypt()", ret); @@ -118,38 +117,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm, void *buf, size_t len) { if (!is_vmalloc_addr(buf)) { - struct scatterlist sg; - - sg_init_table(&sg, 1); - sg_set_page(&sg, - is_vmalloc_addr(buf) - ? vmalloc_to_page(buf) - : virt_to_page(buf), - len, offset_in_page(buf)); + struct scatterlist sg = {}; + + sg_mark_end(&sg); + sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf)); return do_encrypt_sg(tfm, nonce, &sg, len); } else { - unsigned pages = buf_pages(buf, len); - struct scatterlist *sg; - size_t orig_len = len; - int ret, i; - - sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); - if (!sg) - return -BCH_ERR_ENOMEM_do_encrypt; + DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; + size_t sgl_len = 0; + int ret; - sg_init_table(sg, pages); + darray_init(&sgl); - for (i = 0; i < pages; i++) { + while (len) { unsigned offset = offset_in_page(buf); - unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset); + struct scatterlist sg = { + .page_link = (unsigned long) vmalloc_to_page(buf), + .offset = offset, + .length = min(len, PAGE_SIZE - offset), + }; - sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); - buf += pg_len; - len -= pg_len; + if (darray_push(&sgl, sg)) { + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); + if (ret) + goto err; + + nonce = nonce_add(nonce, sgl_len); + sgl_len = 0; + sgl.nr = 0; + BUG_ON(darray_push(&sgl, sg)); + } + + buf += sg.length; + len -= sg.length; + sgl_len += sg.length; } - ret = do_encrypt_sg(tfm, nonce, sg, orig_len); - kfree(sg); + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); +err: + darray_exit(&sgl); return ret; } } @@ -325,39 +333,42 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, { struct bio_vec bv; struct bvec_iter iter; - struct scatterlist sgl[16], *sg = sgl; - size_t bytes = 0; + DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; + size_t sgl_len = 0; int ret = 0; if (!bch2_csum_type_is_encryption(type)) return 0; - sg_init_table(sgl, ARRAY_SIZE(sgl)); + darray_init(&sgl); bio_for_each_segment(bv, bio, iter) { - if (sg == sgl + ARRAY_SIZE(sgl)) { - sg_mark_end(sg - 1); - - ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + struct scatterlist sg = { + .page_link = (unsigned long) bv.bv_page, + .offset = bv.bv_offset, + .length = bv.bv_len, + }; + + if (darray_push(&sgl, sg)) { + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); if (ret) - return ret; + goto err; - nonce = nonce_add(nonce, bytes); - bytes = 0; + nonce = nonce_add(nonce, sgl_len); + sgl_len = 0; + sgl.nr = 0; - sg_init_table(sgl, ARRAY_SIZE(sgl)); - sg = sgl; + BUG_ON(darray_push(&sgl, sg)); } - sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); - bytes += bv.bv_len; - } - - if (sg != sgl) { - sg_mark_end(sg - 1); - return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + sgl_len += sg.length; } + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); +err: + darray_exit(&sgl); return ret; } diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 65176d51b..757b9884e 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -337,6 +337,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, printbuf_exit(&buf); bch2_fatal_error(c); + ret = -EIO; goto out; } @@ -570,7 +571,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, while (data_opts.kill_ptrs) { unsigned i = 0, drop = __fls(data_opts.kill_ptrs); - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); + bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); data_opts.kill_ptrs ^= 1U << drop; } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 141a4c631..6d8d5e6f5 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -18,6 +18,7 @@ #include "ec.h" #include "error.h" #include "io_read.h" +#include "io_write.h" #include "keylist.h" #include "recovery.h" #include "replicas.h" @@ -146,12 +147,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, bch2_prt_csum_type(out, s.csum_type); prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); + if (s.disk_label) { + prt_str(out, " label"); + bch2_disk_path_to_text(out, c, s.disk_label - 1); + } + for (unsigned i = 0; i < s.nr_blocks; i++) { const struct bch_extent_ptr *ptr = sp->ptrs + i; if ((void *) ptr >= bkey_val_end(k)) break; + prt_char(out, ' '); bch2_extent_ptr_to_text(out, c, ptr); if (s.csum_type < BCH_CSUM_NR && @@ -192,7 +199,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->stripe, s.k->p.offset, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -203,7 +210,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } } else { @@ -213,7 +220,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bucket.inode, bucket.offset, a->gen, a->stripe, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -223,7 +230,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bch2_data_type_str(a->data_type), bch2_data_type_str(data_type), (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -235,7 +242,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } } @@ -273,8 +280,8 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); if (unlikely(!ca)) { - if (!(flags & BTREE_TRIGGER_overwrite)) - ret = -EIO; + if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) + ret = -BCH_ERR_mark_stripe; goto err; } @@ -293,7 +300,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err_unlock; } @@ -351,6 +358,19 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } +static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) +{ + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->disk_label = s->disk_label; + m->blocks_nonempty = 0; + + for (unsigned i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); +} + int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -467,14 +487,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, memset(m, 0, sizeof(*m)); } else { - m->sectors = le16_to_cpu(new_s->sectors); - m->algorithm = new_s->algorithm; - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + stripe_to_mem(m, new_s); if (!old_s) bch2_stripes_heap_insert(c, m, idx); @@ -816,13 +829,15 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, } /* recovery read path: */ -int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) +int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct ec_stripe_buf *buf; + struct ec_stripe_buf *buf = NULL; struct closure cl; struct bch_stripe *v; unsigned i, offset; + const char *msg = NULL; int ret = 0; closure_init_stack(&cl); @@ -835,32 +850,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { - bch_err_ratelimited(c, - "error doing reconstruct read: error %i looking up stripe", ret); - kfree(buf); - return -EIO; + msg = "stripe not found"; + goto err; } v = &bkey_i_to_stripe(&buf->key)->v; if (!bch2_ptr_matches_stripe(v, rbio->pick)) { - bch_err_ratelimited(c, - "error doing reconstruct read: pointer doesn't match stripe"); - ret = -EIO; + msg = "pointer doesn't match stripe"; goto err; } offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { - bch_err_ratelimited(c, - "error doing reconstruct read: read is bigger than stripe"); - ret = -EIO; + msg = "read is bigger than stripe"; goto err; } ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); - if (ret) + if (ret) { + msg = "-ENOMEM"; goto err; + } for (i = 0; i < v->nr_blocks; i++) ec_block_io(c, buf, REQ_OP_READ, i, &cl); @@ -868,9 +879,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) closure_sync(&cl); if (ec_nr_failed(buf) > v->nr_redundant) { - bch_err_ratelimited(c, - "error doing reconstruct read: unable to read enough blocks"); - ret = -EIO; + msg = "unable to read enough blocks"; goto err; } @@ -882,20 +891,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); -err: +out: ec_stripe_buf_exit(buf); kfree(buf); return ret; +err: + struct printbuf msgbuf = PRINTBUF; + bch2_bkey_val_to_text(&msgbuf, c, orig_k); + bch_err_ratelimited(c, + "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); + printbuf_exit(&msgbuf);; + ret = -BCH_ERR_stripe_reconstruct; + goto out; } /* stripe bucket accounting: */ -static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) +static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx) { ec_stripes_heap n, *h = &c->ec_stripes_heap; if (idx >= h->size) { - if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) + if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), GFP_KERNEL)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; mutex_lock(&c->ec_stripes_heap_lock); @@ -909,11 +926,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) free_heap(&n); } - if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) + if (!genradix_ptr_alloc(&c->stripes, idx, GFP_KERNEL)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; if (c->gc_pos.phase != GC_PHASE_not_running && - !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) + !genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; return 0; @@ -923,7 +940,7 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans, struct btree_iter *iter) { return allocate_dropping_locks_errcode(trans, - __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); + __ec_stripe_mem_alloc(trans->c, iter->pos.offset)); } /* @@ -1305,7 +1322,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bkey_reassemble(n, k); - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); + bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev); ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); BUG_ON(!ec_ptr); @@ -1555,10 +1572,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); } -static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) +static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s = h->s; + lockdep_assert_held(&h->lock); + BUG_ON(!s->allocated && !s->err); h->s = NULL; @@ -1571,6 +1590,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) ec_stripe_new_put(c, s, STRIPE_REF_io); } +static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err) +{ + h->s->err = err; + ec_stripe_new_set_pending(c, h); +} + void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) { struct ec_stripe_new *s = ob->ec; @@ -1641,7 +1666,8 @@ static void ec_stripe_key_init(struct bch_fs *c, struct bkey_i *k, unsigned nr_data, unsigned nr_parity, - unsigned stripe_size) + unsigned stripe_size, + unsigned disk_label) { struct bkey_i_stripe *s = bkey_stripe_init(k); unsigned u64s; @@ -1652,7 +1678,7 @@ static void ec_stripe_key_init(struct bch_fs *c, s->v.nr_redundant = nr_parity; s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); s->v.csum_type = BCH_CSUM_crc32c; - s->v.pad = 0; + s->v.disk_label = disk_label; while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { BUG_ON(1 << s->v.csum_granularity_bits >= @@ -1685,40 +1711,30 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) s->nr_parity = h->redundancy; ec_stripe_key_init(c, &s->new_stripe.key, - s->nr_data, s->nr_parity, h->blocksize); + s->nr_data, s->nr_parity, + h->blocksize, h->disk_label); h->s = s; + h->nr_created++; return 0; } -static struct ec_stripe_head * -ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, - unsigned algo, unsigned redundancy, - enum bch_watermark watermark) +static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) { - struct ec_stripe_head *h; - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return NULL; - - mutex_init(&h->lock); - BUG_ON(!mutex_trylock(&h->lock)); - - h->target = target; - h->algo = algo; - h->redundancy = redundancy; - h->watermark = watermark; - rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_user, target); + h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label + ? group_to_target(h->disk_label - 1) + : 0); + unsigned nr_devs = dev_mask_nr(&h->devs); for_each_member_device_rcu(c, ca, &h->devs) if (!ca->mi.durability) __clear_bit(ca->dev_idx, h->devs.d); + unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); h->blocksize = pick_blocksize(c, &h->devs); + h->nr_active_devs = 0; for_each_member_device_rcu(c, ca, &h->devs) if (ca->mi.bucket_size == h->blocksize) h->nr_active_devs++; @@ -1729,9 +1745,47 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, * If we only have redundancy + 1 devices, we're better off with just * replication: */ - if (h->nr_active_devs < h->redundancy + 2) - bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", - h->nr_active_devs, h->redundancy + 2); + h->insufficient_devs = h->nr_active_devs < h->redundancy + 2; + + if (h->insufficient_devs) { + const char *err; + + if (nr_devs < h->redundancy + 2) + err = NULL; + else if (nr_devs_with_durability < h->redundancy + 2) + err = "cannot use durability=0 devices"; + else + err = "mismatched bucket sizes"; + + if (err) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s", + h->nr_active_devs, h->redundancy + 2, err); + } + + if (h->s && !h->s->allocated) + ec_stripe_new_cancel(c, h, -EINTR); + + h->rw_devs_change_count = c->rw_devs_change_count; +} + +static struct ec_stripe_head * +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label, + unsigned algo, unsigned redundancy, + enum bch_watermark watermark) +{ + struct ec_stripe_head *h; + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return NULL; + + mutex_init(&h->lock); + BUG_ON(!mutex_trylock(&h->lock)); + + h->disk_label = disk_label; + h->algo = algo; + h->redundancy = redundancy; + h->watermark = watermark; list_add(&h->list, &c->ec_stripe_head_list); return h; @@ -1743,14 +1797,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) h->s->allocated && bitmap_weight(h->s->blocks_allocated, h->s->nr_data) == h->s->nr_data) - ec_stripe_set_pending(c, h); + ec_stripe_new_set_pending(c, h); mutex_unlock(&h->lock); } static struct ec_stripe_head * __bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + unsigned disk_label, unsigned algo, unsigned redundancy, enum bch_watermark watermark) @@ -1768,27 +1822,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); - goto found; + goto err; } list_for_each_entry(h, &c->ec_stripe_head_list, list) - if (h->target == target && + if (h->disk_label == disk_label && h->algo == algo && h->redundancy == redundancy && h->watermark == watermark) { ret = bch2_trans_mutex_lock(trans, &h->lock); - if (ret) + if (ret) { h = ERR_PTR(ret); + goto err; + } goto found; } - h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); + h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); found: - if (!IS_ERR_OR_NULL(h) && - h->nr_active_devs < h->redundancy + 2) { + if (h->rw_devs_change_count != c->rw_devs_change_count) + ec_stripe_head_devs_update(c, h); + + if (h->insufficient_devs) { mutex_unlock(&h->lock); h = NULL; } +err: mutex_unlock(&c->ec_stripe_head_lock); return h; } @@ -1796,38 +1855,39 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, enum bch_watermark watermark, struct closure *cl) { + struct ec_stripe_new *s = h->s; struct bch_fs *c = trans->c; struct bch_devs_mask devs = h->devs; struct open_bucket *ob; struct open_buckets buckets; - struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; int ret = 0; - BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); - BUG_ON(v->nr_redundant != h->s->nr_parity); + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); + BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); - for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { __clear_bit(v->ptrs[i].dev, devs.d); - if (i < h->s->nr_data) + if (i < s->nr_data) nr_have_data++; else nr_have_parity++; } - BUG_ON(nr_have_data > h->s->nr_data); - BUG_ON(nr_have_parity > h->s->nr_parity); + BUG_ON(nr_have_data > s->nr_data); + BUG_ON(nr_have_parity > s->nr_parity); buckets.nr = 0; - if (nr_have_parity < h->s->nr_parity) { + if (nr_have_parity < s->nr_parity) { ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->parity_stripe, &devs, - h->s->nr_parity, + s->nr_parity, &nr_have_parity, &have_cache, 0, BCH_DATA_parity, @@ -1835,14 +1895,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ cl); open_bucket_for_each(c, &buckets, ob, i) { - j = find_next_zero_bit(h->s->blocks_gotten, - h->s->nr_data + h->s->nr_parity, - h->s->nr_data); - BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data + s->nr_parity, + s->nr_data); + BUG_ON(j >= s->nr_data + s->nr_parity); - h->s->blocks[j] = buckets.v[i]; + s->blocks[j] = buckets.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, h->s->blocks_gotten); + __set_bit(j, s->blocks_gotten); } if (ret) @@ -1850,11 +1910,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ } buckets.nr = 0; - if (nr_have_data < h->s->nr_data) { + if (nr_have_data < s->nr_data) { ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->block_stripe, &devs, - h->s->nr_data, + s->nr_data, &nr_have_data, &have_cache, 0, BCH_DATA_user, @@ -1862,13 +1922,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ cl); open_bucket_for_each(c, &buckets, ob, i) { - j = find_next_zero_bit(h->s->blocks_gotten, - h->s->nr_data, 0); - BUG_ON(j >= h->s->nr_data); + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data, 0); + BUG_ON(j >= s->nr_data); - h->s->blocks[j] = buckets.v[i]; + s->blocks[j] = buckets.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, h->s->blocks_gotten); + __set_bit(j, s->blocks_gotten); } if (ret) @@ -1878,7 +1938,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ return 0; } -/* XXX: doesn't obey target: */ static s64 get_existing_stripe(struct bch_fs *c, struct ec_stripe_head *head) { @@ -1901,7 +1960,8 @@ static s64 get_existing_stripe(struct bch_fs *c, m = genradix_ptr(&c->stripes, stripe_idx); - if (m->algorithm == head->algo && + if (m->disk_label == head->disk_label && + m->algorithm == head->algo && m->nr_redundant == head->redundancy && m->sectors == head->blocksize && m->blocks_nonempty < m->nr_blocks - m->nr_redundant && @@ -1914,72 +1974,75 @@ static s64 get_existing_stripe(struct bch_fs *c, return ret; } -static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) +static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) { - struct bch_fs *c = trans->c; - struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; - struct bch_stripe *existing_v; + struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; unsigned i; - s64 idx; - int ret; - - /* - * If we can't allocate a new stripe, and there's no stripes with empty - * blocks for us to reuse, that means we have to wait on copygc: - */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; - ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); - bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, - "reading stripe key: %s", bch2_err_str(ret)); - if (ret) { - bch2_stripe_close(c, h->s); - return ret; - } - - existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; - - BUG_ON(existing_v->nr_redundant != h->s->nr_parity); - h->s->nr_data = existing_v->nr_blocks - + BUG_ON(existing_v->nr_redundant != s->nr_parity); + s->nr_data = existing_v->nr_blocks - existing_v->nr_redundant; - ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); + int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); if (ret) { - bch2_stripe_close(c, h->s); + bch2_stripe_close(c, s); return ret; } - BUG_ON(h->s->existing_stripe.size != h->blocksize); - BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); + BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); /* * Free buckets we initially allocated - they might conflict with * blocks from the stripe we're reusing: */ - for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { - bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); - h->s->blocks[i] = 0; + for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { + bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); + s->blocks[i] = 0; } - memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); - memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); + memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); + memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); for (i = 0; i < existing_v->nr_blocks; i++) { if (stripe_blockcount_get(existing_v, i)) { - __set_bit(i, h->s->blocks_gotten); - __set_bit(i, h->s->blocks_allocated); + __set_bit(i, s->blocks_gotten); + __set_bit(i, s->blocks_allocated); } - ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); } - bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); - h->s->have_existing_stripe = true; + bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); + s->have_existing_stripe = true; return 0; } +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) +{ + struct bch_fs *c = trans->c; + s64 idx; + int ret; + + /* + * If we can't allocate a new stripe, and there's no stripes with empty + * blocks for us to reuse, that means we have to wait on copygc: + */ + idx = get_existing_stripe(c, h); + if (idx < 0) + return -BCH_ERR_stripe_alloc_blocked; + + ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); + bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, + "reading stripe key: %s", bch2_err_str(ret)); + if (ret) { + bch2_stripe_close(c, h->s); + return ret; + } + + return init_new_stripe_from_existing(c, h->s); +} + static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) { struct bch_fs *c = trans->c; @@ -2046,9 +2109,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, struct bch_fs *c = trans->c; struct ec_stripe_head *h; bool waiting = false; + unsigned disk_label = 0; + struct target t = target_decode(target); int ret; - h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); + if (t.type == TARGET_GROUP) { + if (t.group > U8_MAX) { + bch_err(c, "cannot create a stripe when disk_label > U8_MAX"); + return NULL; + } + disk_label = t.group + 1; /* 0 == no label */ + } + + h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); if (IS_ERR_OR_NULL(h)) return h; @@ -2126,6 +2199,79 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, return ERR_PTR(ret); } +/* device removal */ + +static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); + return -BCH_ERR_invalidate_stripe_to_dev; + } + + struct btree_iter iter; + struct bkey_i_stripe *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), + BTREE_ITER_slots, stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + + s64 sectors = 0; + for (unsigned i = 0; i < s->v.nr_blocks; i++) + sectors -= stripe_blockcount_get(&s->v, i); + + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) + goto err; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == k_a.k->p.inode) { + if (stripe_blockcount_get(&s->v, ptr - &ptrs.start->ptr)) { + bch_err(trans->c, "trying to invalidate device in stripe when stripe block not empty"); + ret = -BCH_ERR_invalidate_stripe_to_dev; + goto err; + } + ptr->dev = BCH_SB_MEMBER_INVALID; + } + + sectors = -sectors; + + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_dev_remove_stripes(struct bch_fs *c, struct bch_dev *ca) +{ + return bch2_trans_run(c, + for_each_btree_key_upto_commit(trans, iter, + BTREE_ID_alloc, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX), + BTREE_ITER_intent, k, + NULL, NULL, 0, ({ + bch2_invalidate_stripe_to_dev(trans, k); + }))); +} + +/* startup/shutdown */ + static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) { struct ec_stripe_head *h; @@ -2151,8 +2297,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) } goto unlock; found: - h->s->err = -BCH_ERR_erofs_no_writes; - ec_stripe_set_pending(c, h); + ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); unlock: mutex_unlock(&h->lock); } @@ -2193,21 +2338,13 @@ int bch2_stripes_read(struct bch_fs *c) if (k.k->type != KEY_TYPE_stripe) continue; - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + ret = __ec_stripe_mem_alloc(c, k.k->p.offset); if (ret) break; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + stripe_to_mem(m, bkey_s_c_to_stripe(k).v); bch2_stripes_heap_insert(c, m, k.k->p.offset); 0; @@ -2252,6 +2389,8 @@ static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) prt_printf(out, " %u", s->blocks[i]); prt_newline(out); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key)); + prt_newline(out); } void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) @@ -2261,9 +2400,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "target %u algo %u redundancy %u %s:\n", - h->target, h->algo, h->redundancy, - bch2_watermarks[h->watermark]); + prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", + h->disk_label, h->algo, h->redundancy, + bch2_watermarks[h->watermark], + h->nr_created); if (h->s) bch2_new_stripe_to_text(out, c, h->s); diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 90962b3c0..a2e2d79d1 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -97,7 +97,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe const struct bch_extent_ptr *data_ptr, unsigned sectors) { - return data_ptr->dev == stripe_ptr->dev && + return (data_ptr->dev == stripe_ptr->dev || + data_ptr->dev == BCH_SB_MEMBER_INVALID || + stripe_ptr->dev == BCH_SB_MEMBER_INVALID) && data_ptr->gen == stripe_ptr->gen && data_ptr->offset >= stripe_ptr->offset && data_ptr->offset < stripe_ptr->offset + sectors; @@ -186,10 +188,15 @@ struct ec_stripe_head { struct list_head list; struct mutex lock; - unsigned target; + unsigned disk_label; unsigned algo; unsigned redundancy; enum bch_watermark watermark; + bool insufficient_devs; + + unsigned long rw_devs_change_count; + + u64 nr_created; struct bch_devs_mask devs; unsigned nr_active_devs; @@ -202,7 +209,7 @@ struct ec_stripe_head { struct ec_stripe_new *s; }; -int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *); +int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); @@ -247,6 +254,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } } +int bch2_dev_remove_stripes(struct bch_fs *, struct bch_dev *); + void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); void bch2_fs_ec_flush(struct bch_fs *); diff --git a/libbcachefs/ec_format.h b/libbcachefs/ec_format.h index 44ce88ba0..64ef52e00 100644 --- a/libbcachefs/ec_format.h +++ b/libbcachefs/ec_format.h @@ -11,7 +11,14 @@ struct bch_stripe { __u8 csum_granularity_bits; __u8 csum_type; - __u8 pad; + + /* + * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2 + * + * we can manage with this because this only needs to point to a + * disk label, not a target: + */ + __u8 disk_label; struct bch_extent_ptr ptrs[]; } __packed __aligned(8); diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index 1df03dccf..8d1e70e83 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -16,6 +16,7 @@ struct stripe { u8 nr_blocks; u8 nr_redundant; u8 blocks_nonempty; + u8 disk_label; }; struct gc_stripe { diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index ab5a7adec..60b7875ad 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -119,8 +119,8 @@ x(EEXIST, EEXIST_str_hash_set) \ x(EEXIST, EEXIST_discard_in_flight_add) \ x(EEXIST, EEXIST_subvolume_create) \ - x(0, open_buckets_empty) \ - x(0, freelist_empty) \ + x(ENOSPC, open_buckets_empty) \ + x(ENOSPC, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ x(0, transaction_restart) \ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ @@ -244,6 +244,16 @@ x(EIO, btree_node_read_error) \ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ + x(EIO, bucket_ref_update) \ + x(EIO, trigger_pointer) \ + x(EIO, trigger_stripe_pointer) \ + x(EIO, metadata_bucket_inconsistency) \ + x(EIO, mark_stripe) \ + x(EIO, stripe_reconstruct) \ + x(EIO, key_type_error) \ + x(EIO, no_device_to_read_from) \ + x(EIO, missing_indirect_extent) \ + x(EIO, invalidate_stripe_to_dev) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ @@ -257,7 +267,6 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) \ x(0, invalid_snapshot_node) \ x(0, option_needs_open_fs) diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index e317df364..5467d0f92 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -115,7 +115,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, int ret = 0; if (k.k->type == KEY_TYPE_error) - return -EIO; + return -BCH_ERR_key_type_error; rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -133,7 +133,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, * read: */ if (!ret && !p.ptr.cached) - ret = -EIO; + ret = -BCH_ERR_no_device_to_read_from; struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); @@ -146,16 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ? f->idx : f->idx + 1; - if (!p.idx && !ca) + if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) p.idx++; if (!p.idx && p.has_ec && bch2_force_reconstruct_read) p.idx++; - if (!p.idx && !bch2_dev_is_readable(ca)) - p.idx++; - - if (p.idx >= (unsigned) p.has_ec + 1) + if (p.idx > (unsigned) p.has_ec) continue; if (ret > 0 && !ptr_better(c, p, *pick)) @@ -781,14 +778,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, /* * Returns pointer to the next entry after the one being dropped: */ -union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; - union bch_extent_entry *ret = entry; bool drop_crc = true; + if (k.k->type == KEY_TYPE_stripe) { + ptr->dev = BCH_SB_MEMBER_INVALID; + return; + } + EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); @@ -811,21 +811,28 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, break; if ((extent_entry_is_crc(entry) && drop_crc) || - extent_entry_is_stripe_ptr(entry)) { - ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_is_stripe_ptr(entry)) extent_entry_drop(k, entry); - } } - - return ret; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) { + if (k.k->type != KEY_TYPE_stripe) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == ptr->dev && p.has_ec) { + ptr->dev = BCH_SB_MEMBER_INVALID; + return; + } + } + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; - union bch_extent_entry *ret = - bch2_bkey_drop_ptr_noerror(k, ptr); + + bch2_bkey_drop_ptr_noerror(k, ptr); /* * If we deleted all the dirty pointers and there's still cached @@ -837,14 +844,10 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, !bch2_bkey_dirty_devs(k.s_c).nr) { k.k->type = KEY_TYPE_error; set_bkey_val_u64s(k.k, 0); - ret = NULL; } else if (!bch2_bkey_nr_ptrs(k.s_c)) { k.k->type = KEY_TYPE_deleted; set_bkey_val_u64s(k.k, 0); - ret = NULL; } - - return ret; } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) @@ -854,10 +857,7 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); - - if (ptr) - bch2_bkey_drop_ptr_noerror(k, ptr); + bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); } const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) @@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) if (p1.ptr.dev == p2.ptr.dev && p1.ptr.gen == p2.ptr.gen && + + /* + * This checks that the two pointers point + * to the same region on disk - adjusting + * for the difference in where the extents + * start, since one may have been trimmed: + */ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && + + /* + * This additionally checks that the + * extents overlap on disk, since the + * previous check may trigger spuriously + * when one extent is immediately partially + * overwritten with another extent (so that + * on disk they are adjacent) and + * compression is in use: + */ + ((p1.ptr.offset >= p2.ptr.offset && + p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || + (p2.ptr.offset >= p1.ptr.offset && + p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) return true; return false; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 8e1ba46fb..ed5001dd6 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -611,9 +611,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -void bch2_bkey_drop_device(struct bkey_s, unsigned); -void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); - const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) @@ -649,26 +646,38 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); -union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, - struct bch_extent_ptr *); -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, - struct bch_extent_ptr *); +void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); +void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); -#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ +void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); +void bch2_bkey_drop_device(struct bkey_s, unsigned); + +#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \ do { \ - struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ + __label__ _again; \ + struct bkey_ptrs _ptrs; \ +_again: \ + _ptrs = bch2_bkey_ptrs(_k); \ \ - struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \ - \ - while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ + bkey_for_each_ptr(_ptrs, _ptr) \ if (_cond) { \ - _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ - _ptrs = bch2_bkey_ptrs(_k); \ - continue; \ + bch2_bkey_drop_ptr_noerror(_k, _ptr); \ + goto _again; \ } \ +} while (0) + +#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ +do { \ + __label__ _again; \ + struct bkey_ptrs _ptrs; \ +_again: \ + _ptrs = bch2_bkey_ptrs(_k); \ \ - (_ptr)++; \ - } \ + bkey_for_each_ptr(_ptrs, _ptr) \ + if (_cond) { \ + bch2_bkey_drop_ptr(_k, _ptr); \ + goto _again; \ + } \ } while (0) bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 508d029ac..7e10a9ddc 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, + BTREE_ITER_intent|BTREE_ITER_with_updates); if (ret) goto err; @@ -163,7 +164,7 @@ int bch2_create_trans(struct btree_trans *trans, name, dir_target, &dir_offset, - STR_HASH_must_create); + STR_HASH_must_create|BTREE_ITER_with_updates); if (ret) goto err; diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index f5cff824b..99fef9342 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -791,8 +791,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len, - bool inode_locked) + loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -816,15 +815,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); - /* - * If we're not using the inode lock, we need to lock all the folios for - * atomiticity of writes vs. other writes: - */ - if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { - ret = -BCH_ERR_need_inode_lock; - goto out; - } - f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -921,10 +911,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) { - BUG_ON(!inode_locked); + if (end > inode->v.i_size) i_size_write(&inode->v, end); - } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -968,68 +956,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos; - bool inode_locked = false; - ssize_t written = 0, written2 = 0, ret = 0; - - /* - * We don't take the inode lock unless i_size will be changing. Folio - * locks provide exclusion with other writes, and the pagecache add lock - * provides exclusion with truncate and hole punching. - * - * There is one nasty corner case where atomicity would be broken - * without great care: when copying data from userspace to the page - * cache, we do that with faults disable - a page fault would recurse - * back into the filesystem, taking filesystem locks again, and - * deadlock; so it's done with faults disabled, and we fault in the user - * buffer when we aren't holding locks. - * - * If we do part of the write, but we then race and in the userspace - * buffer have been evicted and are no longer resident, then we have to - * drop our folio locks to re-fault them in, breaking write atomicity. - * - * To fix this, we restart the write from the start, if we weren't - * holding the inode lock. - * - * There is another wrinkle after that; if we restart the write from the - * start, and then get an unrecoverable error, we _cannot_ claim to - * userspace that we did not write data we actually did - so we must - * track (written2) the most we ever wrote. - */ - - if ((iocb->ki_flags & IOCB_APPEND) || - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { - inode_lock(&inode->v); - inode_locked = true; - } - - ret = generic_write_checks(iocb, iter); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); - if (ret) { - if (!inode_locked) { - inode_lock(&inode->v); - inode_locked = true; - ret = file_remove_privs_flags(file, 0); - } - if (ret) - goto unlock; - } - - ret = file_update_time(file); - if (ret) - goto unlock; - - pos = iocb->ki_pos; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; bch2_pagecache_add_get(inode); - if (!inode_locked && - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) - goto get_inode_lock; - do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1054,17 +986,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) } } - if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) - goto get_inode_lock; - if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); - if (ret == -BCH_ERR_need_inode_lock) - goto get_inode_lock; + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); if (unlikely(ret < 0)) break; @@ -1085,46 +1012,50 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) } pos += ret; written += ret; - written2 = max(written, written2); - - if (ret != bytes && !inode_locked) - goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - - if (0) { -get_inode_lock: - bch2_pagecache_add_put(inode); - inode_lock(&inode->v); - inode_locked = true; - bch2_pagecache_add_get(inode); - - iov_iter_revert(iter, written); - pos -= written; - written = 0; - ret = 0; - } } while (iov_iter_count(iter)); - bch2_pagecache_add_put(inode); -unlock: - if (inode_locked) - inode_unlock(&inode->v); - iocb->ki_pos += written; + bch2_pagecache_add_put(inode); - ret = max(written, written2) ?: ret; - if (ret > 0) - ret = generic_write_sync(iocb, ret); - return ret; + return written ? written : ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { - ssize_t ret = iocb->ki_flags & IOCB_DIRECT - ? bch2_direct_write(iocb, iter) - : bch2_buffered_write(iocb, iter); + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: return bch2_err_class(ret); } diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 7a9c164cb..12c1873ff 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -273,14 +273,6 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, } } -#define memalloc_flags_do(_flags, _do) \ -({ \ - unsigned _saved_flags = memalloc_flags_save(_flags); \ - typeof(_do) _ret = _do; \ - memalloc_noreclaim_restore(_saved_flags); \ - _ret; \ -}) - static struct inode *bch2_alloc_inode(struct super_block *sb) { BUG(); @@ -380,6 +372,8 @@ __bch2_create(struct mnt_idmap *idmap, subvol_inum inum; struct bch_subvolume subvol; u64 journal_seq = 0; + kuid_t kuid; + kgid_t kgid; int ret; /* @@ -406,13 +400,15 @@ __bch2_create(struct mnt_idmap *idmap, retry: bch2_trans_begin(trans); + kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); + kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, - from_kuid(i_user_ns(&dir->v), current_fsuid()), - from_kgid(i_user_ns(&dir->v), current_fsgid()), + from_kuid(i_user_ns(&dir->v), kuid), + from_kgid(i_user_ns(&dir->v), kgid), mode, rdev, default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, @@ -727,15 +723,16 @@ static int bch2_rename2(struct mnt_idmap *idmap, struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; - struct bch_inode_unpacked src_inode_u, dst_inode_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; + bool whiteout = !!(flags & RENAME_WHITEOUT); int ret; - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) return -EINVAL; if (mode == BCH_RENAME_OVERWRITE) { @@ -776,18 +773,48 @@ static int bch2_rename2(struct mnt_idmap *idmap, if (ret) goto err; } +retry: + bch2_trans_begin(trans); - ret = commit_do(trans, NULL, NULL, 0, - bch2_rename_trans(trans, - inode_inum(src_dir), &src_dir_u, - inode_inum(dst_dir), &dst_dir_u, - &src_inode_u, - &dst_inode_u, - &src_dentry->d_name, - &dst_dentry->d_name, - mode)); + ret = bch2_rename_trans(trans, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode); if (unlikely(ret)) + goto err_tx_restart; + + if (whiteout) { + whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); + ret = PTR_ERR_OR_ZERO(whiteout_inode_u); + if (unlikely(ret)) + goto err_tx_restart; + bch2_inode_init_early(c, whiteout_inode_u); + + ret = bch2_create_trans(trans, + inode_inum(src_dir), &src_dir_u, + whiteout_inode_u, + &src_dentry->d_name, + from_kuid(i_user_ns(&src_dir->v), current_fsuid()), + from_kgid(i_user_ns(&src_dir->v), current_fsgid()), + S_IFCHR|WHITEOUT_MODE, 0, + NULL, NULL, (subvol_inum) { 0 }, 0) ?: + bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) + goto err_tx_restart; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (unlikely(ret)) { +err_tx_restart: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; goto err; + } BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); BUG_ON(dst_inode && @@ -835,11 +862,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, { struct bch_fs *c = inode->v.i_sb->s_fs_info; unsigned int ia_valid = attr->ia_valid; + kuid_t kuid; + kgid_t kgid; - if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid); - if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (ia_valid & ATTR_UID) { + kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); + bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); + } + if (ia_valid & ATTR_GID) { + kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); + bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); + } if (ia_valid & ATTR_SIZE) bi->bi_size = attr->ia_size; @@ -854,11 +887,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; kgid_t gid = ia_valid & ATTR_GID - ? attr->ia_gid + ? kgid : inode->v.i_gid; - if (!in_group_p(gid) && - !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID)) + if (!in_group_or_capable(idmap, &inode->v, + make_vfsgid(idmap, i_user_ns(&inode->v), gid))) mode &= ~S_ISGID; bi->bi_mode = mode; } @@ -874,17 +907,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; + kuid_t kuid; + kgid_t kgid; int ret; mutex_lock(&inode->ei_update_lock); qid = inode->ei_qid; - if (attr->ia_valid & ATTR_UID) - qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid); + if (attr->ia_valid & ATTR_UID) { + kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); + qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); + } - if (attr->ia_valid & ATTR_GID) - qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (attr->ia_valid & ATTR_GID) { + kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); + qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); + } ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); @@ -940,13 +979,15 @@ static int bch2_getattr(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); stat->dev = inode->v.i_sb->s_dev; stat->ino = inode->v.i_ino; stat->mode = inode->v.i_mode; stat->nlink = inode->v.i_nlink; - stat->uid = inode->v.i_uid; - stat->gid = inode->v.i_gid; + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); stat->atime = inode_get_atime(&inode->v); @@ -1865,30 +1906,13 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; - enum bch_opt_id i; struct printbuf buf = PRINTBUF; - int ret = 0; - - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); - if ((opt->flags & OPT_HIDDEN) || - !(opt->flags & OPT_MOUNT)) - continue; + bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, + OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); + seq_puts(seq, buf.buf); - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - printbuf_reset(&buf); - bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, - OPT_SHOW_MOUNT_STYLE); - seq_putc(seq, ','); - seq_puts(seq, buf.buf); - } - - if (buf.allocation_failure) - ret = -ENOMEM; + int ret = buf.allocation_failure ? -ENOMEM : 0; printbuf_exit(&buf); return ret; } @@ -2209,7 +2233,7 @@ static struct file_system_type bcache_fs_type = { .name = "bcachefs", .init_fs_context = bch2_init_fs_context, .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("bcachefs"); diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index ce27ba1f0..b2f50e74b 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -777,7 +777,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, orig_k->k->k.size, reflink_offset); bch2_inconsistent_error(trans->c); - ret = -EIO; + ret = -BCH_ERR_missing_indirect_extent; goto err; } @@ -869,9 +869,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto hole; if (pick_ret < 0) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + bch_err_inum_offset_ratelimited(c, read_pos.inode, read_pos.offset << 9, - "no device to read from"); + "no device to read from: %s\n %s", + bch2_err_str(pick_ret), + buf.buf); + printbuf_exit(&buf); goto err; } @@ -1086,7 +1092,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, trans->notrace_relock_fail = true; } else { /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(trans, rbio)) { + if (bch2_ec_read_extent(trans, rbio, k)) { bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index 1d4761d15..d3b5be7fd 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -1447,9 +1447,7 @@ static void __bch2_write(struct bch_write_op *op) op->nr_replicas_required, op->watermark, op->flags, - (op->flags & (BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_ONLY_SPECIFIED_DEVS)) - ? NULL : &op->cl, &wp)); + &op->cl, &wp)); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break; @@ -1592,6 +1590,9 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) + op->flags |= BCH_WRITE_ALLOC_NOWAIT; + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 32b886feb..30460bce0 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { struct bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, + .e.nr_devs = 0, .e.nr_required = 1, }; @@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c, goto err; darray_for_each(i->ptrs, ptr) - replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; + replicas_entry_add_dev(&replicas.e, ptr->dev); bch2_replicas_entry_sort(&replicas.e); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 70b998d9f..ace291f17 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j) static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_cache *bc = &c->btree_cache; bool kthread = (current->flags & PF_KTHREAD) != 0; u64 seq_to_flush; size_t min_nr, min_key_cache, nr_flushed; @@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; - if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) + size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr; + if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live) min_nr = 1; min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); @@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - atomic_read(&c->btree_cache.dirty), - c->btree_cache.used, + atomic_long_read(&bc->nr_dirty), btree_cache_live, atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 0770aebef..232be8a44 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -432,6 +432,9 @@ void bch2_opt_to_text(struct printbuf *out, else prt_str(out, opt->choices[v]); break; + case BCH_OPT_BITFIELD: + prt_bitflags(out, opt->choices, v); + break; case BCH_OPT_FN: opt->fn.to_text(out, c, sb, v); break; @@ -440,6 +443,32 @@ void bch2_opt_to_text(struct printbuf *out, } } +void bch2_opts_to_text(struct printbuf *out, + struct bch_opts opts, + struct bch_fs *c, struct bch_sb *sb, + unsigned show_mask, unsigned hide_mask, + unsigned flags) +{ + bool first = true; + + for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + + if ((opt->flags & hide_mask) || !(opt->flags & show_mask)) + continue; + + u64 v = bch2_opt_get_by_id(&opts, i); + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + if (!first) + prt_char(out, ','); + first = false; + + bch2_opt_to_text(out, c, sb, opt, v, flags); + } +} + int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) { int ret = 0; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 3d83bcdca..cb2e244a2 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -373,6 +373,16 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Exit recovery immediately prior to journal replay")\ + x(recovery_passes, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_BITFIELD(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Recovery passes to run explicitly") \ + x(recovery_passes_exclude, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_BITFIELD(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Recovery passes to exclude") \ x(recovery_pass_last, u8, \ OPT_FS|OPT_MOUNT, \ OPT_STR_NOLIMIT(bch2_recovery_passes), \ @@ -595,6 +605,10 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *, void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, const struct bch_option *, u64, unsigned); +void bch2_opts_to_text(struct printbuf *, + struct bch_opts, + struct bch_fs *, struct bch_sb *, + unsigned, unsigned, unsigned); int bch2_opt_check_may_set(struct bch_fs *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); diff --git a/libbcachefs/rcu_pending.c b/libbcachefs/rcu_pending.c index 19a646606..40a20192e 100644 --- a/libbcachefs/rcu_pending.c +++ b/libbcachefs/rcu_pending.c @@ -219,9 +219,9 @@ static noinline void __process_finished_items(struct rcu_pending *pending, BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); - kvfree(ptr); - bool free_head = ((unsigned long) obj->func) & 1UL; + + kvfree(ptr); if (free_head) kfree(obj); } diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index cf81e5128..2d299a37c 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -13,6 +13,7 @@ #include "errcode.h" #include "error.h" #include "inode.h" +#include "io_write.h" #include "move.h" #include "rebalance.h" #include "subvolume.h" @@ -156,6 +157,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); data_opts->target = r->target; + data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; if (!data_opts->rewrite_ptrs) { /* @@ -263,6 +265,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); data_opts->target = target; + data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; return data_opts->rewrite_ptrs != 0; } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 36de1c6fe..be1e7ca43 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -97,7 +97,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, @@ -525,17 +525,17 @@ static int read_btree_roots(struct bch_fs *c) "error reading btree root %s l=%u: %s", bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) { - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); r->error = 0; - } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { + } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { bch_info(c, "will run btree node scan"); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); } ret = 0; @@ -706,14 +706,14 @@ int bch2_fs_recovery(struct bch_fs *c) if (check_version_upgrade(c)) write_sb = true; - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); if (c->opts.fsck) set_bit(BCH_FS_fsck_running, &c->flags); diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c index 73339a0a3..735b8adc8 100644 --- a/libbcachefs/recovery_passes.c +++ b/libbcachefs/recovery_passes.c @@ -40,7 +40,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit) + if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) return bch2_fs_read_write_early(c); return 0; } @@ -97,14 +97,14 @@ u64 bch2_recovery_passes_from_stable(u64 v) int bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - if (c->recovery_passes_explicit & BIT_ULL(pass)) + if (c->opts.recovery_passes & BIT_ULL(pass)) return 0; bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", bch2_recovery_passes[pass], pass, bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); - c->recovery_passes_explicit |= BIT_ULL(pass); + c->opts.recovery_passes |= BIT_ULL(pass); if (c->curr_recovery_pass >= pass) { c->curr_recovery_pass = pass; @@ -161,7 +161,9 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa { struct recovery_pass_fn *p = recovery_pass_fns + pass; - if (c->recovery_passes_explicit & BIT_ULL(pass)) + if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) + return false; + if (c->opts.recovery_passes & BIT_ULL(pass)) return true; if ((p->when & PASS_FSCK) && c->opts.fsck) return true; diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 12b1d28b7..5ceda186f 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -82,7 +82,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, } for (unsigned i = 0; i < r->nr_devs; i++) - if (!bch2_member_exists(sb, r->devs[i])) { + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_member_exists(sb, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } @@ -122,7 +123,7 @@ static void extent_to_replicas(struct bkey_s_c k, continue; if (!p.has_ec) - r->devs[r->nr_devs++] = p.ptr.dev; + replicas_entry_add_dev(r, p.ptr.dev); else r->nr_required = 0; } @@ -139,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k, for (ptr = s.v->ptrs; ptr < s.v->ptrs + s.v->nr_blocks; ptr++) - r->devs[r->nr_devs++] = ptr->dev; + replicas_entry_add_dev(r, ptr->dev); } void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, @@ -180,7 +181,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, e->nr_required = 1; darray_for_each(devs, i) - e->devs[e->nr_devs++] = *i; + replicas_entry_add_dev(e, *i); bch2_replicas_entry_sort(e); } @@ -795,11 +796,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, nr_online += test_bit(e->devs[i], devs.d); struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); - nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed; + nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; } rcu_read_unlock(); - if (nr_failed == e->nr_devs) + if (nr_online + nr_failed == e->nr_devs) continue; if (nr_online < e->nr_required) diff --git a/libbcachefs/replicas_format.h b/libbcachefs/replicas_format.h index b97208195..b7eff904a 100644 --- a/libbcachefs/replicas_format.h +++ b/libbcachefs/replicas_format.h @@ -5,7 +5,7 @@ struct bch_replicas_entry_v0 { __u8 data_type; __u8 nr_devs; - __u8 devs[]; + __u8 devs[] __counted_by(nr_devs); } __packed; struct bch_sb_field_replicas_v0 { @@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 { __u8 data_type; __u8 nr_devs; __u8 nr_required; - __u8 devs[]; + __u8 devs[] __counted_by(nr_devs); } __packed; struct bch_sb_field_replicas { @@ -28,4 +28,9 @@ struct bch_sb_field_replicas { #define replicas_entry_bytes(_i) \ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) +#define replicas_entry_add_dev(e, d) ({ \ + (e)->nr_devs++; \ + (e)->devs[(e)->nr_devs - 1] = (d); \ +}) + #endif /* _BCACHEFS_REPLICAS_FORMAT_H */ diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 317602017..f0c14702f 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -288,10 +288,10 @@ enum bch_fsck_flags { x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ - x(accounting_key_junk_at_end, 277, 0) \ - x(accounting_key_replicas_nr_devs_0, 278, 0) \ - x(accounting_key_replicas_nr_required_bad, 279, 0) \ - x(accounting_key_replicas_devs_unsorted, 280, 0) \ + x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ + x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c index b4ea6490d..02bcde3c1 100644 --- a/libbcachefs/sb-members.c +++ b/libbcachefs/sb-members.c @@ -11,7 +11,8 @@ void bch2_dev_missing(struct bch_fs *c, unsigned dev) { - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); + if (dev != BCH_SB_MEMBER_INVALID) + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); } void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) @@ -473,3 +474,51 @@ unsigned bch2_sb_nr_devices(const struct bch_sb *sb) nr += bch2_member_exists((struct bch_sb *) sb, i); return nr; } + +int bch2_sb_member_alloc(struct bch_fs *c) +{ + unsigned dev_idx = c->sb.nr_devices; + struct bch_sb_field_members_v2 *mi; + unsigned nr_devices; + unsigned u64s; + int best = -1; + u64 best_last_mount = 0; + + if (dev_idx < BCH_SB_MEMBERS_MAX) + goto have_slot; + + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { + /* eventually BCH_SB_MEMBERS_MAX will be raised */ + if (dev_idx == BCH_SB_MEMBER_INVALID) + continue; + + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + if (bch2_member_alive(&m)) + continue; + + u64 last_mount = le64_to_cpu(m.last_mount); + if (best < 0 || last_mount < best_last_mount) { + best = dev_idx; + best_last_mount = last_mount; + } + } + if (best >= 0) { + dev_idx = best; + goto have_slot; + } + + return -BCH_ERR_ENOSPC_sb_members; +have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + + mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + + le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); + if (!mi) + return -BCH_ERR_ENOSPC_sb_members; + + c->disk_sb.sb->nr_devices = nr_devices; + return dev_idx; +} diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index f307f2857..762083b56 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -198,29 +198,37 @@ static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) lockdep_is_held(&c->state_lock)); } -static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) +static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev) { return c && dev < c->sb.nr_devices ? rcu_dereference(c->devs[dev]) : NULL; } +void bch2_dev_missing(struct bch_fs *, unsigned); + +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); + if (unlikely(!ca)) + bch2_dev_missing(c, dev); + return ca; +} + static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (ca) bch2_dev_get(ca); rcu_read_unlock(); return ca; } -void bch2_dev_missing(struct bch_fs *, unsigned); - static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); - if (!ca) + if (unlikely(!ca)) bch2_dev_missing(c, dev); return ca; } @@ -354,4 +362,6 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); +int bch2_sb_member_alloc(struct bch_fs *); + #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/libbcachefs/sb-members_format.h b/libbcachefs/sb-members_format.h index e2630548c..d727d2dfd 100644 --- a/libbcachefs/sb-members_format.h +++ b/libbcachefs/sb-members_format.h @@ -8,6 +8,11 @@ */ #define BCH_SB_MEMBERS_MAX 64 +/* + * Sentinal value - indicates a device that does not exist + */ +#define BCH_SB_MEMBER_INVALID 255 + #define BCH_MIN_NR_NBUCKETS (1 << 6) #define BCH_IOPS_MEASUREMENTS() \ diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index c8c266cb5..215eed4cc 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -270,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index d86d5dae5..77597fd71 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -524,7 +524,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; /* XXX this is wrong, we need a 96 or 128 bit integer type */ - c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), + c->sb.time_base_lo = div64_u64(le64_to_cpu(src->time_base_lo), c->sb.nsec_per_time_unit); c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index d8adf465f..873e4be7e 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -370,7 +370,7 @@ void bch2_fs_read_only(struct bch_fs *c) test_bit(BCH_FS_clean_shutdown, &c->flags) && c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); - BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); @@ -1592,33 +1592,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, /* Device add/removal: */ -static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - struct bpos start = POS(ca->dev_idx, 0); - struct bpos end = POS(ca->dev_idx, U64_MAX); - int ret; - - /* - * We clear the LRU and need_discard btrees first so that we don't race - * with bch2_do_invalidates() and bch2_do_discards() - */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_dev_usage_remove(c, ca->dev_idx); - bch_err_msg(c, ret, "removing dev alloc info"); - return ret; -} - int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; @@ -1730,9 +1703,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; struct bch_dev *ca = NULL; - struct bch_sb_field_members_v2 *mi; - struct bch_member dev_mi; - unsigned dev_idx, nr_devices, u64s; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; int ret; @@ -1742,7 +1712,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); + struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); @@ -1780,55 +1750,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_unlock; if (dynamic_fault("bcachefs:add:no_slot")) - goto no_slot; - - if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) { - dev_idx = c->sb.nr_devices; - goto have_slot; - } - - int best = -1; - u64 best_last_mount = 0; - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - if (bch2_member_alive(&m)) - continue; - - u64 last_mount = le64_to_cpu(m.last_mount); - if (best < 0 || last_mount < best_last_mount) { - best = dev_idx; - best_last_mount = last_mount; - } - } - if (best >= 0) { - dev_idx = best; - goto have_slot; - } -no_slot: - ret = -BCH_ERR_ENOSPC_sb_members; - bch_err_msg(c, ret, "setting up new superblock"); - goto err_unlock; - -have_slot: - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); - - mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + - le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + goto err_unlock; - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) { - ret = -BCH_ERR_ENOSPC_sb_members; + ret = bch2_sb_member_alloc(c); + if (ret < 0) { bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); + unsigned dev_idx = ret; /* success: */ - *m = dev_mi; - m->last_mount = cpu_to_le64(ktime_get_real_seconds()); - c->disk_sb.sb->nr_devices = nr_devices; + dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); + *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 4a373581b..03e59f86f 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = { static size_t bch2_btree_cache_size(struct bch_fs *c) { + struct btree_cache *bc = &c->btree_cache; size_t ret = 0; struct btree *b; - mutex_lock(&c->btree_cache.lock); - list_for_each_entry(b, &c->btree_cache.live, list) + mutex_lock(&bc->lock); + list_for_each_entry(b, &bc->live[0].list, list) ret += btree_buf_bytes(b); - - mutex_unlock(&c->btree_cache.lock); + list_for_each_entry(b, &bc->live[1].list, list) + ret += btree_buf_bytes(b); + list_for_each_entry(b, &bc->freeable, list) + ret += btree_buf_bytes(b); + mutex_unlock(&bc->lock); return ret; } @@ -287,7 +291,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c prt_tab_rjust(out); prt_human_readable_u64(out, nr_extents - ? div_u64(sectors_uncompressed << 9, nr_extents) + ? div64_u64(sectors_uncompressed << 9, nr_extents) : 0); prt_tab_rjust(out); prt_newline(out); @@ -444,11 +448,12 @@ STORE(bch2_fs) return -EROFS; if (attr == &sysfs_trigger_btree_cache_shrink) { + struct btree_cache *bc = &c->btree_cache; struct shrink_control sc; sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); + bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc); } if (attr == &sysfs_trigger_btree_key_cache_shrink) { @@ -456,7 +461,7 @@ STORE(bch2_fs) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); + c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); } if (attr == &sysfs_trigger_gc) diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 2acdfa783..42f565c76 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res) *res = 1; while (p--) { - if (*res > div_u64(U64_MAX, n)) + if (*res > div64_u64(U64_MAX, n)) return -ERANGE; *res *= n; } @@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res) parse_or_ret(cp, parse_unit_suffix(cp, &b)); - if (v > div_u64(U64_MAX, b)) + if (v > div64_u64(U64_MAX, b)) return -ERANGE; v *= b; - if (f_n > div_u64(U64_MAX, b)) + if (f_n > div64_u64(U64_MAX, b)) return -ERANGE; - f_n = div_u64(f_n * b, f_d); + f_n = div64_u64(f_n * b, f_d); if (v + f_n < v) return -ERANGE; v += f_n; @@ -214,7 +214,7 @@ u64 bch2_read_flag_list(const char *opt, const char * const list[]) s = strim(d); - while ((p = strsep(&s, ","))) { + while ((p = strsep(&s, ",;"))) { int flag = match_string(list, -1, p); if (flag < 0) { @@ -360,7 +360,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns) { const struct time_unit *u = bch2_pick_time_units(ns); - prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); + prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name); } static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) @@ -477,7 +477,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; u64 q = max(quantiles->entries[i].m, last_q); - prt_printf(out, "%llu ", div_u64(q, u->nsecs)); + prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); if (is_last) prt_newline(out); last_q = q; diff --git a/libbcachefs/xattr_format.h b/libbcachefs/xattr_format.h index e9f810539..c7916011e 100644 --- a/libbcachefs/xattr_format.h +++ b/libbcachefs/xattr_format.h @@ -13,7 +13,7 @@ struct bch_xattr { __u8 x_type; __u8 x_name_len; __le16 x_val_len; - __u8 x_name[]; + __u8 x_name[] __counted_by(x_name_len); } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */