From da2013bf2cb964ab1d3e4eb562068b8c9e53c6d2 Mon Sep 17 00:00:00 2001 From: Joel Colledge Date: Wed, 22 Jun 2022 11:31:20 +0200 Subject: [PATCH] drbd: allow application IO concurrently with resync requests sent to same peer Since the previous commit ("drbd: respect peer writes held due to conflicts when barrier received"), the receiver also waits for requests that are held due to conflicts when a barrier is received. If these conflicts are due to resync requests towards the same peer, then they will only be resolved once the resync reply has been received. This causes a deadlock. Avoid this by allowing application IO to be submitted while there are conflicting resync requests pending, but only when they relate to the same peer. Implement this by adding "sent" and "received" flags for the intervals. Resync requests that we initiate pass through two conflict resolution phases. The first one is for waiting until the request can be sent. This involves waiting until there are no conflicting application writes from other peers. The second one occurs when we receive the reply and has the purpose of ensuring we do not submit conflicting writes. This change has similarities to 4dc38cdeceea drbd: Break resync deadlock which was reverted because a different solution was required to avoid a distributed deadlock with 3 nodes: c0cd45acb1d2 drbd: Break distributed deadlock in request processing ddc742b4beb7 drbd: drop special-casing peer_device in al_begin_io_for_peer However, this is distinctly different and this time it is valid. The key difference is that resync requests are no longer blocked by an active activity log extent. This means that they only wait for local disk writes to complete, and not for a peer ack, which is a distributed event. It is no longer necessary to include a special case for compatibility in receive_Data(). This approach removes the exclusivity of peer writes with resync requests that we have sent and are waiting for the reply for. In fact, the previous solution was incomplete, because it still held peer writes back when they conflicted with resync requests that had been sent and pending a reply. This approach handles that scenario as well. --- .../application-resync-synchronization.rst | 26 ++++-- drbd/drbd_debugfs.c | 2 + drbd/drbd_int.h | 65 +++++++++++++- drbd/drbd_interval.h | 6 ++ drbd/drbd_receiver.c | 88 +++++++++++-------- drbd/drbd_req.c | 10 ++- drbd/drbd_sender.c | 31 +++++-- 7 files changed, 167 insertions(+), 61 deletions(-) diff --git a/Documentation/application-resync-synchronization.rst b/Documentation/application-resync-synchronization.rst index 83d134851..895312453 100644 --- a/Documentation/application-resync-synchronization.rst +++ b/Documentation/application-resync-synchronization.rst @@ -56,8 +56,12 @@ scheme: * Secondary: The application IO interval is locked while the data is being written to the backing disk. * Sync target: The ``P_RS_DATA_REQUEST`` interval is locked. The lock is taken - then the dagtag is recorded and the request is sent. The lock is released - when the reply has been received and the data written to the backing disk. + in two phases. Before sending the request, the interval is locked for + conflicts with other peers. Then the dagtag is recorded and the request is + sent. When the reply is received, the interval is additionally locked for + conflicts with ongoing IO, in particular writes from the same peer. The lock + is released when the reply has been received and the data written to the + backing disk. * Sync source: The ``P_RS_DATA_REQUEST`` interval is locked. The lock is taken when the dagtag for the request is reached. It is released when the ``P_RS_WRITE_ACK`` is received. This lock is a read lock; it is exclusive @@ -94,12 +98,12 @@ Correctness of data We only consider the synchronization between application and resync IO here. -The locking scheme prevents any writes to the resync request interval from when -the request is initiated until the received data is written. After the lock is -taken on the target, the dagtag is recorded and the request is sent to the -source. The source then waits until it has reached this dagtag before reading. -This ensures that the resync data is at least as new as the data on the target -when the request was made. +The locking scheme prevents any writes from other peers to the resync request +interval from when the request is initiated until the received data is written. +After the lock is taken on the target, the dagtag is recorded and the request +is sent to the source. The source then waits until it has reached this dagtag +before reading. This ensures that the resync data is at least as new as the +data on the target when the request was made. Conflicting application writes that reach the target while the resync request is in progress are held until the resync data has been written. Hence they @@ -108,6 +112,12 @@ this application write when it performed the resync read, the application write will overwrite the resync write on the target with identical data. This is harmless. +Resync requests sent from the target are not exclusive with application writes +from the same peer. However, since the resync and application data originate +from the same node, they are transmitted in the correct order in the data +stream. Application IO defers to received resync IO, ensuring that a resync +write received before an application write is also submitted first. + Correctness of bitmap bits -------------------------- diff --git a/drbd/drbd_debugfs.c b/drbd/drbd_debugfs.c index 1f308378d..e6901da75 100644 --- a/drbd/drbd_debugfs.c +++ b/drbd/drbd_debugfs.c @@ -1111,6 +1111,8 @@ static void seq_printf_interval_tree(struct seq_file *m, struct rb_root *root) char sep = ' '; seq_printf(m, "%llus+%u %s", (unsigned long long) i->sector, i->size, drbd_interval_type_str(i)); + seq_print_rq_state_bit(m, test_bit(INTERVAL_SENT, &i->flags), &sep, "sent"); + seq_print_rq_state_bit(m, test_bit(INTERVAL_RECEIVED, &i->flags), &sep, "received"); seq_print_rq_state_bit(m, test_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &i->flags), &sep, "submit-conflict-queued"); seq_print_rq_state_bit(m, test_bit(INTERVAL_SUBMITTED, &i->flags), &sep, "submitted"); seq_print_rq_state_bit(m, test_bit(INTERVAL_BACKING_COMPLETED, &i->flags), &sep, "backing-completed"); diff --git a/drbd/drbd_int.h b/drbd/drbd_int.h index 199726c52..02f209a49 100644 --- a/drbd/drbd_int.h +++ b/drbd/drbd_int.h @@ -1985,7 +1985,7 @@ extern void drbd_rs_controller_reset(struct drbd_peer_device *); extern void drbd_rs_all_in_flight_came_back(struct drbd_peer_device *, int); extern void drbd_check_peers(struct drbd_resource *resource); extern void drbd_check_peers_new_current_uuid(struct drbd_device *); -extern void drbd_conflict_submit_resync_request(struct drbd_peer_request *peer_req); +extern void drbd_conflict_send_resync_request(struct drbd_peer_request *peer_req); extern void drbd_ping_peer(struct drbd_connection *connection); extern struct drbd_peer_device *peer_device_by_node_id(struct drbd_device *, int); extern void repost_up_to_date_fn(struct timer_list *t); @@ -2094,6 +2094,7 @@ extern void drbd_send_peer_ack_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_peer_device *); extern void drbd_verify_skipped_block(struct drbd_peer_device *peer_device, const sector_t sector, const unsigned int size); +extern void drbd_conflict_submit_resync_request(struct drbd_peer_request *peer_req); extern void drbd_conflict_submit_peer_read(struct drbd_peer_request *peer_req); extern void drbd_conflict_submit_peer_write(struct drbd_peer_request *peer_req); extern int drbd_submit_peer_request(struct drbd_peer_request *); @@ -2654,10 +2655,57 @@ static inline struct drbd_connection *first_connection(struct drbd_resource *res #define NODE_MASK(id) ((u64)1 << (id)) +/* + * drbd_interval_same_peer - determine whether "interval" is for the same peer as "i" + * + * "i" must be an interval corresponding to a drbd_peer_request. + */ +static inline bool drbd_interval_same_peer(struct drbd_interval *interval, struct drbd_interval *i) +{ + struct drbd_peer_request *interval_peer_req, *i_peer_req; + + /* Ensure we only call "container_of" if it is actually a peer request. */ + if (interval->type == INTERVAL_LOCAL_WRITE || + interval->type == INTERVAL_LOCAL_READ || + interval->type == INTERVAL_PEERS_IN_SYNC_LOCK) + return false; + + interval_peer_req = container_of(interval, struct drbd_peer_request, i); + i_peer_req = container_of(i, struct drbd_peer_request, i); + return interval_peer_req->peer_device == i_peer_req->peer_device; +} + +/* + * drbd_resync_should_defer - determine whether "interval" should defer to "i" + */ +static inline bool drbd_should_defer_to_resync(struct drbd_interval *interval, struct drbd_interval *i) +{ + if (!drbd_interval_is_resync(i)) + return false; + + /* Always defer to resync requests once the reply has been received. + * These just need to wait for conflicting local I/O to complete. This + * is necessary to ensure that resync replies received before + * application writes are submitted first, so that the resync writes do + * not overwrite newer data. */ + if (test_bit(INTERVAL_RECEIVED, &i->flags)) + return true; + + /* If we are still waiting for a reply from the peer, only defer to the + * request if it is towards a different peer. The exclusivity between + * resync requests and application writes from another peer is + * necessary to avoid overwriting newer data with older in the resync. + * When the data in both cases is coming from the same peer, this is + * not necessary. The peer ensures that the data stream is correctly + * ordered. */ + return !drbd_interval_same_peer(interval, i); +} + #define CONFLICT_FLAG_WRITE (1 << 0) #define CONFLICT_FLAG_DEFER_TO_RESYNC (1 << 1) -#define CONFLICT_FLAG_EXCLUSIVE_UNTIL_COMPLETED (1 << 2) -#define CONFLICT_FLAG_APPLICATION_ONLY (1 << 3) +#define CONFLICT_FLAG_IGNORE_SAME_PEER (1 << 2) +#define CONFLICT_FLAG_EXCLUSIVE_UNTIL_COMPLETED (1 << 3) +#define CONFLICT_FLAG_APPLICATION_ONLY (1 << 4) /* * drbd_find_conflict - find conflicting interval, if any @@ -2670,6 +2718,7 @@ static inline struct drbd_interval *drbd_find_conflict(struct drbd_device *devic int size = interval->size; bool write = flags & CONFLICT_FLAG_WRITE; bool defer_to_resync = flags & CONFLICT_FLAG_DEFER_TO_RESYNC; + bool ignore_same_peer = flags & CONFLICT_FLAG_IGNORE_SAME_PEER; bool exclusive_until_completed = flags & CONFLICT_FLAG_EXCLUSIVE_UNTIL_COMPLETED; bool application_only = flags & CONFLICT_FLAG_APPLICATION_ONLY; @@ -2693,7 +2742,15 @@ static inline struct drbd_interval *drbd_find_conflict(struct drbd_device *devic /* Ignore, if not yet submitted, unless we should defer to a * resync request. */ if (!test_bit(INTERVAL_SUBMITTED, &i->flags) && - !(defer_to_resync && drbd_interval_is_resync(i))) + !(defer_to_resync && drbd_should_defer_to_resync(interval, i))) + continue; + + /* Ignore peer writes from the peer that this request relates + * to, if requested. This is only used for determining whether + * to send a request. It must not be used for determining + * whether to submit a request, because that would allow + * concurrent writes to the backing disk. */ + if (ignore_same_peer && i->type == INTERVAL_PEER_WRITE && drbd_interval_same_peer(interval, i)) continue; if (unlikely(application_only)) { diff --git a/drbd/drbd_interval.h b/drbd/drbd_interval.h index b98cdca5a..39b783089 100644 --- a/drbd/drbd_interval.h +++ b/drbd/drbd_interval.h @@ -20,6 +20,12 @@ enum drbd_interval_type { }; enum drbd_interval_flags { + /* Whether this resync write has been sent yet. */ + INTERVAL_SENT, + + /* Whether this resync write has been received yet. */ + INTERVAL_RECEIVED, + /* Whether this has been queued after conflict. */ INTERVAL_SUBMIT_CONFLICT_QUEUED, diff --git a/drbd/drbd_receiver.c b/drbd/drbd_receiver.c index ce60a85df..6be0ea4d7 100644 --- a/drbd/drbd_receiver.c +++ b/drbd/drbd_receiver.c @@ -2550,7 +2550,7 @@ static struct drbd_peer_request *find_resync_request(struct drbd_peer_device *pe * the requests. Hence looking the request up with a list traversal * should not be too slow. */ list_for_each_entry(p, list, w.list) { - if (p->i.sector == sector && test_bit(INTERVAL_SUBMITTED, &p->i.flags)) { + if (p->i.sector == sector && test_bit(INTERVAL_SENT, &p->i.flags)) { peer_req = p; break; } @@ -2566,6 +2566,46 @@ static struct drbd_peer_request *find_resync_request(struct drbd_peer_device *pe return peer_req; } +static void drbd_cleanup_after_failed_submit_resync_request(struct drbd_peer_request *peer_req) +{ + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_connection* connection = peer_device->connection; + struct drbd_device *device = peer_device->device; + + drbd_err(device, "submit failed, triggering re-connect\n"); + spin_lock_irq(&connection->peer_reqs_lock); + list_del(&peer_req->w.list); + list_del(&peer_req->recv_order); + spin_unlock_irq(&connection->peer_reqs_lock); + + drbd_remove_peer_req_interval(peer_req); + drbd_free_peer_req(peer_req); + put_ldev(device); + + change_cstate(connection, C_PROTOCOL_ERROR, CS_HARD); +} + +void drbd_conflict_submit_resync_request(struct drbd_peer_request *peer_req) +{ + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + bool conflict; + + spin_lock_irq(&device->interval_lock); + clear_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &peer_req->i.flags); + set_bit(INTERVAL_RECEIVED, &peer_req->i.flags); + conflict = drbd_find_conflict(device, &peer_req->i, CONFLICT_FLAG_WRITE); + if (!conflict) + set_bit(INTERVAL_SUBMITTED, &peer_req->i.flags); + spin_unlock_irq(&device->interval_lock); + + if (!conflict) { + int err = drbd_submit_peer_request(peer_req); + if (err) + drbd_cleanup_after_failed_submit_resync_request(peer_req); + } +} + static int recv_resync_read(struct drbd_peer_device *peer_device, struct drbd_peer_request_details *d) __releases(local) { @@ -2611,9 +2651,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, size = peer_req->i.size; drbd_set_all_out_of_sync(device, sector, size); - err = drbd_submit_peer_request(peer_req); - if (err) - goto out; + drbd_conflict_submit_resync_request(peer_req); peer_req = NULL; /* since submitted, might be destroyed already */ for_each_peer_device_ref(peer_device, im, device) { @@ -2622,16 +2660,6 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, drbd_send_out_of_sync(peer_device, sector, size); } return 0; -out: - /* don't care for the reason here */ - drbd_err(device, "submit failed, triggering re-connect\n"); - spin_lock_irq(&connection->peer_reqs_lock); - list_del(&peer_req->w.list); - list_del(&peer_req->recv_order); - spin_unlock_irq(&connection->peer_reqs_lock); - - drbd_free_peer_req(peer_req); - return err; } /* caller must hold interval_lock */ @@ -3187,7 +3215,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * struct drbd_peer_request *peer_req; struct drbd_peer_request_details d; int err, tp; - unsigned long conflict_flags = CONFLICT_FLAG_WRITE; bool conflict = false; peer_device = conn_peer_device(connection, pi->vnr); @@ -3385,16 +3412,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * goto out_del_list; } } - /* In protocol < 110 (compatibility with DRBD 8.4), we must not defer - * to resync. The peer may have received a resync request from us in - * the same "resync extent" as this write. In this case, it will block - * until it has received the corresponding barrier ack, so we must - * submit the write. - */ - if (connection->agreed_pro_version >= 110) - conflict_flags |= CONFLICT_FLAG_DEFER_TO_RESYNC; - conflict = connection->agreed_pro_version >= 110 && - drbd_find_conflict(device, &peer_req->i, conflict_flags); + conflict = drbd_find_conflict(device, &peer_req->i, + CONFLICT_FLAG_WRITE | CONFLICT_FLAG_DEFER_TO_RESYNC); drbd_insert_interval(&device->requests, &peer_req->i); if (!conflict) set_bit(INTERVAL_SUBMITTED, &peer_req->i.flags); @@ -4066,6 +4085,8 @@ static int receive_common_ov_reply(struct drbd_connection *connection, struct pa if (err) goto fail; + set_bit(INTERVAL_RECEIVED, &peer_req->i.flags); + drbd_alloc_page_chain(&peer_device->connection->transport, &peer_req->page_chain, DIV_ROUND_UP(size, PAGE_SIZE), GFP_TRY); if (!peer_req->page_chain.head) { @@ -8587,20 +8608,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac spin_unlock_irq(&connection->peer_reqs_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - err = drbd_submit_peer_request(peer_req); - - if (err) { - drbd_err(device, "discard submit failed, triggering re-connect\n"); - spin_lock_irq(&connection->peer_reqs_lock); - list_del(&peer_req->w.list); - list_del(&peer_req->recv_order); - spin_unlock_irq(&connection->peer_reqs_lock); - - drbd_remove_peer_req_interval(peer_req); - drbd_free_peer_req(peer_req); - peer_req = NULL; - put_ldev(device); - } + drbd_conflict_submit_resync_request(peer_req); /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(). */ } else { @@ -9701,6 +9709,8 @@ static int got_IsInSync(struct drbd_connection *connection, struct packet_info * dec_rs_pending(peer_device); + set_bit(INTERVAL_RECEIVED, &peer_req->i.flags); + spin_lock_irq(&connection->peer_reqs_lock); list_del(&peer_req->w.list); spin_unlock_irq(&connection->peer_reqs_lock); diff --git a/drbd/drbd_req.c b/drbd/drbd_req.c index d503c39e7..cf8fbbfdd 100644 --- a/drbd/drbd_req.c +++ b/drbd/drbd_req.c @@ -462,6 +462,11 @@ void drbd_release_conflicts(struct drbd_device *device, struct drbd_interval *re if (test_bit(INTERVAL_SUBMITTED, &i->flags)) continue; + /* If we are waiting for a reply from the peer, then there is + * no need to process the conflict. */ + if (test_bit(INTERVAL_SENT, &i->flags) && !test_bit(INTERVAL_RECEIVED, &i->flags)) + continue; + dynamic_drbd_dbg(device, "%s %s request at %llus+%u after conflict with %llus+%u\n", test_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &i->flags) ? "Already queued" : "Queue", @@ -2112,7 +2117,10 @@ void drbd_do_submit_conflict(struct work_struct *ws) list_for_each_entry_safe(peer_req, peer_req_tmp, &resync_writes, submit_list) { list_del_init(&peer_req->submit_list); - drbd_conflict_submit_resync_request(peer_req); + if (!test_bit(INTERVAL_SENT, &peer_req->i.flags)) + drbd_conflict_send_resync_request(peer_req); + else + drbd_conflict_submit_resync_request(peer_req); } list_for_each_entry_safe(peer_req, peer_req_tmp, &resync_reads, submit_list) { diff --git a/drbd/drbd_sender.c b/drbd/drbd_sender.c index 9dbaadd33..454cb6d5e 100644 --- a/drbd/drbd_sender.c +++ b/drbd/drbd_sender.c @@ -455,7 +455,7 @@ static void send_resync_request(struct drbd_peer_request *peer_req) change_cstate(peer_device->connection, C_DISCONNECTING, CS_HARD); } -void drbd_conflict_submit_resync_request(struct drbd_peer_request *peer_req) +void drbd_conflict_send_resync_request(struct drbd_peer_request *peer_req) { struct drbd_peer_device *peer_device = peer_req->peer_device; struct drbd_device *device = peer_device->device; @@ -463,11 +463,12 @@ void drbd_conflict_submit_resync_request(struct drbd_peer_request *peer_req) spin_lock_irq(&device->interval_lock); clear_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &peer_req->i.flags); - conflict = drbd_find_conflict(device, &peer_req->i, CONFLICT_FLAG_WRITE); + conflict = drbd_find_conflict(device, &peer_req->i, + CONFLICT_FLAG_WRITE | CONFLICT_FLAG_IGNORE_SAME_PEER); if (drbd_interval_empty(&peer_req->i)) drbd_insert_interval(&device->requests, &peer_req->i); if (!conflict) - set_bit(INTERVAL_SUBMITTED, &peer_req->i.flags); + set_bit(INTERVAL_SENT, &peer_req->i.flags); spin_unlock_irq(&device->interval_lock); if (!conflict) @@ -562,7 +563,7 @@ static int w_e_send_csum(struct drbd_work *w, int cancel) list_add_tail(&peer_req->w.list, &connection->sync_ee); spin_unlock_irq(&connection->peer_reqs_lock); - drbd_conflict_submit_resync_request(peer_req); + drbd_conflict_send_resync_request(peer_req); return 0; out: @@ -656,7 +657,7 @@ static int make_one_resync_request(struct drbd_peer_device *peer_device, int dis list_add_tail(&peer_req->recv_order, &peer_device->resync_requests); spin_unlock_irq(&connection->peer_reqs_lock); - drbd_conflict_submit_resync_request(peer_req); + drbd_conflict_send_resync_request(peer_req); return 0; } @@ -1039,7 +1040,7 @@ static bool adjacent(sector_t sector1, int size, sector_t sector2) * | +---------------- w_e_send_csum * sync_ee | | * | v conflict - * | drbd_conflict_submit_resync_request -----+ + * | drbd_conflict_send_resync_request -------+ * | | ^ | * | | | ... * | | | | @@ -1055,7 +1056,13 @@ static bool adjacent(sector_t sector1, int size, sector_t sector2) * | v * +-- recv_resync_read * sync_ee | | - * | v + * | v conflict + * | drbd_conflict_submit_resync_request -----+ + * | | ^ | + * | | | ... + * | | | | + * | | | v + * | v +---- drbd_do_submit_conflict * | drbd_submit_peer_request * | | * | ... @@ -1279,7 +1286,7 @@ static void send_ov_request(struct drbd_peer_request *peer_req) change_cstate(peer_device->connection, C_DISCONNECTING, CS_HARD); } -static void drbd_conflict_submit_ov_request(struct drbd_peer_request *peer_req) +static void drbd_conflict_send_ov_request(struct drbd_peer_request *peer_req) { struct drbd_peer_device *peer_device = peer_req->peer_device; struct drbd_device *device = peer_device->device; @@ -1288,6 +1295,9 @@ static void drbd_conflict_submit_ov_request(struct drbd_peer_request *peer_req) if (drbd_find_conflict(device, &peer_req->i, 0)) set_bit(INTERVAL_CONFLICT, &peer_req->i.flags); drbd_insert_interval(&device->requests, &peer_req->i); + set_bit(INTERVAL_SENT, &peer_req->i.flags); + /* Mark as submitted now, since OV requests do not have a second + * conflict resolution stage when the reply is received. */ set_bit(INTERVAL_SUBMITTED, &peer_req->i.flags); spin_unlock_irq(&device->interval_lock); @@ -1401,7 +1411,7 @@ static int make_ov_request(struct drbd_peer_device *peer_device, int cancel) list_add_tail(&peer_req->w.list, &connection->resync_request_ee); spin_unlock_irq(&connection->peer_reqs_lock); - drbd_conflict_submit_ov_request(peer_req); + drbd_conflict_send_ov_request(peer_req); sector += BM_SECT_PER_BIT; } @@ -1965,6 +1975,7 @@ static int drbd_rs_reply(struct drbd_peer_device *peer_device, struct drbd_peer_ * But needed to be properly balanced with * the atomic_sub() in got_BlockAck. */ atomic_add(peer_req->i.size >> 9, &connection->rs_in_flight); + set_bit(INTERVAL_SENT, &peer_req->i.flags); if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req)) { err = drbd_send_rs_deallocated(peer_device, peer_req); } else { @@ -2125,6 +2136,8 @@ int w_e_end_ov_req(struct drbd_work *w, int cancel) if (dagtag_result.err) goto out; + set_bit(INTERVAL_SENT, &peer_req->i.flags); + digest_size = crypto_shash_digestsize(peer_device->connection->verify_tfm); /* FIXME if this allocation fails, online verify will not terminate! */ digest = drbd_prepare_drequest_csum(peer_req, cmd, digest_size,