From e2c7a2c8e0174b395a63d3eace3005165ad2c943 Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Mon, 22 Jun 2026 04:41:13 +0530 Subject: [PATCH 1/4] repository: add replace_pack to keep a subset of a pack's objects, refs #8572 #8514 Copies the kept objects into a new pack via store.defrag, repoints their chunk index entries, and deletes the old pack; an empty keep set drops the pack whole. --- src/borg/repository.py | 45 +++++++++++++++++++++++++-- src/borg/testsuite/repository_test.py | 42 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index f64fbb89be..edfc189ce7 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -695,8 +695,7 @@ def store_list(namespace): logger.error("Repository index is corrupted and must be repaired; skipping the pack check.") objs_errors = index_errors + pack_errors logger.info( - f"Checked {index_files} index files ({index_errors} errors) " - f"and {pack_files} packs ({pack_errors} errors)." + f"Checked {index_files} index files ({index_errors} errors) and {pack_files} packs ({pack_errors} errors)." ) if objs_errors == 0: logger.info(f"Finished {mode} repository check, no problems found.") @@ -811,6 +810,48 @@ def delete(self, id): raise self.ObjectNotFound(id, str(self._location)) logger.warning("ignoring deletion of %s in %s", bin_to_hex(id), bin_to_hex(entry.pack_id)) + def replace_pack(self, old_pack_id, keep_ids): + """Rewrite pack to keep only the objects in , then delete the old pack. + + Returns the new pack_id, or None when is empty (the pack is dropped, not replaced). + The kept objects are copied into the new pack via store.defrag and their chunk index entries are + updated to point at it. The caller must delete the index entries of the dropped objects -- + replace_pack only ever sees the survivors. + """ + self._lock_refresh() + old_key = "packs/" + bin_to_hex(old_pack_id) + if not keep_ids: # whole pack unused: drop it, nothing to copy forward + self.store_delete(old_key) + return None + + # look up each survivor's location, ordered by offset so defrag copies the old pack sequentially + survivors = [] + for keep_id in keep_ids: + entry = self.chunks[keep_id] + assert entry.pack_id == old_pack_id, f"{bin_to_hex(keep_id)} is not in pack {bin_to_hex(old_pack_id)}" + survivors.append((entry.obj_offset, keep_id, entry.obj_size)) + survivors.sort() + + # defrag concatenates the survivor ranges into a new pack named sha256(content), like PackWriter + sources = [(bin_to_hex(old_pack_id), offset, size) for offset, _, size in survivors] + new_pack_id = hex_to_bin(self.store.defrag(sources, algorithm="sha256", namespace="packs")) + + # point the kept objects at the new pack; each new offset is the running sum of the kept sizes + results = [] + offset = 0 + for _, keep_id, size in survivors: + results.append((keep_id, new_pack_id, offset, size)) + offset += size + self.chunks.update_pack_info(results) + + # we stored the new pack and repointed the index before this, so an interrupted call never + # leaves the index pointing at a pack we already deleted. if defrag reproduced the old pack + # (every object kept, in order) its name equals old_pack_id, and deleting it would drop what + # we just kept, so skip the delete in that case. + if new_pack_id != old_pack_id: + self.store_delete(old_key) + return new_pack_id + def break_lock(self): Lock(self.store).break_lock() diff --git a/src/borg/testsuite/repository_test.py b/src/borg/testsuite/repository_test.py index 5757e02a12..66a2c0c076 100644 --- a/src/borg/testsuite/repository_test.py +++ b/src/borg/testsuite/repository_test.py @@ -142,6 +142,48 @@ def test_consistency(repo_fixtures, request): assert pdchunk(repository.get(H(0))) == b"bar" +def test_replace_pack_copy_forward(repo_fixtures, request): + # Force several objects into one pack (N>1), keep a subset, and check the survivors still read + # back correctly while the removed object and its bytes are gone. + with get_repository_from_fixture(repo_fixtures, request) as repository: + repository._pack_writer.max_count = 3 # buffer 3 objects into one pack + chunk0 = fchunk(b"DATA0", chunk_id=H(0)) + chunk2 = fchunk(b"DATA2", chunk_id=H(2)) + repository.put(H(0), chunk0) + repository.put(H(1), fchunk(b"DATA1", chunk_id=H(1))) + repository.put(H(2), chunk2) # 3rd put reaches max_count and flushes the pack + old_pack_id = repository.chunks[H(0)].pack_id + assert repository.chunks[H(1)].pack_id == old_pack_id + assert repository.chunks[H(2)].pack_id == old_pack_id + + new_pack_id = repository.replace_pack(old_pack_id, [H(0), H(2)]) + + assert new_pack_id is not None and new_pack_id != old_pack_id + # survivors read back intact, proving the repointed pack_id and recomputed offsets + assert pdchunk(repository.get(H(0))) == b"DATA0" + assert pdchunk(repository.get(H(2))) == b"DATA2" + # the caller drops the removed object's index entry; it is no longer resolvable afterwards + del repository.chunks[H(1)] + assert repository.get(H(1), raise_missing=False) is None + # old pack gone, new pack holds only the two survivors' bytes + packs = {info.name: info.size for info in repository.store_list("packs")} + assert bin_to_hex(old_pack_id) not in packs + assert packs[bin_to_hex(new_pack_id)] == len(chunk0) + len(chunk2) + + +def test_replace_pack_drops_whole_pack(repo_fixtures, request): + # An empty keep list drops the pack outright, without writing a replacement. + with get_repository_from_fixture(repo_fixtures, request) as repository: + repository.put(H(0), fchunk(b"DATA", chunk_id=H(0))) # N=1: one object per pack + old_pack_id = repository.chunks[H(0)].pack_id + + assert repository.replace_pack(old_pack_id, []) is None + + del repository.chunks[H(0)] + assert repository.get(H(0), raise_missing=False) is None + assert bin_to_hex(old_pack_id) not in [info.name for info in repository.store_list("packs")] + + def test_list(repo_fixtures, request): with get_repository_from_fixture(repo_fixtures, request) as repository: for x in range(100): From 55dc65c1c4521545c84a8b584cb9d054b51b084e Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Mon, 22 Jun 2026 05:15:53 +0530 Subject: [PATCH 2/4] repository: clarify replace_pack docs and cover the keep-all case Document the old_pack_id return and the caller's index-durability duty, reword the delete-old-last rationale, and test the reproduced-pack skip and survivor sort. --- src/borg/repository.py | 15 ++++++++++----- src/borg/testsuite/repository_test.py | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index edfc189ce7..109e96380a 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -813,10 +813,15 @@ def delete(self, id): def replace_pack(self, old_pack_id, keep_ids): """Rewrite pack to keep only the objects in , then delete the old pack. - Returns the new pack_id, or None when is empty (the pack is dropped, not replaced). + Returns the new pack_id, None when is empty (the pack is dropped, not replaced), + or when the kept objects reproduce the old pack exactly (nothing changes on disk). The kept objects are copied into the new pack via store.defrag and their chunk index entries are updated to point at it. The caller must delete the index entries of the dropped objects -- replace_pack only ever sees the survivors. + + Only the in-memory chunk index is updated here. The caller runs under an exclusive lock and owns + index durability: it must invalidate any cached on-disk index before calling, so an interrupted + run rebuilds the index from the actual packs, and write the index back when done, as compact does. """ self._lock_refresh() old_key = "packs/" + bin_to_hex(old_pack_id) @@ -844,10 +849,10 @@ def replace_pack(self, old_pack_id, keep_ids): offset += size self.chunks.update_pack_info(results) - # we stored the new pack and repointed the index before this, so an interrupted call never - # leaves the index pointing at a pack we already deleted. if defrag reproduced the old pack - # (every object kept, in order) its name equals old_pack_id, and deleting it would drop what - # we just kept, so skip the delete in that case. + # delete the old pack only after the new one is stored and the index repointed, so the kept + # bytes always live in at least one pack on the store and an interrupted run can lose nothing. + # when defrag reproduces the old pack exactly (every object kept, in order) the new name equals + # old_pack_id, and deleting it would drop what we just kept, so skip the delete in that case. if new_pack_id != old_pack_id: self.store_delete(old_key) return new_pack_id diff --git a/src/borg/testsuite/repository_test.py b/src/borg/testsuite/repository_test.py index 66a2c0c076..642d168b09 100644 --- a/src/borg/testsuite/repository_test.py +++ b/src/borg/testsuite/repository_test.py @@ -184,6 +184,24 @@ def test_replace_pack_drops_whole_pack(repo_fixtures, request): assert bin_to_hex(old_pack_id) not in [info.name for info in repository.store_list("packs")] +def test_replace_pack_keep_all_is_noop(repo_fixtures, request): + # Keeping every object in stored order reproduces the same pack, so its sha256 name is unchanged + # and the old pack must not be deleted. Passing the ids out of order must give the same result, + # since replace_pack sorts the survivors by offset before copying them forward. + with get_repository_from_fixture(repo_fixtures, request) as repository: + repository._pack_writer.max_count = 2 # buffer 2 objects into one pack + repository.put(H(0), fchunk(b"DATA0", chunk_id=H(0))) + repository.put(H(1), fchunk(b"DATA1", chunk_id=H(1))) # 2nd put reaches max_count and flushes + old_pack_id = repository.chunks[H(0)].pack_id + + new_pack_id = repository.replace_pack(old_pack_id, [H(1), H(0)]) # out of order on purpose + + assert new_pack_id == old_pack_id # reproduced pack keeps its name, the delete was skipped + assert pdchunk(repository.get(H(0))) == b"DATA0" + assert pdchunk(repository.get(H(1))) == b"DATA1" + assert bin_to_hex(old_pack_id) in [info.name for info in repository.store_list("packs")] + + def test_list(repo_fixtures, request): with get_repository_from_fixture(repo_fixtures, request) as repository: for x in range(100): From a56518e41fd8fa5caab904a03f5a5c0d9c90e190 Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Mon, 22 Jun 2026 17:34:11 +0530 Subject: [PATCH 3/4] repository: compact_pack takes keep and drop ids, owns the index update Rename replace_pack to compact_pack; it now repoints kept and deletes dropped index entries, asserting keep + drop tile the whole pack. refs #8572 #8514 --- src/borg/repository.py | 85 ++++++++++++++++----------- src/borg/testsuite/repository_test.py | 72 ++++++++++++----------- 2 files changed, 90 insertions(+), 67 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 109e96380a..fa708a88b9 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -810,51 +810,68 @@ def delete(self, id): raise self.ObjectNotFound(id, str(self._location)) logger.warning("ignoring deletion of %s in %s", bin_to_hex(id), bin_to_hex(entry.pack_id)) - def replace_pack(self, old_pack_id, keep_ids): - """Rewrite pack to keep only the objects in , then delete the old pack. - - Returns the new pack_id, None when is empty (the pack is dropped, not replaced), - or when the kept objects reproduce the old pack exactly (nothing changes on disk). - The kept objects are copied into the new pack via store.defrag and their chunk index entries are - updated to point at it. The caller must delete the index entries of the dropped objects -- - replace_pack only ever sees the survivors. - - Only the in-memory chunk index is updated here. The caller runs under an exclusive lock and owns - index durability: it must invalidate any cached on-disk index before calling, so an interrupted - run rebuilds the index from the actual packs, and write the index back when done, as compact does. + def compact_pack(self, pack_id, keep_ids, drop_ids): + """Rewrite pack , keeping and dropping , then delete the old pack. + + keep_ids and drop_ids must together cover the whole pack (asserted: their ranges tile it with no + gap or overlap). Kept objects are copied into a new pack via store.defrag and repointed in the + chunk index; dropped objects' index entries are removed. + + Returns the new pack_id, None if nothing is kept (pack dropped), or unchanged if the + kept objects reproduce the old pack (same sha256 name, nothing to delete). + + Updates the in-memory chunk index only. The caller holds the exclusive lock and owns index + durability: invalidate the cached index before calling, write it back after, as compact does. """ self._lock_refresh() - old_key = "packs/" + bin_to_hex(old_pack_id) - if not keep_ids: # whole pack unused: drop it, nothing to copy forward - self.store_delete(old_key) - return None + pack_key = "packs/" + bin_to_hex(pack_id) - # look up each survivor's location, ordered by offset so defrag copies the old pack sequentially - survivors = [] + # collect every object's range, tagged with whether it is kept, ordered by offset. + located = [] # (obj_offset, obj_id, obj_size, keep) for keep_id in keep_ids: entry = self.chunks[keep_id] - assert entry.pack_id == old_pack_id, f"{bin_to_hex(keep_id)} is not in pack {bin_to_hex(old_pack_id)}" - survivors.append((entry.obj_offset, keep_id, entry.obj_size)) - survivors.sort() + assert entry.pack_id == pack_id, f"{bin_to_hex(keep_id)} is not in pack {bin_to_hex(pack_id)}" + located.append((entry.obj_offset, keep_id, entry.obj_size, True)) + for drop_id in drop_ids: + entry = self.chunks[drop_id] + assert entry.pack_id == pack_id, f"{bin_to_hex(drop_id)} is not in pack {bin_to_hex(pack_id)}" + located.append((entry.obj_offset, drop_id, entry.obj_size, False)) + located.sort() + + # keep + drop must tile the whole pack; pick out the survivors in the same pass. + survivors = [] # (obj_offset, obj_id, obj_size), offset-ordered + covered = 0 + for offset, obj_id, size, keep in located: + assert offset == covered, f"gap or overlap in pack {bin_to_hex(pack_id)} at offset {covered}" + covered += size + if keep: + survivors.append((offset, obj_id, size)) + assert covered == self.store.info(pack_key).size, f"pack {bin_to_hex(pack_id)} not fully covered" + + for drop_id in drop_ids: # remove dropped objects from the index; their bytes are not copied forward + del self.chunks[drop_id] + + if not survivors: # nothing kept: drop the pack, no replacement + self.store_delete(pack_key) + return None - # defrag concatenates the survivor ranges into a new pack named sha256(content), like PackWriter - sources = [(bin_to_hex(old_pack_id), offset, size) for offset, _, size in survivors] + # copy survivors into a new pack (named sha256 of its content) + sources = [(bin_to_hex(pack_id), offset, size) for offset, _, size in survivors] new_pack_id = hex_to_bin(self.store.defrag(sources, algorithm="sha256", namespace="packs")) - # point the kept objects at the new pack; each new offset is the running sum of the kept sizes - results = [] + # repoint survivors at the new pack; new offset is the running sum of kept sizes + new_locations = [] offset = 0 for _, keep_id, size in survivors: - results.append((keep_id, new_pack_id, offset, size)) + new_locations.append((keep_id, new_pack_id, offset, size)) offset += size - self.chunks.update_pack_info(results) - - # delete the old pack only after the new one is stored and the index repointed, so the kept - # bytes always live in at least one pack on the store and an interrupted run can lose nothing. - # when defrag reproduces the old pack exactly (every object kept, in order) the new name equals - # old_pack_id, and deleting it would drop what we just kept, so skip the delete in that case. - if new_pack_id != old_pack_id: - self.store_delete(old_key) + self.chunks.update_pack_info(new_locations) + + # delete the old pack last, after the new one is stored and indexed, so kept bytes are never the + # only copy. if every object was kept in order, defrag reproduced the pack (new_pack_id == pack_id) + # and deleting it would drop what we kept, so skip. + if new_pack_id != pack_id: + self.store_delete(pack_key) return new_pack_id def break_lock(self): diff --git a/src/borg/testsuite/repository_test.py b/src/borg/testsuite/repository_test.py index 642d168b09..7776fd33eb 100644 --- a/src/borg/testsuite/repository_test.py +++ b/src/borg/testsuite/repository_test.py @@ -142,61 +142,67 @@ def test_consistency(repo_fixtures, request): assert pdchunk(repository.get(H(0))) == b"bar" -def test_replace_pack_copy_forward(repo_fixtures, request): - # Force several objects into one pack (N>1), keep a subset, and check the survivors still read - # back correctly while the removed object and its bytes are gone. - with get_repository_from_fixture(repo_fixtures, request) as repository: - repository._pack_writer.max_count = 3 # buffer 3 objects into one pack - chunk0 = fchunk(b"DATA0", chunk_id=H(0)) - chunk2 = fchunk(b"DATA2", chunk_id=H(2)) - repository.put(H(0), chunk0) - repository.put(H(1), fchunk(b"DATA1", chunk_id=H(1))) - repository.put(H(2), chunk2) # 3rd put reaches max_count and flushes the pack +def build_one_pack(repository, objects): + # Bundle several objects (N>1) into one pack: max_count == object count makes the last put flush the + # whole batch as a single pack. Caller reopens afterwards to read it back from disk. + with repository: + repository._pack_writer.max_count = len(objects) + for chunk_id, chunk in objects: + repository.put(chunk_id, chunk) + + +def test_compact_pack_copy_forward(repo_fixtures, request): + # Keep a subset of a multi-object pack: survivors must read back, the dropped object and its bytes gone. + chunk0 = fchunk(b"DATA0", chunk_id=H(0)) + chunk1 = fchunk(b"DATA1", chunk_id=H(1)) + chunk2 = fchunk(b"DATA2", chunk_id=H(2)) + repository = get_repository_from_fixture(repo_fixtures, request) + build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1), (H(2), chunk2)]) + with reopen(repository) as repository: old_pack_id = repository.chunks[H(0)].pack_id assert repository.chunks[H(1)].pack_id == old_pack_id assert repository.chunks[H(2)].pack_id == old_pack_id - new_pack_id = repository.replace_pack(old_pack_id, [H(0), H(2)]) + new_pack_id = repository.compact_pack(old_pack_id, keep_ids=[H(0), H(2)], drop_ids=[H(1)]) assert new_pack_id is not None and new_pack_id != old_pack_id - # survivors read back intact, proving the repointed pack_id and recomputed offsets assert pdchunk(repository.get(H(0))) == b"DATA0" assert pdchunk(repository.get(H(2))) == b"DATA2" - # the caller drops the removed object's index entry; it is no longer resolvable afterwards - del repository.chunks[H(1)] - assert repository.get(H(1), raise_missing=False) is None - # old pack gone, new pack holds only the two survivors' bytes + assert repository.get(H(1), raise_missing=False) is None # compact_pack removed its index entry packs = {info.name: info.size for info in repository.store_list("packs")} assert bin_to_hex(old_pack_id) not in packs - assert packs[bin_to_hex(new_pack_id)] == len(chunk0) + len(chunk2) + assert packs[bin_to_hex(new_pack_id)] == len(chunk0) + len(chunk2) # only the survivors' bytes -def test_replace_pack_drops_whole_pack(repo_fixtures, request): - # An empty keep list drops the pack outright, without writing a replacement. - with get_repository_from_fixture(repo_fixtures, request) as repository: - repository.put(H(0), fchunk(b"DATA", chunk_id=H(0))) # N=1: one object per pack +def test_compact_pack_drops_whole_pack(repo_fixtures, request): + # Dropping every object removes the pack and clears its index entries. N>1 pack, not the max_count default. + chunk0 = fchunk(b"DATA0", chunk_id=H(0)) + chunk1 = fchunk(b"DATA1", chunk_id=H(1)) + repository = get_repository_from_fixture(repo_fixtures, request) + build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1)]) + with reopen(repository) as repository: old_pack_id = repository.chunks[H(0)].pack_id - assert repository.replace_pack(old_pack_id, []) is None + assert repository.compact_pack(old_pack_id, keep_ids=[], drop_ids=[H(0), H(1)]) is None - del repository.chunks[H(0)] assert repository.get(H(0), raise_missing=False) is None + assert repository.get(H(1), raise_missing=False) is None assert bin_to_hex(old_pack_id) not in [info.name for info in repository.store_list("packs")] -def test_replace_pack_keep_all_is_noop(repo_fixtures, request): - # Keeping every object in stored order reproduces the same pack, so its sha256 name is unchanged - # and the old pack must not be deleted. Passing the ids out of order must give the same result, - # since replace_pack sorts the survivors by offset before copying them forward. - with get_repository_from_fixture(repo_fixtures, request) as repository: - repository._pack_writer.max_count = 2 # buffer 2 objects into one pack - repository.put(H(0), fchunk(b"DATA0", chunk_id=H(0))) - repository.put(H(1), fchunk(b"DATA1", chunk_id=H(1))) # 2nd put reaches max_count and flushes +def test_compact_pack_keep_all_is_noop(repo_fixtures, request): + # Keeping every object reproduces the same pack: same sha256 name, old pack not deleted. Ids passed + # out of order must give the same result, since compact_pack sorts survivors by offset. + chunk0 = fchunk(b"DATA0", chunk_id=H(0)) + chunk1 = fchunk(b"DATA1", chunk_id=H(1)) + repository = get_repository_from_fixture(repo_fixtures, request) + build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1)]) + with reopen(repository) as repository: old_pack_id = repository.chunks[H(0)].pack_id - new_pack_id = repository.replace_pack(old_pack_id, [H(1), H(0)]) # out of order on purpose + new_pack_id = repository.compact_pack(old_pack_id, keep_ids=[H(1), H(0)], drop_ids=[]) # out of order - assert new_pack_id == old_pack_id # reproduced pack keeps its name, the delete was skipped + assert new_pack_id == old_pack_id assert pdchunk(repository.get(H(0))) == b"DATA0" assert pdchunk(repository.get(H(1))) == b"DATA1" assert bin_to_hex(old_pack_id) in [info.name for info in repository.store_list("packs")] From bf36090497a8509ceba8c2e97d5faab48dfeed78 Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Mon, 22 Jun 2026 19:03:13 +0530 Subject: [PATCH 4/4] repository: compact_pack takes sets, unifies keep/drop loop, renames kept keep_ids/drop_ids are now keyword-only sets; one loop over their union replaces two; survivors renamed to kept; intersection asserted empty. --- src/borg/repository.py | 40 +++++++++++++-------------- src/borg/testsuite/repository_test.py | 23 ++++++++------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index fa708a88b9..512f99672f 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -810,12 +810,13 @@ def delete(self, id): raise self.ObjectNotFound(id, str(self._location)) logger.warning("ignoring deletion of %s in %s", bin_to_hex(id), bin_to_hex(entry.pack_id)) - def compact_pack(self, pack_id, keep_ids, drop_ids): + def compact_pack(self, pack_id, *, keep_ids: set, drop_ids: set): """Rewrite pack , keeping and dropping , then delete the old pack. - keep_ids and drop_ids must together cover the whole pack (asserted: their ranges tile it with no - gap or overlap). Kept objects are copied into a new pack via store.defrag and repointed in the - chunk index; dropped objects' index entries are removed. + keep_ids and drop_ids are sets of chunk ids that must together cover the whole pack (asserted: + their ranges tile it with no gap or overlap, and their intersection is empty). Kept objects are + copied into a new pack via store.defrag and repointed in the chunk index; dropped objects' index + entries are removed. Returns the new pack_id, None if nothing is kept (pack dropped), or unchanged if the kept objects reproduce the old pack (same sha256 name, nothing to delete). @@ -826,43 +827,42 @@ def compact_pack(self, pack_id, keep_ids, drop_ids): self._lock_refresh() pack_key = "packs/" + bin_to_hex(pack_id) + assert keep_ids & drop_ids == set(), "an id cannot appear in both keep_ids and drop_ids" + # collect every object's range, tagged with whether it is kept, ordered by offset. located = [] # (obj_offset, obj_id, obj_size, keep) - for keep_id in keep_ids: - entry = self.chunks[keep_id] - assert entry.pack_id == pack_id, f"{bin_to_hex(keep_id)} is not in pack {bin_to_hex(pack_id)}" - located.append((entry.obj_offset, keep_id, entry.obj_size, True)) - for drop_id in drop_ids: - entry = self.chunks[drop_id] - assert entry.pack_id == pack_id, f"{bin_to_hex(drop_id)} is not in pack {bin_to_hex(pack_id)}" - located.append((entry.obj_offset, drop_id, entry.obj_size, False)) + for obj_id in keep_ids | drop_ids: + keep = obj_id in keep_ids + entry = self.chunks[obj_id] + assert entry.pack_id == pack_id, f"{bin_to_hex(obj_id)} is not in pack {bin_to_hex(pack_id)}" + located.append((entry.obj_offset, obj_id, entry.obj_size, keep)) located.sort() - # keep + drop must tile the whole pack; pick out the survivors in the same pass. - survivors = [] # (obj_offset, obj_id, obj_size), offset-ordered + # keep + drop must tile the whole pack; collect the objects to keep in the same pass. + kept = [] # (obj_offset, obj_id, obj_size), offset-ordered covered = 0 for offset, obj_id, size, keep in located: assert offset == covered, f"gap or overlap in pack {bin_to_hex(pack_id)} at offset {covered}" covered += size if keep: - survivors.append((offset, obj_id, size)) + kept.append((offset, obj_id, size)) assert covered == self.store.info(pack_key).size, f"pack {bin_to_hex(pack_id)} not fully covered" for drop_id in drop_ids: # remove dropped objects from the index; their bytes are not copied forward del self.chunks[drop_id] - if not survivors: # nothing kept: drop the pack, no replacement + if not kept: # nothing kept: drop the pack, no replacement self.store_delete(pack_key) return None - # copy survivors into a new pack (named sha256 of its content) - sources = [(bin_to_hex(pack_id), offset, size) for offset, _, size in survivors] + # copy kept objects into a new pack (named sha256 of its content) + sources = [(bin_to_hex(pack_id), offset, size) for offset, _, size in kept] new_pack_id = hex_to_bin(self.store.defrag(sources, algorithm="sha256", namespace="packs")) - # repoint survivors at the new pack; new offset is the running sum of kept sizes + # repoint kept objects at the new pack; new offset is the running sum of kept sizes new_locations = [] offset = 0 - for _, keep_id, size in survivors: + for _, keep_id, size in kept: new_locations.append((keep_id, new_pack_id, offset, size)) offset += size self.chunks.update_pack_info(new_locations) diff --git a/src/borg/testsuite/repository_test.py b/src/borg/testsuite/repository_test.py index 7776fd33eb..74edb40fb8 100644 --- a/src/borg/testsuite/repository_test.py +++ b/src/borg/testsuite/repository_test.py @@ -143,12 +143,11 @@ def test_consistency(repo_fixtures, request): def build_one_pack(repository, objects): - # Bundle several objects (N>1) into one pack: max_count == object count makes the last put flush the - # whole batch as a single pack. Caller reopens afterwards to read it back from disk. with repository: - repository._pack_writer.max_count = len(objects) + repository._pack_writer.max_count = len(objects) + 1 # prevent per-put flush; one pack on flush() for chunk_id, chunk in objects: repository.put(chunk_id, chunk) + repository.flush() def test_compact_pack_copy_forward(repo_fixtures, request): @@ -158,12 +157,12 @@ def test_compact_pack_copy_forward(repo_fixtures, request): chunk2 = fchunk(b"DATA2", chunk_id=H(2)) repository = get_repository_from_fixture(repo_fixtures, request) build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1), (H(2), chunk2)]) - with reopen(repository) as repository: + with repository: old_pack_id = repository.chunks[H(0)].pack_id assert repository.chunks[H(1)].pack_id == old_pack_id assert repository.chunks[H(2)].pack_id == old_pack_id - new_pack_id = repository.compact_pack(old_pack_id, keep_ids=[H(0), H(2)], drop_ids=[H(1)]) + new_pack_id = repository.compact_pack(old_pack_id, keep_ids={H(0), H(2)}, drop_ids={H(1)}) assert new_pack_id is not None and new_pack_id != old_pack_id assert pdchunk(repository.get(H(0))) == b"DATA0" @@ -171,19 +170,19 @@ def test_compact_pack_copy_forward(repo_fixtures, request): assert repository.get(H(1), raise_missing=False) is None # compact_pack removed its index entry packs = {info.name: info.size for info in repository.store_list("packs")} assert bin_to_hex(old_pack_id) not in packs - assert packs[bin_to_hex(new_pack_id)] == len(chunk0) + len(chunk2) # only the survivors' bytes + assert packs[bin_to_hex(new_pack_id)] == len(chunk0) + len(chunk2) # only the kept objects' bytes def test_compact_pack_drops_whole_pack(repo_fixtures, request): - # Dropping every object removes the pack and clears its index entries. N>1 pack, not the max_count default. + # Dropping every object removes the pack and clears its index entries. chunk0 = fchunk(b"DATA0", chunk_id=H(0)) chunk1 = fchunk(b"DATA1", chunk_id=H(1)) repository = get_repository_from_fixture(repo_fixtures, request) build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1)]) - with reopen(repository) as repository: + with repository: old_pack_id = repository.chunks[H(0)].pack_id - assert repository.compact_pack(old_pack_id, keep_ids=[], drop_ids=[H(0), H(1)]) is None + assert repository.compact_pack(old_pack_id, keep_ids=set(), drop_ids={H(0), H(1)}) is None assert repository.get(H(0), raise_missing=False) is None assert repository.get(H(1), raise_missing=False) is None @@ -192,15 +191,15 @@ def test_compact_pack_drops_whole_pack(repo_fixtures, request): def test_compact_pack_keep_all_is_noop(repo_fixtures, request): # Keeping every object reproduces the same pack: same sha256 name, old pack not deleted. Ids passed - # out of order must give the same result, since compact_pack sorts survivors by offset. + # out of order must give the same result, since compact_pack sorts by offset. chunk0 = fchunk(b"DATA0", chunk_id=H(0)) chunk1 = fchunk(b"DATA1", chunk_id=H(1)) repository = get_repository_from_fixture(repo_fixtures, request) build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1)]) - with reopen(repository) as repository: + with repository: old_pack_id = repository.chunks[H(0)].pack_id - new_pack_id = repository.compact_pack(old_pack_id, keep_ids=[H(1), H(0)], drop_ids=[]) # out of order + new_pack_id = repository.compact_pack(old_pack_id, keep_ids={H(1), H(0)}, drop_ids=set()) # out of order assert new_pack_id == old_pack_id assert pdchunk(repository.get(H(0))) == b"DATA0"