diff --git a/ChangeLog b/CHANGELOG.md similarity index 97% rename from ChangeLog rename to CHANGELOG.md index aefe0d6..6fc5119 100644 --- a/ChangeLog +++ b/CHANGELOG.md @@ -8,15 +8,22 @@ as of 2.0.0. ## [Unreleased] +### Changed + +- Insert as few rsync URLs as possible in DB when a book selection is made (#220) + ## [2.1.1] - 2024-01-17 ### Added + - `Publisher` ZIM metadata can now be customized at CLI (#210) ### Changed + - `Publisher` ZIM metadata default value is changed to `openZIM` intead of `Kiwix` (#210) ### Fixed + - Do not fail if temporary directory already exists (#207) - Typo in `Scraper` ZIM metadata (#212) - Adapt to hatchling v1.19.0 which mandates packages setting (#211) @@ -35,11 +42,13 @@ as of 2.0.0. - Removed inline Javascript in HTML files (#145) ### Fixed + - Support single quotes in author names (#162) - Migrated to another Gutenberg server (#187) - Removed useless file languages_06_2018 (#180) ### Removed + - Removed Datatables JS code from repository, fetch online now (#116) - Dropped Python 2 support (#191) diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py index ebda3e8..4ee145f 100755 --- a/src/gutenberg2zim/entrypoint.py +++ b/src/gutenberg2zim/entrypoint.py @@ -178,7 +178,7 @@ def f(x): logger.info(f"PARSING rdf-files in {rdf_path}") parse_and_fill(rdf_path=rdf_path, only_books=books) logger.info("Add possible url to db") - setup_urls(force=force) + setup_urls(force=force, books=books) if do_download: logger.info("DOWNLOADING ebooks from mirror using filters") @@ -190,9 +190,9 @@ def f(x): only_books=books, force=force, s3_storage=s3_storage, - optimizer_version=optimizer_version - if not use_any_optimized_version - else None, + optimizer_version=( + optimizer_version if not use_any_optimized_version else None + ), ) if one_lang_one_zim_folder: if languages == []: diff --git a/src/gutenberg2zim/urls.py b/src/gutenberg2zim/urls.py index deada12..94817e8 100644 --- a/src/gutenberg2zim/urls.py +++ b/src/gutenberg2zim/urls.py @@ -8,7 +8,6 @@ class UrlBuilder: - """ Url builder for the files of a Gutenberg book. Example: @@ -227,7 +226,7 @@ def build_html(files): return list(set(urls)) -def setup_urls(force): +def setup_urls(force, books): file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}") if file_with_url.exists() and not force: @@ -261,10 +260,34 @@ def setup_urls(force): qry.execute() logger.info("\tAppending urls in DB from rsync result") - # strip rsync file to only contain relative path + count_dir = count_old = count_added = count_processed = 0 with open(file_with_url, errors="replace") as src: + # show progress in debug mode, we expect about 5.4M lines as of early 2024 + if count_processed and count_processed % 100000 == 0: + logger.debug(f"\t{count_processed} rsync results processed") for line in src.readlines(): + count_processed += 1 + # ignore all directory entries + if line.startswith("d"): + count_dir += 1 + continue + # ignore all entries in an /old/ subfolder + if "/old/" in line: + count_old += 1 + continue + # take into account the book selection which might have been passed ; + # this not does completely filter-out useless urls for books IDs 1 to 9 + # but still makes the scraper way faster for all other selections + if books: + if not any(f"/{book}/" in line for book in books): + continue + # strip rsync file to only contain relative path Url.create(url=line[start_rel_path_idx:].strip()) # type: ignore + count_added += 1 + logger.info( + f"\tDB is ready, {count_added} URLs have been added ({count_dir} dirs ignored, " + f"{count_old} old stuff ignored, {count_processed} lines processed)" + ) if __name__ == "__main__":