Skip to content

Commit

Permalink
Do not insert all RSYNC paths in database
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Mar 5, 2024
1 parent 59ebd30 commit 525b854
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 8 deletions.
6 changes: 4 additions & 2 deletions src/gutenberg2zim/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,9 @@ def download_book(
]
bfso = bfs
bfs = bfs.filter(BookFormat.pattern << patterns)
pp([(bf.mime, bf.images, bf.pattern) for bf in bfs]) # noqa: T203
pp([(bf.mime, bf.images, bf.pattern) for bf in bfso]) # noqa: T203
if not bfs.count():
pp([(bf.mime, bf.images, bf.pattern) for bf in bfs]) # noqa: T203
pp([(bf.mime, bf.images, bf.pattern) for bf in bfso]) # noqa: T203
logger.error("html not found")
unsuccessful_formats.append(book_format)
continue
Expand All @@ -214,9 +214,11 @@ def download_book(
urls = [bf.downloaded_from]
else:
urld = get_urls(book)
logger.debug(f"urld: {urld}")

Check warning on line 217 in src/gutenberg2zim/download.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/download.py#L217

Added line #L217 was not covered by tests
urls = list(
reversed(urld.get(FORMAT_MATRIX.get(book_format))) # type: ignore
)
logger.debug(f"urls: {urls}")

Check warning on line 221 in src/gutenberg2zim/download.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/download.py#L221

Added line #L221 was not covered by tests

import copy

Expand Down
8 changes: 4 additions & 4 deletions src/gutenberg2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def f(x):
logger.info(f"PARSING rdf-files in {rdf_path}")
parse_and_fill(rdf_path=rdf_path, only_books=books)
logger.info("Add possible url to db")
setup_urls(force=force)
setup_urls(force=force, books=books)

Check warning on line 181 in src/gutenberg2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/entrypoint.py#L181

Added line #L181 was not covered by tests

if do_download:
logger.info("DOWNLOADING ebooks from mirror using filters")
Expand All @@ -190,9 +190,9 @@ def f(x):
only_books=books,
force=force,
s3_storage=s3_storage,
optimizer_version=optimizer_version
if not use_any_optimized_version
else None,
optimizer_version=(
optimizer_version if not use_any_optimized_version else None
),
)
if one_lang_one_zim_folder:
if languages == []:
Expand Down
36 changes: 34 additions & 2 deletions src/gutenberg2zim/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@


class UrlBuilder:

"""
Url builder for the files of a Gutenberg book.
Example:
Expand Down Expand Up @@ -82,6 +81,7 @@ def f(x):
if f(x) in FORMAT_MATRIX.values()
]
files = sort_by_mime_type(available_formats)
logger.debug(f"files: {files}")

Check warning on line 84 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L84

Added line #L84 was not covered by tests
return build_urls(files)


Expand All @@ -108,6 +108,7 @@ def build_urls(files):
for i in mapping:
if i in files:
possible_url = mapping[i](files[i])
logger.debug(f"possible_url: {possible_url}")

Check warning on line 111 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L111

Added line #L111 was not covered by tests
filtre = [
u
for u in possible_url
Expand Down Expand Up @@ -227,7 +228,7 @@ def build_html(files):
return list(set(urls))


def setup_urls(force):
def setup_urls(force, books):

Check warning on line 231 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L231

Added line #L231 was not covered by tests
file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}")

if file_with_url.exists() and not force:
Expand Down Expand Up @@ -262,9 +263,40 @@ def setup_urls(force):

logger.info("\tAppending urls in DB from rsync result")
# strip rsync file to only contain relative path
# ignore all directory entries
# ignore all entries in an /old/ subfolder
# take into account the book selection which might have been passed
# display statistics after operation
count_dir = 0
count_old = 0
count_added = 0
count_processed = 0

Check warning on line 273 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L270-L273

Added lines #L270 - L273 were not covered by tests
with open(file_with_url, errors="replace") as src:
if count_processed and count_processed % 100000 == 0:
# show progress in debug mode, we expect about 5.4M lines as of early 2024
logger.debug(f"\t{count_processed} lines processed")

Check warning on line 277 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L277

Added line #L277 was not covered by tests
for line in src.readlines():
count_processed += 1

Check warning on line 279 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L279

Added line #L279 was not covered by tests
if line.startswith("d"):
count_dir += 1
continue

Check warning on line 282 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L281-L282

Added lines #L281 - L282 were not covered by tests
if "/old/" in line:
count_old += 1
continue

Check warning on line 285 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L284-L285

Added lines #L284 - L285 were not covered by tests
if books:
found = False

Check warning on line 287 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L287

Added line #L287 was not covered by tests
for book in books:
if f"/{book}/" in line:
found = True
break

Check warning on line 291 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L290-L291

Added lines #L290 - L291 were not covered by tests
if not found:
continue

Check warning on line 293 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L293

Added line #L293 was not covered by tests
Url.create(url=line[start_rel_path_idx:].strip()) # type: ignore
count_added += 1
logger.info(

Check warning on line 296 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L295-L296

Added lines #L295 - L296 were not covered by tests
f"\tDB is ready, {count_added} URLs have been added ({count_dir} dirs ignored, "

Check notice on line 297 in src/gutenberg2zim/urls.py

View check run for this annotation

codefactor.io / CodeFactor

src/gutenberg2zim/urls.py#L231-L297

Complex Method
f"{count_old} old stuff ignored, {count_processed} lines processed)"
)


if __name__ == "__main__":
Expand Down

0 comments on commit 525b854

Please sign in to comment.