Skip to content

Commit

Permalink
Merge branch 'hotfix/2407_generate_sitemap' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
RK206 committed Jan 30, 2025
2 parents d351309 + 8dc4a3c commit c1d3e5c
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 31 deletions.
18 changes: 9 additions & 9 deletions doajtest/unit/test_bll_site_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ def test_sitemap(self, name, kwargs):
articles_expectations = [(a.id, a.last_updated) for a in articles]

if prune:
self.localStore.store(self.container_id, "sitemap_doaj_20180101_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20180101_0000/_0_utf8.xml",
source_stream=StringIO("test1"))
self.localStore.store(self.container_id, "sitemap_doaj_20180601_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20180601_0000/_0_utf8.xml",
source_stream=StringIO("test2"))
self.localStore.store(self.container_id, "sitemap_doaj_20190101_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20190101_0000/_0_utf8.xml",
source_stream=StringIO("test3"))

###########################################################
Expand All @@ -153,22 +153,22 @@ def test_sitemap(self, name, kwargs):
filenames = self.localStore.list(self.container_id)
if prune:
assert len(filenames) == 2, "expected 0, received {}".format(len(filenames))
assert "sitemap_doaj_20180101_0000_utf8.xml" not in filenames
assert "sitemap_doaj_20180601_0000_utf8.xml" not in filenames
assert "sitemap_doaj_20190101_0000_utf8.xml" in filenames
assert "sitemap_doaj_20180101_0000" not in filenames
assert "sitemap_doaj_20180601_0000" not in filenames
assert "sitemap_doaj_20190101_0000" in filenames
else:
assert len(filenames) == 1, "expected 0, received {}".format(len(filenames))

latest = None
for fn in filenames:
if fn != "sitemap_doaj_20190101_0000_utf8.xml":
if fn != "sitemap_doaj_20190101_0000":
latest = fn
break

NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"

file_date = '_'.join(latest.split('_')[2:])
index_file = os.path.join(latest, 'sitemap_index_doaj_'+file_date+'_utf8.xml')
index_file = os.path.join(latest, 'sitemap_index_utf8.xml')

handle = self.localStore.get(self.container_id, index_file, encoding="utf-8")

Expand All @@ -184,7 +184,7 @@ def test_sitemap(self, name, kwargs):
article_ids = []

# check sitemap file
sitemap_file = os.path.join(latest, 'sitemap_doaj_' + file_date + '_0_utf8.xml')
sitemap_file = os.path.join(latest, '_0_utf8.xml')
handle = self.localStore.get(self.container_id, sitemap_file, encoding="utf-8")

tree = etree.parse(handle)
Expand Down
23 changes: 11 additions & 12 deletions portality/bll/services/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
MAX_FILE_SIZE = (49 * 1024 * 1024)
MAX_URL_COUNT = 49000


class SitemapGenerator:

def __init__(self, filename_prefix, temp_store, main_store, container_id):
Expand Down Expand Up @@ -61,7 +60,7 @@ def write_url_element(self, loc, lastmod=None):
self.file.write(url_ele)

def create_sitemap_file(self):
self.current_filename = f'{self.filename_prefix}_{self.file_idx}_utf8.xml'
self.current_filename = os.path.join(self.filename_prefix, f'_{self.file_idx}_utf8.xml')
self.current_file_path = os.path.join(self.temp_store, self.current_filename)
self.file = open(self.current_file_path, "w")
self.file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
Expand All @@ -87,7 +86,6 @@ def get_url_count(self):
def get_sitemap_files(self):
return self.sitemap_files


class SiteService(object):

@staticmethod
Expand All @@ -110,7 +108,7 @@ def sitemap(prune: bool = True):
lastmod_date = dates.now_str(FMT_DATETIME_STD)

filename_prefix = 'sitemap_doaj_' + run_start_time
cache_container_id = app.config.get("STORE_CACHE_CONTAINER")
container_id = app.config.get("STORE_CACHE_CONTAINER")

total_static_pages = 0
total_journals_count = 0
Expand All @@ -121,11 +119,11 @@ def sitemap(prune: bool = True):
mainStore = StoreFactory.get("cache")

# temporary directory
tmp_store_dir = tmpStore.path(cache_container_id, '', create_container=True)
tmp_store_dir = tmpStore.path(container_id, '', create_container=True)
# Create the directories if they don't exist
os.makedirs(tmp_store_dir, exist_ok=True)
os.makedirs(os.path.join(tmp_store_dir,filename_prefix) , exist_ok=True)

sitemap_generator = SitemapGenerator(filename_prefix, tmp_store_dir, mainStore, cache_container_id)
sitemap_generator = SitemapGenerator(filename_prefix, tmp_store_dir, mainStore, container_id)

# Generating URLs for static pages
_entries = nav.get_nav_entries()
Expand Down Expand Up @@ -159,7 +157,7 @@ def sitemap(prune: bool = True):
sitemap_generator.finalize_sitemap_file()

# Create sitemap index file
sitemap_index_filename = f'sitemap_index_doaj_{run_start_time}_utf8.xml'
sitemap_index_filename = os.path.join(filename_prefix, f'sitemap_index_utf8.xml')
sitemap_index_path = os.path.join(tmp_store_dir, sitemap_index_filename)
with open(sitemap_index_path, "w") as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
Expand All @@ -185,8 +183,9 @@ def sitemap(prune: bool = True):
break
sitemap_count += 1

mainStore.store(cache_container_id, sitemap_index_filename, source_path=sitemap_index_path)
index_url = mainStore.url(cache_container_id, sitemap_index_filename)

mainStore.store(container_id, sitemap_index_filename, source_path=sitemap_index_path)
index_url = mainStore.url(container_id, sitemap_index_filename)

action_register.append("Sitemap index written to store with url {x}".format(x=index_url))

Expand All @@ -205,8 +204,8 @@ def sort(filelist):
def _filter(filename):
return filename.startswith("sitemap_")

action_register += prune_container(mainStore, cache_container_id, sort, filter=_filter, keep=2)
action_register += prune_container(tmpStore, cache_container_id, sort, filter=_filter, keep=2)
action_register += prune_container(mainStore, container_id, sort, filter=_filter, keep=2, is_directory=True)
action_register += prune_container(tmpStore, container_id, sort, filter=_filter, keep=2)

# Update the cache record to point to the new sitemap index and all sitemaps
models.Cache.cache_sitemap(index_url)
Expand Down
51 changes: 41 additions & 10 deletions portality/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class StoreS3(Store):
~~!FileStoreS3:Feature->S3:Technology~~
"""
def __init__(self, scope):
self.dir = None
self._cfg = app.config.get("STORE_S3_SCOPES", {}).get(scope)
multipart_threshold = app.config.get("STORE_S3_MULTIPART_THRESHOLD", 5 * 1024**3)

Expand Down Expand Up @@ -211,9 +212,10 @@ def __init__(self, scope):

def store(self, container_id, target_name, source_path=None, source_stream=None):
cpath = os.path.join(self.dir, container_id)
if not os.path.exists(cpath):
os.makedirs(cpath)
tpath = os.path.join(cpath, target_name)
directory = os.path.dirname(tpath)
if not os.path.exists(directory):
os.makedirs(directory)

if source_path:
shutil.copyfile(source_path, tpath)
Expand Down Expand Up @@ -292,10 +294,11 @@ def list_container_ids(self):
return [x for x in os.listdir(self.dir) if os.path.isdir(os.path.join(self.dir, x))]


def prune_container(storage, container_id, sort, filter=None, keep=1, logger=None):
def prune_container(storage, container_id, sort, filter=None, keep=1, logger=None, is_directory=False):
logger = logger if logger is not None else lambda x: x
action_register = []

dir_list = []
filelist = storage.list(container_id)
#action_register.append("Current cached files (before prune): " + ", ".join(filelist))

Expand All @@ -309,19 +312,47 @@ def prune_container(storage, container_id, sort, filter=None, keep=1, logger=Non
filtered = filelist
#action_register.append("Filtered cached files (before prune): " + ", ".join(filelist))

if len(filtered) <= keep:
# action_register.append("Fewer than {x} files in cache, no further action".format(x=keep))
return action_register
# treat directories differently
# s3 buckets does not have physical directories under the bucket. They are virtual directories.
# Retrieve the directories and delete all files related to the directories
if is_directory:
for fn in filtered:
dir = os.path.dirname(fn)
if dir:
dir_list.append(dir)
else:
if storage.dir:
if os.path.isdir(os.path.join(storage.dir, container_id, fn)):
dir_list.append(fn)

dir_set = set(dir_list)

if is_directory:
if len(dir_set) <= keep:
return action_register
else:
if len(filtered) <= keep:
# action_register.append("Fewer than {x} files in cache, no further action".format(x=keep))
return action_register

filtered_sorted = sort(filtered)
#action_register.append("Considering files for retention in the following order: " + ", ".join(filtered_sorted))
if is_directory:
filtered_sorted = sort(dir_set)
else:
filtered_sorted = sort(filtered)
#action_register.append("Considering files for retention in the following order: " + ", ".join(filtered_sorted))

remove = filtered_sorted[keep:]
msg = "Removed old files: " + ", ".join(remove)
action_register.append(msg)
logger(msg)

for fn in remove:
storage.delete_file(container_id, fn)
if is_directory:
for fn in remove:
for file in filtered:
if file.startswith(fn):
storage.delete_file(container_id, file)
else:
for fn in remove:
storage.delete_file(container_id, fn)

return action_register

0 comments on commit c1d3e5c

Please sign in to comment.