Skip to content
This repository has been archived by the owner on May 8, 2024. It is now read-only.

Commit

Permalink
chore: merge branch 'pipeline-refactor' of github.com:welfare-state-a…
Browse files Browse the repository at this point in the history
…nalytics/riksdagen-corpus into pipeline-refactor
  • Loading branch information
ninpnin committed Feb 6, 2024
2 parents 4525787 + 3a0f185 commit ae88210
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 10 deletions.
9 changes: 7 additions & 2 deletions pyriksdagen/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,12 @@ def _alto_extract_paragraphs(altofile):

def convert_alto(filenames, files):
"""
Download protocol from betalab, convert it to the simple XML 'blocks' schema
Convert a document from ALTO to a list of paragraphs.
Args:
filenames: the names of the ALTO files of one document, as a list of str.
The script assumes zero-padded numbering right before the .xml extension.
files: ALTO XML files as a list of str in corresponding order to the filenames
"""
in_sync = True
paragraphs = []
Expand All @@ -134,7 +139,7 @@ def convert_alto(filenames, files):
page_number = int(re.findall("([0-9]{3,3}).xml", fname)[0])
paragraphs.append(page_number)
if in_sync and page_number != ix:
not_in_sync_warning = f"ALTO page number and page count not in sync ({package_id})"
not_in_sync_warning = f"ALTO page number and page count not in sync ({fname})"
warnings.warn(not_in_sync_warning)
in_sync = False
paragraphs += _alto_extract_paragraphs(altofile)
Expand Down
34 changes: 26 additions & 8 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
import argparse

from pyriksdagen.download import dl_kb_blocks, LazyArchive, count_pages
from pyriksdagen.download import dl_kb_blocks, LazyArchive, count_pages, convert_alto
from pyriksdagen.export import dict_to_parlaclarin
from pyriksdagen.utils import infer_metadata

Expand All @@ -16,14 +16,26 @@
from pathlib import Path
import progressbar

def fetch_local_package(pgk_path, package):
filenames = os.listdir(f"{pgk_path}/{package}")
def files():
for fname in filenames:
with open(f"{pgk_path}/{package}/{fname}", 'r') as f:
yield f.read()
return filenames, files()

def main(args):
if args.protocol_ids is not None:
package_ids = args.protocol_ids
if args.local_alto is not None:
package_ids = args.local_alto
archive = None
else:
df = count_pages(args.start, args.end)
print(df)
package_ids = list(df["protocol_id"])
archive = LazyArchive()
if args.protocol_ids is not None:
package_ids = args.protocol_ids
else:
df = count_pages(args.start, args.end)
print(df)
package_ids = list(df["protocol_id"])
archive = LazyArchive()
for package_id in progressbar.progressbar(list(package_ids)):
data = infer_metadata(package_id)
print("metadata", data)
Expand All @@ -35,7 +47,11 @@ def main(args):
data["licence"] = "Licence: Attribution 4.0 International (CC BY 4.0)"
data["licence_url"] = "https://creativecommons.org/licenses/by/4.0/"

paragraphs = dl_kb_blocks(package_id, archive)
if archive:
paragraphs = dl_kb_blocks(package_id, archive)
else:
filenames, files = fetch_local_package(args.alto_path, package_id)
paragraphs = convert_alto(filenames, files)
print()
print(paragraphs[0])
data["edition"] = args.edition
Expand All @@ -51,5 +67,7 @@ def main(args):
parser.add_argument("--authority", type=str, default="SWERIK Project, 2023-2027")
parser.add_argument("--edition", type=str, required=True)
parser.add_argument("--protocol_ids", type=str, nargs="+", default=None)
parser.add_argument("--local-alto", type=str, nargs="+", default=None, help="Locally stored alto package (folder=protocol name, contents=pages.")
parser.add_argument("--alto-path", type=str, help="Path to `--local-alto` directories")
args = parser.parse_args()
main(args)

0 comments on commit ae88210

Please sign in to comment.