Skip to content

Commit 14ddc9a

Browse files
committed
removed "import_warc_direct_answers_parsers" from CLI
1 parent 9e62837 commit 14ddc9a

File tree

2 files changed

+0
-79
lines changed

2 files changed

+0
-79
lines changed

archive_query_log/cli/parsers.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -439,11 +439,6 @@ def warc_direct_answers_add(
439439
dir_okay=False, readable=True, resolve_path=True,
440440
allow_dash=False),
441441
default=Path("data") / "selected-services.yaml")
442-
@pass_config
443-
def warc_direct_answers_import(config: Config, services_path: Path) -> None:
444-
from archive_query_log.imports.yaml import import_warc_direct_answers_parsers
445-
WarcDirectAnswersParser.init(using=config.es.client)
446-
import_warc_direct_answers_parsers(config, services_path)
447442

448443

449444
@parsers.group()

archive_query_log/imports/yaml.py

Lines changed: 0 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from archive_query_log.parsers.url_query import add_url_query_parser
2020
from archive_query_log.parsers.warc_query import add_warc_query_parser
2121
from archive_query_log.parsers.warc_snippets import add_warc_snippets_parser
22-
from archive_query_log.parsers.warc_direct_answers import add_warc_direct_answers_parser
2322
from archive_query_log.parsers.xml import xpaths_from_css_selector, \
2423
text_xpath, merge_xpaths
2524
from archive_query_log.providers import add_provider
@@ -480,76 +479,3 @@ def import_warc_snippets_parsers(config: Config, services_path: Path) -> None:
480479
title_xpath=title_xpath,
481480
text_xpath=snippet_xpath,
482481
)
483-
484-
485-
def import_warc_direct_answers_parsers(config: Config, services_path: Path) -> None:
486-
echo("Load providers from services file.")
487-
with services_path.open("r") as file:
488-
services_list: Sequence[dict] = safe_load(file)
489-
echo(f"Found {len(services_list)} service definitions.")
490-
491-
services: Iterable[dict] = services_list
492-
# noinspection PyTypeChecker
493-
services = tqdm(
494-
services,
495-
desc="Import parsers for providers",
496-
unit="provider",
497-
)
498-
for service in services:
499-
if ("domains" not in service or "results_parsers" not in service):
500-
continue
501-
502-
results_parsers = service["results_parsers"]
503-
num_results_parsers = len(results_parsers)
504-
505-
providers = (
506-
Provider.search(using=config.es.client)
507-
.query(Terms(domains=service["domains"]))
508-
.scan()
509-
)
510-
providers = safe_iter_scan(providers)
511-
for provider in providers:
512-
for k, results_parser in enumerate(results_parsers):
513-
if results_parser["type"] != "html_selector":
514-
continue
515-
results_selector = results_parser["results_selector"]
516-
url_selector = results_parser.get("url_selector")
517-
direct_answer_selector = results_parser.get("direct_answer_selector")
518-
519-
results_xpaths = xpaths_from_css_selector(results_selector)
520-
results_xpaths = [
521-
"//" + result_xpath
522-
for result_xpath in results_xpaths
523-
]
524-
results_xpath = merge_xpaths(results_xpaths)
525-
526-
if url_selector is not None:
527-
url_xpaths = xpaths_from_css_selector(url_selector)
528-
url_xpaths = [
529-
text_xpath(xpath, attribute="href")
530-
for xpath in url_xpaths
531-
]
532-
url_xpath = merge_xpaths(url_xpaths)
533-
else:
534-
url_xpath = None
535-
536-
if direct_answer_selector is not None:
537-
direct_answer_xpaths = xpaths_from_css_selector(direct_answer_selector)
538-
direct_answer_xpaths = [
539-
text_xpath(xpath, text=True)
540-
for xpath in direct_answer_xpaths
541-
]
542-
direct_answer_xpath = merge_xpaths(direct_answer_xpaths)
543-
else:
544-
direct_answer_xpath = None
545-
546-
add_warc_direct_answers_parser(
547-
config=config,
548-
provider_id=provider.meta.id,
549-
url_pattern_regex=results_parser.get("url_pattern"),
550-
priority=num_results_parsers - k,
551-
parser_type="xpath",
552-
xpath=results_xpath,
553-
url_xpath=url_xpath,
554-
text_xpath=direct_answer_xpath,
555-
)

0 commit comments

Comments
 (0)