|
19 | 19 | from archive_query_log.parsers.url_query import add_url_query_parser
|
20 | 20 | from archive_query_log.parsers.warc_query import add_warc_query_parser
|
21 | 21 | from archive_query_log.parsers.warc_snippets import add_warc_snippets_parser
|
22 |
| -from archive_query_log.parsers.warc_direct_answers import add_warc_direct_answers_parser |
23 | 22 | from archive_query_log.parsers.xml import xpaths_from_css_selector, \
|
24 | 23 | text_xpath, merge_xpaths
|
25 | 24 | from archive_query_log.providers import add_provider
|
@@ -480,76 +479,3 @@ def import_warc_snippets_parsers(config: Config, services_path: Path) -> None:
|
480 | 479 | title_xpath=title_xpath,
|
481 | 480 | text_xpath=snippet_xpath,
|
482 | 481 | )
|
483 |
| - |
484 |
| - |
485 |
| -def import_warc_direct_answers_parsers(config: Config, services_path: Path) -> None: |
486 |
| - echo("Load providers from services file.") |
487 |
| - with services_path.open("r") as file: |
488 |
| - services_list: Sequence[dict] = safe_load(file) |
489 |
| - echo(f"Found {len(services_list)} service definitions.") |
490 |
| - |
491 |
| - services: Iterable[dict] = services_list |
492 |
| - # noinspection PyTypeChecker |
493 |
| - services = tqdm( |
494 |
| - services, |
495 |
| - desc="Import parsers for providers", |
496 |
| - unit="provider", |
497 |
| - ) |
498 |
| - for service in services: |
499 |
| - if ("domains" not in service or "results_parsers" not in service): |
500 |
| - continue |
501 |
| - |
502 |
| - results_parsers = service["results_parsers"] |
503 |
| - num_results_parsers = len(results_parsers) |
504 |
| - |
505 |
| - providers = ( |
506 |
| - Provider.search(using=config.es.client) |
507 |
| - .query(Terms(domains=service["domains"])) |
508 |
| - .scan() |
509 |
| - ) |
510 |
| - providers = safe_iter_scan(providers) |
511 |
| - for provider in providers: |
512 |
| - for k, results_parser in enumerate(results_parsers): |
513 |
| - if results_parser["type"] != "html_selector": |
514 |
| - continue |
515 |
| - results_selector = results_parser["results_selector"] |
516 |
| - url_selector = results_parser.get("url_selector") |
517 |
| - direct_answer_selector = results_parser.get("direct_answer_selector") |
518 |
| - |
519 |
| - results_xpaths = xpaths_from_css_selector(results_selector) |
520 |
| - results_xpaths = [ |
521 |
| - "//" + result_xpath |
522 |
| - for result_xpath in results_xpaths |
523 |
| - ] |
524 |
| - results_xpath = merge_xpaths(results_xpaths) |
525 |
| - |
526 |
| - if url_selector is not None: |
527 |
| - url_xpaths = xpaths_from_css_selector(url_selector) |
528 |
| - url_xpaths = [ |
529 |
| - text_xpath(xpath, attribute="href") |
530 |
| - for xpath in url_xpaths |
531 |
| - ] |
532 |
| - url_xpath = merge_xpaths(url_xpaths) |
533 |
| - else: |
534 |
| - url_xpath = None |
535 |
| - |
536 |
| - if direct_answer_selector is not None: |
537 |
| - direct_answer_xpaths = xpaths_from_css_selector(direct_answer_selector) |
538 |
| - direct_answer_xpaths = [ |
539 |
| - text_xpath(xpath, text=True) |
540 |
| - for xpath in direct_answer_xpaths |
541 |
| - ] |
542 |
| - direct_answer_xpath = merge_xpaths(direct_answer_xpaths) |
543 |
| - else: |
544 |
| - direct_answer_xpath = None |
545 |
| - |
546 |
| - add_warc_direct_answers_parser( |
547 |
| - config=config, |
548 |
| - provider_id=provider.meta.id, |
549 |
| - url_pattern_regex=results_parser.get("url_pattern"), |
550 |
| - priority=num_results_parsers - k, |
551 |
| - parser_type="xpath", |
552 |
| - xpath=results_xpath, |
553 |
| - url_xpath=url_xpath, |
554 |
| - text_xpath=direct_answer_xpath, |
555 |
| - ) |
0 commit comments