Skip to content

Commit 218ae44

Browse files
authored
Merge pull request #45 from webis-de/direct-answers
Direct answers
2 parents cf8300e + 14ddc9a commit 218ae44

File tree

4 files changed

+366
-2
lines changed

4 files changed

+366
-2
lines changed

archive_query_log/cli/parsers.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
UrlQueryParser, UrlPageParserType, UrlPageParser, \
1010
UrlOffsetParser, UrlOffsetParserType, WarcQueryParserType, \
1111
WarcQueryParser, WarcSnippetsParserType, WarcSnippetsParser, \
12+
WarcDirectAnswersParserType, WarcDirectAnswersParser, \
1213
WarcMainContentParserType, WarcMainContentParser
1314

1415

@@ -380,6 +381,66 @@ def warc_snippets_import(config: Config, services_path: Path) -> None:
380381
import_warc_snippets_parsers(config, services_path)
381382

382383

384+
@parsers.group()
385+
def warc_direct_answers() -> None:
386+
pass
387+
388+
389+
CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE = [
390+
"xpath",
391+
]
392+
393+
394+
@warc_direct_answers.command("add")
395+
@option("--provider-id", type=str)
396+
@option("--url-pattern-regex", type=str)
397+
@option("--priority", type=FloatRange(min=0, min_open=False))
398+
@option("--parser-type",
399+
type=Choice(CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE), required=True)
400+
@option("--xpath", type=str)
401+
@option("--url-xpath", type=str)
402+
@option("--text-xpath", type=str)
403+
@pass_config
404+
def warc_direct_answers_add(
405+
config: Config,
406+
provider_id: str | None,
407+
url_pattern_regex: str | None,
408+
priority: float | None,
409+
parser_type: str,
410+
xpath: str | None,
411+
url_xpath: str | None,
412+
text_xpath: str | None,
413+
) -> None:
414+
from archive_query_log.parsers.warc_direct_answers import \
415+
add_warc_direct_answers_parser
416+
parser_type_strict: WarcDirectAnswersParserType
417+
if parser_type == "xpath":
418+
parser_type_strict = "xpath"
419+
if xpath is None:
420+
raise UsageError("No XPath given.")
421+
else:
422+
raise ValueError(f"Invalid parser type: {parser_type}")
423+
WarcDirectAnswersParser.init(using=config.es.client)
424+
add_warc_direct_answers_parser(
425+
config=config,
426+
provider_id=provider_id,
427+
url_pattern_regex=url_pattern_regex,
428+
priority=priority,
429+
parser_type=parser_type_strict,
430+
xpath=xpath,
431+
url_xpath=url_xpath,
432+
text_xpath=text_xpath,
433+
)
434+
435+
436+
@warc_direct_answers.command("import")
437+
@option("-s", "--services-file", "services_path",
438+
type=PathType(path_type=Path, exists=True, file_okay=True,
439+
dir_okay=False, readable=True, resolve_path=True,
440+
allow_dash=False),
441+
default=Path("data") / "selected-services.yaml")
442+
443+
383444
@parsers.group()
384445
def warc_main_content() -> None:
385446
pass

archive_query_log/namespaces.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@
1515
NAMESPACE_AQL, "warc_snippets_parser")
1616
NAMESPACE_WARC_MAIN_CONTENT_PARSER = uuid5(
1717
NAMESPACE_AQL, "warc_main_content_parser")
18-
NAMESPACE_WARC_DIRECT_ANSWER_PARSER = uuid5(
19-
NAMESPACE_AQL, "warc_direct_answer_parser")
18+
NAMESPACE_WARC_DIRECT_ANSWERS_PARSER = uuid5(
19+
NAMESPACE_AQL, "warc_direct_answers_parser")
2020
NAMESPACE_WARC_DOWNLOADER = uuid5(NAMESPACE_AQL, "warc_downloader")

archive_query_log/orm.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,16 @@ class Snippet(SnippetId):
190190
text: str | None = Text()
191191

192192

193+
class DirectAnswerId(InnerDocument):
194+
id: str = Keyword()
195+
196+
197+
class DirectAnswer(DirectAnswerId):
198+
content: str = Text()
199+
url: str | None = Keyword()
200+
text: str | None = Text()
201+
202+
193203
class Serp(BaseDocument):
194204
archive: InnerArchive = Object(InnerArchive)
195205
provider: InnerProvider = Object(InnerProvider)
@@ -208,6 +218,8 @@ class Serp(BaseDocument):
208218
warc_query_parser: InnerParser | None = Object(InnerParser)
209219
warc_snippets: list[SnippetId] | None = Nested(SnippetId)
210220
warc_snippets_parser: InnerParser | None = Object(InnerParser)
221+
warc_direct_answers: list[DirectAnswerId] | None = Nested(DirectAnswerId)
222+
warc_direct_answers_parser: InnerParser | None = Object(InnerParser)
211223

212224
# rendered_warc_location: WarcLocation | None = Object(WarcLocation)
213225
# rendered_warc_downloader: InnerDownloader | None = (
@@ -437,6 +449,34 @@ class Index:
437449
}
438450

439451

452+
WarcDirectAnswersParserType = Literal[
453+
"xpath",
454+
]
455+
456+
457+
class WarcDirectAnswersParser(BaseDocument):
458+
provider: InnerProviderId | None = Object(InnerProviderId)
459+
url_pattern_regex: str | None = Keyword()
460+
priority: float | None = RankFeature(positive_score_impact=True)
461+
parser_type: WarcDirectAnswersParserType = Keyword()
462+
xpath: str | None = Keyword()
463+
url_xpath: str | None = Keyword()
464+
text_xpath: str | None = Keyword()
465+
466+
@cached_property
467+
def url_pattern(self) -> Pattern | None:
468+
if self.url_pattern_regex is None:
469+
raise ValueError("No URL pattern regex.")
470+
return pattern(self.url_pattern_regex)
471+
472+
class Index:
473+
name = "aql_warc_direct_answers_parsers"
474+
settings = {
475+
"number_of_shards": 1,
476+
"number_of_replicas": 2,
477+
}
478+
479+
440480
WarcMainContentParserType = Literal[
441481
"resiliparse",
442482
]

0 commit comments

Comments
 (0)