Skip to content

Commit 140dd17

Browse files
Fix SERP snippet parsing
1 parent c26951a commit 140dd17

File tree

2 files changed

+10
-1
lines changed

2 files changed

+10
-1
lines changed

archive_query_log/orm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ class InnerSerp(InnerDocument):
406406
class Result(BaseDocument):
407407
archive: InnerArchive = Object(InnerArchive)
408408
provider: InnerProvider = Object(InnerProvider)
409+
capture: InnerCapture = Object(InnerCapture)
409410
serp: InnerSerp = Object(InnerSerp)
410411
snippet: Snippet = Object(Snippet)
411412
snippet_parser: InnerParser | None = Object(InnerParser)

archive_query_log/parsers/warc_snippets.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def add_warc_snippets_parser(
7171

7272
def _parse_warc_snippets(
7373
parser: WarcSnippetsParser,
74+
serp_id: str,
7475
capture_url: str,
7576
warc_store: WarcS3Store,
7677
warc_location: WarcLocation,
@@ -121,6 +122,7 @@ def _parse_warc_snippets(
121122
with_tail=True,
122123
)
123124
snippet_id_components = (
125+
serp_id,
124126
parser.id,
125127
str(hash(content)),
126128
str(i),
@@ -175,7 +177,12 @@ def _parse_serp_warc_snippets_action(
175177
for parser in _warc_snippets_parsers(config, serp.provider.id):
176178
# Try to parse the snippets.
177179
warc_snippets = _parse_warc_snippets(
178-
parser, serp.capture.url, config.s3.warc_store, serp.warc_location)
180+
parser=parser,
181+
serp_id=serp.id,
182+
capture_url=serp.capture.url,
183+
warc_store=config.s3.warc_store,
184+
warc_location=serp.warc_location,
185+
)
179186
if warc_snippets is None:
180187
# Parsing was not successful, e.g., URL pattern did not match.
181188
continue
@@ -188,6 +195,7 @@ def _parse_serp_warc_snippets_action(
188195
meta={"id": snippet.id},
189196
archive=serp.archive,
190197
provider=serp.provider,
198+
capture=serp.capture,
191199
serp=InnerSerp(
192200
id=serp.id,
193201
),

0 commit comments

Comments
 (0)