We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 130c6f8 commit 22ffb26Copy full SHA for 22ffb26
archive_query_log/parsers/html.py
@@ -0,0 +1,18 @@
1
+from io import BytesIO
2
+from shutil import copyfileobj
3
+from warnings import warn
4
+
5
+from warcio.recordloader import ArcWarcRecord
6
7
8
+def read_html_string(record: ArcWarcRecord) -> str | None:
9
+ mime_type: str | None = record.http_headers.get_header("Content-Type")
10
+ if mime_type is None:
11
+ warn(UserWarning("No MIME type given."))
12
+ return None
13
+ mime_type = mime_type.split(";", maxsplit=1)[0]
14
+ if mime_type != "text/xml":
15
16
+ with BytesIO() as content_buffer:
17
+ copyfileobj(record.content_stream(), content_buffer)
18
+ return content_buffer.getvalue().decode("utf-8")
0 commit comments