Skip to content

Commit 22ffb26

Browse files
Add HTML utils
1 parent 130c6f8 commit 22ffb26

File tree

1 file changed

+18
-0
lines changed

1 file changed

+18
-0
lines changed

archive_query_log/parsers/html.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from io import BytesIO
2+
from shutil import copyfileobj
3+
from warnings import warn
4+
5+
from warcio.recordloader import ArcWarcRecord
6+
7+
8+
def read_html_string(record: ArcWarcRecord) -> str | None:
9+
mime_type: str | None = record.http_headers.get_header("Content-Type")
10+
if mime_type is None:
11+
warn(UserWarning("No MIME type given."))
12+
return None
13+
mime_type = mime_type.split(";", maxsplit=1)[0]
14+
if mime_type != "text/xml":
15+
return None
16+
with BytesIO() as content_buffer:
17+
copyfileobj(record.content_stream(), content_buffer)
18+
return content_buffer.getvalue().decode("utf-8")

0 commit comments

Comments
 (0)