Skip to content

Commit

Permalink
Implement basic inclusion of images, JavaScript, and CSS
Browse files Browse the repository at this point in the history
This is kind of a MVP? It really needs a lot more functionality to make it
more usable, but I'm not mad at it for an afternoon's work.
  • Loading branch information
jimwins committed Mar 7, 2024
1 parent c7bb361 commit f369229
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 7 deletions.
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,23 @@ pip install frozen-soup
```
## Usage

Usage instructions go here.
```base
python -mfrozen_soup https://www.example.com
```

```python
import frozen_soup

output = freeze_to_string("https://www.example.com")
```

## Development

To contribute to this library, first checkout the code. Then create a new virtual environment:
```bash
cd frozen-soup
python -m venv venv
source venv/bin/activate
python -m venv .venv
source .venv/bin/activate
```
Now install the dependencies and test dependencies:
```bash
Expand Down
70 changes: 68 additions & 2 deletions frozen_soup/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,75 @@

import requests

def freeze_to_string(url: str, session: Optional[requests.Session]) -> str:
from bs4 import BeautifulSoup

from urllib.parse import urljoin
import base64

def _get_ref_as_dataurl(base_url: str, ref_url: str, session: requests.Session) -> str:
url = urljoin(base_url, ref_url)
response = session.get(url)
if response.status_code == 200:
# Encode the content to base64 - don't use urlsafe_b64encode because
# the browser won't be passing this on to a server, and they don't
# expect the 'URL-safe' substitutions
encoded_content = base64.b64encode(response.content).decode("utf-8")
# Grab the content-type from the response headers
content_type = response.headers.get('Content-Type')
# TODO: what if we have no content_type?
# Return the data: URL with appropriate MIME type
return f"data:{content_type};base64,{encoded_content}"
else:
raise Exception(f"Unable to generate base64-encoded value")

def freeze_to_string(
url: str,
session: Optional[requests.Session] = None,
formatter: str = 'html5'
) -> str:
if session is None:
session = requests.Session()

r = session.get(url)
return r.text

soup = BeautifulSoup(r.text, 'html.parser')

# Inline images
for img in soup.find_all('img'):
img['src'] = _get_ref_as_dataurl(url, img['src'], session)

# Handle <link> elements
for link in soup.find_all('link'):
# Inline rel="icon"
if 'icon' in link.get_attribute_list('rel'):
link['href'] = _get_ref_as_dataurl(url, link['href'], session)

# Turn rel="stylesheet" into <style>
if 'stylesheet' in link.get_attribute_list('rel'):
response = session.get(urljoin(url, link['href']))
if response.status_code == 200:
style = soup.new_tag('style')
style.string = response.text
# Carry over media=""
if link.get('media'):
style['media'] = link['media']
# TODO anything else?
# TODO should replace url() in CSS with data URLs
link.replace_with(style)
else:
raise Exception(f"Unable to replace style {link['href']}")

# Inline <script src="">
for script in soup.find_all('script'):
if script.get('src'):
response = session.get(urljoin(url, script['src']))
if response.status_code == 200:
script.string = response.text
# TODO what other attributes do we care about?
# TODO parse/rewrite JavaScript to handle `import`?
del script['src']
else:
raise Exception(f"Unable to replace script contents {script['src']}")

# Should allow the caller to specify the formatter to use, html5 for now
return soup.decode(formatter=formatter)
36 changes: 36 additions & 0 deletions frozen_soup/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import sys
import os

import argparse

from . import freeze_to_string

# Based on what pip does, but try to be brief and not hardcode our name
def get_prog() -> str:
package = __loader__.name.split(".")[0]
try:
prog = os.path.basename(sys.argv[0])
if prog in ("__main__.py", "-c"):
# go back to orig_argv[0] to get what the user used
return f"{sys.orig_argv[0]} -m {package}"
else:
return prog
except (AttributeError, TypeError, IndexError):
pass
return package

def main() -> int:
parser = argparse.ArgumentParser(
prog = get_prog(),
description = 'Create a single-file version of an HTML file',
)
parser.add_argument('url')

args = parser.parse_args()

print(freeze_to_string(args.url))

return 0

if __name__ == '__main__':
sys.exit(main())
Binary file added tests/test_data/1x1.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
68 changes: 66 additions & 2 deletions tests/test_frozen_soup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest

import requests
from requests_testadapter import TestAdapter, TestSession

Expand All @@ -7,11 +8,74 @@
@pytest.fixture
def session() -> requests.Session:
s = TestSession()
s.mount('http://test/simple-string', TestAdapter(b'Mock!'))
s.mount('http://test/simple-string', TestAdapter(b'Alive!'))

s.mount(
"http://test/1x1.gif",
TestAdapter(
stream= open("tests/test_data/1x1.gif", "rb").read(),
headers= {
'Content-type': 'image/gif',
},
),
)
s.mount("http://test/bad.gif", TestAdapter(b'BAD', status=404))
s.mount("http://test/html-one-image", TestAdapter(b'<img src="/1x1.gif">'))
s.mount("http://test/html-bad-image", TestAdapter(b'<img src="/bad.gif">'))

s.mount(
"http://test/style.css",
TestAdapter(b'* { color: white }', headers= { 'Content-type': 'text/css' })
)

s.mount("http://test/html-link-icon", TestAdapter(b'<link rel="icon" href="1x1.gif">'))
s.mount("http://test/html-link-style", TestAdapter(b'<link rel="stylesheet" href="style.css">'))

s.mount(
"http://test/code.js",
TestAdapter(b'/* Code! */', headers= { 'Content-type': 'application/javascript' })
)

s.mount("http://test/html-script", TestAdapter(b'<script src="code.js"></script>'))

return s


def test_freeze_to_string(session):
out = freeze_to_string('http://test/simple-string', session)

assert out == "Mock!"
assert out == 'Alive!'


@pytest.fixture
def data_url():
return "data:image/gif;base64,R0lGODlhAQABAIAAAMRUFwAAACH5BAAAAAAALAAAAAABAAEAAAICRAEAOw=="

def test_single_image(session, data_url):
out = freeze_to_string('http://test/html-one-image', session)

assert out == f'<img src="{data_url}">'

def test_single_image_as_xhtml(session, data_url):
out = freeze_to_string('http://test/html-one-image', session, formatter='html')

assert out == f'<img src="{data_url}"/>'

def test_bad_image(session):
with pytest.raises(Exception):
out = freeze_to_string('http://test/html-bad-image', session)

def test_link_icon(session, data_url):
out = freeze_to_string('http://test/html-link-icon', session)

assert out == f'<link href="{data_url}" rel="icon">'

def test_link_style(session):
out = freeze_to_string('http://test/html-link-style', session)

assert out == '<style>* { color: white }</style>'

def test_script(session):
out = freeze_to_string('http://test/html-script', session)

assert out == '<script>/* Code! */</script>';

0 comments on commit f369229

Please sign in to comment.