Skip to content

Commit af8aa20

Browse files
committed
REF,TST: tox.ini, setup.py, tests/: pytest tests with fixtures
1 parent b14c9d7 commit af8aa20

File tree

8 files changed

+267
-9
lines changed

8 files changed

+267
-9
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,6 @@ docs/_build
4343

4444
# UndoTree
4545
*.un~
46+
47+
# pytest-xprocess
48+
.xprocess/

requirements-dev.txt

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
1+
2+
pytest
3+
pytest-cov
4+
pytest-xprocess
5+
flake8
6+
17
#sarge
2-
-e hg+https://bitbucket.org/vinay.sajip/sarge#egg=sarge
8+
-e git+https://bitbucket.org/vinay.sajip/sarge#egg=sarge
39

410
#structlog
511
-e git+https://github.com/hynek/structlog#egg=structlog
12+
13+
matplotlib

setup.py

+7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@
2121
requires = [
2222
"sarge",
2323
"structlog",
24+
"requests",
25+
"beautifulsoup4",
26+
"URLObject",
27+
"NLTK",
28+
"textblob",
29+
"networkx",
30+
"pydot",
2431
]
2532

2633
setup(

tests/test_crawl.py

+224
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
#!/usr/bin/env python
2+
3+
import itertools
4+
import socket
5+
import subprocess
6+
import sys
7+
from collections import Counter
8+
from pathlib import Path
9+
10+
import bs4
11+
import requests
12+
import pytest
13+
from xprocess import ProcessStarter
14+
15+
from wrdrd.tools.crawl import (
16+
crawl_url,
17+
wrdcrawler,
18+
expand_link,
19+
same_netloc,
20+
strip_fragment,
21+
extract_links,
22+
build_networkx_graph,
23+
write_nxgraph_to_dot,
24+
write_nxgraph_to_json,
25+
sum_counters,
26+
tokenize,
27+
iteritems,
28+
StringIO,
29+
word_frequencies,
30+
)
31+
32+
33+
# @pytest.fixture(scope="session")
34+
# def find_free_port():
35+
# with socket.socket() as s:
36+
# s.bind(("localhost", 0))
37+
# port = s.getsockname()[1]
38+
# return port
39+
40+
41+
@pytest.fixture(scope="session")
42+
def myserver(xprocess):
43+
dirname = Path(__file__).parent.parent / "docs" / "_build" / "html"
44+
port = 40025 # find_free_port
45+
46+
class Starter(ProcessStarter):
47+
pattern = "^Serving HTTP on"
48+
args = [
49+
sys.executable,
50+
"-m",
51+
"http.server",
52+
"--directory",
53+
dirname,
54+
"-b",
55+
"localhost",
56+
port,
57+
]
58+
59+
url = f"http://localhost:{port}/"
60+
print(("setup", dirname, port, url))
61+
logfile = xprocess.ensure("myserver", Starter)
62+
yield logfile, url
63+
process = xprocess.getinfo("myserver")
64+
process.kill()
65+
subprocess.call(["kill", "-s SIGQUIT", str(process.pid)])
66+
print(("teardown", dirname, port, url))
67+
68+
69+
def test_tokenize():
70+
input_ = "d'yer mak'er is a great song, don't you think?"
71+
expected_output = [
72+
"d'yer",
73+
"mak'er",
74+
"is",
75+
"a",
76+
"great",
77+
"song",
78+
",",
79+
"do",
80+
"n't",
81+
"you",
82+
"think",
83+
"?",
84+
]
85+
output = list(tokenize(input_))
86+
assert output == expected_output # (input_, output, expected_output)
87+
88+
89+
def test_word_frequencies():
90+
url = "./"
91+
keywords = ["cat", "dog", "mouse", "mouse", "whale", "dog"]
92+
output = word_frequencies(url, keywords)
93+
assert output.url == url
94+
assert output.frequencies == Counter(
95+
{"dog": 2, "mouse": 2, "cat": 1, "whale": 1}
96+
)
97+
98+
99+
def test_crawl_url(myserver):
100+
START_URL = myserver[1]
101+
output = StringIO.StringIO()
102+
crawled = crawl_url(START_URL, output=output)
103+
assert crawled
104+
105+
106+
def test_wrdcrawler(myserver):
107+
START_URL = myserver[1]
108+
output = StringIO.StringIO()
109+
output = wrdcrawler(START_URL, output=output)
110+
output.seek(0)
111+
print(output.read())
112+
assert output
113+
114+
# print(pformat(keyword_counts))
115+
116+
117+
def test_expand_link():
118+
test_data = (
119+
(
120+
("http://localhost/index.html", "About.html"),
121+
"http://localhost/About.html",
122+
),
123+
(
124+
("http://localhost:8080#Test", "About.html"),
125+
"http://localhost:8080/About.html",
126+
),
127+
(
128+
("http://localhost?query", "About.html#Test"),
129+
"http://localhost/About.html#Test",
130+
),
131+
)
132+
for input_, expected_output in test_data:
133+
output = expand_link(*input_)
134+
assert output == expected_output
135+
136+
137+
def test_strip_fragment():
138+
test_data = (
139+
("http://localhost/#test", "http://localhost/"),
140+
("http://localhost:8080?query#Test", "http://localhost:8080?query",),
141+
("http://localhost?query", "http://localhost?query"),
142+
)
143+
for input_, expected_output in test_data:
144+
output = strip_fragment(input_)
145+
assert output == expected_output
146+
147+
148+
def test_same_netloc():
149+
test_data = (
150+
(("http://localhost/index.html", "http://localhost/"), True),
151+
(("http://localhost:8080#Test", "http://localhost/"), False),
152+
(("http://localhost:8080#Test", "http://localhost"), False),
153+
)
154+
for input_, expected_output in test_data:
155+
output = same_netloc(*input_)
156+
try:
157+
assert output == expected_output
158+
except Exception:
159+
print(input_)
160+
raise
161+
162+
163+
def test_sum_counters():
164+
c1 = {"a": 2, "b": 1, "c": 3}
165+
c2 = {"a": 1, "b": 2, "d": 3}
166+
csum = sum_counters([c1, c2])
167+
for k, v in iteritems(csum):
168+
assert v == 3 # k
169+
170+
171+
def test_other(myserver):
172+
url = myserver[1]
173+
resp = requests.get(url)
174+
bs = bs4.BeautifulSoup(resp.content)
175+
links = list(extract_links(url, bs))
176+
for key, links in itertools.groupby(links, lambda x: x.parent_id):
177+
print("## %s" % key)
178+
print(list(links))
179+
180+
181+
def test_build_networkx_graph(myserver, tmpdir):
182+
url = myserver[1]
183+
output = sys.stdout
184+
resp = requests.get(url)
185+
bs = bs4.BeautifulSoup(resp.content)
186+
links = list(extract_links(url, bs))
187+
g = build_networkx_graph(url, links) # , output=output)
188+
assert len(g)
189+
190+
output = StringIO.StringIO()
191+
write_nxgraph_to_dot(g, output)
192+
output.seek(0)
193+
print(output.read())
194+
output.seek(0)
195+
assert output.read()
196+
output.seek(0)
197+
dotpath = tmpdir / "nxgraph_dot.dot"
198+
with open(dotpath, "w") as f:
199+
f.write(output.read())
200+
dotcontent = dotpath.read_text(encoding="utf8")
201+
assert dotcontent
202+
assert dotcontent.startswith("strict digraph")
203+
204+
output = StringIO.StringIO()
205+
write_nxgraph_to_json(g, output)
206+
output.seek(0)
207+
print(output.read())
208+
output.seek(0)
209+
assert output.read()
210+
output.seek(0)
211+
forcejsonpath = tmpdir / "nxgraph_force.json"
212+
with open(forcejsonpath, "w") as f:
213+
f.write(output.read())
214+
215+
import matplotlib.pyplot as plt
216+
import networkx
217+
218+
networkx.draw_circular(g)
219+
svgpath = tmpdir / "nxgraph_draw_circular.svg"
220+
plt.savefig(svgpath)
221+
svgcontents = svgpath.read_text(encoding="utf8")
222+
assert svgcontents
223+
assert "<svg" in svgcontents
224+
# plt.show()

tests/test_domain.py

Whitespace-only changes.

tests/test_stripsinglehtml.py

Whitespace-only changes.

tox.ini

+20-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,25 @@
11
[tox]
2-
envlist = py26, py27, py33
2+
envlist = py37, docs, py27
33

44
[testenv]
55
setenv =
6-
PYTHONPATH = {toxinidir}:{toxinidir}/wrdsbc
7-
commands = python setup.py test
6+
; PYTHONPATH = {toxinidir}:{toxinidir}/wrdrd
7+
PYTHONWARNINGS = all,ignore::ImportWarning:pkgutil,ignore::ImportWarning:importlib._bootstrap,ignore::ImportWarning:importlib._bootstrap_external,ignore::ImportWarning:pytest_cov.plugin,ignore::DeprecationWarning:site,ignore::DeprecationWarning:_pytest.assertion.rewrite,ignore::DeprecationWarning:_pytest.fixtures,ignore::DeprecationWarning:distutils
8+
commands = pytest -v --cov=wrdrd {posargs} ./tests
89
deps =
9-
-r{toxinidir}/requirements.txt
10+
pytest
11+
pytest-cov
12+
pytest-xprocess
13+
matplotlib
14+
-r requirements.txt
15+
16+
[testenv:docs]
17+
basepython = python3
18+
description =
19+
Build documentation.
20+
extras =
21+
docs
22+
deps =
23+
-r requirements-docs.txt
24+
commands =
25+
python setup.py build_sphinx {posargs}

wrdrd/tools/crawl.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def get_stop_words():
217217
STOP_WORDS.pop("about")
218218

219219

220-
# get_stop_words()
220+
get_stop_words()
221221

222222

223223
def get_text_from_bs(bs):
@@ -268,7 +268,7 @@ def extract_words_from_bs(bs):
268268
KeywordFrequency = namedtuple("KeywordFrequency", ("url", "frequencies"))
269269

270270

271-
def word_frequencies(url, keywords):
271+
def word_frequencies(url, keywords, stopwords=STOP_WORDS):
272272
"""
273273
Get frequencies (counts) for a set of (non-stopword) keywords
274274
@@ -280,7 +280,7 @@ def word_frequencies(url, keywords):
280280
"""
281281
words = (x.lower() for x in keywords)
282282
return KeywordFrequency(
283-
url, Counter(w for w in words if len(w) > 1 and w not in STOP_WORDS)
283+
url, Counter(w for w in words if len(w) > 1 and w not in stopwords)
284284
)
285285

286286

@@ -705,7 +705,7 @@ def write_nxgraph_to_dot(g, output):
705705
"""
706706
import networkx
707707

708-
return networkx.drawing.write_dot(g, output)
708+
return networkx.drawing.nx_pydot.write_dot(g, output)
709709

710710

711711
def write_nxgraph_to_json(g, output):

0 commit comments

Comments
 (0)