Skip to content

Commit

Permalink
Merge pull request #31 from elsiehupp/fix-bytes-regex
Browse files Browse the repository at this point in the history
Cast XML bytes to str to avoid encoding issues
  • Loading branch information
robkam committed Dec 10, 2022
2 parents 1975127 + efcde30 commit 58baa10
Show file tree
Hide file tree
Showing 20 changed files with 136 additions and 79 deletions.
Binary file modified dist/wikiteam3-3.0.0-py3-none-any.whl
Binary file not shown.
Binary file modified dist/wikiteam3-3.0.0.tar.gz
Binary file not shown.
9 changes: 9 additions & 0 deletions wikiteam3/dumpgenerator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,4 +306,13 @@ def getParameters(params=[]):
print("Which expands to:")
print(" " + config["path"])

if config["delay"] == 0.5:
print("--delay is the default value of 0.5")
print(
"There will be a 0.5 second delay between HTTP calls in order to keep the server from timing you out."
)
print(
"If you know that this is unnecessary, you can manually specify '--delay 0.0'."
)

return config, other
39 changes: 34 additions & 5 deletions wikiteam3/dumpgenerator/delay.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,37 @@
import itertools
import threading
import time
import sys


def delay(config={}, session=None):
"""Add a delay if configured for that"""
if config["delay"] > 0:
print("Sleeping... %.2f seconds..." % (config["delay"]))
time.sleep(config["delay"])
class Delay:

done: bool = True
ellipses: str = "."

def animate(self):
try:
while not self.done:
sys.stdout.write("\r " + self.ellipses)
sys.stdout.flush()
self.ellipses += "."
time.sleep(0.1)
except KeyboardInterrupt:
sys.exit()

def __init__(self, config={}, session=None):
"""Add a delay if configured for that"""
if config["delay"] > 0:
self.done = False

ellipses_animation = threading.Thread(target=self.animate)
ellipses_animation.start()

# sys.stdout.write("\rSleeping %.2f seconds..." % (config["delay"]))
# sys.stdout.flush()

time.sleep(config["delay"])
self.done = True

sys.stdout.write("\r \r")
sys.stdout.flush()
2 changes: 1 addition & 1 deletion wikiteam3/dumpgenerator/greeter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def welcome():
"# Copyright (C) 2011-%d WikiTeam developers #\n"
% (datetime.datetime.now().year)
)
message += """
message += """# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
Expand Down
20 changes: 10 additions & 10 deletions wikiteam3/dumpgenerator/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
import urllib

from .delay import delay
from .delay import Delay
from .domain import domain2prefix
from .exceptions import PageMissingError
from .get_json import getJSON
Expand Down Expand Up @@ -46,7 +46,7 @@ def generateImageDump(config={}, other={}, images=[], start="", session=None):
lock = False
if lock:
continue
delay(config=config, session=session)
Delay(config=config, session=session)

# saving file
# truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
Expand Down Expand Up @@ -162,12 +162,12 @@ def generateImageDump(config={}, other={}, images=[], start="", session=None):
text=f"File {imagepath}/{filename2}.desc could not be created by OS",
)

delay(config=config, session=session)
Delay(config=config, session=session)
c += 1
if c % 10 == 0:
print(" Downloaded %d images" % (c))
print(f"\n-> Downloaded {c} images\n")

print("Downloaded %d images" % (c))
print(f"\n-> Downloaded {c} images\n")

def getImageNames(config={}, session=None):
"""Get list of image names"""
Expand Down Expand Up @@ -203,8 +203,8 @@ def getImageNamesScraper(config={}, session=None):
params={"title": "Special:Imagelist", "limit": limit, "offset": offset},
timeout=30,
)
raw = r.text
delay(config=config, session=session)
raw = str(r.text)
Delay(config=config, session=session)
# delicate wiki
if re.search(
r"(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)",
Expand All @@ -225,7 +225,7 @@ def getImageNamesScraper(config={}, session=None):
print("No more retries, exit...")
break

raw = cleanHTML(raw)
raw = str(cleanHTML(raw))
# archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
# wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
# href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
Expand Down Expand Up @@ -311,7 +311,7 @@ def getImageNamesAPI(config={}, session=None):
r = session.get(url=config["api"], params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
Delay(config=config, session=session)

if "query" in jsonimages:
aifrom = ""
Expand Down Expand Up @@ -386,7 +386,7 @@ def getImageNamesAPI(config={}, session=None):
r = session.get(url=config["api"], params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
Delay(config=config, session=session)

if "query" in jsonimages:
gapfrom = ""
Expand Down
2 changes: 1 addition & 1 deletion wikiteam3/dumpgenerator/index_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def checkIndex(index=None, cookies=None, session=None):
if r.status_code >= 400:
print(f"ERROR: The wiki returned status code HTTP {r.status_code}")
return False
raw = r.text
raw = str(r.text)
print("Checking index.php...", index)
# Workaround for issue 71
if (
Expand Down
8 changes: 4 additions & 4 deletions wikiteam3/dumpgenerator/index_php.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from .delay import delay
from .delay import Delay
from .util import removeIP


Expand All @@ -12,8 +12,8 @@ def saveIndexPHP(config={}, session=None):
else:
print("Downloading index.php (Main Page) as index.html")
r = session.post(url=config["index"], params={}, timeout=10)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
raw = str(r.text)
Delay(config=config, session=session)
raw = str(removeIP(raw=raw))
with open("%s/index.html" % (config["path"]), "w", encoding="utf-8") as outfile:
outfile.write(str(raw))
4 changes: 2 additions & 2 deletions wikiteam3/dumpgenerator/logs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .delay import delay
from .delay import Delay


def saveLogs(config={}, session=None):
Expand All @@ -19,4 +19,4 @@ def saveLogs(config={}, session=None):
<option value="">Todos los registros</option>
</select>
"""
delay(config=config, session=session)
Delay(config=config, session=session)
8 changes: 4 additions & 4 deletions wikiteam3/dumpgenerator/namespaces.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re

from .delay import delay
from .delay import Delay
from .get_json import getJSON


Expand All @@ -13,8 +13,8 @@ def getNamespacesScraper(config={}, session=None):
r = session.post(
url=config["index"], params={"title": "Special:Allpages"}, timeout=30
)
raw = r.text
delay(config=config, session=session)
raw = str(r.text)
Delay(config=config, session=session)

# [^>]*? to include selected="selected"
m = re.compile(
Expand Down Expand Up @@ -59,7 +59,7 @@ def getNamespacesAPI(config={}, session=None):
timeout=30,
)
result = getJSON(r)
delay(config=config, session=session)
Delay(config=config, session=session)
try:
nsquery = result["query"]["namespaces"]
except KeyError:
Expand Down
8 changes: 4 additions & 4 deletions wikiteam3/dumpgenerator/page_special_version.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from .delay import delay
from .delay import Delay
from .util import removeIP


Expand All @@ -14,9 +14,9 @@ def saveSpecialVersion(config={}, session=None):
r = session.post(
url=config["index"], params={"title": "Special:Version"}, timeout=10
)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
raw = str(r.text)
Delay(config=config, session=session)
raw = str(removeIP(raw=raw))
with open(
"%s/Special:Version.html" % (config["path"]), "w", encoding="utf-8"
) as outfile:
Expand Down
22 changes: 13 additions & 9 deletions wikiteam3/dumpgenerator/page_titles.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import re
import sys
from urllib.parse import urlparse

import mwclient

from .delay import delay
from .delay import Delay
from .domain import domain2prefix
from .namespaces import getNamespacesAPI, getNamespacesScraper
from .util import cleanHTML, undoHTMLEntities
Expand All @@ -19,7 +20,7 @@ def getPageTitlesAPI(config={}, session=None):
continue

c = 0
print(" Retrieving titles in the namespace %d" % (namespace))
sys.stdout.write(" Retrieving titles in the namespace %d" % (namespace))
apiurl = urlparse(config["api"])
site = mwclient.Site(
apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme
Expand All @@ -34,8 +35,11 @@ def getPageTitlesAPI(config={}, session=None):
print("Probably a loop, switching to next namespace")
titles = list(set(titles))

print(" %d titles retrieved in the namespace %d" % (c, namespace))
delay(config=config, session=session)
sys.stdout.write(
"\r %d titles retrieved in the namespace %d\n" % (c, namespace)
)
sys.stdout.flush()
Delay(config=config, session=session)


def getPageTitlesScraper(config={}, session=None):
Expand All @@ -48,8 +52,8 @@ def getPageTitlesScraper(config={}, session=None):
config["index"], namespace
)
r = session.get(url=url, timeout=30)
raw = r.text
raw = cleanHTML(raw)
raw = str(r.text)
raw = str(cleanHTML(raw))

r_title = 'title="(?P<title>[^>]+)">'
r_suballpages = ""
Expand Down Expand Up @@ -114,10 +118,10 @@ def getPageTitlesScraper(config={}, session=None):
if name not in checked_suballpages:
# to avoid reload dupe subpages links
checked_suballpages.append(name)
delay(config=config, session=session)
Delay(config=config, session=session)
r = session.get(url=url, timeout=10)
# print ('Fetching URL: ', url)
raw = r.text
raw = str(r.text)
raw = cleanHTML(raw)
rawacum += raw # merge it after removed junk
print(
Expand All @@ -131,7 +135,7 @@ def getPageTitlesScraper(config={}, session=None):
"pages",
)

delay(config=config, session=session)
Delay(config=config, session=session)
oldfr = currfr
c += 1

Expand Down
20 changes: 10 additions & 10 deletions wikiteam3/dumpgenerator/page_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
maxretries = config["retries"] # x retries and skip
increment = 20 # increment every retry

while not re.search(r"</mediawiki>", xml):
while not re.search(r"</mediawiki>", str(xml)):
if c > 0 and c < maxretries:
wait = (
increment * c < maxseconds and increment * c or maxseconds
Expand Down Expand Up @@ -86,7 +86,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
xml = ""
c += 1

return xml
return str(xml)


def getXMLPage(config={}, title="", verbose=True, session=None):
Expand Down Expand Up @@ -114,7 +114,7 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
if "templates" in config and config["templates"]:
params["templates"] = 1

xml = getXMLPageCore(params=params, config=config, session=session)
xml = str(getXMLPageCore(params=params, config=config, session=session))
if xml == "":
raise ExportAbortedError(config["index"])
if "</page>" not in xml:
Expand All @@ -132,8 +132,8 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
# else, warning about Special:Export truncating large page histories
r_timestamp = "<timestamp>([^<]+)</timestamp>"

numberofedits = 0
numberofedits += len(re.findall(r_timestamp, xml))
edit_count = 0
edit_count += len(re.findall(r_timestamp, xml))

# search for timestamps in xml to avoid analysing empty pages like
# Special:Allpages and the random one
Expand Down Expand Up @@ -183,16 +183,16 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
params["limit"] = params["limit"] / 2
continue
xml = xml2
numberofedits += len(re.findall(r_timestamp, xml))
edit_count += len(re.findall(r_timestamp, xml))
else:
params["offset"] = "" # no more edits in this page history
yield "</page>\n"

if verbose:
if numberofedits == 1:
if edit_count == 1:
uprint(" %s, 1 edit" % (title.strip()))
else:
uprint(" %s, %d edits" % (title.strip(), numberofedits))
uprint(" %s, %d edits" % (title.strip(), edit_count))


def makeXmlPageFromRaw(xml):
Expand Down Expand Up @@ -252,11 +252,11 @@ def makeXmlFromPage(page):
except KeyError as e:
print(e)
raise PageMissingError(page["title"], e)
return etree.tostring(p, pretty_print=True, encoding="unicode")
return str(etree.tostring(p, pretty_print=True, encoding="utf-8"))


def fixBOM(request):
"""Strip Unicode BOM"""
if request.text.startswith("\ufeff"):
request.encoding = "utf-8-sig"
return request.text
return str(request.text)
4 changes: 2 additions & 2 deletions wikiteam3/dumpgenerator/site_info.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os

from .delay import delay
from .delay import Delay
from .get_json import getJSON


Expand Down Expand Up @@ -51,7 +51,7 @@ def saveSiteInfo(config={}, session=None):
timeout=10,
)
result = getJSON(r)
delay(config=config, session=session)
Delay(config=config, session=session)
with open(
"%s/siteinfo.json" % (config["path"]), "w", encoding="utf-8"
) as outfile:
Expand Down
Loading

0 comments on commit 58baa10

Please sign in to comment.