Merge pull request #31 from elsiehupp/fix-bytes-regex

Cast XML bytes to str to avoid encoding issues
mediawiki-client-tools · Dec 10, 2022 · 58baa10 · 58baa10
2 parents 1975127 + efcde30
commit 58baa10
Show file tree

Hide file tree

Showing 20 changed files with 136 additions and 79 deletions.
diff --git a/dist/wikiteam3-3.0.0-py3-none-any.whl b/dist/wikiteam3-3.0.0-py3-none-any.whl
diff --git a/dist/wikiteam3-3.0.0.tar.gz b/dist/wikiteam3-3.0.0.tar.gz
diff --git a/wikiteam3/dumpgenerator/cli.py b/wikiteam3/dumpgenerator/cli.py
@@ -306,4 +306,13 @@ def getParameters(params=[]):
         print("Which expands to:")
         print("  " + config["path"])
 
+    if config["delay"] == 0.5:
+        print("--delay is the default value of 0.5")
+        print(
+            "There will be a 0.5 second delay between HTTP calls in order to keep the server from timing you out."
+        )
+        print(
+            "If you know that this is unnecessary, you can manually specify '--delay 0.0'."
+        )
+
     return config, other
diff --git a/wikiteam3/dumpgenerator/delay.py b/wikiteam3/dumpgenerator/delay.py
@@ -1,8 +1,37 @@
+import itertools
+import threading
 import time
+import sys
 
 
-def delay(config={}, session=None):
-    """Add a delay if configured for that"""
-    if config["delay"] > 0:
-        print("Sleeping... %.2f seconds..." % (config["delay"]))
-        time.sleep(config["delay"])
+class Delay:
+
+    done: bool = True
+    ellipses: str = "."
+
+    def animate(self):
+        try:
+            while not self.done:
+                sys.stdout.write("\r    " + self.ellipses)
+                sys.stdout.flush()
+                self.ellipses += "."
+                time.sleep(0.1)
+        except KeyboardInterrupt:
+            sys.exit()
+
+    def __init__(self, config={}, session=None):
+        """Add a delay if configured for that"""
+        if config["delay"] > 0:
+            self.done = False
+
+            ellipses_animation = threading.Thread(target=self.animate)
+            ellipses_animation.start()
+
+            # sys.stdout.write("\rSleeping %.2f seconds..." % (config["delay"]))
+            # sys.stdout.flush()
+
+            time.sleep(config["delay"])
+            self.done = True
+
+            sys.stdout.write("\r                           \r")
+            sys.stdout.flush()
diff --git a/wikiteam3/dumpgenerator/greeter.py b/wikiteam3/dumpgenerator/greeter.py
@@ -28,7 +28,7 @@ def welcome():
         "# Copyright (C) 2011-%d WikiTeam developers                           #\n"
         % (datetime.datetime.now().year)
     )
-    message += """
+    message += """#                                                                       #
 # This program is free software: you can redistribute it and/or modify  #
 # it under the terms of the GNU General Public License as published by  #
 # the Free Software Foundation, either version 3 of the License, or     #

diff --git a/wikiteam3/dumpgenerator/image.py b/wikiteam3/dumpgenerator/image.py
@@ -3,7 +3,7 @@
 import sys
 import urllib
 
-from .delay import delay
+from .delay import Delay
 from .domain import domain2prefix
 from .exceptions import PageMissingError
 from .get_json import getJSON
@@ -46,7 +46,7 @@ def generateImageDump(config={}, other={}, images=[], start="", session=None):
                 lock = False
             if lock:
                 continue
-            delay(config=config, session=session)
+            Delay(config=config, session=session)
 
             # saving file
             # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
@@ -162,12 +162,12 @@ def generateImageDump(config={}, other={}, images=[], start="", session=None):
                     text=f"File {imagepath}/{filename2}.desc could not be created by OS",
                 )
 
-            delay(config=config, session=session)
+            Delay(config=config, session=session)
             c += 1
             if c % 10 == 0:
-                print("    Downloaded %d images" % (c))
+                print(f"\n->  Downloaded {c} images\n")
 
-        print("Downloaded %d images" % (c))
+        print(f"\n->  Downloaded {c} images\n")
 
     def getImageNames(config={}, session=None):
         """Get list of image names"""
@@ -203,8 +203,8 @@ def getImageNamesScraper(config={}, session=None):
                 params={"title": "Special:Imagelist", "limit": limit, "offset": offset},
                 timeout=30,
             )
-            raw = r.text
-            delay(config=config, session=session)
+            raw = str(r.text)
+            Delay(config=config, session=session)
             # delicate wiki
             if re.search(
                 r"(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)",
@@ -225,7 +225,7 @@ def getImageNamesScraper(config={}, session=None):
                     print("No more retries, exit...")
                     break
 
-            raw = cleanHTML(raw)
+            raw = str(cleanHTML(raw))
             # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
             # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
             # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
@@ -311,7 +311,7 @@ def getImageNamesAPI(config={}, session=None):
             r = session.get(url=config["api"], params=params, timeout=30)
             handleStatusCode(r)
             jsonimages = getJSON(r)
-            delay(config=config, session=session)
+            Delay(config=config, session=session)
 
             if "query" in jsonimages:
                 aifrom = ""
@@ -386,7 +386,7 @@ def getImageNamesAPI(config={}, session=None):
                 r = session.get(url=config["api"], params=params, timeout=30)
                 handleStatusCode(r)
                 jsonimages = getJSON(r)
-                delay(config=config, session=session)
+                Delay(config=config, session=session)
 
                 if "query" in jsonimages:
                     gapfrom = ""

diff --git a/wikiteam3/dumpgenerator/index_check.py b/wikiteam3/dumpgenerator/index_check.py
@@ -7,7 +7,7 @@ def checkIndex(index=None, cookies=None, session=None):
     if r.status_code >= 400:
         print(f"ERROR: The wiki returned status code HTTP {r.status_code}")
         return False
-    raw = r.text
+    raw = str(r.text)
     print("Checking index.php...", index)
     # Workaround for issue 71
     if (

diff --git a/wikiteam3/dumpgenerator/index_php.py b/wikiteam3/dumpgenerator/index_php.py
@@ -1,6 +1,6 @@
 import os
 
-from .delay import delay
+from .delay import Delay
 from .util import removeIP
 
 
@@ -12,8 +12,8 @@ def saveIndexPHP(config={}, session=None):
     else:
         print("Downloading index.php (Main Page) as index.html")
         r = session.post(url=config["index"], params={}, timeout=10)
-        raw = r.text
-        delay(config=config, session=session)
-        raw = removeIP(raw=raw)
+        raw = str(r.text)
+        Delay(config=config, session=session)
+        raw = str(removeIP(raw=raw))
         with open("%s/index.html" % (config["path"]), "w", encoding="utf-8") as outfile:
             outfile.write(str(raw))
diff --git a/wikiteam3/dumpgenerator/logs.py b/wikiteam3/dumpgenerator/logs.py
@@ -1,4 +1,4 @@
-from .delay import delay
+from .delay import Delay
 
 
 def saveLogs(config={}, session=None):
@@ -19,4 +19,4 @@ def saveLogs(config={}, session=None):
     <option value="">Todos los registros</option>
     </select>
 """
-    delay(config=config, session=session)
+    Delay(config=config, session=session)
diff --git a/wikiteam3/dumpgenerator/namespaces.py b/wikiteam3/dumpgenerator/namespaces.py
@@ -1,6 +1,6 @@
 import re
 
-from .delay import delay
+from .delay import Delay
 from .get_json import getJSON
 
 
@@ -13,8 +13,8 @@ def getNamespacesScraper(config={}, session=None):
         r = session.post(
             url=config["index"], params={"title": "Special:Allpages"}, timeout=30
         )
-        raw = r.text
-        delay(config=config, session=session)
+        raw = str(r.text)
+        Delay(config=config, session=session)
 
         # [^>]*? to include selected="selected"
         m = re.compile(
@@ -59,7 +59,7 @@ def getNamespacesAPI(config={}, session=None):
             timeout=30,
         )
         result = getJSON(r)
-        delay(config=config, session=session)
+        Delay(config=config, session=session)
         try:
             nsquery = result["query"]["namespaces"]
         except KeyError:

diff --git a/wikiteam3/dumpgenerator/page_special_version.py b/wikiteam3/dumpgenerator/page_special_version.py
@@ -1,6 +1,6 @@
 import os
 
-from .delay import delay
+from .delay import Delay
 from .util import removeIP
 
 
@@ -14,9 +14,9 @@ def saveSpecialVersion(config={}, session=None):
         r = session.post(
             url=config["index"], params={"title": "Special:Version"}, timeout=10
         )
-        raw = r.text
-        delay(config=config, session=session)
-        raw = removeIP(raw=raw)
+        raw = str(r.text)
+        Delay(config=config, session=session)
+        raw = str(removeIP(raw=raw))
         with open(
             "%s/Special:Version.html" % (config["path"]), "w", encoding="utf-8"
         ) as outfile:

diff --git a/wikiteam3/dumpgenerator/page_titles.py b/wikiteam3/dumpgenerator/page_titles.py
@@ -1,9 +1,10 @@
 import re
+import sys
 from urllib.parse import urlparse
 
 import mwclient
 
-from .delay import delay
+from .delay import Delay
 from .domain import domain2prefix
 from .namespaces import getNamespacesAPI, getNamespacesScraper
 from .util import cleanHTML, undoHTMLEntities
@@ -19,7 +20,7 @@ def getPageTitlesAPI(config={}, session=None):
             continue
 
         c = 0
-        print("    Retrieving titles in the namespace %d" % (namespace))
+        sys.stdout.write("    Retrieving titles in the namespace %d" % (namespace))
         apiurl = urlparse(config["api"])
         site = mwclient.Site(
             apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme
@@ -34,8 +35,11 @@ def getPageTitlesAPI(config={}, session=None):
             print("Probably a loop, switching to next namespace")
             titles = list(set(titles))
 
-        print("    %d titles retrieved in the namespace %d" % (c, namespace))
-        delay(config=config, session=session)
+        sys.stdout.write(
+            "\r    %d titles retrieved in the namespace %d\n" % (c, namespace)
+        )
+        sys.stdout.flush()
+        Delay(config=config, session=session)
 
 
 def getPageTitlesScraper(config={}, session=None):
@@ -48,8 +52,8 @@ def getPageTitlesScraper(config={}, session=None):
             config["index"], namespace
         )
         r = session.get(url=url, timeout=30)
-        raw = r.text
-        raw = cleanHTML(raw)
+        raw = str(r.text)
+        raw = str(cleanHTML(raw))
 
         r_title = 'title="(?P<title>[^>]+)">'
         r_suballpages = ""
@@ -114,10 +118,10 @@ def getPageTitlesScraper(config={}, session=None):
                 if name not in checked_suballpages:
                     # to avoid reload dupe subpages links
                     checked_suballpages.append(name)
-                    delay(config=config, session=session)
+                    Delay(config=config, session=session)
                     r = session.get(url=url, timeout=10)
                     # print ('Fetching URL: ', url)
-                    raw = r.text
+                    raw = str(r.text)
                     raw = cleanHTML(raw)
                     rawacum += raw  # merge it after removed junk
                     print(
@@ -131,7 +135,7 @@ def getPageTitlesScraper(config={}, session=None):
                         "pages",
                     )
 
-                delay(config=config, session=session)
+                Delay(config=config, session=session)
             oldfr = currfr
             c += 1
 

diff --git a/wikiteam3/dumpgenerator/page_xml.py b/wikiteam3/dumpgenerator/page_xml.py
@@ -23,7 +23,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
     maxretries = config["retries"]  # x retries and skip
     increment = 20  # increment every retry
 
-    while not re.search(r"</mediawiki>", xml):
+    while not re.search(r"</mediawiki>", str(xml)):
         if c > 0 and c < maxretries:
             wait = (
                 increment * c < maxseconds and increment * c or maxseconds
@@ -86,7 +86,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
             xml = ""
         c += 1
 
-    return xml
+    return str(xml)
 
 
 def getXMLPage(config={}, title="", verbose=True, session=None):
@@ -114,7 +114,7 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
     if "templates" in config and config["templates"]:
         params["templates"] = 1
 
-    xml = getXMLPageCore(params=params, config=config, session=session)
+    xml = str(getXMLPageCore(params=params, config=config, session=session))
     if xml == "":
         raise ExportAbortedError(config["index"])
     if "</page>" not in xml:
@@ -132,8 +132,8 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
     # else, warning about Special:Export truncating large page histories
     r_timestamp = "<timestamp>([^<]+)</timestamp>"
 
-    numberofedits = 0
-    numberofedits += len(re.findall(r_timestamp, xml))
+    edit_count = 0
+    edit_count += len(re.findall(r_timestamp, xml))
 
     # search for timestamps in xml to avoid analysing empty pages like
     # Special:Allpages and the random one
@@ -183,16 +183,16 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
                         params["limit"] = params["limit"] / 2
                         continue
                     xml = xml2
-                    numberofedits += len(re.findall(r_timestamp, xml))
+                    edit_count += len(re.findall(r_timestamp, xml))
             else:
                 params["offset"] = ""  # no more edits in this page history
     yield "</page>\n"
 
     if verbose:
-        if numberofedits == 1:
+        if edit_count == 1:
             uprint("    %s, 1 edit" % (title.strip()))
         else:
-            uprint("    %s, %d edits" % (title.strip(), numberofedits))
+            uprint("    %s, %d edits" % (title.strip(), edit_count))
 
 
 def makeXmlPageFromRaw(xml):
@@ -252,11 +252,11 @@ def makeXmlFromPage(page):
     except KeyError as e:
         print(e)
         raise PageMissingError(page["title"], e)
-    return etree.tostring(p, pretty_print=True, encoding="unicode")
+    return str(etree.tostring(p, pretty_print=True, encoding="utf-8"))
 
 
 def fixBOM(request):
     """Strip Unicode BOM"""
     if request.text.startswith("\ufeff"):
         request.encoding = "utf-8-sig"
-    return request.text
+    return str(request.text)
diff --git a/wikiteam3/dumpgenerator/site_info.py b/wikiteam3/dumpgenerator/site_info.py
@@ -1,7 +1,7 @@
 import json
 import os
 
-from .delay import delay
+from .delay import Delay
 from .get_json import getJSON
 
 
@@ -51,7 +51,7 @@ def saveSiteInfo(config={}, session=None):
                     timeout=10,
                 )
             result = getJSON(r)
-            delay(config=config, session=session)
+            Delay(config=config, session=session)
             with open(
                 "%s/siteinfo.json" % (config["path"]), "w", encoding="utf-8"
             ) as outfile: