fix: use httpx instead of requests to avoid Bandcamp blocking

When using requests/urllib3, Bandcamp response to all requests with 403 errors. Investigating why, I tried: - using curl to send the same request: it worked - writing a tiny Python script to `GET bandcamp.com/` with requests: it failed with 403 - waiting a week to see if it solved itself: no luck - changing the above mentioned script to use http.client or httpx worked I think that in this case, Bandcamp's Web Application Firewall (WAF) blocks the requests based not on their contents but on an artifact of how urllib3 builds/sends the data, since curl with exact same headers works. Instead of trying to identify the exact reason, which is quite hard without any info on Bandcamp's WAF, and fix/workaround that, I rewrote the very little required HTTP code to use httpx and sidestep the issue.
snejus · Aug 4, 2024 · 46c51eb · 46c51eb
1 parent ffacfe1
commit 46c51eb
Show file tree

Hide file tree

Showing 6 changed files with 122 additions and 177 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,9 @@
 
 ### Fixed
 
-- `search`: properly escape query strings for better results with special characters
+- `search`:
+  - properly escape query strings for better results with special characters
+  - change HTTP client implementation to avoid Bandcamp "403 Forbidden" responses
 
 ## [0.19.1] 2024-05-10
 

diff --git a/beetsplug/bandcamp/__init__.py b/beetsplug/bandcamp/__init__.py
@@ -22,16 +22,15 @@
 import re
 from contextlib import contextmanager
 from functools import lru_cache, partial
-from html import unescape
 from itertools import chain
 from operator import itemgetter
 from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Literal, Sequence
 
-import requests
-from beets import IncludeLazyConfig, __version__, config, library, plugins
+from beets import IncludeLazyConfig, config, library, plugins
 
 from beetsplug import fetchart  # type: ignore[attr-defined]
 
+from .http import HTTPError, http_get_text
 from .metaguru import Metaguru
 from .search import search_bandcamp
 
@@ -57,12 +56,6 @@
 
 ALBUM_URL_IN_TRACK = re.compile(r'<a id="buyAlbumLink" href="([^"]+)')
 LABEL_URL_IN_COMMENT = re.compile(r"Visit (https:[\w/.-]+\.[a-z]+)")
-USER_AGENT = f"beets/{__version__} +http://beets.radbox.org/"
-
-
-@lru_cache(maxsize=None)
-def get_response(url: str) -> requests.Response:
-    return requests.get(url, headers={"User-Agent": USER_AGENT})
 
 
 class BandcampRequestsHandler:
@@ -79,13 +72,11 @@ def _info(self, msg_template: str, *args: Sequence[str]) -> None:
 
     def _get(self, url: str) -> str:
         """Return text contents of the url response."""
-        response = get_response(url)
         try:
-            response.raise_for_status()
-        except requests.HTTPError as e:
+            return http_get_text(url)
+        except HTTPError as e:
             self._info("{}", e)
             return ""
-        return unescape(response.text)
 
     def guru(self, url: str) -> Metaguru:
         return Metaguru.from_html(self._get(url), config=self.config.flatten())

diff --git a/beetsplug/bandcamp/http.py b/beetsplug/bandcamp/http.py
@@ -0,0 +1,21 @@
+from functools import lru_cache
+from html import unescape
+from urllib.parse import urlsplit
+
+from beets import __version__
+import httpx
+
+HTTPError = httpx.HTTPError
+
+USER_AGENT = f"beets/{__version__} +https://beets.io/"
+
+_client = httpx.Client(headers={"User-Agent": USER_AGENT})
+
+@lru_cache(maxsize=None)
+def http_get_text(url: str) -> str:
+    """Return text contents of the url."""
+
+    response = _client.get(url)
+    response.raise_for_status()
+
+    return unescape(response.text)
diff --git a/beetsplug/bandcamp/search.py b/beetsplug/bandcamp/search.py
@@ -7,7 +7,7 @@
 from typing import Any, Callable, Dict, List
 from urllib.parse import quote_plus
 
-import requests
+from .http import http_get_text
 
 JSONDict = Dict[str, Any]
 SEARCH_URL = "https://bandcamp.com/search?page={}&q={}"
@@ -95,17 +95,11 @@ def parse_and_sort_results(html: str, **kwargs: str) -> List[JSONDict]:
     return [{"index": i + 1, **r} for i, r in enumerate(results)]
 
 
-def get_url(url: str) -> str:
-    response = requests.get(url)
-    response.raise_for_status()
-    return unescape(response.text)
-
-
 def search_bandcamp(
     query: str = "",
     search_type: str = "",
     page: int = 1,
-    get: Callable[[str], str] = get_url,
+    get: Callable[[str], str] = http_get_text,
     **kwargs: Any,
 ) -> List[JSONDict]:
     """Return a list with item JSONs of type search_type matching the query."""