Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing URLs from item comments, add -p / --page flag to cmdline #57

Merged
merged 3 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
## Unreleased

## [0.19.0] 2024-05-07

### Fixed

- (#56) Support parsing URLs that do not end with **.com** in item comments when importing
music that was bought on Bandcamp.

### Added

- Add a new flag to the command line application for searching Bandcamp:
**`[-p PAGE, --page PAGE]`** to enable seeing further search results

## [0.18.0] 2024-04-28

Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Navigate to your `beets` virtual environment and install the plug-in with
The plugin exposes some of its functionality through a command-line application `beetcamp`:

```xml
usage: beetcamp [-h] [-a] [-l] [-t] [release_url | query]
usage: beetcamp [-h] [-a] [-l] [-t] [-o INDEX] [-p PAGE] (release_url | query)

Get bandcamp release metadata from the given <release-url> or perform
bandcamp search with <query>. Anything that does not start with https://
Expand All @@ -65,13 +65,14 @@ positional arguments:
release_url Release URL, starting with https:// OR
query Search query

options:
optional arguments:
-h, --help show this help message and exit
-a, --album Search albums
-l, --label Search labels and artists
-t, --track Search tracks
-o INDEX, --open INDEX
Open search result indexed by INDEX in the browser
-p PAGE, --page PAGE The results page to show, 1 by default
```

- Use `beetcamp <bandcamp-release-url>` to return release metadata in JSON format.
Expand Down
48 changes: 31 additions & 17 deletions beetsplug/bandcamp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
}

ALBUM_URL_IN_TRACK = re.compile(r'<a id="buyAlbumLink" href="([^"]+)')
LABEL_URL_IN_COMMENT = re.compile(r"Visit (https:[\w/.-]+com)")
LABEL_URL_IN_COMMENT = re.compile(r"Visit (https:[\w/.-]+\.[a-z]+)")
USER_AGENT = f"beets/{__version__} +http://beets.radbox.org/"


Expand Down Expand Up @@ -169,6 +169,13 @@ def loaded(self) -> None:
plugin.sources = [bandcamp_fetchart, *plugin.sources]
break

@staticmethod
def parse_label_url(text: str) -> str | None:
if m := LABEL_URL_IN_COMMENT.match(text):
return m.group(1)

return None

def _find_url_in_item(
self, item: library.Item, name: str, _type: CandidateType
) -> str:
Expand All @@ -189,16 +196,14 @@ def _find_url_in_item(
the number of previous releases that also did not have any valid
alphanums. Therefore, we cannot make a reliable guess here.
"""
url = getattr(item, f"mb_{_type}id", "")
if _from_bandcamp(url):
if (url := getattr(item, f"mb_{_type}id", "")) and _from_bandcamp(url):
self._info("Fetching the URL attached to the first item, {}", url)
return url

if (m := LABEL_URL_IN_COMMENT.match(item.comments)) and (
if (label_url := self.parse_label_url(item.comments)) and (
urlified_name := urlify(name)
):
label = m.group(1)
url = f"{label}/{_type}/{urlified_name}"
url = f"{label_url}/{_type}/{urlified_name}"
self._info("Trying our guess {} before searching", url)
return url
return ""
Expand All @@ -207,15 +212,15 @@ def candidates(
self, items: List[library.Item], artist: str, album: str, *_: Any, **__: Any
) -> Iterable[AlbumInfo]:
"""Return a sequence of album candidates matching given artist and album."""
item = items[0]
label = ""
if items and album == items[0].album and artist == items[0].albumartist:
label = items[0].label
url = self._find_url_in_item(items[0], album, "album")
if url:
initial_guess = self.get_album_info(url)
if initial_guess:
yield from initial_guess
return
if items and album == item.album and artist == item.albumartist:
label = item.label
if (url := self._find_url_in_item(item, album, "album")) and (
initial_guess := self.get_album_info(url)
):
yield from initial_guess
return

if "various" in artist.lower():
artist = ""
Expand All @@ -228,12 +233,12 @@ def item_candidates(
self, item: library.Item, artist: str, title: str
) -> Iterable[TrackInfo]:
"""Return a sequence of singleton candidates matching given artist and title."""
url = self._find_url_in_item(item, title, "track")
label = ""
if item and title == item.title and artist == item.artist:
label = item.label
initial_guess = self.get_track_info(url) if url else None
if initial_guess:
if (url := self._find_url_in_item(item, title, "track")) and (
initial_guess := self.get_track_info(url)
):
yield initial_guess
return

Expand Down Expand Up @@ -350,6 +355,15 @@ def __call__(
type=int,
help="Open search result indexed by INDEX in the browser",
)
parser.add_argument(
"-p",
"--page",
action="store",
dest="page",
type=int,
default=1,
help="The results page to show, 1 by default",
)

return parser.parse_args()

Expand Down
28 changes: 17 additions & 11 deletions beetsplug/bandcamp/search.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Module with bandcamp search functionality."""

import re
from difflib import SequenceMatcher
from html import unescape
Expand All @@ -8,7 +9,7 @@
import requests

JSONDict = Dict[str, Any]
SEARCH_URL = "https://bandcamp.com/search?q={}"
SEARCH_URL = "https://bandcamp.com/search?page={}&q={}"


def _f(field: str) -> str:
Expand Down Expand Up @@ -40,12 +41,14 @@ def to_ascii(string: str) -> str:

def get_similarity(query: str, result: str) -> float:
"""Return the similarity between two strings normalized to [0, 1].

We take into account how well the result matches the query, e.g.
query: "foo"
query: "foobar"
result: "foo bar"
Similarity is then:
(2 * (len("foo") / len("foo")) + len("foo") / len("foo bar")) / 3
2/3 of the result is how much of the query is found in the result,
(2 * (len("foo") / len("foobar")) + len("foo") / len("foo bar")) / 3

2/3 of the weight is how much of the query is found in the result,
and 1/3 is a penalty for the non-matching part.
"""
a, b = to_ascii(query), to_ascii(result)
Expand All @@ -70,8 +73,13 @@ def get_matches(text: str) -> JSONDict:


def parse_and_sort_results(html: str, **kwargs: str) -> List[JSONDict]:
"""Given the html string, parse metadata for each entity and sort them
by the field/value pairs given in kwargs.
"""Extract search results from `html` and sort them by similarity to kwargs.

Bandcamp search may be unpredictable, therefore search results get sorted
regarding their similarity to what's being queried.

`kwargs` contains field and value pairs we compare the results with. Usually,
this has 'label', 'artist' and 'name' ('title' or 'album') fields.
"""
results: List[JSONDict] = []
for block in html.split("searchresult data-search")[1:]:
Expand All @@ -95,14 +103,12 @@ def get_url(url: str) -> str:
def search_bandcamp(
query: str = "",
search_type: str = "",
page: int = 1,
get: Callable[[str], str] = get_url,
**kwargs: Any,
) -> List[JSONDict]:
"""Return a list with item JSONs of type search_type matching the query.
Bandcamp search may be unpredictable, therefore search results get sorted
regarding their similarity to what's being queried.
"""
url = SEARCH_URL.format(query)
"""Return a list with item JSONs of type search_type matching the query."""
url = SEARCH_URL.format(page, query)
if search_type:
url += "&item_type=" + search_type
kwargs["name"] = query
Expand Down
Loading
Loading