diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index c662110e90..88e0616feb 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -22,12 +22,12 @@ import itertools import os.path import re -import unicodedata import urllib from contextlib import contextmanager from functools import cached_property from html import unescape from typing import Any, Iterator +from urllib.parse import urlparse import requests from typing_extensions import TypedDict @@ -183,7 +183,7 @@ def generate_alternatives(string, patterns): return itertools.product(artists, multi_titles) -def slug(text): +def slug(text: str) -> str: """Make a URL-safe, human-readable version of the given text This will do the following: @@ -193,10 +193,6 @@ def slug(text): 3. strip whitespace 4. replace other non-word characters with dashes 5. strip extra dashes - - This somewhat duplicates the :func:`Google.slugify` function but - slugify is not as generic as this one, which can be reused - elsewhere. """ return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-") @@ -666,19 +662,6 @@ def is_lyrics(self, text, artist=None): self.debug("Bad triggers detected: {}", bad_triggers_occ) return len(bad_triggers_occ) < 2 - def slugify(self, text): - """Normalize a string and remove non-alphanumeric characters.""" - text = re.sub(r"[-'_\s]", "_", text) - text = re.sub(r"_+", "_", text).strip("_") - pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses - text = re.sub(pat, r"\g<1>", text).strip() - try: - text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore") - text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8"))) - except UnicodeDecodeError: - self.debug("Failed to normalize '{}'", text) - return text - BY_TRANS = ["by", "par", "de", "von"] LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] @@ -686,12 +669,10 @@ def is_page_candidate(self, url_link, url_title, title, artist): """Return True if the URL title makes it a good candidate to be a page that contains lyrics of title by artist. """ - title = self.slugify(title.lower()) - artist = self.slugify(artist.lower()) - sitename = re.search( - "//([^/]+)/.*", self.slugify(url_link.lower()) - ).group(1) - url_title = self.slugify(url_title.lower()) + title = slug(title) + artist = re.escape(slug(artist)) + sitename = urlparse(url_link).netloc + url_title = slug(url_title) # Check if URL title contains song title (exact match) if url_title.find(title) != -1: @@ -700,14 +681,13 @@ def is_page_candidate(self, url_link, url_title, title, artist): # or try extracting song title from URL title and check if # they are close enough tokens = ( - [by + "_" + artist for by in self.BY_TRANS] + [by + "-" + artist for by in self.BY_TRANS] + [artist, sitename, sitename.replace("www.", "")] + self.LYRICS_TRANS ) - tokens = [re.escape(t) for t in tokens] song_title = re.sub("(%s)" % "|".join(tokens), "", url_title) - song_title = song_title.strip("_|") + song_title = song_title.strip("-|") typo_ratio = 0.9 ratio = difflib.SequenceMatcher(None, song_title, title).ratio() return ratio >= typo_ratio diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 0fc345e352..a93f271deb 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -14,13 +14,10 @@ """Tests for the 'lyrics' plugin.""" -import itertools import os import re -import unittest from functools import partial from http import HTTPStatus -from unittest.mock import patch import pytest @@ -43,11 +40,7 @@ ) -class LyricsPluginTest(unittest.TestCase): - def setUp(self): - """Set up configuration.""" - lyrics.LyricsPlugin() - +class TestLyricsUtils: def test_search_artist(self): item = Item(artist="Alice ft. Bob", title="song") assert ("Alice ft. Bob", ["song"]) in lyrics.search_pairs(item) @@ -159,6 +152,24 @@ def test_scrape_merge_paragraphs(self): text = "one

two

three" assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree" + @pytest.mark.parametrize( + "text, expected", + [ + ("test", "test"), + ("Mørdag", "mordag"), + ("l'été c'est fait pour jouer", "l-ete-c-est-fait-pour-jouer"), + ("\xe7afe au lait (boisson)", "cafe-au-lait-boisson"), + ("Multiple spaces -- and symbols! -- merged", "multiple-spaces-and-symbols-merged"), # noqa: E501 + ("\u200bno-width-space", "no-width-space"), + ("El\u002dp", "el-p"), + ("\u200bblackbear", "blackbear"), + ("\u200d", ""), + ("\u2010", ""), + ], + ) # fmt: skip + def test_slug(self, text, expected): + assert lyrics.slug(text) == expected + @pytest.fixture(scope="module") def lyrics_root_dir(pytestconfig: pytest.Config): @@ -333,10 +344,6 @@ def test_is_page_candidate( def test_bad_lyrics(self, backend, lyrics): assert not backend.is_lyrics(lyrics) - def test_slugify(self, backend): - text = "http://site.com/\xe7afe-au_lait(boisson)" - assert backend.slugify(text) == "http://site.com/cafe_au_lait" - class TestGeniusLyrics(LyricsPluginBackendMixin): @pytest.fixture(scope="class") @@ -358,52 +365,6 @@ def test_scrape_genius_lyrics( assert len(result.splitlines()) == expected_line_count - @patch.object(lyrics.Genius, "scrape_lyrics") - @patch.object(lyrics.Backend, "fetch_text", return_value=True) - def test_json(self, mock_fetch_text, mock_scrape, backend): - """Ensure we're finding artist matches""" - with patch.object( - lyrics.Genius, - "_search", - return_value={ - "response": { - "hits": [ - { - "result": { - "primary_artist": { - "name": "\u200bblackbear", - }, - "url": "blackbear_url", - } - }, - { - "result": { - "primary_artist": {"name": "El\u002dp"}, - "url": "El-p_url", - } - }, - ] - } - }, - ) as mock_json: - # genius uses zero-width-spaces (\u200B) for lowercase - # artists so we make sure we can match those - assert backend.fetch("blackbear", "Idfc") is not None - mock_fetch_text.assert_called_once_with("blackbear_url") - mock_scrape.assert_called_once_with(True) - - # genius uses the hyphen minus (\u002D) as their dash - assert backend.fetch("El-p", "Idfc") is not None - mock_fetch_text.assert_called_with("El-p_url") - mock_scrape.assert_called_with(True) - - # test no matching artist - assert backend.fetch("doesntexist", "none") is None - - # test invalid json - mock_json.return_value = {"response": {"hits": []}} - assert backend.fetch("blackbear", "Idfc") is None - class TestTekstowoLyrics(LyricsPluginBackendMixin): @pytest.fixture(scope="class") @@ -519,31 +480,3 @@ def test_pick_lyrics_match(self, fetch_lyrics, expected_lyrics): ) def test_synced_config_option(self, fetch_lyrics, expected_lyrics): assert fetch_lyrics() == expected_lyrics - - -class SlugTests(unittest.TestCase): - def test_slug(self): - # plain ascii passthrough - text = "test" - assert lyrics.slug(text) == "test" - - # german unicode and capitals - text = "Mørdag" - assert lyrics.slug(text) == "mordag" - - # more accents and quotes - text = "l'été c'est fait pour jouer" - assert lyrics.slug(text) == "l-ete-c-est-fait-pour-jouer" - - # accents, parens and spaces - text = "\xe7afe au lait (boisson)" - assert lyrics.slug(text) == "cafe-au-lait-boisson" - text = "Multiple spaces -- and symbols! -- merged" - assert lyrics.slug(text) == "multiple-spaces-and-symbols-merged" - text = "\u200bno-width-space" - assert lyrics.slug(text) == "no-width-space" - - # variations of dashes should get standardized - dashes = ["\u200d", "\u2010"] - for dash1, dash2 in itertools.combinations(dashes, 2): - assert lyrics.slug(dash1) == lyrics.slug(dash2)