Split release track names parsing into TrackNames class

snejus · Mar 18, 2024 · 92451c1 · 92451c1
1 parent 5f04de1
commit 92451c1
Show file tree

Hide file tree

Showing 7 changed files with 227 additions and 229 deletions.
diff --git a/beetsplug/bandcamp/helpers.py b/beetsplug/bandcamp/helpers.py
@@ -1,7 +1,5 @@
 """Module with a Helpers class that contains various static, independent functions."""
 
-from __future__ import annotations
-
 import itertools as it
 import operator as op
 import re
@@ -108,7 +106,9 @@ class MediaInfo(NamedTuple):
     r"CD ?\d+",
 ]
 
-_remix_pat = r"(?P<remix>((?P<remixer>[^])]+) )?\b((re)?mix|edit|bootleg)\b[^])]*)"
+REMIX = re.compile(
+    r"(?P<remix>((?P<remixer>[^])]+) )?\b((re)?mix|edit|bootleg)\b[^])]*)", re.I
+)
 CAMELCASE = re.compile(r"(?<=[a-z])(?=[A-Z])")
 
 
@@ -133,8 +133,8 @@ def split_artist_title(m: re.Match[str]) -> str:
     (re.compile(r"(- )?\( *"), "("),                      # hi - ( bye)      -> hi (bye)
     (re.compile(r" \)+|(\)+$)"), ")"),                    # hi (bye ))       -> hi (bye)
     (re.compile(r"- Reworked"), "(Reworked)"),            # bye - Reworked   -> bye (Reworked)    # noqa
-    (re.compile(rf"(\({_remix_pat})$", re.I), r"\1)"),    # bye - (Some Mix  -> bye - (Some Mix)  # noqa
-    (re.compile(rf"- *({_remix_pat})$", re.I), r"(\1)"),  # bye - Some Mix   -> bye (Some Mix)    # noqa
+    (re.compile(rf"(\({REMIX.pattern})$", re.I), r"\1)"),    # bye - (Some Mix  -> bye - (Some Mix)  # noqa
+    (re.compile(rf"- *({REMIX.pattern})$", re.I), r"(\1)"),  # bye - Some Mix   -> bye (Some Mix)    # noqa
     (re.compile(r'(^|- )[“"]([^”"]+)[”"]( \(|$)'), r"\1\2\3"),   # "bye" -> bye; hi - "bye" -> hi - bye  # noqa
     (re.compile(r"\((the )?(remixes)\)", re.I), r"\2"),   # Album (Remixes)  -> Album Remixes     # noqa
     (re.compile(r"examine-.+CD\d+_([^_-]+)[_-](.*)"), split_artist_title),  # See https://examine-archive.bandcamp.com/album/va-examine-archive-international-sampler-xmn01 # noqa

diff --git a/beetsplug/bandcamp/metaguru.py b/beetsplug/bandcamp/metaguru.py
@@ -17,7 +17,8 @@
 
 from .album import AlbumName
 from .helpers import PATTERNS, Helpers, MediaInfo
-from .tracks import Track, Tracks
+from .track import Track
+from .tracks import Tracks
 
 NEW_BEETS = int(beets_version.split(".")[1]) > 4
 
@@ -184,7 +185,7 @@ def mediums(self) -> int:
     @cached_property
     def general_catalognum(self) -> str:
         """Find catalog number in the media-agnostic release metadata and cache it."""
-        return self._tracks.single_catalognum or self.parse_catalognum(
+        return self._tracks.catalognum or self.parse_catalognum(
             album=self.meta["name"],
             description=self.comments,
             label=self.label if not self._singleton else "",

diff --git a/beetsplug/bandcamp/track.py b/beetsplug/bandcamp/track.py
@@ -5,7 +5,7 @@
 from functools import cached_property
 from typing import List, Optional, Tuple
 
-from .helpers import CATNUM_PAT, PATTERNS, Helpers, JSONDict, _remix_pat
+from .helpers import CATNUM_PAT, PATTERNS, REMIX, Helpers, JSONDict
 
 digiwords = r"""
     # must contain at least one of
@@ -33,7 +33,7 @@
 
 @dataclass
 class Remix:
-    PATTERN = re.compile(rf" *[\[(] *{_remix_pat}[])]", re.I)
+    PATTERN = re.compile(rf" *[\[(] *{REMIX.pattern}[])]", re.I)
 
     delimited: str
     remixer: str
@@ -66,22 +66,6 @@ class Track:
     digi_only: bool = False
     track_alt: Optional[str] = None
 
-    @classmethod
-    def from_json(cls, json: JSONDict, label: str) -> "Track":
-        try:
-            artist = json["inAlbum"]["byArtist"]["name"]
-        except KeyError:
-            artist = ""
-        artist = artist or json.get("byArtist", {}).get("name", "")
-        data = {
-            "json_item": json,
-            "json_artist": artist,
-            "track_id": json["@id"],
-            "index": json.get("position"),
-            "catalognum": json.get("catalognum"),
-        }
-        return cls(**cls.parse_name(data, json["name"], label))
-
     @staticmethod
     def clean_digi_name(name: str) -> Tuple[str, bool]:
         """Clean the track title from digi-only artifacts.
@@ -111,43 +95,59 @@ def find_featuring(data: JSONDict) -> JSONDict:
                     break
         return data
 
-    @staticmethod
-    def parse_name(data: JSONDict, name: str, label: str) -> JSONDict:
-        # remove label from the end of the track name
-        # see https://gutterfunkuk.bandcamp.com/album/gutterfunk-all-subject-to-vibes-various-artists-lp  # noqa
-        if name.endswith(label):
-            name = name.replace(label, "").strip(" -")
-
-        json_artist, artist_digi_only = Track.clean_digi_name(data["json_artist"])
-        name, name_digi_only = Track.clean_digi_name(name)
-        data["digi_only"] = name_digi_only or artist_digi_only
-
-        data["json_artist"] = Helpers.clean_name(json_artist) if json_artist else ""
+    @classmethod
+    def parse_name(cls, name: str, artist: str, index: Optional[int]) -> JSONDict:
+        result: JSONDict = {}
+        artist, artist_digi_only = cls.clean_digi_name(artist)
+        name, name_digi_only = cls.clean_digi_name(name)
+        result["digi_only"] = name_digi_only or artist_digi_only
+
+        if artist:
+            artist = Helpers.clean_name(artist)
         name = Helpers.clean_name(name).strip().lstrip("-")
 
+        # find the track_alt and remove it from the name
         m = PATTERNS["track_alt"].search(name)
         if m:
-            data["track_alt"] = m.group(1).replace(".", "").upper()
+            result["track_alt"] = m.group(1).replace(".", "").upper()
             name = name.replace(m.group(), "")
 
-        if not data.get("catalognum"):
-            # check whether track name contains the catalog number within parens
-            # or square brackets
-            # see https://objection999x.bandcamp.com/album/eruption-va-obj012
-            m = CATNUM_PAT["delimited"].search(name)
-            if m:
-                data["catalognum"] = m.group(1)
-                name = name.replace(m.group(), "").strip()
-        name = re.sub(rf"^0*{data.get('index', 0)}(?!\W\d)\W+", "", name)
+        # check whether track name contains the catalog number within parens
+        # or square brackets
+        # see https://objection999x.bandcamp.com/album/eruption-va-obj012
+        m = CATNUM_PAT["delimited"].search(name)
+        if m:
+            result["catalognum"] = m.group(1)
+            name = name.replace(m.group(), "").strip()
 
+        # Remove leading index
+        if index:
+            name = re.sub(rf"^0*{index}(?!\W\d)\W+", "", name)
+
+        # find the remixer and remove it from the name
         remix = Remix.from_name(name)
         if remix:
-            data.update(remix=remix)
+            result["remix"] = remix
             name = name.replace(remix.delimited, "").rstrip()
 
-        data["name"] = name
-        data = Track.find_featuring(data)
-        return data
+        result["name"] = name
+        return Track.find_featuring({**result, "json_artist": artist})
+
+    @classmethod
+    def make(cls, json: JSONDict, name: str) -> "Track":
+        try:
+            artist = json["inAlbum"]["byArtist"]["name"]
+        except KeyError:
+            artist = json.get("byArtist", {}).get("name", "")
+
+        index = json.get("position")
+        data = {
+            "json_item": json,
+            "track_id": json["@id"],
+            "index": index,
+            **cls.parse_name(name, artist, index),
+        }
+        return cls(**data)
 
     @cached_property
     def duration(self) -> Optional[int]:
@@ -177,6 +177,7 @@ def full_name(self) -> str:
     @cached_property
     def title_without_remix(self) -> str:
         """Split the track name, deduce the title and return it.
+
         The extra complexity here is to ensure that it does not cut off a title
         that ends with ' - -', like in '(DJ) NICK JERSEY - 202memo - - -'.
         """
@@ -199,9 +200,7 @@ def title(self) -> str:
 
     @cached_property
     def artist(self) -> str:
-        """Take the name, remove the title, ensure it does not duplicate any remixers
-        and return the resulting artist.
-        """
+        """Return name without the title and the remixer."""
         title_start_idx = self.full_name.rfind(self.title_without_remix)
         artist = Remix.PATTERN.sub("", self.full_name[:title_start_idx].strip(", -"))
         if self.remix:

diff --git a/beetsplug/bandcamp/track_names.py b/beetsplug/bandcamp/track_names.py
@@ -0,0 +1,141 @@
+"""Module for parsing track names."""
+
+import operator as op
+import re
+from collections import Counter
+from contextlib import suppress
+from dataclasses import dataclass
+from functools import reduce
+from typing import Iterator, List, Optional, Tuple
+
+from ordered_set import OrderedSet
+
+from .helpers import CATNUM_PAT, REMIX
+
+
+@dataclass
+class TrackNames:
+    """Responsible for parsing track names in the entire release context."""
+
+    # Title [Some Album EP]
+    ALBUM_IN_TITLE = re.compile(r"[- ]*\[([^\]]+ [EL]P)\]+", re.I)
+    DELIMITER_PAT = re.compile(r" ([^\w&()+/[\] ]) ")
+    TITLE_IN_QUOTES = re.compile(r'^(.+[^ -])[ -]+"([^"]+)"$')
+
+    original: List[str]
+    names: List[str]
+    album: Optional[str] = None
+    catalognum: Optional[str] = None
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.names)
+
+    @classmethod
+    def split_quoted_titles(cls, names: List[str]) -> List[str]:
+        if len(names) > 1:
+            matches = list(filter(None, map(cls.TITLE_IN_QUOTES.match, names)))
+            if len(matches) == len(names):
+                return [m.expand(r"\1 - \2") for m in matches]
+
+        return names
+
+    @classmethod
+    def find_common_track_delimiter(cls, names: List[str]) -> str:
+        """Return the track parts delimiter that is in effect in the current release.
+
+        In some (rare) situations track parts are delimited by a pipe character
+        or some UTF-8 equivalent of a dash.
+
+        This checks every track for the first character (see the regex for exclusions)
+        that splits it. The character that splits the most and at least half of
+        the tracks is the character we need.
+
+        If no such character is found, or if we have just one track, return a dash '-'.
+        """
+
+        def get_delim(string: str) -> str:
+            m = cls.DELIMITER_PAT.search(string)
+            return m.group(1) if m else "-"
+
+        delim, count = Counter(map(get_delim, names)).most_common(1).pop()
+        return delim if (len(names) == 1 or count > len(names) / 2) else "-"
+
+    @classmethod
+    def normalize_delimiter(cls, names: List[str]) -> List[str]:
+        """Ensure the same delimiter splits artist and title in all names."""
+        delim = cls.find_common_track_delimiter(names)
+        return [n.replace(f" {delim} ", " - ") for n in names]
+
+    @staticmethod
+    def remove_label(names: List[str], label: str) -> List[str]:
+        """Remove label name from the end of track names.
+
+        See https://gutterfunkuk.bandcamp.com/album/gutterfunk-all-subject-to-vibes-various-artists-lp
+        """
+        return [
+            (n.replace(label, "").strip(" -") if n.endswith(label) else n)
+            for n in names
+        ]
+
+    @staticmethod
+    def eject_common_catalognum(names: List[str]) -> Tuple[Optional[str], List[str]]:
+        """Return catalognum found in every track title.
+
+        1. Split each track name into words
+        2. Find the list of words that are common to all tracks
+        3. Check the *first* and the *last* word for the catalog number
+           - If found, return it and remove it from every track name
+        """
+        catalognum = None
+
+        names_tokens = map(str.split, names)
+        common_words = reduce(op.and_, [OrderedSet(x) for x in names_tokens])
+        if common_words:
+            matches = (CATNUM_PAT["anywhere"].search(common_words[i]) for i in [0, -1])
+            with suppress(StopIteration):
+                catalognum, word = next((m.group(1), m.string) for m in matches if m)
+                names = [n.replace(word, "").strip() for n in names]
+
+        return catalognum, names
+
+    @staticmethod
+    def parenthesize_remixes(names: List[str]) -> List[str]:
+        """Reformat broken remix titles for an album with a single root title.
+
+        1. Check whether this release has a single root title
+        2. Find remixes that do not have parens around them
+        3. Add parens
+        """
+        names_tokens = map(str.split, names)
+        common_words = reduce(op.and_, [OrderedSet(x) for x in names_tokens])
+        joined = " ".join(common_words)
+        if joined in names:  # it is one of the track names (root title)
+            remix_parts = [n.replace(joined, "").lstrip() for n in names]
+            return [
+                (n.replace(rp, f"({rp})") if REMIX.fullmatch(rp) else n)
+                for n, rp in zip(names, remix_parts)
+            ]
+
+        return names
+
+    @classmethod
+    def eject_album_name(cls, names: List[str]) -> Tuple[Optional[str], List[str]]:
+        matches = list(map(cls.ALBUM_IN_TITLE.search, names))
+        albums = {m.group(1).replace('"', "") for m in matches if m}
+        if len(albums) != 1:
+            return None, names
+
+        return albums.pop(), [
+            (n.replace(m.group(), "") if m else n) for m, n in zip(matches, names)
+        ]
+
+    @classmethod
+    def make(cls, original: List[str], label: str) -> "TrackNames":
+        names = cls.parenthesize_remixes(
+            cls.remove_label(
+                cls.normalize_delimiter(cls.split_quoted_titles(original)), label
+            )
+        )
+        catalognum, names = cls.eject_common_catalognum(names)
+        album, names = cls.eject_album_name(names)
+        return cls(original, names, album=album, catalognum=catalognum)