Scaruffi/scaruffi/api.py

import logging
import re
from dataclasses import dataclass

from bs4 import BeautifulSoup, NavigableString
import requests

import scaruffi.log


SITE_URL = "https://scaruffi.com"
GENERAL_INDEX = SITE_URL + "/music/groups.html"
RATINGS_DECADES = SITE_URL + "/ratings/{:02}.html"


@dataclass
class Release:
    title: str
    artist: str = ""
    year: int = 0  # Usually the release year, not the recording year.


class ScaruffiApi:

    def __init__(self, log_level=logging.WARNING):
        self.log = scaruffi.log.get_logger("scaruffi", level=log_level)

    def _get_soup(self, url):
        html = self._get_page(url)
        if not html:
            return None
        return BeautifulSoup(html, "html5lib")

    def _get_page(self, url):
        self.log.debug(f"GET {url}")
        try:
            response = requests.get(url)
        except requests.exceptions.RequestException as exc:
            self.log.error(f"An exception occured during HTTP GET: {exc}")
            return None
        sc = response.status_code
        if sc != 200:
            self.log.error(f"Server returned HTTP response {sc} to {url}.")
            return None
        return response.text

    def get_musicians(self, offset=0, limit=20):
        """Get a list of musicians, or None on error."""
        soup = self._get_soup(GENERAL_INDEX)
        if not soup:
            return None
        # Semantic Web? Just find the fattest table.
        mu_table = max(soup.find_all("table"), key=lambda t: len(t.text))
        musicians = [a_tag.text for a_tag in mu_table.find_all("a")]
        return musicians[offset : offset + limit]

    def get_ratings(self, decade):
        """Get a dict of ratings to a release list for this decade.

        The decade must be an integer in the [0, 99] range, or a full year
        (1960 for example). Returns None on error.
        """
        if 1900 <= decade:
            decade %= 100
        if not (0 <= decade < 100 and decade % 10 == 0):
            self.log.error(f"Invalid decade value: {decade}.")
            return None
        soup = self._get_soup(RATINGS_DECADES.format(decade))
        if not soup:
            return None
        ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text))
        lists = ratings_table("ul")
        if len(lists) == 1:
            return self._get_ratings_from_unique_list(lists[0])
        else:
            return self._get_ratings_from_lists(lists)

    def _get_ratings_from_unique_list(self, messy_list):
        """Get ratings from decades where one list contains all ratings."""
        ratings = {}
        current_key = None
        for tag in messy_list:
            if isinstance(tag, NavigableString):
                continue
            # Get an entry for the current rating.
            if tag.name == "li":
                release = self._parse_release(tag.text)
                if not current_key:
                    self.log.critical(f"Release {release} without rating.")
                    return None
                ratings[current_key].append(release)
            # Detect a new rating list.
            # Do it after getting entries in tag due to bad HTML.
            text = tag.text.strip()
            if text:
                rating = self._match_rating(text.split()[-1])
                if rating is not None:
                    current_key = rating
                    ratings[current_key] = []
        return ratings

    def _get_ratings_from_lists(self, lists):
        """Get ratings from several lists, one per rating.

        For some decades, there are two "lists of lists": one for albums per
        ratings and one for EP/mini albums per ratings.
        """
        ratings = {}
        rating = None
        for ul in lists:
            for child in ul:
                tag = child.name
                if not tag:
                    continue
                if tag in ("p", "span"):
                    parsed_rating = self._match_rating(child.text)
                    if parsed_rating:
                        rating = parsed_rating
                    if rating not in ratings:
                        ratings[rating] = []
                    continue
                if rating is None:
                    self.log.critical("Failed to find rating tag in list.")
                    return None
                if tag != "li":
                    self.log.warning(f"Unused tag in ratings list: {tag}.")
                    continue
                release = self._parse_release(child.text)
                ratings[rating].append(release)
        return ratings

    RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*")

    def _match_rating(self, text):
        """Try to match text as a rating and return the rating, or None."""
        if not text.strip():
            return None
        match = self.RATING_RE.match(text.strip())
        if match:
            return float(match.group(1))

    def _parse_release(self, entry):
        """Fill a release fields using entry, as well as we can."""
        entry = entry.strip("\r\n :")  # Remove bogus spaces and colons.
        parts = entry.split(": ")
        if len(parts) == 1:
            self.log.info(f"No colon in {entry}, using both as artist & title.")
            title_and_year = self._parse_release_title_year(entry)
            if not title_and_year:
                return Release(title=entry)
            title, year = title_and_year
            artist = title
        else:
            # Usual case is 2 parts ("artist: title"), but in case one of them
            # contains ": " as well, assume that it is part of the title, not
            # the artist name.
            artist = parts[0]
            title_and_year_str = parts[1].strip()
            if len(parts) > 2:
                title_and_year_str += ": " + ": ".join(parts[2:])
            title_and_year = self._parse_release_title_year(title_and_year_str)
            if not title_and_year:
                return Release(artist=artist, title=title_and_year_str)
            title, year = title_and_year
        return Release(artist=artist, title=title, year=year)

    RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)")

    def _parse_release_title_year(self, title_year):
        """Parse title and year in the approximate "title (year)" format.

        In some instances, the year is actually a range of years, in the YYYY-YY
        format. Sometimes there is no space between title and year."""
        match = self.RATING_TITLE_AND_YEAR_RE.match(title_year)
        if not match:
            self.log.error(f"Failed to split title/year in \"{title_year}\".")
            return None
        groups = match.groups()
        if len(groups) != 2 or None in groups:
            self.log.error(f"Failed to parse title/year in \"{title_year}\".")
            return None
        title, year = groups
        try:
            year = int(year)
        except ValueError:
            self.log.error(f"Failed to parse \"{year}\" as an integer.")
            year = 0
        return title, year