From 22b180157740a1f3eff530789e86de5cc2e21018 Mon Sep 17 00:00:00 2001 From: dece Date: Wed, 23 Sep 2020 13:37:33 +0200 Subject: [PATCH] Add ratings per decade methods --- __main__.py | 195 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 183 insertions(+), 12 deletions(-) diff --git a/__main__.py b/__main__.py index 3a86075..41ddbe9 100644 --- a/__main__.py +++ b/__main__.py @@ -1,24 +1,64 @@ #!/usr/bin/env python3 """A simple library to get data from scaruffi.com.""" +import argparse import logging +import re +from dataclasses import dataclass -from bs4 import BeautifulSoup as Soup - +from bs4 import BeautifulSoup, NavigableString import requests import log -LOG = log.get_logger("scaruffi", level=logging.WARNING) -GENERAL_INDEX_URL = "https://scaruffi.com/music/groups.html" +LOG = None + +SITE_URL = "https://scaruffi.com" +GENERAL_INDEX = SITE_URL + "/music/groups.html" +RATINGS_DECADES = SITE_URL + "/ratings/{:02}.html" + + + +@dataclass +class Release: + title: str + artist: str = "" + year: int = 0 # Usually the release year, not the recording year. def main(): - print(get_musicians()) + parser = argparse.ArgumentParser() + parser.add_argument("-v", "--verbose", action="store_true", + help="Print debug logs") + parser.add_argument("-r", "--ratings", type=int, + help="Get ratings for a decade (e.g. 60)") + parser.add_argument("-m", "--musicians", action="store_true", + help="Get the list of musicians") + parser.add_argument("--offset", type=int, default=0, + help="Offset for paginated queries (default is 0)") + parser.add_argument("--limit", type=int, default=20, + help="Limit for paginated queries (default is 20)") + args = parser.parse_args() + + log_level = logging.DEBUG if args.verbose else logging.WARNING + global LOG + LOG = log.get_logger("scaruffi", level=log_level) + + if args.musicians: + musicians = get_musicians(args.offset, args.limit) + for musician in musicians: + print(musician) + elif args.ratings is not None: + ratings = get_ratings(args.ratings) + if ratings: + for rating, releases in ratings.items(): + print(rating) + for rel in releases: + print(f"- {rel.artist} - {rel.title} ({rel.year})") -def _get_url(url): +def _get_page(url): LOG.debug(f"GET {url}") try: response = requests.get(url) @@ -32,16 +72,147 @@ def _get_url(url): return response.text -def get_musicians(offset=0, limit=20): - """Get a list of musicians.""" - html = _get_url(GENERAL_INDEX_URL) +def _get_soup(url): + html = _get_page(url) if not html: return None + return BeautifulSoup(html, "html5lib") - soup = Soup(html, 'html5lib') + +def get_musicians(offset=0, limit=20): + """Get a list of musicians, or None on error.""" + soup = _get_soup(GENERAL_INDEX) + if not soup: + return None # Semantic Web? Just find the fattest table. - mu_table = max(soup.find_all('table'), key=lambda t: len(t.text)) - return [a_tag.text for a_tag in mu_table.find_all("a")] + mu_table = max(soup.find_all("table"), key=lambda t: len(t.text)) + musicians = [a_tag.text for a_tag in mu_table.find_all("a")] + return musicians[offset : offset + limit] + + +def get_ratings(decade): + """Get a dict of ratings to a release list for this decade. + + The decade must be an integer in the [0, 99] range, or a full year + (1960 for example). Returns None on error. + """ + if 1900 <= decade: + decade %= 100 + if not (0 <= decade < 100 and decade % 10 == 0): + LOG.error(f"Invalid decade value: {decade}.") + return None + soup = _get_soup(RATINGS_DECADES.format(decade)) + if not soup: + return None + ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text)) + num_lists = len(ratings_table("ul")) + if num_lists == 1: + return _get_ratings_from_unique_list(ratings_table.ul) + else: + return _get_ratings_from_lists(ratings_table("ul")) + + +def _get_ratings_from_unique_list(messy_list): + """Get ratings from decades where one list contains all ratings.""" + ratings = {} + current_key = None + for tag in messy_list: + if isinstance(tag, NavigableString): + continue + # Get an entry for the current rating. + if tag.name == "li": + release = _parse_release(tag.text) + if not current_key: + LOG.critical(f"Found release {release} without rating.") + return None + ratings[current_key].append(release) + # Detect a new rating list. + # Do it after getting entries in tag due to bad HTML. + text = tag.text.strip() + if text: + rating = _match_rating(text.split()[-1]) + if rating is not None: + current_key = rating + ratings[current_key] = [] + return ratings + + +def _get_ratings_from_lists(lists): + """Get ratings from several lists, one per rating.""" + ratings = {} + for ul in lists: + rating_tag = ul.span + if rating_tag: + rating = _match_rating(rating_tag.text) + if rating is None: + LOG.critical("Failed to find rating tag in list.") + return None + releases = [_parse_release(li.text) for li in ul("li")] + ratings[rating] = releases + return ratings + + +RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*") + + +def _match_rating(text): + """Try to match text as a rating and return the rating, or None.""" + if not text.strip(): + return None + match = RATING_RE.match(text.strip()) + if match: + return float(match.group(1)) + + +def _parse_release(entry): + """Fill a release fields using entry, as well as we can.""" + entry = entry.strip("\r\n :") # Remove bogus spaces and colons. + parts = entry.split(": ") + if len(parts) == 1: + LOG.info(f"No colon in {entry}, using both as artist and title.") + title_and_year = _parse_release_title_year(entry) + if not title_and_year: + return Release(title=entry) + title, year = title_and_year + artist = title + else: + # Usual case is 2 parts ("artist: title"), but in case one of them + # contains ": " as well, assume that it is part of the title, not the + # artist name. + artist = parts[0] + title_and_year_str = parts[1].strip() + if len(parts) > 2: + title_and_year_str += ": " + ": ".join(parts[2:]) + title_and_year = _parse_release_title_year(title_and_year_str) + if not title_and_year: + return Release(artist=artist, title=title_and_year_str) + title, year = title_and_year + return Release(artist=artist, title=title, year=year) + + +RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)") + + +def _parse_release_title_year(title_and_year): + """Parse title and year in the approximate "title (year)" format. + + In some instances, the year is actually a range of years, in the YYYY-YY + format. Sometimes there is no space between title and year.""" + match = RATING_TITLE_AND_YEAR_RE.match(title_and_year) + if not match: + LOG.error(f"Failed to split title and year in \"{title_and_year}\".") + return None + groups = match.groups() + if len(groups) != 2 or None in groups: + LOG.error(f"Failed to parse title and year in \"{title_and_year}\".") + return None + title, year = groups + try: + year = int(year) + except ValueError: + LOG.error(f"Failed to parse year string \"{year}\" as an integer.") + year = 0 + return title, year if __name__ == "__main__":