Add ratings per decade methods

This commit is contained in:
dece 2020-09-23 13:37:33 +02:00
parent a3ff6d8c4a
commit 22b1801577

View file

@ -1,24 +1,64 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""A simple library to get data from scaruffi.com.""" """A simple library to get data from scaruffi.com."""
import argparse
import logging import logging
import re
from dataclasses import dataclass
from bs4 import BeautifulSoup as Soup from bs4 import BeautifulSoup, NavigableString
import requests import requests
import log import log
LOG = log.get_logger("scaruffi", level=logging.WARNING) LOG = None
GENERAL_INDEX_URL = "https://scaruffi.com/music/groups.html"
SITE_URL = "https://scaruffi.com"
GENERAL_INDEX = SITE_URL + "/music/groups.html"
RATINGS_DECADES = SITE_URL + "/ratings/{:02}.html"
@dataclass
class Release:
title: str
artist: str = ""
year: int = 0 # Usually the release year, not the recording year.
def main(): def main():
print(get_musicians()) parser = argparse.ArgumentParser()
parser.add_argument("-v", "--verbose", action="store_true",
help="Print debug logs")
parser.add_argument("-r", "--ratings", type=int,
help="Get ratings for a decade (e.g. 60)")
parser.add_argument("-m", "--musicians", action="store_true",
help="Get the list of musicians")
parser.add_argument("--offset", type=int, default=0,
help="Offset for paginated queries (default is 0)")
parser.add_argument("--limit", type=int, default=20,
help="Limit for paginated queries (default is 20)")
args = parser.parse_args()
log_level = logging.DEBUG if args.verbose else logging.WARNING
global LOG
LOG = log.get_logger("scaruffi", level=log_level)
if args.musicians:
musicians = get_musicians(args.offset, args.limit)
for musician in musicians:
print(musician)
elif args.ratings is not None:
ratings = get_ratings(args.ratings)
if ratings:
for rating, releases in ratings.items():
print(rating)
for rel in releases:
print(f"- {rel.artist} - {rel.title} ({rel.year})")
def _get_url(url): def _get_page(url):
LOG.debug(f"GET {url}") LOG.debug(f"GET {url}")
try: try:
response = requests.get(url) response = requests.get(url)
@ -32,16 +72,147 @@ def _get_url(url):
return response.text return response.text
def get_musicians(offset=0, limit=20): def _get_soup(url):
"""Get a list of musicians.""" html = _get_page(url)
html = _get_url(GENERAL_INDEX_URL)
if not html: if not html:
return None return None
return BeautifulSoup(html, "html5lib")
soup = Soup(html, 'html5lib')
def get_musicians(offset=0, limit=20):
"""Get a list of musicians, or None on error."""
soup = _get_soup(GENERAL_INDEX)
if not soup:
return None
# Semantic Web? Just find the fattest table. # Semantic Web? Just find the fattest table.
mu_table = max(soup.find_all('table'), key=lambda t: len(t.text)) mu_table = max(soup.find_all("table"), key=lambda t: len(t.text))
return [a_tag.text for a_tag in mu_table.find_all("a")] musicians = [a_tag.text for a_tag in mu_table.find_all("a")]
return musicians[offset : offset + limit]
def get_ratings(decade):
"""Get a dict of ratings to a release list for this decade.
The decade must be an integer in the [0, 99] range, or a full year
(1960 for example). Returns None on error.
"""
if 1900 <= decade:
decade %= 100
if not (0 <= decade < 100 and decade % 10 == 0):
LOG.error(f"Invalid decade value: {decade}.")
return None
soup = _get_soup(RATINGS_DECADES.format(decade))
if not soup:
return None
ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text))
num_lists = len(ratings_table("ul"))
if num_lists == 1:
return _get_ratings_from_unique_list(ratings_table.ul)
else:
return _get_ratings_from_lists(ratings_table("ul"))
def _get_ratings_from_unique_list(messy_list):
"""Get ratings from decades where one list contains all ratings."""
ratings = {}
current_key = None
for tag in messy_list:
if isinstance(tag, NavigableString):
continue
# Get an entry for the current rating.
if tag.name == "li":
release = _parse_release(tag.text)
if not current_key:
LOG.critical(f"Found release {release} without rating.")
return None
ratings[current_key].append(release)
# Detect a new rating list.
# Do it after getting entries in tag due to bad HTML.
text = tag.text.strip()
if text:
rating = _match_rating(text.split()[-1])
if rating is not None:
current_key = rating
ratings[current_key] = []
return ratings
def _get_ratings_from_lists(lists):
"""Get ratings from several lists, one per rating."""
ratings = {}
for ul in lists:
rating_tag = ul.span
if rating_tag:
rating = _match_rating(rating_tag.text)
if rating is None:
LOG.critical("Failed to find rating tag in list.")
return None
releases = [_parse_release(li.text) for li in ul("li")]
ratings[rating] = releases
return ratings
RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*")
def _match_rating(text):
"""Try to match text as a rating and return the rating, or None."""
if not text.strip():
return None
match = RATING_RE.match(text.strip())
if match:
return float(match.group(1))
def _parse_release(entry):
"""Fill a release fields using entry, as well as we can."""
entry = entry.strip("\r\n :") # Remove bogus spaces and colons.
parts = entry.split(": ")
if len(parts) == 1:
LOG.info(f"No colon in {entry}, using both as artist and title.")
title_and_year = _parse_release_title_year(entry)
if not title_and_year:
return Release(title=entry)
title, year = title_and_year
artist = title
else:
# Usual case is 2 parts ("artist: title"), but in case one of them
# contains ": " as well, assume that it is part of the title, not the
# artist name.
artist = parts[0]
title_and_year_str = parts[1].strip()
if len(parts) > 2:
title_and_year_str += ": " + ": ".join(parts[2:])
title_and_year = _parse_release_title_year(title_and_year_str)
if not title_and_year:
return Release(artist=artist, title=title_and_year_str)
title, year = title_and_year
return Release(artist=artist, title=title, year=year)
RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)")
def _parse_release_title_year(title_and_year):
"""Parse title and year in the approximate "title (year)" format.
In some instances, the year is actually a range of years, in the YYYY-YY
format. Sometimes there is no space between title and year."""
match = RATING_TITLE_AND_YEAR_RE.match(title_and_year)
if not match:
LOG.error(f"Failed to split title and year in \"{title_and_year}\".")
return None
groups = match.groups()
if len(groups) != 2 or None in groups:
LOG.error(f"Failed to parse title and year in \"{title_and_year}\".")
return None
title, year = groups
try:
year = int(year)
except ValueError:
LOG.error(f"Failed to parse year string \"{year}\" as an integer.")
year = 0
return title, year
if __name__ == "__main__": if __name__ == "__main__":