api: put functions in a class

This commit is contained in:
dece 2020-11-05 19:03:31 +01:00
parent 2eb84bd746
commit c54f4f0fe7
2 changed files with 151 additions and 158 deletions

View file

@ -1,3 +1,4 @@
import logging
import re import re
from dataclasses import dataclass from dataclasses import dataclass
@ -7,8 +8,6 @@ import requests
import scaruffi.log import scaruffi.log
LOG = None
SITE_URL = "https://scaruffi.com" SITE_URL = "https://scaruffi.com"
GENERAL_INDEX = SITE_URL + "/music/groups.html" GENERAL_INDEX = SITE_URL + "/music/groups.html"
RATINGS_DECADES = SITE_URL + "/ratings/{:02}.html" RATINGS_DECADES = SITE_URL + "/ratings/{:02}.html"
@ -21,35 +20,33 @@ class Release:
year: int = 0 # Usually the release year, not the recording year. year: int = 0 # Usually the release year, not the recording year.
def setup_logging(*args, **kwargs): class ScaruffiApi:
global LOG
LOG = scaruffi.log.get_logger(*args, **kwargs)
def __init__(self, log_level=logging.WARNING):
self.log = scaruffi.log.get_logger("scaruffi", level=log_level)
def _get_page(url): def _get_soup(self, url):
LOG.debug(f"GET {url}") html = self._get_page(url)
try:
response = requests.get(url)
except requests.exceptions.RequestException as exc:
LOG.error(f"An exception occured during HTTP GET: {exc}")
return None
sc = response.status_code
if sc != 200:
LOG.error(f"Server returned HTTP response {sc} to {url}.")
return None
return response.text
def _get_soup(url):
html = _get_page(url)
if not html: if not html:
return None return None
return BeautifulSoup(html, "html5lib") return BeautifulSoup(html, "html5lib")
def _get_page(self, url):
self.log.debug(f"GET {url}")
try:
response = requests.get(url)
except requests.exceptions.RequestException as exc:
self.log.error(f"An exception occured during HTTP GET: {exc}")
return None
sc = response.status_code
if sc != 200:
self.log.error(f"Server returned HTTP response {sc} to {url}.")
return None
return response.text
def get_musicians(offset=0, limit=20): def get_musicians(self, offset=0, limit=20):
"""Get a list of musicians, or None on error.""" """Get a list of musicians, or None on error."""
soup = _get_soup(GENERAL_INDEX) soup = self._get_soup(GENERAL_INDEX)
if not soup: if not soup:
return None return None
# Semantic Web? Just find the fattest table. # Semantic Web? Just find the fattest table.
@ -57,8 +54,7 @@ def get_musicians(offset=0, limit=20):
musicians = [a_tag.text for a_tag in mu_table.find_all("a")] musicians = [a_tag.text for a_tag in mu_table.find_all("a")]
return musicians[offset : offset + limit] return musicians[offset : offset + limit]
def get_ratings(self, decade):
def get_ratings(decade):
"""Get a dict of ratings to a release list for this decade. """Get a dict of ratings to a release list for this decade.
The decade must be an integer in the [0, 99] range, or a full year The decade must be an integer in the [0, 99] range, or a full year
@ -67,20 +63,19 @@ def get_ratings(decade):
if 1900 <= decade: if 1900 <= decade:
decade %= 100 decade %= 100
if not (0 <= decade < 100 and decade % 10 == 0): if not (0 <= decade < 100 and decade % 10 == 0):
LOG.error(f"Invalid decade value: {decade}.") self.log.error(f"Invalid decade value: {decade}.")
return None return None
soup = _get_soup(RATINGS_DECADES.format(decade)) soup = self._get_soup(RATINGS_DECADES.format(decade))
if not soup: if not soup:
return None return None
ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text)) ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text))
num_lists = len(ratings_table("ul")) num_lists = len(ratings_table("ul"))
if num_lists == 1: if num_lists == 1:
return _get_ratings_from_unique_list(ratings_table.ul) return self._get_ratings_from_unique_list(ratings_table.ul)
else: else:
return _get_ratings_from_lists(ratings_table("ul")) return self._get_ratings_from_lists(ratings_table("ul"))
def _get_ratings_from_unique_list(self, messy_list):
def _get_ratings_from_unique_list(messy_list):
"""Get ratings from decades where one list contains all ratings.""" """Get ratings from decades where one list contains all ratings."""
ratings = {} ratings = {}
current_key = None current_key = None
@ -89,95 +84,89 @@ def _get_ratings_from_unique_list(messy_list):
continue continue
# Get an entry for the current rating. # Get an entry for the current rating.
if tag.name == "li": if tag.name == "li":
release = _parse_release(tag.text) release = self._parse_release(tag.text)
if not current_key: if not current_key:
LOG.critical(f"Found release {release} without rating.") self.log.critical(f"Release {release} without rating.")
return None return None
ratings[current_key].append(release) ratings[current_key].append(release)
# Detect a new rating list. # Detect a new rating list.
# Do it after getting entries in tag due to bad HTML. # Do it after getting entries in tag due to bad HTML.
text = tag.text.strip() text = tag.text.strip()
if text: if text:
rating = _match_rating(text.split()[-1]) rating = self._match_rating(text.split()[-1])
if rating is not None: if rating is not None:
current_key = rating current_key = rating
ratings[current_key] = [] ratings[current_key] = []
return ratings return ratings
def _get_ratings_from_lists(self, lists):
def _get_ratings_from_lists(lists):
"""Get ratings from several lists, one per rating.""" """Get ratings from several lists, one per rating."""
ratings = {} ratings = {}
for ul in lists: for ul in lists:
rating_tag = ul.span rating_tag = ul.span
if rating_tag: if rating_tag:
rating = _match_rating(rating_tag.text) rating = self._match_rating(rating_tag.text)
if rating is None: if rating is None:
LOG.critical("Failed to find rating tag in list.") self.log.critical("Failed to find rating tag in list.")
return None return None
releases = [_parse_release(li.text) for li in ul("li")] releases = [self._parse_release(li.text) for li in ul("li")]
ratings[rating] = releases ratings[rating] = releases
return ratings return ratings
RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*")
RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*") def _match_rating(self, text):
def _match_rating(text):
"""Try to match text as a rating and return the rating, or None.""" """Try to match text as a rating and return the rating, or None."""
if not text.strip(): if not text.strip():
return None return None
match = RATING_RE.match(text.strip()) match = self.RATING_RE.match(text.strip())
if match: if match:
return float(match.group(1)) return float(match.group(1))
def _parse_release(self, entry):
def _parse_release(entry):
"""Fill a release fields using entry, as well as we can.""" """Fill a release fields using entry, as well as we can."""
entry = entry.strip("\r\n :") # Remove bogus spaces and colons. entry = entry.strip("\r\n :") # Remove bogus spaces and colons.
parts = entry.split(": ") parts = entry.split(": ")
if len(parts) == 1: if len(parts) == 1:
LOG.info(f"No colon in {entry}, using both as artist and title.") self.log.info(f"No colon in {entry}, using both as artist & title.")
title_and_year = _parse_release_title_year(entry) title_and_year = self._parse_release_title_year(entry)
if not title_and_year: if not title_and_year:
return Release(title=entry) return Release(title=entry)
title, year = title_and_year title, year = title_and_year
artist = title artist = title
else: else:
# Usual case is 2 parts ("artist: title"), but in case one of them # Usual case is 2 parts ("artist: title"), but in case one of them
# contains ": " as well, assume that it is part of the title, not the # contains ": " as well, assume that it is part of the title, not
# artist name. # the artist name.
artist = parts[0] artist = parts[0]
title_and_year_str = parts[1].strip() title_and_year_str = parts[1].strip()
if len(parts) > 2: if len(parts) > 2:
title_and_year_str += ": " + ": ".join(parts[2:]) title_and_year_str += ": " + ": ".join(parts[2:])
title_and_year = _parse_release_title_year(title_and_year_str) title_and_year = self._parse_release_title_year(title_and_year_str)
if not title_and_year: if not title_and_year:
return Release(artist=artist, title=title_and_year_str) return Release(artist=artist, title=title_and_year_str)
title, year = title_and_year title, year = title_and_year
return Release(artist=artist, title=title, year=year) return Release(artist=artist, title=title, year=year)
RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)")
RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)") def _parse_release_title_year(self, title_year):
def _parse_release_title_year(title_and_year):
"""Parse title and year in the approximate "title (year)" format. """Parse title and year in the approximate "title (year)" format.
In some instances, the year is actually a range of years, in the YYYY-YY In some instances, the year is actually a range of years, in the YYYY-YY
format. Sometimes there is no space between title and year.""" format. Sometimes there is no space between title and year."""
match = RATING_TITLE_AND_YEAR_RE.match(title_and_year) match = self.RATING_TITLE_AND_YEAR_RE.match(title_year)
if not match: if not match:
LOG.error(f"Failed to split title and year in \"{title_and_year}\".") self.log.error(f"Failed to split title/year in \"{title_year}\".")
return None return None
groups = match.groups() groups = match.groups()
if len(groups) != 2 or None in groups: if len(groups) != 2 or None in groups:
LOG.error(f"Failed to parse title and year in \"{title_and_year}\".") self.log.error(f"Failed to parse title/year in \"{title_year}\".")
return None return None
title, year = groups title, year = groups
try: try:
year = int(year) year = int(year)
except ValueError: except ValueError:
LOG.error(f"Failed to parse year string \"{year}\" as an integer.") self.log.error(f"Failed to parse \"{year}\" as an integer.")
year = 0 year = 0
return title, year return title, year

View file

@ -1,21 +1,25 @@
import logging
import unittest import unittest
from scaruffi import api from scaruffi.api import ScaruffiApi
class TestScaruffi(unittest.TestCase): class TestScaruffi(unittest.TestCase):
def setUpClass(): def setUp(self):
api.setup_logging("test") self.api = ScaruffiApi()
def tearDown(self):
self.api = None
def test_get_musicians(self): def test_get_musicians(self):
musicians = api.get_musicians() musicians = self.api.get_musicians()
self.assertEqual(len(musicians), 20) self.assertEqual(len(musicians), 20)
def test_get_ratings(self): def test_get_ratings(self):
self.assertIsNotNone(api.get_ratings(1960)) self.assertIsNotNone(self.api.get_ratings(1960))
self.assertIsNotNone(api.get_ratings(1970)) self.assertIsNotNone(self.api.get_ratings(1970))
self.assertIsNotNone(api.get_ratings(1980)) self.assertIsNotNone(self.api.get_ratings(1980))
self.assertIsNotNone(api.get_ratings(1990)) self.assertIsNotNone(self.api.get_ratings(1990))
self.assertIsNotNone(api.get_ratings(2000)) self.assertIsNotNone(self.api.get_ratings(2000))
self.assertIsNotNone(api.get_ratings(2010)) self.assertIsNotNone(self.api.get_ratings(2010))