From c54f4f0fe7615a224864ffc755f9e3bcb3c81368 Mon Sep 17 00:00:00 2001 From: dece Date: Thu, 5 Nov 2020 19:03:31 +0100 Subject: [PATCH] api: put functions in a class --- scaruffi/api.py | 309 ++++++++++++++++++++++------------------------ scaruffi/tests.py | 24 ++-- 2 files changed, 163 insertions(+), 170 deletions(-) diff --git a/scaruffi/api.py b/scaruffi/api.py index a9c4256..e0ebb75 100644 --- a/scaruffi/api.py +++ b/scaruffi/api.py @@ -1,3 +1,4 @@ +import logging import re from dataclasses import dataclass @@ -7,8 +8,6 @@ import requests import scaruffi.log -LOG = None - SITE_URL = "https://scaruffi.com" GENERAL_INDEX = SITE_URL + "/music/groups.html" RATINGS_DECADES = SITE_URL + "/ratings/{:02}.html" @@ -21,163 +20,153 @@ class Release: year: int = 0 # Usually the release year, not the recording year. -def setup_logging(*args, **kwargs): - global LOG - LOG = scaruffi.log.get_logger(*args, **kwargs) - - -def _get_page(url): - LOG.debug(f"GET {url}") - try: - response = requests.get(url) - except requests.exceptions.RequestException as exc: - LOG.error(f"An exception occured during HTTP GET: {exc}") - return None - sc = response.status_code - if sc != 200: - LOG.error(f"Server returned HTTP response {sc} to {url}.") - return None - return response.text - - -def _get_soup(url): - html = _get_page(url) - if not html: - return None - return BeautifulSoup(html, "html5lib") - - -def get_musicians(offset=0, limit=20): - """Get a list of musicians, or None on error.""" - soup = _get_soup(GENERAL_INDEX) - if not soup: - return None - # Semantic Web? Just find the fattest table. - mu_table = max(soup.find_all("table"), key=lambda t: len(t.text)) - musicians = [a_tag.text for a_tag in mu_table.find_all("a")] - return musicians[offset : offset + limit] - - -def get_ratings(decade): - """Get a dict of ratings to a release list for this decade. - - The decade must be an integer in the [0, 99] range, or a full year - (1960 for example). Returns None on error. - """ - if 1900 <= decade: - decade %= 100 - if not (0 <= decade < 100 and decade % 10 == 0): - LOG.error(f"Invalid decade value: {decade}.") - return None - soup = _get_soup(RATINGS_DECADES.format(decade)) - if not soup: - return None - ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text)) - num_lists = len(ratings_table("ul")) - if num_lists == 1: - return _get_ratings_from_unique_list(ratings_table.ul) - else: - return _get_ratings_from_lists(ratings_table("ul")) - - -def _get_ratings_from_unique_list(messy_list): - """Get ratings from decades where one list contains all ratings.""" - ratings = {} - current_key = None - for tag in messy_list: - if isinstance(tag, NavigableString): - continue - # Get an entry for the current rating. - if tag.name == "li": - release = _parse_release(tag.text) - if not current_key: - LOG.critical(f"Found release {release} without rating.") +class ScaruffiApi: + + def __init__(self, log_level=logging.WARNING): + self.log = scaruffi.log.get_logger("scaruffi", level=log_level) + + def _get_soup(self, url): + html = self._get_page(url) + if not html: + return None + return BeautifulSoup(html, "html5lib") + + def _get_page(self, url): + self.log.debug(f"GET {url}") + try: + response = requests.get(url) + except requests.exceptions.RequestException as exc: + self.log.error(f"An exception occured during HTTP GET: {exc}") + return None + sc = response.status_code + if sc != 200: + self.log.error(f"Server returned HTTP response {sc} to {url}.") + return None + return response.text + + def get_musicians(self, offset=0, limit=20): + """Get a list of musicians, or None on error.""" + soup = self._get_soup(GENERAL_INDEX) + if not soup: + return None + # Semantic Web? Just find the fattest table. + mu_table = max(soup.find_all("table"), key=lambda t: len(t.text)) + musicians = [a_tag.text for a_tag in mu_table.find_all("a")] + return musicians[offset : offset + limit] + + def get_ratings(self, decade): + """Get a dict of ratings to a release list for this decade. + + The decade must be an integer in the [0, 99] range, or a full year + (1960 for example). Returns None on error. + """ + if 1900 <= decade: + decade %= 100 + if not (0 <= decade < 100 and decade % 10 == 0): + self.log.error(f"Invalid decade value: {decade}.") + return None + soup = self._get_soup(RATINGS_DECADES.format(decade)) + if not soup: + return None + ratings_table = max(soup.find_all("table"), key=lambda t: len(t.text)) + num_lists = len(ratings_table("ul")) + if num_lists == 1: + return self._get_ratings_from_unique_list(ratings_table.ul) + else: + return self._get_ratings_from_lists(ratings_table("ul")) + + def _get_ratings_from_unique_list(self, messy_list): + """Get ratings from decades where one list contains all ratings.""" + ratings = {} + current_key = None + for tag in messy_list: + if isinstance(tag, NavigableString): + continue + # Get an entry for the current rating. + if tag.name == "li": + release = self._parse_release(tag.text) + if not current_key: + self.log.critical(f"Release {release} without rating.") + return None + ratings[current_key].append(release) + # Detect a new rating list. + # Do it after getting entries in tag due to bad HTML. + text = tag.text.strip() + if text: + rating = self._match_rating(text.split()[-1]) + if rating is not None: + current_key = rating + ratings[current_key] = [] + return ratings + + def _get_ratings_from_lists(self, lists): + """Get ratings from several lists, one per rating.""" + ratings = {} + for ul in lists: + rating_tag = ul.span + if rating_tag: + rating = self._match_rating(rating_tag.text) + if rating is None: + self.log.critical("Failed to find rating tag in list.") return None - ratings[current_key].append(release) - # Detect a new rating list. - # Do it after getting entries in tag due to bad HTML. - text = tag.text.strip() - if text: - rating = _match_rating(text.split()[-1]) - if rating is not None: - current_key = rating - ratings[current_key] = [] - return ratings - - -def _get_ratings_from_lists(lists): - """Get ratings from several lists, one per rating.""" - ratings = {} - for ul in lists: - rating_tag = ul.span - if rating_tag: - rating = _match_rating(rating_tag.text) - if rating is None: - LOG.critical("Failed to find rating tag in list.") + releases = [self._parse_release(li.text) for li in ul("li")] + ratings[rating] = releases + return ratings + + RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*") + + def _match_rating(self, text): + """Try to match text as a rating and return the rating, or None.""" + if not text.strip(): + return None + match = self.RATING_RE.match(text.strip()) + if match: + return float(match.group(1)) + + def _parse_release(self, entry): + """Fill a release fields using entry, as well as we can.""" + entry = entry.strip("\r\n :") # Remove bogus spaces and colons. + parts = entry.split(": ") + if len(parts) == 1: + self.log.info(f"No colon in {entry}, using both as artist & title.") + title_and_year = self._parse_release_title_year(entry) + if not title_and_year: + return Release(title=entry) + title, year = title_and_year + artist = title + else: + # Usual case is 2 parts ("artist: title"), but in case one of them + # contains ": " as well, assume that it is part of the title, not + # the artist name. + artist = parts[0] + title_and_year_str = parts[1].strip() + if len(parts) > 2: + title_and_year_str += ": " + ": ".join(parts[2:]) + title_and_year = self._parse_release_title_year(title_and_year_str) + if not title_and_year: + return Release(artist=artist, title=title_and_year_str) + title, year = title_and_year + return Release(artist=artist, title=title, year=year) + + RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)") + + def _parse_release_title_year(self, title_year): + """Parse title and year in the approximate "title (year)" format. + + In some instances, the year is actually a range of years, in the YYYY-YY + format. Sometimes there is no space between title and year.""" + match = self.RATING_TITLE_AND_YEAR_RE.match(title_year) + if not match: + self.log.error(f"Failed to split title/year in \"{title_year}\".") + return None + groups = match.groups() + if len(groups) != 2 or None in groups: + self.log.error(f"Failed to parse title/year in \"{title_year}\".") return None - releases = [_parse_release(li.text) for li in ul("li")] - ratings[rating] = releases - return ratings - - -RATING_RE = re.compile(r"\s*(\d(.\d)?)/10\s*") - - -def _match_rating(text): - """Try to match text as a rating and return the rating, or None.""" - if not text.strip(): - return None - match = RATING_RE.match(text.strip()) - if match: - return float(match.group(1)) - - -def _parse_release(entry): - """Fill a release fields using entry, as well as we can.""" - entry = entry.strip("\r\n :") # Remove bogus spaces and colons. - parts = entry.split(": ") - if len(parts) == 1: - LOG.info(f"No colon in {entry}, using both as artist and title.") - title_and_year = _parse_release_title_year(entry) - if not title_and_year: - return Release(title=entry) - title, year = title_and_year - artist = title - else: - # Usual case is 2 parts ("artist: title"), but in case one of them - # contains ": " as well, assume that it is part of the title, not the - # artist name. - artist = parts[0] - title_and_year_str = parts[1].strip() - if len(parts) > 2: - title_and_year_str += ": " + ": ".join(parts[2:]) - title_and_year = _parse_release_title_year(title_and_year_str) - if not title_and_year: - return Release(artist=artist, title=title_and_year_str) - title, year = title_and_year - return Release(artist=artist, title=title, year=year) - - -RATING_TITLE_AND_YEAR_RE = re.compile(r"(.+?)\s?\((\d{4})(?:-\d+)?\)") - - -def _parse_release_title_year(title_and_year): - """Parse title and year in the approximate "title (year)" format. - - In some instances, the year is actually a range of years, in the YYYY-YY - format. Sometimes there is no space between title and year.""" - match = RATING_TITLE_AND_YEAR_RE.match(title_and_year) - if not match: - LOG.error(f"Failed to split title and year in \"{title_and_year}\".") - return None - groups = match.groups() - if len(groups) != 2 or None in groups: - LOG.error(f"Failed to parse title and year in \"{title_and_year}\".") - return None - title, year = groups - try: - year = int(year) - except ValueError: - LOG.error(f"Failed to parse year string \"{year}\" as an integer.") - year = 0 - return title, year + title, year = groups + try: + year = int(year) + except ValueError: + self.log.error(f"Failed to parse \"{year}\" as an integer.") + year = 0 + return title, year diff --git a/scaruffi/tests.py b/scaruffi/tests.py index b3fa9e7..9c1619c 100644 --- a/scaruffi/tests.py +++ b/scaruffi/tests.py @@ -1,21 +1,25 @@ +import logging import unittest -from scaruffi import api +from scaruffi.api import ScaruffiApi class TestScaruffi(unittest.TestCase): - def setUpClass(): - api.setup_logging("test") + def setUp(self): + self.api = ScaruffiApi() + + def tearDown(self): + self.api = None def test_get_musicians(self): - musicians = api.get_musicians() + musicians = self.api.get_musicians() self.assertEqual(len(musicians), 20) def test_get_ratings(self): - self.assertIsNotNone(api.get_ratings(1960)) - self.assertIsNotNone(api.get_ratings(1970)) - self.assertIsNotNone(api.get_ratings(1980)) - self.assertIsNotNone(api.get_ratings(1990)) - self.assertIsNotNone(api.get_ratings(2000)) - self.assertIsNotNone(api.get_ratings(2010)) + self.assertIsNotNone(self.api.get_ratings(1960)) + self.assertIsNotNone(self.api.get_ratings(1970)) + self.assertIsNotNone(self.api.get_ratings(1980)) + self.assertIsNotNone(self.api.get_ratings(1990)) + self.assertIsNotNone(self.api.get_ratings(2000)) + self.assertIsNotNone(self.api.get_ratings(2010))