From 415af8de988d873b27073fd254f5746b3174cc67 Mon Sep 17 00:00:00 2001 From: Adrien Abraham Date: Wed, 28 Sep 2022 19:04:09 +0200 Subject: [PATCH] scrape-djtracklists --- scrape-djtracklists.py | 121 ++++++++++++++++++++++++++++++++++------- 1 file changed, 100 insertions(+), 21 deletions(-) diff --git a/scrape-djtracklists.py b/scrape-djtracklists.py index 1aa809d..e965cae 100755 --- a/scrape-djtracklists.py +++ b/scrape-djtracklists.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 -"""Download tracklists from djtracklists.com.""" +"""Download tracklists from djtracklists.com as CSV.""" import argparse +import csv +import time from dataclasses import dataclass from pathlib import Path from typing import Optional @@ -12,31 +14,71 @@ from bs4 import BeautifulSoup @dataclass class Track: + """One track, parsed from a tracklist page.""" title: str artists: list[str] mix: Optional[str] mix_artists: Optional[list[str]] timestamp: str + def format_artists(self): + return " & ".join(self.artists) + + def format_mix_artists(self): + return " & ".join(self.mix_artists) if self.mix_artists else "" + def main(): parser = argparse.ArgumentParser() - parser.add_argument("-s", "--series", help="download this series") - parser.add_argument("-t", "--tracklist", help="download this tracklist") + parser.add_argument("-s", "--series", + help="download this series (provide URL of 1st page)") + parser.add_argument("-t", "--tracklist", + help="download this tracklist (provide URL)") + parser.add_argument("--pretty", help="pretty print a CSV file.") args = parser.parse_args() - if tracklist_url := args.tracklist: - tracklist = download_tracklist(tracklist_url) - name = tracklist_url.rstrip("/").rsplit("/", maxsplit=1)[-1] - file_name = Path.cwd() / (name + ".txt") - save_tracklist(tracklist, file_name) + if csv_file_name := args.pretty: + pretty_print_csv(csv_file_name) + elif series_url := args.series: + download_series(series_url) + elif tracklist_url := args.tracklist: + download_tracklist(tracklist_url) + + +def download_series(series_url: str): + while series_url: + print("Processing series URL", series_url) + response = requests.get(series_url, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + # Download all tracklists. + for tracklist_link in soup.find_all("a", class_="mix"): + tracklist_url = tracklist_link["href"] + print("Processing tracklist URL", tracklist_url) + download_tracklist(tracklist_url) + time.sleep(1) # throttle + # Look for the next page button. + for page_link in soup.find_all("a", class_="pagenumber"): + if "Next" in page_link.string: + series_url = page_link["href"] + break + else: + series_url = None + + +def download_tracklist(url: str): + tracklist = get_tracklist_from_url(url) + name = url.rstrip("/").rsplit("/", maxsplit=1)[-1] + file_name = Path.cwd() / (name + ".csv") + save_tracklist_as_csv(tracklist, file_name) def is_track_row(css_class: str) -> bool: return css_class in ("on", "off") -def download_tracklist(url: str) -> list[Track]: +def get_tracklist_from_url(url: str) -> list[Track]: + """Get tracklist from the Web and parse it into a list of Track objects.""" response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") @@ -44,13 +86,22 @@ def download_tracklist(url: str) -> list[Track]: for row in soup.find_all("div", class_=is_track_row): artists = [] mix_artists = [] + try: title = row.find("a", class_="track").string mix = row.find("a", class_="release").string except AttributeError: - title = row.find("b").string + try: + title = row.find("b").string + except AttributeError: + title = "(unknown title)" mix = None - timestamp = row.find("span", class_="index_time").string + + try: + timestamp = row.find("span", class_="index_time").string + except AttributeError: + timestamp = "(unknown timestamp)" + for artist in row.find_all("a", class_="artist"): prev_tag = artist.previous_sibling.string if getattr(prev_tag, "string", "").strip() == "remixed by": @@ -69,21 +120,49 @@ def download_tracklist(url: str) -> list[Track]: return tracklist -def save_tracklist(tracklist: list[Track], file_name: Path): +def save_tracklist_as_csv(tracklist: list[Track], file_name: Path): try: - with open(file_name, "wt", encoding="utf8") as file: + with open(file_name, "wt", encoding="utf8", newline="") as file: + writer = csv.writer(file) + writer.writerow( + ["Timestamp", "Artists", "Title", "Mix", "Remix artists"] + ) for track in tracklist: - artists = " & ".join(track.artists) - line = f"{track.timestamp} — {artists} — {track.title}" - if track.mix: - line += f" ({track.mix})" - if track.mix_artists: - mix_artists = " & ".join(track.mix_artists) - line += f" remixed by {mix_artists}" - file.write(line + "\n") + writer.writerow([ + track.timestamp, + track.format_artists(), + track.title, + track.mix or "", + track.format_mix_artists(), + ]) except OSError as exc: print(f"Can't save tracklist: {exc}") +def pretty_print_csv(csv_file_name: str): + try: + with open(csv_file_name, "rt", encoding="utf8", newline="") as file: + reader = csv.reader(file) + first_line_skipped = False + for row in reader: + if not first_line_skipped: + first_line_skipped = True + continue + ts, artists, title, mix, remix_artists = row + if ":" in ts: + ts_min, ts_sec = ts.split(":") + ts_min = int(ts_min) + ts_h, ts_min = ts_min // 60, ts_min % 60 + ts = f"{ts_h:02}:{ts_min:02}:{ts_sec}" + print(f"{ts} {artists or '(unknown)'} — {title}", end="") + if mix: + print(f" ({mix})", end="") + if remix_artists: + print(f" by {remix_artists}", end="") + print() + except OSError as exc: + print(f"Can't read CSV: {exc}") + + if __name__ == "__main__": main()