From 559646bc1147c2a50cef510ec484d0bffbf85380 Mon Sep 17 00:00:00 2001 From: Adrien Abraham Date: Fri, 23 Sep 2022 18:50:16 +0200 Subject: [PATCH] scrape-djtracklists: WIP --- scrape-djtracklists.py | 66 +++++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/scrape-djtracklists.py b/scrape-djtracklists.py index 18ab845..1aa809d 100755 --- a/scrape-djtracklists.py +++ b/scrape-djtracklists.py @@ -2,11 +2,23 @@ """Download tracklists from djtracklists.com.""" import argparse +from dataclasses import dataclass +from pathlib import Path +from typing import Optional import requests from bs4 import BeautifulSoup +@dataclass +class Track: + title: str + artists: list[str] + mix: Optional[str] + mix_artists: Optional[list[str]] + timestamp: str + + def main(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--series", help="download this series") @@ -14,31 +26,63 @@ def main(): args = parser.parse_args() if tracklist_url := args.tracklist: - download_tracklist(tracklist_url) + tracklist = download_tracklist(tracklist_url) + name = tracklist_url.rstrip("/").rsplit("/", maxsplit=1)[-1] + file_name = Path.cwd() / (name + ".txt") + save_tracklist(tracklist, file_name) def is_track_row(css_class: str) -> bool: - return css_class == "on" or css_class == "off" + return css_class in ("on", "off") -def download_tracklist(url: str): - response = requests.get(url) +def download_tracklist(url: str) -> list[Track]: + response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") + tracklist = [] for row in soup.find_all("div", class_=is_track_row): - print("*" * 80) + artists = [] + mix_artists = [] try: - print("track", row.find("a", class_="track").string) - print("release", row.find("a", class_="release").string) + title = row.find("a", class_="track").string + mix = row.find("a", class_="release").string except AttributeError: - print("track", row.find("b").string) + title = row.find("b").string + mix = None + timestamp = row.find("span", class_="index_time").string for artist in row.find_all("a", class_="artist"): prev_tag = artist.previous_sibling.string if getattr(prev_tag, "string", "").strip() == "remixed by": - print("remixing artist", artist.string) + mix_artists.append(artist.string) else: - print("artist", artist.string) - print("\n" * 10) + artists.append(artist.string) + tracklist.append( + Track( + title=title, + mix=mix, + artists=artists, + mix_artists=mix_artists or None, + timestamp=timestamp, + ) + ) + return tracklist + + +def save_tracklist(tracklist: list[Track], file_name: Path): + try: + with open(file_name, "wt", encoding="utf8") as file: + for track in tracklist: + artists = " & ".join(track.artists) + line = f"{track.timestamp} — {artists} — {track.title}" + if track.mix: + line += f" ({track.mix})" + if track.mix_artists: + mix_artists = " & ".join(track.mix_artists) + line += f" remixed by {mix_artists}" + file.write(line + "\n") + except OSError as exc: + print(f"Can't save tracklist: {exc}") if __name__ == "__main__":