1
0
Fork 0

scrape-djtracklists

This commit is contained in:
Adrien Abraham 2022-09-28 19:04:09 +02:00
parent 559646bc11
commit 415af8de98

View file

@ -1,7 +1,9 @@
#!/usr/bin/env python3
"""Download tracklists from djtracklists.com."""
"""Download tracklists from djtracklists.com as CSV."""
import argparse
import csv
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
@ -12,31 +14,71 @@ from bs4 import BeautifulSoup
@dataclass
class Track:
"""One track, parsed from a tracklist page."""
title: str
artists: list[str]
mix: Optional[str]
mix_artists: Optional[list[str]]
timestamp: str
def format_artists(self):
return " & ".join(self.artists)
def format_mix_artists(self):
return " & ".join(self.mix_artists) if self.mix_artists else ""
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--series", help="download this series")
parser.add_argument("-t", "--tracklist", help="download this tracklist")
parser.add_argument("-s", "--series",
help="download this series (provide URL of 1st page)")
parser.add_argument("-t", "--tracklist",
help="download this tracklist (provide URL)")
parser.add_argument("--pretty", help="pretty print a CSV file.")
args = parser.parse_args()
if tracklist_url := args.tracklist:
tracklist = download_tracklist(tracklist_url)
name = tracklist_url.rstrip("/").rsplit("/", maxsplit=1)[-1]
file_name = Path.cwd() / (name + ".txt")
save_tracklist(tracklist, file_name)
if csv_file_name := args.pretty:
pretty_print_csv(csv_file_name)
elif series_url := args.series:
download_series(series_url)
elif tracklist_url := args.tracklist:
download_tracklist(tracklist_url)
def download_series(series_url: str):
while series_url:
print("Processing series URL", series_url)
response = requests.get(series_url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Download all tracklists.
for tracklist_link in soup.find_all("a", class_="mix"):
tracklist_url = tracklist_link["href"]
print("Processing tracklist URL", tracklist_url)
download_tracklist(tracklist_url)
time.sleep(1) # throttle
# Look for the next page button.
for page_link in soup.find_all("a", class_="pagenumber"):
if "Next" in page_link.string:
series_url = page_link["href"]
break
else:
series_url = None
def download_tracklist(url: str):
tracklist = get_tracklist_from_url(url)
name = url.rstrip("/").rsplit("/", maxsplit=1)[-1]
file_name = Path.cwd() / (name + ".csv")
save_tracklist_as_csv(tracklist, file_name)
def is_track_row(css_class: str) -> bool:
return css_class in ("on", "off")
def download_tracklist(url: str) -> list[Track]:
def get_tracklist_from_url(url: str) -> list[Track]:
"""Get tracklist from the Web and parse it into a list of Track objects."""
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
@ -44,13 +86,22 @@ def download_tracklist(url: str) -> list[Track]:
for row in soup.find_all("div", class_=is_track_row):
artists = []
mix_artists = []
try:
title = row.find("a", class_="track").string
mix = row.find("a", class_="release").string
except AttributeError:
title = row.find("b").string
try:
title = row.find("b").string
except AttributeError:
title = "(unknown title)"
mix = None
timestamp = row.find("span", class_="index_time").string
try:
timestamp = row.find("span", class_="index_time").string
except AttributeError:
timestamp = "(unknown timestamp)"
for artist in row.find_all("a", class_="artist"):
prev_tag = artist.previous_sibling.string
if getattr(prev_tag, "string", "").strip() == "remixed by":
@ -69,21 +120,49 @@ def download_tracklist(url: str) -> list[Track]:
return tracklist
def save_tracklist(tracklist: list[Track], file_name: Path):
def save_tracklist_as_csv(tracklist: list[Track], file_name: Path):
try:
with open(file_name, "wt", encoding="utf8") as file:
with open(file_name, "wt", encoding="utf8", newline="") as file:
writer = csv.writer(file)
writer.writerow(
["Timestamp", "Artists", "Title", "Mix", "Remix artists"]
)
for track in tracklist:
artists = " & ".join(track.artists)
line = f"{track.timestamp}{artists}{track.title}"
if track.mix:
line += f" ({track.mix})"
if track.mix_artists:
mix_artists = " & ".join(track.mix_artists)
line += f" remixed by {mix_artists}"
file.write(line + "\n")
writer.writerow([
track.timestamp,
track.format_artists(),
track.title,
track.mix or "",
track.format_mix_artists(),
])
except OSError as exc:
print(f"Can't save tracklist: {exc}")
def pretty_print_csv(csv_file_name: str):
try:
with open(csv_file_name, "rt", encoding="utf8", newline="") as file:
reader = csv.reader(file)
first_line_skipped = False
for row in reader:
if not first_line_skipped:
first_line_skipped = True
continue
ts, artists, title, mix, remix_artists = row
if ":" in ts:
ts_min, ts_sec = ts.split(":")
ts_min = int(ts_min)
ts_h, ts_min = ts_min // 60, ts_min % 60
ts = f"{ts_h:02}:{ts_min:02}:{ts_sec}"
print(f"{ts} {artists or '(unknown)'}{title}", end="")
if mix:
print(f" ({mix})", end="")
if remix_artists:
print(f" by {remix_artists}", end="")
print()
except OSError as exc:
print(f"Can't read CSV: {exc}")
if __name__ == "__main__":
main()