From 08f2735f8281fc31ac843bde0dbe4f4cc637b84e Mon Sep 17 00:00:00 2001 From: dece Date: Sun, 17 Sep 2023 16:11:04 +0200 Subject: [PATCH] scrape-interesting-breton-music-website.py --- scrape-interesting-breton-music-website.py | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100755 scrape-interesting-breton-music-website.py diff --git a/scrape-interesting-breton-music-website.py b/scrape-interesting-breton-music-website.py new file mode 100755 index 0000000..674022a --- /dev/null +++ b/scrape-interesting-breton-music-website.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# Ahem… +import argparse +from pathlib import Path +from urllib.parse import urljoin, urlparse + +from bs4 import BeautifulSoup +import requests + + +def scrape_course(course_url: str, cookies: dict, output_dir: Path): + response = requests.get(course_url, cookies=cookies) + response.raise_for_status() + soup = BeautifulSoup(response.text, features="lxml") + for link in soup.find_all("a", class_="card-container-link"): + video_url = urljoin(course_url, link["href"]) + scrape_video(video_url, cookies, output_dir) + + +def scrape_video(video_url: str, cookies: dict, output_dir: Path): + print(f"Video {video_url}") + response = requests.get(video_url, cookies=cookies) + response.raise_for_status() + soup = BeautifulSoup(response.text, features="lxml") + for container in soup.find_all("div", class_="player-container"): + link = container.find("a") + if link is None: + continue + file_url = link["href"] + file_name = urlparse(file_url).path.split("/")[-1] + output_path = output_dir / file_name + download_file(file_url, cookies, output_path) + for link in soup.find_all("a"): + file_url = link["href"] + if not file_url.startswith("/scripts/files/"): + continue + file_url = urljoin(video_url, file_url) + file_name = urlparse(file_url).path.split("/")[-1] + output_path = output_dir / file_name + download_file(file_url, cookies, output_path) + + +def download_file(url: str, cookies: dict, output_path: Path): + print(f'Downloading: "{url}"') + print(f' → "{output_path}"') + with requests.get(url, cookies=cookies, stream=True) as response: + response.raise_for_status() + with open(output_path, 'wb') as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + + +def main(): + argparser = argparse.ArgumentParser() + argparser.add_argument("--course") + argparser.add_argument("--video") # either video or course + argparser.add_argument("--phpsessid", help="PHPSESSID") + argparser.add_argument("-o", "--output") + args = argparser.parse_args() + cookies = {"PHPSESSID": args.phpsessid} + output_dir = Path(args.output) if args.output else Path.cwd() + if not output_dir.exists(): + output_dir.mkdir(parents=True) + + if course_url := args.course: + scrape_course(course_url, cookies, output_dir) + elif video_url := args.video: + scrape_video(video_url, cookies, output_dir) + else: + print("Nothing to do.") + + +if __name__ == "__main__": + main()