1
0
Fork 0
Scripts/scrape-interesting-breton-music-website.py

75 lines
2.6 KiB
Python
Executable file

#!/usr/bin/env python3
# Ahem…
import argparse
from pathlib import Path
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import requests
def scrape_course(course_url: str, cookies: dict, output_dir: Path):
response = requests.get(course_url, cookies=cookies)
response.raise_for_status()
soup = BeautifulSoup(response.text, features="lxml")
for link in soup.find_all("a", class_="card-container-link"):
video_url = urljoin(course_url, link["href"])
scrape_video(video_url, cookies, output_dir)
def scrape_video(video_url: str, cookies: dict, output_dir: Path):
print(f"Video {video_url}")
response = requests.get(video_url, cookies=cookies)
response.raise_for_status()
soup = BeautifulSoup(response.text, features="lxml")
for container in soup.find_all("div", class_="player-container"):
link = container.find("a")
if link is None:
continue
file_url = link["href"]
file_name = urlparse(file_url).path.split("/")[-1]
output_path = output_dir / file_name
download_file(file_url, cookies, output_path)
for link in soup.find_all("a"):
file_url = link["href"]
if not file_url.startswith("/scripts/files/"):
continue
file_url = urljoin(video_url, file_url)
file_name = urlparse(file_url).path.split("/")[-1]
output_path = output_dir / file_name
download_file(file_url, cookies, output_path)
def download_file(url: str, cookies: dict, output_path: Path):
print(f'Downloading: "{url}"')
print(f'"{output_path}"')
with requests.get(url, cookies=cookies, stream=True) as response:
response.raise_for_status()
with open(output_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
def main():
argparser = argparse.ArgumentParser()
argparser.add_argument("--course")
argparser.add_argument("--video") # either video or course
argparser.add_argument("--phpsessid", help="PHPSESSID")
argparser.add_argument("-o", "--output")
args = argparser.parse_args()
cookies = {"PHPSESSID": args.phpsessid}
output_dir = Path(args.output) if args.output else Path.cwd()
if not output_dir.exists():
output_dir.mkdir(parents=True)
if course_url := args.course:
scrape_course(course_url, cookies, output_dir)
elif video_url := args.video:
scrape_video(video_url, cookies, output_dir)
else:
print("Nothing to do.")
if __name__ == "__main__":
main()