import os import sys import time import urllib.parse from pathlib import Path import requests from bs4 import BeautifulSoup from requests import models BASE_URL = "https://www.cnrtl.fr/definition/" def main(): forms_list_filepath = sys.argv[1] start_from = "" if len(sys.argv) > 2: start_from = sys.argv[2] try: with open(forms_list_filepath, "rt") as forms_list_file: for form in forms_list_file: form = form.rstrip() if start_from: if start_from != form: continue else: start_from = "" get_definitions(form) time.sleep(5000) except OSError: exit("Could not open forms list file.") def get_definitions(form): url = BASE_URL + urllib.parse.quote(form) print(f"Processing {url}") content = get_page(url) soup = BeautifulSoup(content, "html.parser") process_definition_page(soup, form, 0) # Check other definitions. tabs_bar = soup.find("div", id="vtoolbar") if not tabs_bar: exit("No tabs bar!\n") num_defs = len(tabs_bar("li")) for variant in range(1, num_defs): # skip the first, we got it above. variant_url = url + f"/{variant}" print(f"Processing {variant_url}") content = get_page(variant_url) soup = BeautifulSoup(content, "html.parser") process_definition_page(soup, form, variant) def get_page(url): response = requests.get(url) if response.status_code != 200: sys.stderr.write(f"ERROR {response.status_code} at {url}\n") return response.text def process_definition_page(soup, form, variant): definition = soup.find("div", id="lexicontent") if not definition: exit("No definition!\n") save_definition(form, variant, definition.prettify()) def save_definition(name, variant, content): def_dir = Path.cwd() / name def_dir.mkdir(exist_ok=True) def_file_path = def_dir / f"{variant}.txt" try: with open(def_file_path, "wt") as def_file: def_file.write(content) except OSError as exc: sys.stderr.write(f"Could not save definition at {def_file_path}: {exc}") if __name__ == "__main__": main()