import sys import time import urllib.parse from pathlib import Path import requests from bs4 import BeautifulSoup BASE_URL = "https://www.cnrtl.fr/definition/" CRIMINAL_SCUM = [] def main(): forms_list_filepath = sys.argv[1] start_from = "" if len(sys.argv) > 2: start_from = sys.argv[2] try: with open(forms_list_filepath, "rt") as forms_list_file: for form in forms_list_file: form = form.rstrip() if start_from: if start_from == form: start_from = "" else: continue get_definitions(form) time.sleep(5) except OSError: exit("Could not open forms list file.") def get_definitions(form): url = BASE_URL + urllib.parse.quote(form) print(f"Processing {url}") content = get_page(url) soup = BeautifulSoup(content, "html.parser") success = process_definition_page(soup, form, 0) if success: # Check other definitions. tabs_bar = soup.find("div", id="vtoolbar") if not tabs_bar: exit("No tabs bar!\n") num_defs = len(tabs_bar("li")) for variant in range(1, num_defs): # Skip the first, we got it above. variant_url = url + f"/{variant}" print(f"Processing {variant_url}") content = get_page(variant_url) soup = BeautifulSoup(content, "html.parser") process_definition_page(soup, form, variant) else: # Might be an ambiguity? contentbox = soup.find("div", id="contentbox") if not contentbox: exit("No contentbox!") h2 = contentbox.find("h2") if h2: title = str(h2.string).strip() if title == "Terme introuvable": CRIMINAL_SCUM.append(form) links = contentbox("a") for link in links: alternative_form = str(link.string).strip() if alternative_form in CRIMINAL_SCUM: continue get_definitions(alternative_form) elif title == "Erreur": print("Nothing there :O") else: exit(f"Unknown error title: {h2}") else: exit("Bork! no definition nor ambiguity list") def get_page(url): response = requests.get(url) if response.status_code != 200: exit(f"ERROR {response.status_code} at {url}\n") return response.text def process_definition_page(soup, form, variant): definition = soup.find("div", id="lexicontent") if not definition: return False save_definition(form, variant, definition.prettify()) return True def save_definition(name, variant, content): def_dir = Path.cwd() / name.rstrip(".") # fucking ntfs def_dir.mkdir(exist_ok=True) def_file_path = def_dir / f"{variant}.txt" try: with open(def_file_path, "wt") as def_file: def_file.write(content) except OSError as exc: exit(f"Could not save definition at {def_file_path}: {exc}") if __name__ == "__main__": main()