From fb43e28c6e91c9a14d3e48ef899d4868dc281436 Mon Sep 17 00:00:00 2001 From: dece Date: Mon, 15 Mar 2021 20:12:02 +0100 Subject: [PATCH] init --- build-form-list.py | 59 ++++++++++++++++++++++++++++++++++ get_definitions.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 build-form-list.py create mode 100644 get_definitions.py diff --git a/build-form-list.py b/build-form-list.py new file mode 100644 index 0000000..dfed3dd --- /dev/null +++ b/build-form-list.py @@ -0,0 +1,59 @@ +import string +import sys +import time + +import requests +from bs4 import BeautifulSoup + + +BASE_URL = "https://www.cnrtl.fr" +PER_FIRST_LETTER_URL = BASE_URL + "/portailindex/LEXI/TLFI/" + + +def main(): + for letter in string.ascii_uppercase: + try: + with open(f"lexical_forms_{letter}.txt", "wt") as letter_file: + process_letter(letter, letter_file) + except OSError as exc: + exit(f"FUCK {exc}") + + +def process_letter(letter, letter_file): + next_url = PER_FIRST_LETTER_URL + letter + while True: + print(f"Processing {next_url}") + response = requests.get(next_url) + if response.status_code != 200: + exit(f"ERROR {response.status_code} at {next_url}") + next_page = process_page(response.text, letter_file) + if not next_page: + break + next_url = BASE_URL + next_page + + +def process_page(page, output): + soup = BeautifulSoup(page, "html.parser") + hometab = soup.find("table", class_="hometab") + lexical_forms = [td.string for td in hometab("td")] + for form in lexical_forms: + if form: + output.write(str(form) + "\n") + nav_table = hometab.find_next_sibling("table") + if not nav_table: + return None + nav_table_cells = nav_table("td") + if not nav_table_cells or len(nav_table_cells) < 2: + return None + second_cell = nav_table_cells[1] + next_page_link = second_cell.find("a") + if not next_page_link: + return None + try: + return next_page_link["href"] + except KeyError: + return None + + +if __name__ == "__main__": + main() diff --git a/get_definitions.py b/get_definitions.py new file mode 100644 index 0000000..791d936 --- /dev/null +++ b/get_definitions.py @@ -0,0 +1,80 @@ +import os +import sys +import urllib.parse +from pathlib import Path + +import requests +from bs4 import BeautifulSoup +from requests import models + + +BASE_URL = "https://www.cnrtl.fr/definition/" + + +def main(): + forms_list_filepath = sys.argv[1] + start_from = "" + if len(sys.argv) > 2: + start_from = sys.argv[2] + try: + with open(forms_list_filepath, "rt") as forms_list_file: + for form in forms_list_file: + form = form.rstrip() + if start_from: + if start_from != form: + continue + else: + start_from = "" + get_definitions(form) + except OSError: + exit("Could not open forms list file.") + + +def get_definitions(form): + url = BASE_URL + urllib.parse.quote(form) + print(f"Processing {url}") + content = get_page(url) + soup = BeautifulSoup(content, "html.parser") + process_definition_page(soup, form, 0) + # Check other definitions. + tabs_bar = soup.find("div", id="vtoolbar") + if not tabs_bar: + sys.stderr.write("No tabs bar!\n") + return + num_defs = len(tabs_bar("li")) + for variant in range(1, num_defs): # skip the first, we got it above. + variant_url = url + f"/{variant}" + print(f"Processing {variant_url}") + content = get_page(variant_url) + soup = BeautifulSoup(content, "html.parser") + process_definition_page(soup, form, variant) + + +def get_page(url): + response = requests.get(url) + if response.status_code != 200: + sys.stderr.write(f"ERROR {response.status_code} at {url}\n") + return response.text + + +def process_definition_page(soup, form, variant): + definition = soup.find("div", id="lexicontent") + if not definition: + sys.stderr.write("No definition!\n") + return + save_definition(form, variant, definition.prettify()) + + +def save_definition(name, variant, content): + def_dir = Path.cwd() / name + def_dir.mkdir(exist_ok=True) + def_file_path = def_dir / f"{variant}.txt" + try: + with open(def_file_path, "wt") as def_file: + def_file.write(content) + except OSError as exc: + sys.stderr.write(f"Could not save definition at {def_file_path}: {exc}") + + +if __name__ == "__main__": + main()