You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
TLFi/scripts/get-definitions.py

104 lines
3.1 KiB

import sys
import time
import urllib.parse
from pathlib import Path
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://www.cnrtl.fr/definition/"
CRIMINAL_SCUM = []
def main():
forms_list_filepath = sys.argv[1]
start_from = ""
if len(sys.argv) > 2:
start_from = sys.argv[2]
try:
with open(forms_list_filepath, "rt") as forms_list_file:
for form in forms_list_file:
form = form.rstrip()
if start_from:
if start_from == form:
start_from = ""
else:
continue
get_definitions(form)
time.sleep(5)
except OSError:
exit("Could not open forms list file.")
def get_definitions(form):
url = BASE_URL + urllib.parse.quote(form)
print(f"Processing {url}")
content = get_page(url)
soup = BeautifulSoup(content, "html.parser")
success = process_definition_page(soup, form, 0)
if success:
# Check other definitions.
tabs_bar = soup.find("div", id="vtoolbar")
if not tabs_bar:
exit("No tabs bar!\n")
num_defs = len(tabs_bar("li"))
for variant in range(1, num_defs): # Skip the first, we got it above.
variant_url = url + f"/{variant}"
print(f"Processing {variant_url}")
content = get_page(variant_url)
soup = BeautifulSoup(content, "html.parser")
process_definition_page(soup, form, variant)
else:
# Might be an ambiguity?
contentbox = soup.find("div", id="contentbox")
if not contentbox:
exit("No contentbox!")
h2 = contentbox.find("h2")
if h2:
title = str(h2.string).strip()
if title == "Terme introuvable":
CRIMINAL_SCUM.append(form)
links = contentbox("a")
for link in links:
alternative_form = str(link.string).strip()
if alternative_form in CRIMINAL_SCUM:
continue
get_definitions(alternative_form)
elif title == "Erreur":
print("Nothing there :O")
else:
exit(f"Unknown error title: {h2}")
else:
exit("Bork! no definition nor ambiguity list")
def get_page(url):
response = requests.get(url)
if response.status_code != 200:
exit(f"ERROR {response.status_code} at {url}\n")
return response.text
def process_definition_page(soup, form, variant):
definition = soup.find("div", id="lexicontent")
if not definition:
return False
save_definition(form, variant, definition.prettify())
return True
def save_definition(name, variant, content):
def_dir = Path.cwd() / name.rstrip(".") # fucking ntfs
def_dir.mkdir(exist_ok=True)
def_file_path = def_dir / f"{variant}.txt"
try:
with open(def_file_path, "wt") as def_file:
def_file.write(content)
except OSError as exc:
exit(f"Could not save definition at {def_file_path}: {exc}")
if __name__ == "__main__":
main()