2021-03-15 20:12:02 +01:00
|
|
|
import sys
|
2021-03-15 20:19:30 +01:00
|
|
|
import time
|
2021-03-15 20:12:02 +01:00
|
|
|
import urllib.parse
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
BASE_URL = "https://www.cnrtl.fr/definition/"
|
2021-03-16 00:46:07 +01:00
|
|
|
CRIMINAL_SCUM = []
|
2021-03-15 20:12:02 +01:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
forms_list_filepath = sys.argv[1]
|
|
|
|
start_from = ""
|
|
|
|
if len(sys.argv) > 2:
|
|
|
|
start_from = sys.argv[2]
|
|
|
|
try:
|
|
|
|
with open(forms_list_filepath, "rt") as forms_list_file:
|
|
|
|
for form in forms_list_file:
|
|
|
|
form = form.rstrip()
|
|
|
|
if start_from:
|
2021-03-15 20:29:23 +01:00
|
|
|
if start_from == form:
|
2021-03-15 20:12:02 +01:00
|
|
|
start_from = ""
|
2021-03-15 20:29:23 +01:00
|
|
|
else:
|
|
|
|
continue
|
2021-03-15 20:12:02 +01:00
|
|
|
get_definitions(form)
|
2021-03-15 20:29:23 +01:00
|
|
|
time.sleep(5)
|
2021-03-15 20:12:02 +01:00
|
|
|
except OSError:
|
|
|
|
exit("Could not open forms list file.")
|
|
|
|
|
|
|
|
|
|
|
|
def get_definitions(form):
|
|
|
|
url = BASE_URL + urllib.parse.quote(form)
|
|
|
|
print(f"Processing {url}")
|
|
|
|
content = get_page(url)
|
|
|
|
soup = BeautifulSoup(content, "html.parser")
|
2021-03-16 00:38:31 +01:00
|
|
|
success = process_definition_page(soup, form, 0)
|
|
|
|
if success:
|
|
|
|
# Check other definitions.
|
|
|
|
tabs_bar = soup.find("div", id="vtoolbar")
|
|
|
|
if not tabs_bar:
|
|
|
|
exit("No tabs bar!\n")
|
|
|
|
num_defs = len(tabs_bar("li"))
|
|
|
|
for variant in range(1, num_defs): # Skip the first, we got it above.
|
|
|
|
variant_url = url + f"/{variant}"
|
|
|
|
print(f"Processing {variant_url}")
|
|
|
|
content = get_page(variant_url)
|
|
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
|
|
process_definition_page(soup, form, variant)
|
|
|
|
else:
|
|
|
|
# Might be an ambiguity?
|
|
|
|
contentbox = soup.find("div", id="contentbox")
|
|
|
|
if not contentbox:
|
|
|
|
exit("No contentbox!")
|
|
|
|
h2 = contentbox.find("h2")
|
|
|
|
if h2:
|
|
|
|
title = str(h2.string).strip()
|
|
|
|
if title == "Terme introuvable":
|
2021-03-16 00:46:07 +01:00
|
|
|
CRIMINAL_SCUM.append(form)
|
2021-03-16 00:38:31 +01:00
|
|
|
links = contentbox("a")
|
|
|
|
for link in links:
|
|
|
|
alternative_form = str(link.string).strip()
|
2021-03-16 00:46:07 +01:00
|
|
|
if alternative_form in CRIMINAL_SCUM:
|
2021-03-16 00:38:31 +01:00
|
|
|
continue
|
|
|
|
get_definitions(alternative_form)
|
|
|
|
elif title == "Erreur":
|
|
|
|
print("Nothing there :O")
|
|
|
|
else:
|
|
|
|
exit(f"Unknown error title: {h2}")
|
|
|
|
else:
|
|
|
|
exit("Bork! no definition nor ambiguity list")
|
2021-03-15 20:12:02 +01:00
|
|
|
|
|
|
|
|
|
|
|
def get_page(url):
|
|
|
|
response = requests.get(url)
|
|
|
|
if response.status_code != 200:
|
2021-03-16 00:38:31 +01:00
|
|
|
exit(f"ERROR {response.status_code} at {url}\n")
|
2021-03-15 20:12:02 +01:00
|
|
|
return response.text
|
|
|
|
|
|
|
|
|
|
|
|
def process_definition_page(soup, form, variant):
|
|
|
|
definition = soup.find("div", id="lexicontent")
|
|
|
|
if not definition:
|
2021-03-16 00:38:31 +01:00
|
|
|
return False
|
2021-03-15 20:12:02 +01:00
|
|
|
save_definition(form, variant, definition.prettify())
|
2021-03-16 00:38:31 +01:00
|
|
|
return True
|
2021-03-15 20:12:02 +01:00
|
|
|
|
|
|
|
|
|
|
|
def save_definition(name, variant, content):
|
2021-03-16 00:41:03 +01:00
|
|
|
def_dir = Path.cwd() / name.rstrip(".") # fucking ntfs
|
2021-03-15 20:12:02 +01:00
|
|
|
def_dir.mkdir(exist_ok=True)
|
|
|
|
def_file_path = def_dir / f"{variant}.txt"
|
|
|
|
try:
|
|
|
|
with open(def_file_path, "wt") as def_file:
|
|
|
|
def_file.write(content)
|
|
|
|
except OSError as exc:
|
2021-03-16 00:38:31 +01:00
|
|
|
exit(f"Could not save definition at {def_file_path}: {exc}")
|
2021-03-15 20:12:02 +01:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|