init
This commit is contained in:
commit
fb43e28c6e
59
build-form-list.py
Normal file
59
build-form-list.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
import string
|
||||
import sys
|
||||
import time
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
BASE_URL = "https://www.cnrtl.fr"
|
||||
PER_FIRST_LETTER_URL = BASE_URL + "/portailindex/LEXI/TLFI/"
|
||||
|
||||
|
||||
def main():
|
||||
for letter in string.ascii_uppercase:
|
||||
try:
|
||||
with open(f"lexical_forms_{letter}.txt", "wt") as letter_file:
|
||||
process_letter(letter, letter_file)
|
||||
except OSError as exc:
|
||||
exit(f"FUCK {exc}")
|
||||
|
||||
|
||||
def process_letter(letter, letter_file):
|
||||
next_url = PER_FIRST_LETTER_URL + letter
|
||||
while True:
|
||||
print(f"Processing {next_url}")
|
||||
response = requests.get(next_url)
|
||||
if response.status_code != 200:
|
||||
exit(f"ERROR {response.status_code} at {next_url}")
|
||||
next_page = process_page(response.text, letter_file)
|
||||
if not next_page:
|
||||
break
|
||||
next_url = BASE_URL + next_page
|
||||
|
||||
|
||||
def process_page(page, output):
|
||||
soup = BeautifulSoup(page, "html.parser")
|
||||
hometab = soup.find("table", class_="hometab")
|
||||
lexical_forms = [td.string for td in hometab("td")]
|
||||
for form in lexical_forms:
|
||||
if form:
|
||||
output.write(str(form) + "\n")
|
||||
nav_table = hometab.find_next_sibling("table")
|
||||
if not nav_table:
|
||||
return None
|
||||
nav_table_cells = nav_table("td")
|
||||
if not nav_table_cells or len(nav_table_cells) < 2:
|
||||
return None
|
||||
second_cell = nav_table_cells[1]
|
||||
next_page_link = second_cell.find("a")
|
||||
if not next_page_link:
|
||||
return None
|
||||
try:
|
||||
return next_page_link["href"]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
80
get_definitions.py
Normal file
80
get_definitions.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
import os
|
||||
import sys
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from requests import models
|
||||
|
||||
|
||||
BASE_URL = "https://www.cnrtl.fr/definition/"
|
||||
|
||||
|
||||
def main():
|
||||
forms_list_filepath = sys.argv[1]
|
||||
start_from = ""
|
||||
if len(sys.argv) > 2:
|
||||
start_from = sys.argv[2]
|
||||
try:
|
||||
with open(forms_list_filepath, "rt") as forms_list_file:
|
||||
for form in forms_list_file:
|
||||
form = form.rstrip()
|
||||
if start_from:
|
||||
if start_from != form:
|
||||
continue
|
||||
else:
|
||||
start_from = ""
|
||||
get_definitions(form)
|
||||
except OSError:
|
||||
exit("Could not open forms list file.")
|
||||
|
||||
|
||||
def get_definitions(form):
|
||||
url = BASE_URL + urllib.parse.quote(form)
|
||||
print(f"Processing {url}")
|
||||
content = get_page(url)
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
process_definition_page(soup, form, 0)
|
||||
# Check other definitions.
|
||||
tabs_bar = soup.find("div", id="vtoolbar")
|
||||
if not tabs_bar:
|
||||
sys.stderr.write("No tabs bar!\n")
|
||||
return
|
||||
num_defs = len(tabs_bar("li"))
|
||||
for variant in range(1, num_defs): # skip the first, we got it above.
|
||||
variant_url = url + f"/{variant}"
|
||||
print(f"Processing {variant_url}")
|
||||
content = get_page(variant_url)
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
process_definition_page(soup, form, variant)
|
||||
|
||||
|
||||
def get_page(url):
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
sys.stderr.write(f"ERROR {response.status_code} at {url}\n")
|
||||
return response.text
|
||||
|
||||
|
||||
def process_definition_page(soup, form, variant):
|
||||
definition = soup.find("div", id="lexicontent")
|
||||
if not definition:
|
||||
sys.stderr.write("No definition!\n")
|
||||
return
|
||||
save_definition(form, variant, definition.prettify())
|
||||
|
||||
|
||||
def save_definition(name, variant, content):
|
||||
def_dir = Path.cwd() / name
|
||||
def_dir.mkdir(exist_ok=True)
|
||||
def_file_path = def_dir / f"{variant}.txt"
|
||||
try:
|
||||
with open(def_file_path, "wt") as def_file:
|
||||
def_file.write(content)
|
||||
except OSError as exc:
|
||||
sys.stderr.write(f"Could not save definition at {def_file_path}: {exc}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in a new issue