60 lines
1.6 KiB
Python
60 lines
1.6 KiB
Python
|
import string
|
||
|
import sys
|
||
|
import time
|
||
|
|
||
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
|
||
|
BASE_URL = "https://www.cnrtl.fr"
|
||
|
PER_FIRST_LETTER_URL = BASE_URL + "/portailindex/LEXI/TLFI/"
|
||
|
|
||
|
|
||
|
def main():
|
||
|
for letter in string.ascii_uppercase:
|
||
|
try:
|
||
|
with open(f"lexical_forms_{letter}.txt", "wt") as letter_file:
|
||
|
process_letter(letter, letter_file)
|
||
|
except OSError as exc:
|
||
|
exit(f"FUCK {exc}")
|
||
|
|
||
|
|
||
|
def process_letter(letter, letter_file):
|
||
|
next_url = PER_FIRST_LETTER_URL + letter
|
||
|
while True:
|
||
|
print(f"Processing {next_url}")
|
||
|
response = requests.get(next_url)
|
||
|
if response.status_code != 200:
|
||
|
exit(f"ERROR {response.status_code} at {next_url}")
|
||
|
next_page = process_page(response.text, letter_file)
|
||
|
if not next_page:
|
||
|
break
|
||
|
next_url = BASE_URL + next_page
|
||
|
|
||
|
|
||
|
def process_page(page, output):
|
||
|
soup = BeautifulSoup(page, "html.parser")
|
||
|
hometab = soup.find("table", class_="hometab")
|
||
|
lexical_forms = [td.string for td in hometab("td")]
|
||
|
for form in lexical_forms:
|
||
|
if form:
|
||
|
output.write(str(form) + "\n")
|
||
|
nav_table = hometab.find_next_sibling("table")
|
||
|
if not nav_table:
|
||
|
return None
|
||
|
nav_table_cells = nav_table("td")
|
||
|
if not nav_table_cells or len(nav_table_cells) < 2:
|
||
|
return None
|
||
|
second_cell = nav_table_cells[1]
|
||
|
next_page_link = second_cell.find("a")
|
||
|
if not next_page_link:
|
||
|
return None
|
||
|
try:
|
||
|
return next_page_link["href"]
|
||
|
except KeyError:
|
||
|
return None
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|