import string import sys import time import requests from bs4 import BeautifulSoup BASE_URL = "https://www.cnrtl.fr" PER_FIRST_LETTER_URL = BASE_URL + "/portailindex/LEXI/TLFI/" def main(): for letter in string.ascii_uppercase: try: with open(f"lexical_forms_{letter}.txt", "wt") as letter_file: process_letter(letter, letter_file) except OSError as exc: exit(f"FUCK {exc}") def process_letter(letter, letter_file): next_url = PER_FIRST_LETTER_URL + letter while True: print(f"Processing {next_url}") response = requests.get(next_url) if response.status_code != 200: exit(f"ERROR {response.status_code} at {next_url}") next_page = process_page(response.text, letter_file) if not next_page: break next_url = BASE_URL + next_page def process_page(page, output): soup = BeautifulSoup(page, "html.parser") hometab = soup.find("table", class_="hometab") lexical_forms = [td.string for td in hometab("td")] for form in lexical_forms: if form: output.write(str(form) + "\n") nav_table = hometab.find_next_sibling("table") if not nav_table: return None nav_table_cells = nav_table("td") if not nav_table_cells or len(nav_table_cells) < 2: return None second_cell = nav_table_cells[1] next_page_link = second_cell.find("a") if not next_page_link: return None try: return next_page_link["href"] except KeyError: return None if __name__ == "__main__": main()