From ba27b4cd9abbc92e663f9217f24ce7d9e4e7c970 Mon Sep 17 00:00:00 2001 From: dece Date: Tue, 16 Mar 2021 00:38:31 +0100 Subject: [PATCH] get-definitions: handle ambiguous and missing defs --- get-definitions.py | 53 +++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/get-definitions.py b/get-definitions.py index 4a9a603..2a596f4 100644 --- a/get-definitions.py +++ b/get-definitions.py @@ -37,32 +37,55 @@ def get_definitions(form): print(f"Processing {url}") content = get_page(url) soup = BeautifulSoup(content, "html.parser") - process_definition_page(soup, form, 0) - # Check other definitions. - tabs_bar = soup.find("div", id="vtoolbar") - if not tabs_bar: - exit("No tabs bar!\n") - num_defs = len(tabs_bar("li")) - for variant in range(1, num_defs): # skip the first, we got it above. - variant_url = url + f"/{variant}" - print(f"Processing {variant_url}") - content = get_page(variant_url) - soup = BeautifulSoup(content, "html.parser") - process_definition_page(soup, form, variant) + success = process_definition_page(soup, form, 0) + if success: + # Check other definitions. + tabs_bar = soup.find("div", id="vtoolbar") + if not tabs_bar: + exit("No tabs bar!\n") + num_defs = len(tabs_bar("li")) + for variant in range(1, num_defs): # Skip the first, we got it above. + variant_url = url + f"/{variant}" + print(f"Processing {variant_url}") + content = get_page(variant_url) + soup = BeautifulSoup(content, "html.parser") + process_definition_page(soup, form, variant) + else: + # Might be an ambiguity? + contentbox = soup.find("div", id="contentbox") + if not contentbox: + exit("No contentbox!") + h2 = contentbox.find("h2") + if h2: + title = str(h2.string).strip() + if title == "Terme introuvable": + links = contentbox("a") + for link in links: + alternative_form = str(link.string).strip() + if alternative_form == form: + continue + get_definitions(alternative_form) + elif title == "Erreur": + print("Nothing there :O") + else: + exit(f"Unknown error title: {h2}") + else: + exit("Bork! no definition nor ambiguity list") def get_page(url): response = requests.get(url) if response.status_code != 200: - sys.stderr.write(f"ERROR {response.status_code} at {url}\n") + exit(f"ERROR {response.status_code} at {url}\n") return response.text def process_definition_page(soup, form, variant): definition = soup.find("div", id="lexicontent") if not definition: - exit("No definition!\n") + return False save_definition(form, variant, definition.prettify()) + return True def save_definition(name, variant, content): @@ -73,7 +96,7 @@ def save_definition(name, variant, content): with open(def_file_path, "wt") as def_file: def_file.write(content) except OSError as exc: - sys.stderr.write(f"Could not save definition at {def_file_path}: {exc}") + exit(f"Could not save definition at {def_file_path}: {exc}") if __name__ == "__main__":