get-definitions: handle ambiguous and missing defs

This commit is contained in:
dece 2021-03-16 00:38:31 +01:00
parent d4eb485f45
commit ba27b4cd9a

View file

@ -37,32 +37,55 @@ def get_definitions(form):
print(f"Processing {url}") print(f"Processing {url}")
content = get_page(url) content = get_page(url)
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
process_definition_page(soup, form, 0) success = process_definition_page(soup, form, 0)
if success:
# Check other definitions. # Check other definitions.
tabs_bar = soup.find("div", id="vtoolbar") tabs_bar = soup.find("div", id="vtoolbar")
if not tabs_bar: if not tabs_bar:
exit("No tabs bar!\n") exit("No tabs bar!\n")
num_defs = len(tabs_bar("li")) num_defs = len(tabs_bar("li"))
for variant in range(1, num_defs): # skip the first, we got it above. for variant in range(1, num_defs): # Skip the first, we got it above.
variant_url = url + f"/{variant}" variant_url = url + f"/{variant}"
print(f"Processing {variant_url}") print(f"Processing {variant_url}")
content = get_page(variant_url) content = get_page(variant_url)
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
process_definition_page(soup, form, variant) process_definition_page(soup, form, variant)
else:
# Might be an ambiguity?
contentbox = soup.find("div", id="contentbox")
if not contentbox:
exit("No contentbox!")
h2 = contentbox.find("h2")
if h2:
title = str(h2.string).strip()
if title == "Terme introuvable":
links = contentbox("a")
for link in links:
alternative_form = str(link.string).strip()
if alternative_form == form:
continue
get_definitions(alternative_form)
elif title == "Erreur":
print("Nothing there :O")
else:
exit(f"Unknown error title: {h2}")
else:
exit("Bork! no definition nor ambiguity list")
def get_page(url): def get_page(url):
response = requests.get(url) response = requests.get(url)
if response.status_code != 200: if response.status_code != 200:
sys.stderr.write(f"ERROR {response.status_code} at {url}\n") exit(f"ERROR {response.status_code} at {url}\n")
return response.text return response.text
def process_definition_page(soup, form, variant): def process_definition_page(soup, form, variant):
definition = soup.find("div", id="lexicontent") definition = soup.find("div", id="lexicontent")
if not definition: if not definition:
exit("No definition!\n") return False
save_definition(form, variant, definition.prettify()) save_definition(form, variant, definition.prettify())
return True
def save_definition(name, variant, content): def save_definition(name, variant, content):
@ -73,7 +96,7 @@ def save_definition(name, variant, content):
with open(def_file_path, "wt") as def_file: with open(def_file_path, "wt") as def_file:
def_file.write(content) def_file.write(content)
except OSError as exc: except OSError as exc:
sys.stderr.write(f"Could not save definition at {def_file_path}: {exc}") exit(f"Could not save definition at {def_file_path}: {exc}")
if __name__ == "__main__": if __name__ == "__main__":