tlfi: use compressed def files

This commit is contained in:
dece 2021-11-07 00:26:57 +01:00
parent 5737895738
commit f3d8f08578

38
tlfi.py
View file

@ -8,6 +8,7 @@ maybe colors!
import argparse import argparse
import difflib import difflib
import re import re
import gzip
import unicodedata import unicodedata
from pathlib import Path from pathlib import Path
@ -25,33 +26,34 @@ except ImportError:
def main(): def main():
ap = argparse.ArgumentParser(description="TLFi CLI") ap = argparse.ArgumentParser(description="TLFi CLI")
ap.add_argument("query", help="word or words to search") ap.add_argument("query", help="mot(s) à chercher")
ap.add_argument("-f", "--lexical-forms", default="lexical_forms.txt", ap.add_argument("-f", "--lexical-forms", default="lexical_forms.txt",
help="lexical forms file") help="fichier des formes lexicales")
ap.add_argument("-d", "--definitions", default="definitions", ap.add_argument("-d", "--definitions", default="definitions",
help="definitions directory") help="répertoire des définitions")
args = ap.parse_args() args = ap.parse_args()
lookup_result = lookup(args.query, args.lexical_forms)
if lookup_result is None: lookup_res = lookup(args.query, args.lexical_forms)
if lookup_res is None:
exit() exit()
if (definitions := get_definition_paths(lookup_result, args.definitions)): if (defs := get_definition_paths(lookup_res, args.definitions)):
for d in definitions: for d in defs:
show_definition(d) show_definition(d)
def lookup(query, lexical_form_path): def lookup(query, lexical_forms_path):
"""Return a form for which a definition might exist, else None. """Return a form for which a definition might exist, else None.
If we are sure the lexical form does not have definitions, suggest similar If we are sure the lexical form does not have definitions, suggest similar
words to the user. words to the user.
""" """
with open(lexical_form_path, "rt") as lf_file: with open(lexical_forms_path, "rt") as lexical_forms_file:
forms = lf_file.readlines() forms = lexical_forms_file.readlines()
if query + "\n" in forms: if query + "\n" in forms:
return query return query
print("Did you mean:") print("Suggestions :")
suggestions = ( suggestions = (
form for form in map(str.rstrip, forms) form for form in map(str.rstrip, forms)
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8 if difflib.SequenceMatcher(None, query, form).ratio() > 0.8
@ -61,24 +63,24 @@ def lookup(query, lexical_form_path):
return None return None
def get_definition_paths(query, defs): def get_definition_paths(query, definitions):
"""Return a list of definition file paths for this lexical form.""" """Return a list of definition file paths for this lexical form."""
nfkd = unicodedata.normalize("NFKD", query[0]) nfkd = unicodedata.normalize("NFKD", query[0])
first_char = next((c for c in nfkd if not unicodedata.combining(c)), "") first_char = next((c for c in nfkd if not unicodedata.combining(c)), "")
if not first_char: if not first_char:
print("Can't understand what the first char is") print("Pas compris la première lettre")
return [] return []
path = Path(defs) / first_char.upper() / query path = Path(definitions) / first_char.upper() / query
try: try:
return [f for f in path.iterdir() if f.suffix == ".txt"] return [f for f in path.iterdir() if str(f).endswith(".txt.gz")]
except FileNotFoundError: except FileNotFoundError:
print("Definition not found.") print("Définition non trouvée.")
return [] return []
def show_definition(def_path): def show_definition(def_path):
"""Print a definition from a definition file.""" """Print a definition from a definition file."""
with open(def_path, "rt") as def_file: with gzip.open(def_path, "rt") as def_file:
html = def_file.read() html = def_file.read()
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
content = parse_tag(soup.div.div) content = parse_tag(soup.div.div)
@ -104,6 +106,8 @@ def parse_tag(tag):
content = f"{t.red}{content}{t.normal}" content = f"{t.red}{content}{t.normal}"
if "tlf_csyntagme" in classes: if "tlf_csyntagme" in classes:
content = f"{t.green}{content}{t.normal}" content = f"{t.green}{content}{t.normal}"
if "tlf_cmot" in classes:
content = f"{t.reverse}{content}{t.normal}"
if tag.name == "b": if tag.name == "b":
content = f"{t.bold}{content}{t.normal}" content = f"{t.bold}{content}{t.normal}"
if tag.name == "i": if tag.name == "i":