From 49db7931a3acb474bd8078cbbb05b5eaf79beff4 Mon Sep 17 00:00:00 2001 From: dece Date: Mon, 28 Nov 2022 18:13:29 +0100 Subject: [PATCH] rework tool to work with the sqlite database --- tlfi/__main__.py | 115 ++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 61 deletions(-) diff --git a/tlfi/__main__.py b/tlfi/__main__.py index a704a58..88d713e 100644 --- a/tlfi/__main__.py +++ b/tlfi/__main__.py @@ -7,12 +7,13 @@ formatting almost like in the TLFi. import argparse import difflib -import gzip +import lzma +import os import re -import unicodedata -from functools import cache -from os import environ +import sqlite3 +import sys from pathlib import Path +from typing import Generator from bs4 import BeautifulSoup, NavigableString @@ -25,80 +26,53 @@ except ImportError: return "" t = DummyTerminal() +TAG_STRIP_RE = re.compile(r"\s+") + def main(): ap = argparse.ArgumentParser(description="TLFi CLI") ap.add_argument("query", help="mot(s) à chercher") - ap.add_argument("-f", "--lexical-forms", - default=get_root_path() / "lexical_forms.txt", - help="fichier des formes lexicales") - ap.add_argument("-d", "--definitions", - default=get_root_path() / "definitions", - help="répertoire des définitions") + ap.add_argument("-d", "--database", help="base de données") args = ap.parse_args() - lookup_res = lookup(args.query, args.lexical_forms) - if lookup_res is None: - exit() + if args.database: + db_path = Path(args.database) + elif (env_db_path := os.environ.get("TLFI_DATABASE")): + db_path = Path(env_db_path) + else: + sys.exit("Pas de base de données et TLFI_DATABASE n'est pas défini.") - if (defs := get_definition_paths(lookup_res, args.definitions)): - for d in defs: - show_definition(d) + lexical_form = args.query + connection = sqlite3.connect(db_path) + printed = 0 + for definition_html in get_definitions(lexical_form, connection): + pretty_print_definition(definition_html) + printed += 1 + if printed == 0: + print("Forme lexicale non trouvée. Suggestions :") + for suggestion in get_suggestions(lexical_form, connection): + print(f"* {suggestion}") -@cache -def get_root_path(): - return Path(environ.get("TLFI_ROOT", ".")) - - -def lookup(query, lexical_forms_path): - """Return a form for which a definition might exist, else None. - - If we are sure the lexical form does not have definitions, suggest similar - words to the user. - """ - with open(lexical_forms_path, "rt") as lexical_forms_file: - forms = lexical_forms_file.readlines() - if query + "\n" in forms: - return query - - print("Suggestions :") - suggestions = ( - form for form in map(str.rstrip, forms) - if difflib.SequenceMatcher(None, query, form).ratio() > 0.8 +def get_definitions( + lexical_form: str, + connection: sqlite3.Connection +) -> Generator[str, None, None]: + cursor = connection.cursor() + result = cursor.execute( + "SELECT definition FROM definitions WHERE lexical_form = ?", + (lexical_form,) ) - for form in suggestions: - print(f"- {form}") - return None + for (blob,) in result.fetchall(): + yield lzma.decompress(blob).decode() -def get_definition_paths(query, definitions) -> list: - """Return a list of definition file paths for this lexical form.""" - nfkd = unicodedata.normalize("NFKD", query[0]) - first_char = next((c for c in nfkd if not unicodedata.combining(c)), "") - if not first_char: - print("Pas compris la première lettre…") - return [] - path = Path(definitions) / first_char.upper() / query - try: - return [f for f in path.iterdir() if str(f).endswith(".txt.gz")] - except FileNotFoundError: - print("Définition non trouvée.") - return [] - - -def show_definition(def_path): - """Print a definition from a definition file.""" - with gzip.open(def_path, "rt") as def_file: - html = def_file.read() +def pretty_print_definition(html: str): soup = BeautifulSoup(html, "html.parser") content = parse_tag(soup.div.div) print(content) -TAG_STRIP_RE = re.compile(r"\s+") - - def parse_tag(tag) -> str: if isinstance(tag, NavigableString): return TAG_STRIP_RE.sub(" ", tag) @@ -124,5 +98,24 @@ def parse_tag(tag) -> str: return content +def get_suggestions( + query: str, + connection: sqlite3.Connection +) -> Generator[str, None, None]: + """Return a form for which a definition might exist, else None. + + If we are sure the lexical form does not have definitions, suggest similar + words to the user. + """ + cursor = connection.cursor() + result = cursor.execute("SELECT lexical_form FROM definitions") + yielded = [] + for (form,) in result.fetchall(): + if difflib.SequenceMatcher(None, query, form).ratio() > 0.8: + if form not in yielded: + yield form + yielded.append(form) + + if __name__ == "__main__": main()