rework tool to work with the sqlite database

This commit is contained in:
dece 2022-11-28 18:13:29 +01:00
parent c66f0a5409
commit 49db7931a3

View file

@ -7,12 +7,13 @@ formatting almost like in the TLFi.
import argparse import argparse
import difflib import difflib
import gzip import lzma
import os
import re import re
import unicodedata import sqlite3
from functools import cache import sys
from os import environ
from pathlib import Path from pathlib import Path
from typing import Generator
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
@ -25,80 +26,53 @@ except ImportError:
return "" return ""
t = DummyTerminal() t = DummyTerminal()
TAG_STRIP_RE = re.compile(r"\s+")
def main(): def main():
ap = argparse.ArgumentParser(description="TLFi CLI") ap = argparse.ArgumentParser(description="TLFi CLI")
ap.add_argument("query", help="mot(s) à chercher") ap.add_argument("query", help="mot(s) à chercher")
ap.add_argument("-f", "--lexical-forms", ap.add_argument("-d", "--database", help="base de données")
default=get_root_path() / "lexical_forms.txt",
help="fichier des formes lexicales")
ap.add_argument("-d", "--definitions",
default=get_root_path() / "definitions",
help="répertoire des définitions")
args = ap.parse_args() args = ap.parse_args()
lookup_res = lookup(args.query, args.lexical_forms) if args.database:
if lookup_res is None: db_path = Path(args.database)
exit() elif (env_db_path := os.environ.get("TLFI_DATABASE")):
db_path = Path(env_db_path)
else:
sys.exit("Pas de base de données et TLFI_DATABASE n'est pas défini.")
if (defs := get_definition_paths(lookup_res, args.definitions)): lexical_form = args.query
for d in defs: connection = sqlite3.connect(db_path)
show_definition(d) printed = 0
for definition_html in get_definitions(lexical_form, connection):
pretty_print_definition(definition_html)
printed += 1
if printed == 0:
print("Forme lexicale non trouvée. Suggestions :")
for suggestion in get_suggestions(lexical_form, connection):
print(f"* {suggestion}")
@cache def get_definitions(
def get_root_path(): lexical_form: str,
return Path(environ.get("TLFI_ROOT", ".")) connection: sqlite3.Connection
) -> Generator[str, None, None]:
cursor = connection.cursor()
def lookup(query, lexical_forms_path): result = cursor.execute(
"""Return a form for which a definition might exist, else None. "SELECT definition FROM definitions WHERE lexical_form = ?",
(lexical_form,)
If we are sure the lexical form does not have definitions, suggest similar
words to the user.
"""
with open(lexical_forms_path, "rt") as lexical_forms_file:
forms = lexical_forms_file.readlines()
if query + "\n" in forms:
return query
print("Suggestions :")
suggestions = (
form for form in map(str.rstrip, forms)
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8
) )
for form in suggestions: for (blob,) in result.fetchall():
print(f"- {form}") yield lzma.decompress(blob).decode()
return None
def get_definition_paths(query, definitions) -> list: def pretty_print_definition(html: str):
"""Return a list of definition file paths for this lexical form."""
nfkd = unicodedata.normalize("NFKD", query[0])
first_char = next((c for c in nfkd if not unicodedata.combining(c)), "")
if not first_char:
print("Pas compris la première lettre…")
return []
path = Path(definitions) / first_char.upper() / query
try:
return [f for f in path.iterdir() if str(f).endswith(".txt.gz")]
except FileNotFoundError:
print("Définition non trouvée.")
return []
def show_definition(def_path):
"""Print a definition from a definition file."""
with gzip.open(def_path, "rt") as def_file:
html = def_file.read()
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
content = parse_tag(soup.div.div) content = parse_tag(soup.div.div)
print(content) print(content)
TAG_STRIP_RE = re.compile(r"\s+")
def parse_tag(tag) -> str: def parse_tag(tag) -> str:
if isinstance(tag, NavigableString): if isinstance(tag, NavigableString):
return TAG_STRIP_RE.sub(" ", tag) return TAG_STRIP_RE.sub(" ", tag)
@ -124,5 +98,24 @@ def parse_tag(tag) -> str:
return content return content
def get_suggestions(
query: str,
connection: sqlite3.Connection
) -> Generator[str, None, None]:
"""Return a form for which a definition might exist, else None.
If we are sure the lexical form does not have definitions, suggest similar
words to the user.
"""
cursor = connection.cursor()
result = cursor.execute("SELECT lexical_form FROM definitions")
yielded = []
for (form,) in result.fetchall():
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8:
if form not in yielded:
yield form
yielded.append(form)
if __name__ == "__main__": if __name__ == "__main__":
main() main()