rework tool to work with the sqlite database

This commit is contained in:
dece 2022-11-28 18:13:29 +01:00
parent c66f0a5409
commit 49db7931a3

View file

@ -7,12 +7,13 @@ formatting almost like in the TLFi.
import argparse
import difflib
import gzip
import lzma
import os
import re
import unicodedata
from functools import cache
from os import environ
import sqlite3
import sys
from pathlib import Path
from typing import Generator
from bs4 import BeautifulSoup, NavigableString
@ -25,80 +26,53 @@ except ImportError:
return ""
t = DummyTerminal()
TAG_STRIP_RE = re.compile(r"\s+")
def main():
ap = argparse.ArgumentParser(description="TLFi CLI")
ap.add_argument("query", help="mot(s) à chercher")
ap.add_argument("-f", "--lexical-forms",
default=get_root_path() / "lexical_forms.txt",
help="fichier des formes lexicales")
ap.add_argument("-d", "--definitions",
default=get_root_path() / "definitions",
help="répertoire des définitions")
ap.add_argument("-d", "--database", help="base de données")
args = ap.parse_args()
lookup_res = lookup(args.query, args.lexical_forms)
if lookup_res is None:
exit()
if args.database:
db_path = Path(args.database)
elif (env_db_path := os.environ.get("TLFI_DATABASE")):
db_path = Path(env_db_path)
else:
sys.exit("Pas de base de données et TLFI_DATABASE n'est pas défini.")
if (defs := get_definition_paths(lookup_res, args.definitions)):
for d in defs:
show_definition(d)
lexical_form = args.query
connection = sqlite3.connect(db_path)
printed = 0
for definition_html in get_definitions(lexical_form, connection):
pretty_print_definition(definition_html)
printed += 1
if printed == 0:
print("Forme lexicale non trouvée. Suggestions :")
for suggestion in get_suggestions(lexical_form, connection):
print(f"* {suggestion}")
@cache
def get_root_path():
return Path(environ.get("TLFI_ROOT", "."))
def lookup(query, lexical_forms_path):
"""Return a form for which a definition might exist, else None.
If we are sure the lexical form does not have definitions, suggest similar
words to the user.
"""
with open(lexical_forms_path, "rt") as lexical_forms_file:
forms = lexical_forms_file.readlines()
if query + "\n" in forms:
return query
print("Suggestions :")
suggestions = (
form for form in map(str.rstrip, forms)
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8
def get_definitions(
lexical_form: str,
connection: sqlite3.Connection
) -> Generator[str, None, None]:
cursor = connection.cursor()
result = cursor.execute(
"SELECT definition FROM definitions WHERE lexical_form = ?",
(lexical_form,)
)
for form in suggestions:
print(f"- {form}")
return None
for (blob,) in result.fetchall():
yield lzma.decompress(blob).decode()
def get_definition_paths(query, definitions) -> list:
"""Return a list of definition file paths for this lexical form."""
nfkd = unicodedata.normalize("NFKD", query[0])
first_char = next((c for c in nfkd if not unicodedata.combining(c)), "")
if not first_char:
print("Pas compris la première lettre…")
return []
path = Path(definitions) / first_char.upper() / query
try:
return [f for f in path.iterdir() if str(f).endswith(".txt.gz")]
except FileNotFoundError:
print("Définition non trouvée.")
return []
def show_definition(def_path):
"""Print a definition from a definition file."""
with gzip.open(def_path, "rt") as def_file:
html = def_file.read()
def pretty_print_definition(html: str):
soup = BeautifulSoup(html, "html.parser")
content = parse_tag(soup.div.div)
print(content)
TAG_STRIP_RE = re.compile(r"\s+")
def parse_tag(tag) -> str:
if isinstance(tag, NavigableString):
return TAG_STRIP_RE.sub(" ", tag)
@ -124,5 +98,24 @@ def parse_tag(tag) -> str:
return content
def get_suggestions(
query: str,
connection: sqlite3.Connection
) -> Generator[str, None, None]:
"""Return a form for which a definition might exist, else None.
If we are sure the lexical form does not have definitions, suggest similar
words to the user.
"""
cursor = connection.cursor()
result = cursor.execute("SELECT lexical_form FROM definitions")
yielded = []
for (form,) in result.fetchall():
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8:
if form not in yielded:
yield form
yielded.append(form)
if __name__ == "__main__":
main()