rework tool to work with the sqlite database
This commit is contained in:
parent
c66f0a5409
commit
49db7931a3
115
tlfi/__main__.py
115
tlfi/__main__.py
|
@ -7,12 +7,13 @@ formatting almost like in the TLFi.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import difflib
|
import difflib
|
||||||
import gzip
|
import lzma
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import sqlite3
|
||||||
from functools import cache
|
import sys
|
||||||
from os import environ
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
|
||||||
|
@ -25,80 +26,53 @@ except ImportError:
|
||||||
return ""
|
return ""
|
||||||
t = DummyTerminal()
|
t = DummyTerminal()
|
||||||
|
|
||||||
|
TAG_STRIP_RE = re.compile(r"\s+")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
ap = argparse.ArgumentParser(description="TLFi CLI")
|
ap = argparse.ArgumentParser(description="TLFi CLI")
|
||||||
ap.add_argument("query", help="mot(s) à chercher")
|
ap.add_argument("query", help="mot(s) à chercher")
|
||||||
ap.add_argument("-f", "--lexical-forms",
|
ap.add_argument("-d", "--database", help="base de données")
|
||||||
default=get_root_path() / "lexical_forms.txt",
|
|
||||||
help="fichier des formes lexicales")
|
|
||||||
ap.add_argument("-d", "--definitions",
|
|
||||||
default=get_root_path() / "definitions",
|
|
||||||
help="répertoire des définitions")
|
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
lookup_res = lookup(args.query, args.lexical_forms)
|
if args.database:
|
||||||
if lookup_res is None:
|
db_path = Path(args.database)
|
||||||
exit()
|
elif (env_db_path := os.environ.get("TLFI_DATABASE")):
|
||||||
|
db_path = Path(env_db_path)
|
||||||
|
else:
|
||||||
|
sys.exit("Pas de base de données et TLFI_DATABASE n'est pas défini.")
|
||||||
|
|
||||||
if (defs := get_definition_paths(lookup_res, args.definitions)):
|
lexical_form = args.query
|
||||||
for d in defs:
|
connection = sqlite3.connect(db_path)
|
||||||
show_definition(d)
|
printed = 0
|
||||||
|
for definition_html in get_definitions(lexical_form, connection):
|
||||||
|
pretty_print_definition(definition_html)
|
||||||
|
printed += 1
|
||||||
|
if printed == 0:
|
||||||
|
print("Forme lexicale non trouvée. Suggestions :")
|
||||||
|
for suggestion in get_suggestions(lexical_form, connection):
|
||||||
|
print(f"* {suggestion}")
|
||||||
|
|
||||||
|
|
||||||
@cache
|
def get_definitions(
|
||||||
def get_root_path():
|
lexical_form: str,
|
||||||
return Path(environ.get("TLFI_ROOT", "."))
|
connection: sqlite3.Connection
|
||||||
|
) -> Generator[str, None, None]:
|
||||||
|
cursor = connection.cursor()
|
||||||
def lookup(query, lexical_forms_path):
|
result = cursor.execute(
|
||||||
"""Return a form for which a definition might exist, else None.
|
"SELECT definition FROM definitions WHERE lexical_form = ?",
|
||||||
|
(lexical_form,)
|
||||||
If we are sure the lexical form does not have definitions, suggest similar
|
|
||||||
words to the user.
|
|
||||||
"""
|
|
||||||
with open(lexical_forms_path, "rt") as lexical_forms_file:
|
|
||||||
forms = lexical_forms_file.readlines()
|
|
||||||
if query + "\n" in forms:
|
|
||||||
return query
|
|
||||||
|
|
||||||
print("Suggestions :")
|
|
||||||
suggestions = (
|
|
||||||
form for form in map(str.rstrip, forms)
|
|
||||||
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8
|
|
||||||
)
|
)
|
||||||
for form in suggestions:
|
for (blob,) in result.fetchall():
|
||||||
print(f"- {form}")
|
yield lzma.decompress(blob).decode()
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_definition_paths(query, definitions) -> list:
|
def pretty_print_definition(html: str):
|
||||||
"""Return a list of definition file paths for this lexical form."""
|
|
||||||
nfkd = unicodedata.normalize("NFKD", query[0])
|
|
||||||
first_char = next((c for c in nfkd if not unicodedata.combining(c)), "")
|
|
||||||
if not first_char:
|
|
||||||
print("Pas compris la première lettre…")
|
|
||||||
return []
|
|
||||||
path = Path(definitions) / first_char.upper() / query
|
|
||||||
try:
|
|
||||||
return [f for f in path.iterdir() if str(f).endswith(".txt.gz")]
|
|
||||||
except FileNotFoundError:
|
|
||||||
print("Définition non trouvée.")
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def show_definition(def_path):
|
|
||||||
"""Print a definition from a definition file."""
|
|
||||||
with gzip.open(def_path, "rt") as def_file:
|
|
||||||
html = def_file.read()
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
content = parse_tag(soup.div.div)
|
content = parse_tag(soup.div.div)
|
||||||
print(content)
|
print(content)
|
||||||
|
|
||||||
|
|
||||||
TAG_STRIP_RE = re.compile(r"\s+")
|
|
||||||
|
|
||||||
|
|
||||||
def parse_tag(tag) -> str:
|
def parse_tag(tag) -> str:
|
||||||
if isinstance(tag, NavigableString):
|
if isinstance(tag, NavigableString):
|
||||||
return TAG_STRIP_RE.sub(" ", tag)
|
return TAG_STRIP_RE.sub(" ", tag)
|
||||||
|
@ -124,5 +98,24 @@ def parse_tag(tag) -> str:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def get_suggestions(
|
||||||
|
query: str,
|
||||||
|
connection: sqlite3.Connection
|
||||||
|
) -> Generator[str, None, None]:
|
||||||
|
"""Return a form for which a definition might exist, else None.
|
||||||
|
|
||||||
|
If we are sure the lexical form does not have definitions, suggest similar
|
||||||
|
words to the user.
|
||||||
|
"""
|
||||||
|
cursor = connection.cursor()
|
||||||
|
result = cursor.execute("SELECT lexical_form FROM definitions")
|
||||||
|
yielded = []
|
||||||
|
for (form,) in result.fetchall():
|
||||||
|
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8:
|
||||||
|
if form not in yielded:
|
||||||
|
yield form
|
||||||
|
yielded.append(form)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Reference in a new issue