Compare commits

...

6 Commits

1
.gitignore vendored

@ -1,3 +1,4 @@
__pycache__/
/build/
/dist/
/*.egg-info/

@ -0,0 +1,4 @@
TLFi
====
http://atilf.atilf.fr/

@ -1,3 +1,6 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
[tool.mypy]
ignore_missing_imports = true

@ -0,0 +1,72 @@
#!/usr/bin/env python3
"""Build an SQLite database from definition files."""
import gzip
import lzma
import os
import sqlite3
import sys
from pathlib import Path
def main():
tlfi_root = os.environ.get("TLFI_ROOT")
if not tlfi_root:
sys.exit("No TLFI_ROOT environment variable.")
tlfi_root_path = Path(tlfi_root)
lexical_forms_path = tlfi_root_path / "lexical_forms.txt"
definitions_path = tlfi_root_path / "definitions"
db_path = Path.cwd() / "tlfi.db"
conn = sqlite3.connect(db_path)
create_database(conn)
fill_database(conn, lexical_forms_path, definitions_path)
conn.commit()
conn.close()
def create_database(conn: sqlite3.Connection):
cursor = conn.cursor()
cursor.execute(
"create table if not exists definitions ("
" id integer primary key,"
" lexical_form text,"
" definition blob not null"
");"
)
cursor.execute(
"create index if not exists idx_forms on definitions(lexical_form);"
)
def fill_database(
conn: sqlite3.Connection,
lexical_forms_path: Path,
definitions_path: Path
):
with open(lexical_forms_path, "rt") as lexical_forms_file:
forms = [form.rstrip() for form in lexical_forms_file.readlines()]
print(f"{len(forms)} forms in the lexical forms file.")
cursor = conn.cursor()
for letter in definitions_path.iterdir():
if not letter.is_dir():
continue
for form_dir in letter.iterdir():
form = form_dir.name
print(form)
insert_forms(cursor, form, form_dir)
def insert_forms(cursor: sqlite3.Cursor, form: str, form_dir: Path):
for gz_file in form_dir.iterdir():
with gzip.open(gz_file, "rt") as form_file:
html = form_file.read()
def_data = lzma.compress(html.encode())
cursor.execute(
"insert into definitions (lexical_form, definition) values (?, ?)",
(form, def_data)
)
if __name__ == "__main__":
main()

@ -1,6 +1,6 @@
[metadata]
name = tlfi
version = 0.0.2
version = 0.0.3
description = TLFi command-line interface
long_description = file: README.md
long_description_content_type = text/markdown
@ -16,6 +16,7 @@ classifiers =
packages = tlfi
python_requires = >= 3.7
setup_requires = setuptools >= 38.3.0
install_requires = blessings ~= 1.6
[options.entry_points]
console_scripts =

@ -7,98 +7,65 @@ formatting almost like in the TLFi.
import argparse
import difflib
import gzip
import lzma
import os
import re
import unicodedata
from functools import cache
from os import environ
import sqlite3
import sys
from pathlib import Path
from typing import Generator
from blessings import Terminal
from bs4 import BeautifulSoup, NavigableString
try:
from blessings import Terminal
t = Terminal()
except ImportError:
class DummyTerminal:
def __getattr__(self, _):
return ""
t = DummyTerminal()
T = Terminal()
TAG_STRIP_RE = re.compile(r"\s+")
def main():
ap = argparse.ArgumentParser(description="TLFi CLI")
ap.add_argument("query", help="mot(s) à chercher")
ap.add_argument("-f", "--lexical-forms",
default=get_root_path() / "lexical_forms.txt",
help="fichier des formes lexicales")
ap.add_argument("-d", "--definitions",
default=get_root_path() / "definitions",
help="répertoire des définitions")
ap.add_argument("-d", "--database", help="base de données")
args = ap.parse_args()
lookup_res = lookup(args.query, args.lexical_forms)
if lookup_res is None:
exit()
if (defs := get_definition_paths(lookup_res, args.definitions)):
for d in defs:
show_definition(d)
@cache
def get_root_path():
return Path(environ.get("TLFI_ROOT", "."))
if args.database:
db_path = Path(args.database)
elif (env_db_path := os.environ.get("TLFI_DATABASE")):
db_path = Path(env_db_path)
else:
sys.exit("Pas de base de données et TLFI_DATABASE n'est pas défini.")
lexical_form = args.query
connection = sqlite3.connect(db_path)
printed = 0
for definition_html in get_definitions(lexical_form, connection):
pretty_print_definition(definition_html)
printed += 1
if printed == 0:
print("Forme lexicale non trouvée. Suggestions :")
for suggestion in get_suggestions(lexical_form, connection):
print(f"* {suggestion}")
def get_definitions(
lexical_form: str,
connection: sqlite3.Connection
) -> Generator[str, None, None]:
cursor = connection.cursor()
result = cursor.execute(
"SELECT definition FROM definitions WHERE lexical_form = ?",
(lexical_form,)
)
for (blob,) in result.fetchall():
yield lzma.decompress(blob).decode()
def lookup(query, lexical_forms_path):
"""Return a form for which a definition might exist, else None.
If we are sure the lexical form does not have definitions, suggest similar
words to the user.
"""
with open(lexical_forms_path, "rt") as lexical_forms_file:
forms = lexical_forms_file.readlines()
if query + "\n" in forms:
return query
print("Suggestions :")
suggestions = (
form for form in map(str.rstrip, forms)
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8
)
for form in suggestions:
print(f"- {form}")
return None
def get_definition_paths(query, definitions) -> list:
"""Return a list of definition file paths for this lexical form."""
nfkd = unicodedata.normalize("NFKD", query[0])
first_char = next((c for c in nfkd if not unicodedata.combining(c)), "")
if not first_char:
print("Pas compris la première lettre…")
return []
path = Path(definitions) / first_char.upper() / query
try:
return [f for f in path.iterdir() if str(f).endswith(".txt.gz")]
except FileNotFoundError:
print("Définition non trouvée.")
return []
def show_definition(def_path):
"""Print a definition from a definition file."""
with gzip.open(def_path, "rt") as def_file:
html = def_file.read()
def pretty_print_definition(html: str):
soup = BeautifulSoup(html, "html.parser")
content = parse_tag(soup.div.div)
print(content)
TAG_STRIP_RE = re.compile(r"\s+")
def parse_tag(tag) -> str:
if isinstance(tag, NavigableString):
return TAG_STRIP_RE.sub(" ", tag)
@ -110,19 +77,38 @@ def parse_tag(tag) -> str:
if tag.name == "span":
classes = tag.get("class") or []
if "tlf_cdefinition" in classes:
content = f"{t.yellow}{content}{t.normal}"
content = f"{T.yellow}{content}{T.normal}"
if "tlf_cdomaine" in classes:
content = f"{t.red}{content}{t.normal}"
content = f"{T.red}{content}{T.normal}"
if "tlf_csyntagme" in classes:
content = f"{t.green}{content}{t.normal}"
content = f"{T.green}{content}{T.normal}"
if "tlf_cmot" in classes:
content = f"{t.reverse}{content}{t.normal}"
content = f"{T.reverse}{content}{T.normal}"
if tag.name == "b":
content = f"{t.bold}{content}{t.normal}"
content = f"{T.bold}{content}{T.normal}"
if tag.name == "i":
content = f"{t.italic}{content}{t.no_italic}"
content = f"{T.italic}{content}{T.no_italic}"
return content
def get_suggestions(
query: str,
connection: sqlite3.Connection
) -> Generator[str, None, None]:
"""Return a form for which a definition might exist, else None.
If we are sure the lexical form does not have definitions, suggest similar
words to the user.
"""
cursor = connection.cursor()
result = cursor.execute("SELECT lexical_form FROM definitions")
yielded = []
for (form,) in result.fetchall():
if difflib.SequenceMatcher(None, query, form).ratio() > 0.8:
if form not in yielded:
yield form
yielded.append(form)
if __name__ == "__main__":
main()

Loading…
Cancel
Save