73 lines
2 KiB
Python
73 lines
2 KiB
Python
#!/usr/bin/env python3
|
|
"""Build an SQLite database from definition files."""
|
|
|
|
import gzip
|
|
import lzma
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def main():
|
|
tlfi_root = os.environ.get("TLFI_ROOT")
|
|
if not tlfi_root:
|
|
sys.exit("No TLFI_ROOT environment variable.")
|
|
tlfi_root_path = Path(tlfi_root)
|
|
lexical_forms_path = tlfi_root_path / "lexical_forms.txt"
|
|
definitions_path = tlfi_root_path / "definitions"
|
|
|
|
db_path = Path.cwd() / "tlfi.db"
|
|
conn = sqlite3.connect(db_path)
|
|
create_database(conn)
|
|
fill_database(conn, lexical_forms_path, definitions_path)
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
|
|
def create_database(conn: sqlite3.Connection):
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"create table if not exists definitions ("
|
|
" id integer primary key,"
|
|
" lexical_form text,"
|
|
" definition blob not null"
|
|
");"
|
|
)
|
|
cursor.execute(
|
|
"create index if not exists idx_forms on definitions(lexical_form);"
|
|
)
|
|
|
|
|
|
def fill_database(
|
|
conn: sqlite3.Connection,
|
|
lexical_forms_path: Path,
|
|
definitions_path: Path
|
|
):
|
|
with open(lexical_forms_path, "rt") as lexical_forms_file:
|
|
forms = [form.rstrip() for form in lexical_forms_file.readlines()]
|
|
print(f"{len(forms)} forms in the lexical forms file.")
|
|
cursor = conn.cursor()
|
|
for letter in definitions_path.iterdir():
|
|
if not letter.is_dir():
|
|
continue
|
|
for form_dir in letter.iterdir():
|
|
form = form_dir.name
|
|
print(form)
|
|
insert_forms(cursor, form, form_dir)
|
|
|
|
|
|
def insert_forms(cursor: sqlite3.Cursor, form: str, form_dir: Path):
|
|
for gz_file in form_dir.iterdir():
|
|
with gzip.open(gz_file, "rt") as form_file:
|
|
html = form_file.read()
|
|
def_data = lzma.compress(html.encode())
|
|
cursor.execute(
|
|
"insert into definitions (lexical_form, definition) values (?, ?)",
|
|
(form, def_data)
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|