scripts/build-database.py: a measly 421.3 MiB!

This commit is contained in:
dece 2022-11-28 16:55:23 +01:00
parent b8bb8c893b
commit c66f0a5409

72
scripts/build-database.py Normal file
View file

@ -0,0 +1,72 @@
#!/usr/bin/env python3
"""Build an SQLite database from definition files."""
import gzip
import lzma
import os
import sqlite3
import sys
from pathlib import Path
def main():
tlfi_root = os.environ.get("TLFI_ROOT")
if not tlfi_root:
sys.exit("No TLFI_ROOT environment variable.")
tlfi_root_path = Path(tlfi_root)
lexical_forms_path = tlfi_root_path / "lexical_forms.txt"
definitions_path = tlfi_root_path / "definitions"
db_path = Path.cwd() / "tlfi.db"
conn = sqlite3.connect(db_path)
create_database(conn)
fill_database(conn, lexical_forms_path, definitions_path)
conn.commit()
conn.close()
def create_database(conn: sqlite3.Connection):
cursor = conn.cursor()
cursor.execute(
"create table if not exists definitions ("
" id integer primary key,"
" lexical_form text,"
" definition blob not null"
");"
)
cursor.execute(
"create index if not exists idx_forms on definitions(lexical_form);"
)
def fill_database(
conn: sqlite3.Connection,
lexical_forms_path: Path,
definitions_path: Path
):
with open(lexical_forms_path, "rt") as lexical_forms_file:
forms = [form.rstrip() for form in lexical_forms_file.readlines()]
print(f"{len(forms)} forms in the lexical forms file.")
cursor = conn.cursor()
for letter in definitions_path.iterdir():
if not letter.is_dir():
continue
for form_dir in letter.iterdir():
form = form_dir.name
print(form)
insert_forms(cursor, form, form_dir)
def insert_forms(cursor: sqlite3.Cursor, form: str, form_dir: Path):
for gz_file in form_dir.iterdir():
with gzip.open(gz_file, "rt") as form_file:
html = form_file.read()
def_data = lzma.compress(html.encode())
cursor.execute(
"insert into definitions (lexical_form, definition) values (?, ?)",
(form, def_data)
)
if __name__ == "__main__":
main()