From c66f0a5409bdf87fb1a67cd66043e5f480717474 Mon Sep 17 00:00:00 2001 From: dece Date: Mon, 28 Nov 2022 16:55:23 +0100 Subject: [PATCH] scripts/build-database.py: a measly 421.3 MiB! --- scripts/build-database.py | 72 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 scripts/build-database.py diff --git a/scripts/build-database.py b/scripts/build-database.py new file mode 100644 index 0000000..0b20312 --- /dev/null +++ b/scripts/build-database.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Build an SQLite database from definition files.""" + +import gzip +import lzma +import os +import sqlite3 +import sys +from pathlib import Path + + +def main(): + tlfi_root = os.environ.get("TLFI_ROOT") + if not tlfi_root: + sys.exit("No TLFI_ROOT environment variable.") + tlfi_root_path = Path(tlfi_root) + lexical_forms_path = tlfi_root_path / "lexical_forms.txt" + definitions_path = tlfi_root_path / "definitions" + + db_path = Path.cwd() / "tlfi.db" + conn = sqlite3.connect(db_path) + create_database(conn) + fill_database(conn, lexical_forms_path, definitions_path) + conn.commit() + conn.close() + + +def create_database(conn: sqlite3.Connection): + cursor = conn.cursor() + cursor.execute( + "create table if not exists definitions (" + " id integer primary key," + " lexical_form text," + " definition blob not null" + ");" + ) + cursor.execute( + "create index if not exists idx_forms on definitions(lexical_form);" + ) + + +def fill_database( + conn: sqlite3.Connection, + lexical_forms_path: Path, + definitions_path: Path +): + with open(lexical_forms_path, "rt") as lexical_forms_file: + forms = [form.rstrip() for form in lexical_forms_file.readlines()] + print(f"{len(forms)} forms in the lexical forms file.") + cursor = conn.cursor() + for letter in definitions_path.iterdir(): + if not letter.is_dir(): + continue + for form_dir in letter.iterdir(): + form = form_dir.name + print(form) + insert_forms(cursor, form, form_dir) + + +def insert_forms(cursor: sqlite3.Cursor, form: str, form_dir: Path): + for gz_file in form_dir.iterdir(): + with gzip.open(gz_file, "rt") as form_file: + html = form_file.read() + def_data = lzma.compress(html.encode()) + cursor.execute( + "insert into definitions (lexical_form, definition) values (?, ?)", + (form, def_data) + ) + + +if __name__ == "__main__": + main()