scripts/build-database.py: a measly 421.3 MiB!
This commit is contained in:
parent
b8bb8c893b
commit
c66f0a5409
72
scripts/build-database.py
Normal file
72
scripts/build-database.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Build an SQLite database from definition files."""
|
||||
|
||||
import gzip
|
||||
import lzma
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
tlfi_root = os.environ.get("TLFI_ROOT")
|
||||
if not tlfi_root:
|
||||
sys.exit("No TLFI_ROOT environment variable.")
|
||||
tlfi_root_path = Path(tlfi_root)
|
||||
lexical_forms_path = tlfi_root_path / "lexical_forms.txt"
|
||||
definitions_path = tlfi_root_path / "definitions"
|
||||
|
||||
db_path = Path.cwd() / "tlfi.db"
|
||||
conn = sqlite3.connect(db_path)
|
||||
create_database(conn)
|
||||
fill_database(conn, lexical_forms_path, definitions_path)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def create_database(conn: sqlite3.Connection):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"create table if not exists definitions ("
|
||||
" id integer primary key,"
|
||||
" lexical_form text,"
|
||||
" definition blob not null"
|
||||
");"
|
||||
)
|
||||
cursor.execute(
|
||||
"create index if not exists idx_forms on definitions(lexical_form);"
|
||||
)
|
||||
|
||||
|
||||
def fill_database(
|
||||
conn: sqlite3.Connection,
|
||||
lexical_forms_path: Path,
|
||||
definitions_path: Path
|
||||
):
|
||||
with open(lexical_forms_path, "rt") as lexical_forms_file:
|
||||
forms = [form.rstrip() for form in lexical_forms_file.readlines()]
|
||||
print(f"{len(forms)} forms in the lexical forms file.")
|
||||
cursor = conn.cursor()
|
||||
for letter in definitions_path.iterdir():
|
||||
if not letter.is_dir():
|
||||
continue
|
||||
for form_dir in letter.iterdir():
|
||||
form = form_dir.name
|
||||
print(form)
|
||||
insert_forms(cursor, form, form_dir)
|
||||
|
||||
|
||||
def insert_forms(cursor: sqlite3.Cursor, form: str, form_dir: Path):
|
||||
for gz_file in form_dir.iterdir():
|
||||
with gzip.open(gz_file, "rt") as form_file:
|
||||
html = form_file.read()
|
||||
def_data = lzma.compress(html.encode())
|
||||
cursor.execute(
|
||||
"insert into definitions (lexical_form, definition) values (?, ?)",
|
||||
(form, def_data)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in a new issue