scripts/build-database.py: a measly 421.3 MiB!
This commit is contained in:
parent
b8bb8c893b
commit
c66f0a5409
72
scripts/build-database.py
Normal file
72
scripts/build-database.py
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Build an SQLite database from definition files."""
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import lzma
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
tlfi_root = os.environ.get("TLFI_ROOT")
|
||||||
|
if not tlfi_root:
|
||||||
|
sys.exit("No TLFI_ROOT environment variable.")
|
||||||
|
tlfi_root_path = Path(tlfi_root)
|
||||||
|
lexical_forms_path = tlfi_root_path / "lexical_forms.txt"
|
||||||
|
definitions_path = tlfi_root_path / "definitions"
|
||||||
|
|
||||||
|
db_path = Path.cwd() / "tlfi.db"
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
create_database(conn)
|
||||||
|
fill_database(conn, lexical_forms_path, definitions_path)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def create_database(conn: sqlite3.Connection):
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"create table if not exists definitions ("
|
||||||
|
" id integer primary key,"
|
||||||
|
" lexical_form text,"
|
||||||
|
" definition blob not null"
|
||||||
|
");"
|
||||||
|
)
|
||||||
|
cursor.execute(
|
||||||
|
"create index if not exists idx_forms on definitions(lexical_form);"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def fill_database(
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
lexical_forms_path: Path,
|
||||||
|
definitions_path: Path
|
||||||
|
):
|
||||||
|
with open(lexical_forms_path, "rt") as lexical_forms_file:
|
||||||
|
forms = [form.rstrip() for form in lexical_forms_file.readlines()]
|
||||||
|
print(f"{len(forms)} forms in the lexical forms file.")
|
||||||
|
cursor = conn.cursor()
|
||||||
|
for letter in definitions_path.iterdir():
|
||||||
|
if not letter.is_dir():
|
||||||
|
continue
|
||||||
|
for form_dir in letter.iterdir():
|
||||||
|
form = form_dir.name
|
||||||
|
print(form)
|
||||||
|
insert_forms(cursor, form, form_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def insert_forms(cursor: sqlite3.Cursor, form: str, form_dir: Path):
|
||||||
|
for gz_file in form_dir.iterdir():
|
||||||
|
with gzip.open(gz_file, "rt") as form_file:
|
||||||
|
html = form_file.read()
|
||||||
|
def_data = lzma.compress(html.encode())
|
||||||
|
cursor.execute(
|
||||||
|
"insert into definitions (lexical_form, definition) values (?, ?)",
|
||||||
|
(form, def_data)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in a new issue