# Building the Morphology Database from Morpheus

## Overview

This document describes how to build a comprehensive Latin/Greek morphology SQLite database using the open-source Morpheus engine from the Perseus Project. The result is a self-contained `.db` file containing the full LSJ (Greek) and Lewis & Short (Latin) lexicons — every headword, every inflected form — that the Lector app reads at runtime with no external API dependencies.

**Expected output:** ~116,000 lemmas, ~500,000+ unique forms, 50–100 MB SQLite database.

---

## Prerequisites

On the VPS:
- Docker installed (`sudo apt install docker.io`)
- Python 3.10+ with `sqlite3` (built-in) and `unicodedata` (built-in)
- `pip install betacode` (for Beta Code → Unicode conversion)
- ~5GB free disk space for the build process (Morpheus image + intermediate files + final DB)
- The Lector repo cloned

---

## Step 1: Pull and test Morpheus

```bash
docker pull perseidsproject/morpheus

# Test Greek (input is Unicode, Morpheus handles conversion internally)
echo "ἄνθρωπος" | docker run -i --rm -e MORPHLIB=stemlib perseidsproject/morpheus -S

# Test Latin
echo "hominem" | docker run -i --rm -e MORPHLIB=stemlib perseidsproject/morpheus -L -S
```

You should see XML output with lemma and morphological analysis. If this works, Morpheus is ready.

---

## Step 2: Extract every headword from Morpheus source data

The Morpheus repo contains the complete LSJ (Greek) and Lewis & Short (Latin) lexicons as stem source files. We need to extract every headword, convert from Beta Code to Unicode, and build word lists.

### 2a: Clone the Morpheus data

```bash
cd /var/www/lector
git clone --depth 1 https://github.com/perseids-tools/morpheus-perseids.git morpheus-data
```

### 2b: Extract headwords

Create `scripts/extract-headwords.py`:

```python
#!/usr/bin/env python3
"""
Extract all headwords from Morpheus stemsrc files and convert
Beta Code to Unicode Greek.

Outputs:
  scripts/wordlist-greek.txt   — all Greek headwords (Unicode)
  scripts/wordlist-latin.txt   — all Latin headwords
"""

import os
import re
import glob

try:
    from betacode import conv as beta_conv
    HAS_BETACODE = True
except ImportError:
    HAS_BETACODE = False
    print("WARNING: betacode package not installed. Install with: pip install betacode")
    print("         Greek headwords will remain in Beta Code and need manual conversion.")


def beta_to_unicode(beta: str) -> str:
    """Convert Perseus-style Beta Code to Unicode Greek."""
    if not HAS_BETACODE:
        return beta

    # Morpheus Beta Code uses slightly different conventions:
    # - Capitals indicated by * prefix: *a = Α
    # - Breathing/accents come after the vowel: a) = ἀ, a( = ἁ
    # - Diaeresis: a+ = ϊ
    try:
        result = beta_conv.beta_to_uni(beta)
        return result
    except Exception:
        return beta


def extract_lemmas_from_file(filepath: str) -> set:
    """Extract :le: entries from a Morpheus stemsrc file."""
    lemmas = set()
    if not os.path.exists(filepath):
        return lemmas

    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.strip()
            if line.startswith(":le:"):
                lemma = line[4:].strip()
                # Strip any trailing comments or metadata
                # Some entries have #1, #2 for homographs — keep them for now
                if lemma:
                    lemmas.add(lemma)
    return lemmas


def clean_latin_lemma(lemma: str) -> str:
    """Clean a Latin lemma: convert Morpheus notation to standard form."""
    # Remove homograph markers (#1, #2, etc.)
    lemma = re.sub(r'#\d+$', '', lemma)
    # Morpheus uses ^ for short vowels and _ for long — strip these
    lemma = lemma.replace('^', '').replace('_', '')
    # Convert to standard macron notation where possible
    # (Morpheus marks long vowels with _ before the vowel sometimes)
    return lemma.strip()


def clean_greek_lemma(beta: str) -> str:
    """Clean a Greek Beta Code lemma and convert to Unicode."""
    # Remove homograph markers
    beta = re.sub(r'#\d+$', '', beta)
    # Convert to Unicode
    uni = beta_to_unicode(beta)
    return uni.strip()


def main():
    morpheus_dir = "morpheus-data/stemlib"

    if not os.path.exists(morpheus_dir):
        print(f"Error: {morpheus_dir} not found.")
        print("Clone the Morpheus repo first:")
        print("  git clone --depth 1 https://github.com/perseids-tools/morpheus-perseids.git morpheus-data")
        return

    # === GREEK ===
    print("Extracting Greek headwords...")
    greek_files = [
        f"{morpheus_dir}/Greek/stemsrc/lsj.nom",
        f"{morpheus_dir}/Greek/stemsrc/lsj.vbs",
        f"{morpheus_dir}/Greek/stemsrc/lsj.vbs.byhand",
        f"{morpheus_dir}/Greek/stemsrc/lsj.byhand",
        f"{morpheus_dir}/Greek/stemsrc/irreg.nom.src",
        f"{morpheus_dir}/Greek/stemsrc/irreg.vbs.src",
    ]
    # Also grab author-specific supplements
    for extra in glob.glob(f"{morpheus_dir}/Greek/stemsrc/nom[0-9]*"):
        greek_files.append(extra)
    for extra in glob.glob(f"{morpheus_dir}/Greek/stemsrc/vbs.*"):
        if extra not in greek_files:
            greek_files.append(extra)

    greek_lemmas_raw = set()
    for f in greek_files:
        found = extract_lemmas_from_file(f)
        print(f"  {os.path.basename(f)}: {len(found)} lemmas")
        greek_lemmas_raw |= found

    print(f"  Total raw Greek lemmas: {len(greek_lemmas_raw)}")

    # Convert to Unicode
    greek_lemmas = set()
    failed_conversions = []
    for beta in greek_lemmas_raw:
        uni = clean_greek_lemma(beta)
        if uni and len(uni) > 0:
            greek_lemmas.add(uni)
        else:
            failed_conversions.append(beta)

    if failed_conversions:
        print(f"  Warning: {len(failed_conversions)} lemmas failed Beta Code conversion")
        with open("scripts/failed-greek-conversions.txt", "w", encoding="utf-8") as f:
            for beta in sorted(failed_conversions):
                f.write(f"{beta}\n")

    # Write Greek word list
    os.makedirs("scripts", exist_ok=True)
    with open("scripts/wordlist-greek.txt", "w", encoding="utf-8") as f:
        for word in sorted(greek_lemmas):
            f.write(f"{word}\n")
    print(f"  Wrote {len(greek_lemmas)} Greek headwords to scripts/wordlist-greek.txt")

    # === LATIN ===
    print("\nExtracting Latin headwords...")
    latin_files = [
        f"{morpheus_dir}/Latin/stemsrc/ls.nom",
        f"{morpheus_dir}/Latin/stemsrc/vbs.latin",
        f"{morpheus_dir}/Latin/stemsrc/vbs.latin.bas",
        f"{morpheus_dir}/Latin/stemsrc/irreg.nom.src",
        f"{morpheus_dir}/Latin/stemsrc/irreg.vbs.src",
        f"{morpheus_dir}/Latin/stemsrc/vbs.irreg",
        f"{morpheus_dir}/Latin/stemsrc/nom.irreg",
        f"{morpheus_dir}/Latin/stemsrc/nom.latin.bas",
    ]
    for extra in glob.glob(f"{morpheus_dir}/Latin/stemsrc/nom.[0-9]*"):
        latin_files.append(extra)

    latin_lemmas_raw = set()
    for f in latin_files:
        found = extract_lemmas_from_file(f)
        print(f"  {os.path.basename(f)}: {len(found)} lemmas")
        latin_lemmas_raw |= found

    print(f"  Total raw Latin lemmas: {len(latin_lemmas_raw)}")

    # Clean Latin lemmas
    latin_lemmas = set()
    for lemma in latin_lemmas_raw:
        cleaned = clean_latin_lemma(lemma)
        if cleaned:
            latin_lemmas.add(cleaned)

    with open("scripts/wordlist-latin.txt", "w", encoding="utf-8") as f:
        for word in sorted(latin_lemmas):
            f.write(f"{word}\n")
    print(f"  Wrote {len(latin_lemmas)} Latin headwords to scripts/wordlist-latin.txt")

    print(f"\nDone. Total: {len(greek_lemmas)} Greek + {len(latin_lemmas)} Latin = {len(greek_lemmas) + len(latin_lemmas)} headwords")


if __name__ == "__main__":
    main()
```

Run it:
```bash
pip install betacode
python3 scripts/extract-headwords.py
```

Expected output:
```
Extracting Greek headwords...
  lsj.nom: 51630 lemmas
  lsj.vbs: 15980 lemmas
  ...
  Total raw Greek lemmas: ~67000
  Wrote ~65000 Greek headwords to scripts/wordlist-greek.txt

Extracting Latin headwords...
  ls.nom: 41929 lemmas
  vbs.latin: 6800 lemmas
  ...
  Total raw Latin lemmas: ~48000
  Wrote ~47000 Latin headwords to scripts/wordlist-latin.txt

Done. Total: ~112000 headwords
```

---

## Step 3: Build the morphology database

Create `scripts/build-morphology-db.py`:

```python
#!/usr/bin/env python3
"""
Build the Lector morphology SQLite database by feeding every headword
from LSJ and Lewis & Short through the Morpheus morphological engine.

Usage:
    python3 scripts/build-morphology-db.py [--resume]

The --resume flag skips words already in the database (useful if
a previous run was interrupted).

Requires:
  - Docker with perseidsproject/morpheus image pulled
  - scripts/wordlist-greek.txt and scripts/wordlist-latin.txt
    (generated by extract-headwords.py)

Outputs:
  data/morphology.db
"""

import subprocess
import sqlite3
import unicodedata
import xml.etree.ElementTree as ET
import os
import sys
import re
import time
import argparse
from datetime import datetime

DB_PATH = "data/morphology.db"

# Process this many words per Morpheus invocation.
# Morpheus handles batches well — larger batches are faster but
# use more memory. 200 is a good balance.
BATCH_SIZE = 200

# Number of retries for failed Docker calls
MAX_RETRIES = 3


# ============================================================
# NORMALIZATION
# ============================================================

def strip_diacritics(s: str) -> str:
    """Strip Greek diacritics (breathings, accents, iota subscript) for bare-keyboard matching."""
    nfd = unicodedata.normalize("NFD", s)
    stripped = "".join(c for c in nfd if unicodedata.category(c) != "Mn")
    return unicodedata.normalize("NFC", stripped).lower()


def strip_macrons(s: str) -> str:
    """Strip Latin macrons."""
    replacements = {"ā": "a", "ē": "e", "ī": "i", "ō": "o", "ū": "u",
                    "Ā": "A", "Ē": "E", "Ī": "I", "Ō": "O", "Ū": "U"}
    for k, v in replacements.items():
        s = s.replace(k, v)
    return s.lower()


def normalize_form(form: str, language: str) -> str:
    """Create a bare/normalized lookup key."""
    if language == "greek":
        return strip_diacritics(form)
    else:
        return strip_macrons(form)


# ============================================================
# MORPHEUS RUNNER
# ============================================================

def run_morpheus(words: list[str], language: str) -> str:
    """
    Run Morpheus on a batch of words, return raw XML output.
    Retries on failure.
    """
    input_text = "\n".join(words) + "\n"

    cmd = [
        "docker", "run", "-i", "--rm",
        "-e", "MORPHLIB=stemlib",
        "perseidsproject/morpheus", "-S"
    ]
    if language == "latin":
        cmd.append("-L")

    for attempt in range(MAX_RETRIES):
        try:
            result = subprocess.run(
                cmd,
                input=input_text,
                capture_output=True,
                text=True,
                timeout=300  # 5 min timeout for large batches
            )
            if result.returncode == 0:
                return result.stdout
            else:
                print(f"    Morpheus returned code {result.returncode}, attempt {attempt + 1}/{MAX_RETRIES}")
                if result.stderr:
                    print(f"    stderr: {result.stderr[:200]}")
        except subprocess.TimeoutExpired:
            print(f"    Timeout on attempt {attempt + 1}/{MAX_RETRIES}")
        except Exception as e:
            print(f"    Error on attempt {attempt + 1}/{MAX_RETRIES}: {e}")

        time.sleep(2)  # Brief pause before retry

    return ""


# ============================================================
# XML PARSER
# ============================================================

# Mapping from Morpheus verbose labels to short codes
PERSON_MAP = {"1st": "1", "2nd": "2", "3rd": "3"}
NUMBER_MAP = {"singular": "sg", "dual": "dual", "plural": "pl"}
TENSE_MAP = {
    "present": "pres", "imperfect": "imperf", "future": "fut",
    "aorist": "aor", "perfect": "perf", "pluperfect": "plup",
    "future perfect": "futperf", "future_perfect": "futperf"
}
MOOD_MAP = {
    "indicative": "ind", "subjunctive": "subj", "optative": "opt",
    "imperative": "imper", "infinitive": "inf", "participle": "part",
    "gerundive": "gerundive", "gerund": "gerund", "supine": "supine"
}
VOICE_MAP = {
    "active": "act", "passive": "pass", "middle": "mid",
    "mediopassive": "mid_pass", "medio-passive": "mid_pass",
    "deponent": "dep"
}
CASE_MAP = {
    "nominative": "nom", "genitive": "gen", "dative": "dat",
    "accusative": "acc", "vocative": "voc", "ablative": "abl",
    "locative": "loc"
}
GENDER_MAP = {
    "masculine": "m", "feminine": "f", "neuter": "n",
    "masculine/feminine": "m/f", "common": "c"
}


def parse_morpheus_xml(xml_text: str, language: str) -> list[dict]:
    """
    Parse Morpheus XML output into a list of morphology entries.

    Morpheus XML structure:
    <words>
      <word>
        <form>ἄνθρωπος</form>
        <entry>
          <dict>
            <hdwd>ἄνθρωπος</hdwd>
            <pofs>noun</pofs>
            <decl>2nd</decl>
            <gend>masculine</gend>
          </dict>
          <infl>  <!-- can be single element or list -->
            <term><stem>ἀνθρωπ</stem><suff>ος</suff></term>
            <pofs>noun</pofs>
            <case>nominative</case>
            <gend>masculine</gend>
            <num>singular</num>
          </infl>
        </entry>
      </word>
    </words>

    A single <word> can have multiple <entry> elements (ambiguous lemmatization),
    and each <entry> can have multiple <infl> elements.
    """
    entries = []

    # Morpheus outputs multiple <words> blocks (one per input word).
    # Wrap in a root element for valid XML.
    xml_text = f"<root>{xml_text}</root>"

    try:
        root = ET.fromstring(xml_text)
    except ET.ParseError:
        # Try to salvage partial XML by finding complete <word>...</word> blocks
        word_blocks = re.findall(r'<word>.*?</word>', xml_text, re.DOTALL)
        if not word_blocks:
            return entries
        try:
            root = ET.fromstring(f"<root><words>{''.join(word_blocks)}</words></root>")
        except ET.ParseError:
            return entries

    for word_elem in root.iter("word"):
        form_elem = word_elem.find("form")
        if form_elem is None or not form_elem.text:
            continue
        surface_form = form_elem.text.strip()

        for entry_elem in word_elem.iter("entry"):
            dict_elem = entry_elem.find("dict")
            if dict_elem is None:
                continue

            hdwd_elem = dict_elem.find("hdwd")
            pofs_elem = dict_elem.find("pofs")
            gend_elem = dict_elem.find("gend")

            lemma = hdwd_elem.text.strip() if hdwd_elem is not None and hdwd_elem.text else ""
            pos = pofs_elem.text.strip() if pofs_elem is not None and pofs_elem.text else ""
            dict_gender = gend_elem.text.strip() if gend_elem is not None and gend_elem.text else None

            infl_elems = entry_elem.findall("infl")
            if not infl_elems:
                # Record the lemma even without inflection details
                entries.append({
                    "form": surface_form,
                    "form_bare": normalize_form(surface_form, language),
                    "lemma": lemma,
                    "language": language,
                    "part_of_speech": pos,
                    "person": None, "number": None, "tense": None,
                    "mood": None, "voice": None, "case_": None,
                    "gender": GENDER_MAP.get(dict_gender, dict_gender) if dict_gender else None,
                    "dialect": None,
                })
                continue

            for infl_elem in infl_elems:
                def get_text(tag):
                    el = infl_elem.find(tag)
                    return el.text.strip() if el is not None and el.text else None

                raw_person = get_text("pers")
                raw_number = get_text("num")
                raw_tense = get_text("tense")
                raw_mood = get_text("mood")
                raw_voice = get_text("voice")
                raw_case = get_text("case")
                raw_gender = get_text("gend") or dict_gender
                raw_dialect = get_text("dial")

                entries.append({
                    "form": surface_form,
                    "form_bare": normalize_form(surface_form, language),
                    "lemma": lemma,
                    "language": language,
                    "part_of_speech": pos,
                    "person": PERSON_MAP.get(raw_person, raw_person),
                    "number": NUMBER_MAP.get(raw_number, raw_number),
                    "tense": TENSE_MAP.get(raw_tense, raw_tense),
                    "mood": MOOD_MAP.get(raw_mood, raw_mood),
                    "voice": VOICE_MAP.get(raw_voice, raw_voice),
                    "case_": CASE_MAP.get(raw_case, raw_case),
                    "gender": GENDER_MAP.get(raw_gender, raw_gender) if raw_gender else None,
                    "dialect": raw_dialect,
                })

    return entries


# ============================================================
# DATABASE
# ============================================================

def create_database(db_path: str) -> sqlite3.Connection:
    """Create the SQLite database and tables."""
    os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)

    conn = sqlite3.connect(db_path)
    c = conn.cursor()

    # Enable WAL mode for better concurrent read performance
    c.execute("PRAGMA journal_mode=WAL")

    c.execute("DROP TABLE IF EXISTS morphology")
    c.execute("""
        CREATE TABLE morphology (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            form TEXT NOT NULL,
            form_bare TEXT NOT NULL,
            lemma TEXT NOT NULL,
            language TEXT NOT NULL,
            part_of_speech TEXT NOT NULL,
            person TEXT,
            number TEXT,
            tense TEXT,
            mood TEXT,
            voice TEXT,
            case_ TEXT,
            gender TEXT,
            dialect TEXT
        )
    """)

    # Metadata table
    c.execute("DROP TABLE IF EXISTS meta")
    c.execute("""
        CREATE TABLE meta (
            key TEXT PRIMARY KEY,
            value TEXT
        )
    """)

    conn.commit()
    return conn


def open_database(db_path: str) -> sqlite3.Connection:
    """Open existing database for resume mode."""
    conn = sqlite3.connect(db_path)
    conn.execute("PRAGMA journal_mode=WAL")
    return conn


def get_processed_forms(conn: sqlite3.Connection) -> set:
    """Get all forms already in the database (for resume mode)."""
    c = conn.cursor()
    c.execute("SELECT DISTINCT form FROM morphology")
    return {row[0] for row in c.fetchall()}


def insert_entries(conn: sqlite3.Connection, entries: list[dict]):
    """Insert morphology entries into the database."""
    c = conn.cursor()
    c.executemany("""
        INSERT INTO morphology (form, form_bare, lemma, language, part_of_speech,
                                person, number, tense, mood, voice, case_, gender, dialect)
        VALUES (:form, :form_bare, :lemma, :language, :part_of_speech,
                :person, :number, :tense, :mood, :voice, :case_, :gender, :dialect)
    """, entries)
    conn.commit()


def create_indices(conn: sqlite3.Connection):
    """Create indices after all data is inserted (faster than creating before)."""
    print("Creating indices...")
    c = conn.cursor()
    c.execute("CREATE INDEX IF NOT EXISTS idx_morph_form ON morphology(form)")
    c.execute("CREATE INDEX IF NOT EXISTS idx_morph_bare ON morphology(form_bare)")
    c.execute("CREATE INDEX IF NOT EXISTS idx_morph_lemma ON morphology(lemma)")
    c.execute("CREATE INDEX IF NOT EXISTS idx_morph_lang ON morphology(language)")
    c.execute("CREATE INDEX IF NOT EXISTS idx_morph_pos ON morphology(part_of_speech)")
    conn.commit()


# ============================================================
# WORD LIST LOADING
# ============================================================

def load_wordlist(filepath: str) -> list[str]:
    """Load a word list file (one word per line), preserving order, deduplicating."""
    seen = set()
    words = []
    if not os.path.exists(filepath):
        print(f"  Warning: {filepath} not found")
        return words

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            word = line.strip()
            if word and not word.startswith("#") and word not in seen:
                seen.add(word)
                words.append(word)
    return words


def extract_passage_words() -> tuple[list[str], list[str]]:
    """
    Extract unique words from passages.ts as a supplement.
    Returns (greek_words, latin_words).
    """
    greek_words = set()
    latin_words = set()

    passages_path = "server/data/passages.ts"
    if not os.path.exists(passages_path):
        return list(greek_words), list(latin_words)

    with open(passages_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Extract rawText and language pairs
    raw_texts = re.findall(r'rawText:\s*["`]([^"`]+)["`]', content)
    languages = re.findall(r'language:\s*"(\w+)"', content)

    for text, lang in zip(raw_texts, languages):
        words = re.findall(r'[\w\u0370-\u03FF\u1F00-\u1FFF\u0300-\u036F]+', text)
        for word in words:
            word = word.strip()
            if not word:
                continue
            if lang == "greek":
                greek_words.add(word)
            else:
                latin_words.add(word)

    return sorted(greek_words), sorted(latin_words)


# ============================================================
# BATCH PROCESSING
# ============================================================

def batch_process(words: list[str], language: str, conn: sqlite3.Connection,
                  processed: set = None):
    """Process words in batches through Morpheus."""
    total = len(words)
    total_entries = 0
    skipped = 0

    # Filter out already-processed words in resume mode
    if processed:
        words = [w for w in words if w not in processed]
        skipped = total - len(words)
        total = len(words)
        if skipped > 0:
            print(f"  Skipping {skipped} already-processed words")

    if total == 0:
        print(f"  Nothing to process for {language}")
        return 0

    num_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE
    start_time = time.time()

    for i in range(0, total, BATCH_SIZE):
        batch = words[i:i + BATCH_SIZE]
        batch_num = i // BATCH_SIZE + 1
        elapsed = time.time() - start_time
        rate = (i / elapsed) if elapsed > 0 and i > 0 else 0
        eta = ((total - i) / rate / 60) if rate > 0 else 0

        print(f"  Batch {batch_num}/{num_batches} ({len(batch)} words) "
              f"[{i}/{total}, {rate:.0f} w/s, ETA {eta:.1f}m]", end="", flush=True)

        xml_output = run_morpheus(batch, language)
        if xml_output:
            entries = parse_morpheus_xml(xml_output, language)
            if entries:
                insert_entries(conn, entries)
                total_entries += len(entries)
                print(f" → {len(entries)} entries")
            else:
                print(f" → 0 entries (no parses)")
        else:
            print(f" → FAILED (no output)")

    return total_entries


# ============================================================
# MAIN
# ============================================================

def main():
    parser = argparse.ArgumentParser(description="Build Lector morphology database")
    parser.add_argument("--resume", action="store_true",
                        help="Resume interrupted build (skip already-processed words)")
    args = parser.parse_args()

    print("=" * 70)
    print("  Lector Morphology Database Builder")
    print("  Full LSJ + Lewis & Short lexicon via Morpheus")
    print("=" * 70)

    # 1. Load word lists
    print("\n1. Loading word lists...")

    greek_words = load_wordlist("scripts/wordlist-greek.txt")
    latin_words = load_wordlist("scripts/wordlist-latin.txt")
    print(f"   From headword extraction: {len(greek_words)} Greek, {len(latin_words)} Latin")

    # Also add any words from passages that might not be headwords
    greek_passage, latin_passage = extract_passage_words()
    greek_set = set(greek_words)
    latin_set = set(latin_words)
    greek_new = [w for w in greek_passage if w not in greek_set]
    latin_new = [w for w in latin_passage if w not in latin_set]
    greek_words.extend(greek_new)
    latin_words.extend(latin_new)
    print(f"   Added from passages: +{len(greek_new)} Greek, +{len(latin_new)} Latin")
    print(f"   Total: {len(greek_words)} Greek + {len(latin_words)} Latin = {len(greek_words) + len(latin_words)} words")

    # 2. Create or open database
    if args.resume and os.path.exists(DB_PATH):
        print(f"\n2. Resuming with existing database at {DB_PATH}...")
        conn = open_database(DB_PATH)
        processed = get_processed_forms(conn)
        print(f"   Already have {len(processed)} unique forms in database")
    else:
        print(f"\n2. Creating fresh database at {DB_PATH}...")
        conn = create_database(DB_PATH)
        processed = None

    # 3. Process Greek
    print(f"\n3. Processing Greek through Morpheus...")
    print(f"   This will take a while — ~{len(greek_words) // BATCH_SIZE} batches")
    greek_entries = batch_process(greek_words, "greek", conn, processed)
    print(f"   Greek complete: {greek_entries:,} entries")

    # 4. Process Latin
    print(f"\n4. Processing Latin through Morpheus...")
    print(f"   ~{len(latin_words) // BATCH_SIZE} batches")
    latin_entries = batch_process(latin_words, "latin", conn, processed)
    print(f"   Latin complete: {latin_entries:,} entries")

    # 5. Create indices (doing this after insert is much faster)
    print("\n5. Creating database indices...")
    create_indices(conn)

    # 6. Record metadata
    print("\n6. Recording metadata...")
    c = conn.cursor()
    c.execute("SELECT COUNT(*) FROM morphology")
    total_entries = c.fetchone()[0]
    c.execute("SELECT COUNT(DISTINCT lemma) FROM morphology")
    total_lemmas = c.fetchone()[0]
    c.execute("SELECT COUNT(DISTINCT form) FROM morphology")
    total_forms = c.fetchone()[0]
    c.execute("SELECT COUNT(DISTINCT form) FROM morphology WHERE language='greek'")
    greek_forms = c.fetchone()[0]
    c.execute("SELECT COUNT(DISTINCT form) FROM morphology WHERE language='latin'")
    latin_forms = c.fetchone()[0]

    c.execute("DELETE FROM meta")
    meta = [
        ("built_at", datetime.utcnow().isoformat()),
        ("total_entries", str(total_entries)),
        ("total_lemmas", str(total_lemmas)),
        ("total_forms", str(total_forms)),
        ("greek_forms", str(greek_forms)),
        ("latin_forms", str(latin_forms)),
        ("source", "Morpheus (Perseus Project) — LSJ + Lewis & Short"),
        ("morpheus_repo", "https://github.com/perseids-tools/morpheus-perseids"),
        ("license", "MPL-2.0 (Morpheus), public domain (LSJ/L&S lexical data)"),
    ]
    c.executemany("INSERT INTO meta VALUES (?, ?)", meta)
    conn.commit()

    # 7. Optimize
    print("\n7. Optimizing database (VACUUM + ANALYZE)...")
    c.execute("ANALYZE")
    c.execute("VACUUM")
    conn.commit()

    db_size = os.path.getsize(DB_PATH) / 1024 / 1024

    print(f"\n{'=' * 70}")
    print(f"  BUILD COMPLETE")
    print(f"{'=' * 70}")
    print(f"  Total morphology entries:  {total_entries:,}")
    print(f"  Unique inflected forms:    {total_forms:,}")
    print(f"    Greek:                   {greek_forms:,}")
    print(f"    Latin:                   {latin_forms:,}")
    print(f"  Unique lemmas:             {total_lemmas:,}")
    print(f"  Database size:             {db_size:.1f} MB")
    print(f"  Path:                      {DB_PATH}")
    print(f"{'=' * 70}")

    conn.close()


if __name__ == "__main__":
    main()
```

---

## Step 4: Run the full build

```bash
cd /var/www/lector

# 1. Install Python dependency
pip install betacode

# 2. Clone Morpheus data (just the stemlib files)
git clone --depth 1 https://github.com/perseids-tools/morpheus-perseids.git morpheus-data

# 3. Extract all headwords from LSJ and Lewis & Short
python3 scripts/extract-headwords.py

# 4. Build the morphology database
#    This will take 30-90 minutes depending on your VPS CPU.
#    Uses Docker to run Morpheus against every headword.
python3 scripts/build-morphology-db.py

# If interrupted (e.g., VPS restart), resume where you left off:
# python3 scripts/build-morphology-db.py --resume
```

### Running in the background

For a long build, use `screen` or `nohup`:

```bash
# Option A: screen
screen -S morpheus-build
python3 scripts/build-morphology-db.py
# Ctrl+A, D to detach. screen -r morpheus-build to reattach.

# Option B: nohup
nohup python3 scripts/build-morphology-db.py > build.log 2>&1 &
tail -f build.log
```

---

## Step 5: Wire into the Lector app

Once the database exists at `data/morphology.db`, update the server to read from it.

### 5a: Install better-sqlite3

```bash
npm install better-sqlite3
npm install -D @types/better-sqlite3
```

### 5b: Create `server/morphology-db.ts`

```typescript
import Database from 'better-sqlite3';
import path from 'path';

const DB_PATH = process.env.MORPHOLOGY_DB_PATH
  || path.join(process.cwd(), 'data', 'morphology.db');

let db: Database.Database | null = null;

function getDb(): Database.Database {
  if (!db) {
    try {
      db = new Database(DB_PATH, { readonly: true, fileMustExist: true });
      db.pragma('journal_mode = WAL');
      const stats = getMorphStats();
      console.log(`Morphology DB loaded: ${stats.totalForms.toLocaleString()} forms, `
                  + `${stats.totalLemmas.toLocaleString()} lemmas`);
    } catch (e) {
      console.warn(`Morphology DB not found at ${DB_PATH} — using in-memory fallback only`);
      return null as any;
    }
  }
  return db;
}

export interface MorphEntry {
  form: string;
  form_bare: string;
  lemma: string;
  language: string;
  part_of_speech: string;
  person: string | null;
  number: string | null;
  tense: string | null;
  mood: string | null;
  voice: string | null;
  case_: string | null;
  gender: string | null;
  dialect: string | null;
}

/**
 * Look up a word in the morphology database.
 * Tries exact form match first, then bare (diacritics-stripped) match.
 * Returns empty array if database is not available.
 */
export function lookupMorphDb(query: string): MorphEntry[] {
  const database = getDb();
  if (!database) return [];

  const q = query.trim();

  // Try exact match (case-insensitive)
  const exactStmt = database.prepare(
    'SELECT * FROM morphology WHERE form = ? COLLATE NOCASE'
  );
  let results = exactStmt.all(q) as MorphEntry[];
  if (results.length > 0) return results;

  // Try bare form match (diacritics stripped)
  const bareStmt = database.prepare(
    'SELECT * FROM morphology WHERE form_bare = ? COLLATE NOCASE'
  );
  results = bareStmt.all(q.toLowerCase()) as MorphEntry[];
  if (results.length > 0) return results;

  // Try prefix match for partial input (limit results)
  const prefixStmt = database.prepare(
    'SELECT * FROM morphology WHERE form_bare LIKE ? COLLATE NOCASE LIMIT 20'
  );
  results = prefixStmt.all(q.toLowerCase() + '%') as MorphEntry[];
  return results;
}

export function getMorphStats(): { totalEntries: number; totalLemmas: number; totalForms: number } {
  const database = getDb();
  if (!database) return { totalEntries: 0, totalLemmas: 0, totalForms: 0 };

  const entries = (database.prepare('SELECT COUNT(*) as c FROM morphology').get() as any).c;
  const lemmas = (database.prepare('SELECT COUNT(DISTINCT lemma) as c FROM morphology').get() as any).c;
  const forms = (database.prepare('SELECT COUNT(DISTINCT form) as c FROM morphology').get() as any).c;
  return { totalEntries: entries, totalLemmas: lemmas, totalForms: forms };
}

export function isAvailable(): boolean {
  try {
    const database = getDb();
    return database !== null;
  } catch {
    return false;
  }
}
```

### 5c: Update the parse route in `server/routes.ts`

The key change: check the SQLite morphology database first, fall back to the hand-built inflection table for anything not found.

```typescript
import { lookupMorphDb, isAvailable as morphDbAvailable } from './morphology-db';
import { lookupWord as lookupInflectionTable } from './data/inflections';

// In the GET /api/parse handler, replace the lookup logic:

app.get("/api/parse", (req, res) => {
  const q = (req.query.q as string || "").trim();
  if (!q) {
    return res.json({ query: q, found: false, entries: [] });
  }

  // 1. Try the comprehensive Morpheus-generated SQLite database
  const dbResults = lookupMorphDb(q);
  if (dbResults.length > 0) {
    // Group by lemma and format for the frontend
    const grouped = groupByLemma(dbResults);
    return res.json({ query: q, found: true, entries: grouped, source: "morpheus" });
  }

  // 2. Fall back to the hand-built in-memory inflection table
  const memResults = lookupInflectionTable(q);
  if (memResults.length > 0) {
    // Format memResults for the frontend (existing logic)
    // ... keep existing formatting code ...
    return res.json({ query: q, found: true, entries: formatted, source: "inflection-table" });
  }

  // 3. Not found — return external reference links
  return res.json({
    query: q,
    found: false,
    entries: [],
    externalLinks: {
      logeion: `https://logeion.uchicago.edu/${encodeURIComponent(q)}`,
      perseus: `https://www.perseus.tufts.edu/hopper/morph?l=${encodeURIComponent(q)}&la=greek`,
    }
  });
});

function groupByLemma(entries: MorphEntry[]) {
  const groups = new Map<string, { lemma: any; parses: any[] }>();

  for (const entry of entries) {
    const key = entry.lemma + "|" + entry.part_of_speech;
    if (!groups.has(key)) {
      groups.set(key, {
        lemma: {
          headword: entry.lemma,
          language: entry.language,
          partOfSpeech: entry.part_of_speech,
        },
        logeionUrl: `https://logeion.uchicago.edu/${encodeURIComponent(entry.lemma)}`,
        parses: [],
      });
    }

    const morph: any = { type: entry.part_of_speech };
    if (entry.person) morph.person = entry.person;
    if (entry.number) morph.number = entry.number;
    if (entry.tense) morph.tense = entry.tense;
    if (entry.mood) morph.mood = entry.mood;
    if (entry.voice) morph.voice = entry.voice;
    if (entry.case_) morph.case_ = entry.case_;
    if (entry.gender) morph.gender = entry.gender;

    // Build human-readable morphology string
    const parts = [];
    if (entry.person) parts.push(`${entry.person}${entry.number === "sg" ? "s" : entry.number === "pl" ? "p" : entry.number || ""}.`);
    if (entry.tense) parts.push(`${entry.tense}.`);
    if (entry.voice) parts.push(`${entry.voice}.`);
    if (entry.mood) parts.push(`${entry.mood}.`);
    if (entry.case_) parts.push(`${entry.case_}.`);
    if (entry.gender) parts.push(`${entry.gender}.`);
    if (entry.number && !entry.person) parts.push(`${entry.number}.`);

    groups.get(key)!.parses.push({
      surfaceForm: entry.form,
      morphology: morph,
      morphologyStr: parts.join(" ") || entry.part_of_speech,
    });
  }

  return Array.from(groups.values());
}
```

### 5d: Graceful degradation

The design is intentionally layered:
1. If `data/morphology.db` exists → use it (comprehensive, ~500K forms)
2. If not → fall back to the in-memory inflection table (~2,700 forms)
3. If neither has the word → show Logeion/Perseus external links

This means the app works immediately after cloning (no database build needed),
and gets dramatically better once you run the Morpheus build.

---

## Step 6: Verify the database

After the build completes, spot-check:

```bash
# Open the database
sqlite3 data/morphology.db

-- Check size
SELECT * FROM meta;

-- Test a Greek lookup
SELECT form, lemma, part_of_speech, tense, mood, voice, person, number
FROM morphology WHERE form_bare = 'ελυσα';

-- Test a Latin lookup
SELECT form, lemma, part_of_speech, tense, mood, voice, person, number
FROM morphology WHERE form = 'amabam';

-- Check coverage
SELECT language, COUNT(DISTINCT lemma) as lemmas, COUNT(DISTINCT form) as forms
FROM morphology GROUP BY language;

-- Find all forms of a lemma
SELECT DISTINCT form, tense, mood, voice, person, number, case_, gender
FROM morphology WHERE lemma = 'λύω' ORDER BY tense, mood, voice, person;
```

---

## Cleanup after build

Once the database is built and verified, you can remove the Morpheus data clone:

```bash
rm -rf morpheus-data
# Keep the wordlists in scripts/ for reference
```

The Docker image can stay (it's useful for future vocabulary expansion) or be removed:
```bash
docker rmi perseidsproject/morpheus
```

---

## Expanding coverage later

When you add new passages with unfamiliar vocabulary:

1. Extract new words not already in the database
2. Add them to the word list files
3. Re-run with `--resume` (only processes new words):
```bash
python3 scripts/build-morphology-db.py --resume
```

Or for a clean rebuild with everything:
```bash
python3 scripts/build-morphology-db.py
```

---

## Troubleshooting

**Morpheus Docker won't start:**
- Make sure Docker is running: `sudo systemctl start docker`
- Your user needs to be in the docker group: `sudo usermod -aG docker $USER` (then re-login)
- Check image: `docker images | grep morpheus`

**Greek Beta Code conversion fails:**
- Install betacode: `pip install betacode`
- Check `scripts/failed-greek-conversions.txt` for problem entries
- Some rare entries with unusual diacritics may fail — these can be manually added later

**Build is interrupted:**
- Just re-run with `--resume`: `python3 scripts/build-morphology-db.py --resume`
- It picks up where it left off

**Database is larger than expected:**
- Run `VACUUM` in sqlite3 to reclaim space
- Consider filtering out rare dialect forms: `DELETE FROM morphology WHERE dialect IS NOT NULL AND dialect NOT IN ('attic', 'prose')`

**Morpheus returns no results for a word:**
- Morpheus doesn't know every word. Proper nouns, rare poetic forms, very late vocabulary may be missing.
- The in-memory fallback handles your curated high-priority lemmas.
- You can manually insert entries: `INSERT INTO morphology (form, form_bare, lemma, language, part_of_speech, ...) VALUES (...)`

**Memory usage on VPS:**
- The SQLite database stays on disk. `better-sqlite3` memory-maps the file, so the OS manages caching efficiently.
- Even a 100MB database typically uses only 10-20MB of resident memory for the active working set.
- This is far more efficient than the in-memory Map approach for large datasets.

---

## Appendix: Ingesting Full LSJ and Lewis & Short Entries

The `definitions` table stores both short definitions (Logeion/helmadik) and full dictionary entries (Perseus XML). Full entries are stored as HTML in the `full_def` column and served to the Analyze tab.

### What this adds

- ~123k Greek entries from LSJ (Liddell–Scott–Jones), sourced from 27 Perseus XML volumes
- ~54k Latin entries from Lewis & Short, sourced from a single Perseus XML file
- HTML rendering with hierarchical sense numbering (I → A → 1 → a), italic glosses, citation links

### Prerequisites

```bash
pip install betacode lxml requests
```

### Running the ingestion

```bash
# Both Greek (LSJ) and Latin (Lewis & Short) — takes ~15 minutes, downloads ~200 MB
python3 scripts/ingest-lexica.py \
  --db /var/lib/docker/volumes/lector_data/_data/morphology.db \
  --lang both

# Greek only (re-ingest after script changes)
python3 scripts/ingest-lexica.py \
  --db /var/lib/docker/volumes/lector_data/_data/morphology.db \
  --lang greek

# Use a local cache directory to avoid re-downloading (place XML files there)
python3 scripts/ingest-lexica.py \
  --db /var/lib/docker/volumes/lector_data/_data/morphology.db \
  --lang greek \
  --lsj-dir /path/to/lsj-xml-cache/
```

After ingestion, restart the app container to pick up the new entries:

```bash
docker restart lector-app
```

### Cross-reference stubs

LSJ has many headwords that are stubs pointing to canonical forms (e.g. δένδρον → δένδρεον). The script stores the stub as-is. To resolve stubs and fill them with the target entry's content, run:

```python
import sqlite3, re, unicodedata
from betacode.conv import beta_to_uni

def strip(s):
    return unicodedata.normalize('NFC',
        ''.join(c for c in unicodedata.normalize('NFD', s)
                if unicodedata.category(c) != 'Mn')).lower()

def beta_to_bare(s):
    try: u = beta_to_uni(s)
    except: u = s
    return strip(u)

db = sqlite3.connect('/var/lib/docker/volumes/lector_data/_data/morphology.db')
cur = db.cursor()
REF_RE = re.compile(r'class="lex-ref">([^<\s][^<]*?)\.*\s*</span>')
cur.execute("SELECT lemma_bare, full_def FROM definitions WHERE language='greek' AND full_def IS NOT NULL AND length(full_def) < 800 AND full_def LIKE '%lex-ref%'")
fixed = 0
for bare, fd in cur.fetchall():
    refs = REF_RE.findall(fd)
    targets = [r.strip().rstrip('.') for r in refs if len(r.strip()) > 2 and r.strip().lower() not in ('v', 'v.', 'sq', 'cf')]
    if not targets: continue
    for target_bare in [strip(targets[-1]), beta_to_bare(targets[-1])]:
        row = cur.execute("SELECT full_def FROM definitions WHERE lemma_bare=? AND language='greek'", (target_bare,)).fetchone()
        if row and row[0] and len(row[0]) > len(fd):
            cur.execute("UPDATE definitions SET full_def=? WHERE lemma_bare=? AND language='greek'", (row[0], bare))
            fixed += 1; break
db.commit()
print(f"Fixed {fixed} stubs")
db.close()
```

### Important: Beta Code handling

The LSJ XML body text is in English/Latin. Greek forms (headwords, quotes, etymologies, cross-references) are in Beta Code within specific XML elements. The ingestion script converts Beta Code to Unicode only within explicitly Greek-tagged elements — NOT for the entire entry body. This distinction is critical: passing English prose through the Beta Code converter silently corrupts it ("poet. imper." → "ποετ. ιμπερ.").

If you suspect corruption, check a known entry:

```bash
python3 -c "
import sqlite3
db = sqlite3.connect('/var/lib/docker/volumes/lector_data/_data/morphology.db')
cur = db.cursor()
cur.execute(\"SELECT full_def FROM definitions WHERE lemma_bare='λυω' AND language='greek'\")
print(cur.fetchone()[0][:200])
db.close()
"
```

English words like "poet.", "imper.", "to loose" should appear as plain ASCII. If you see Greek characters where English should be, re-run the ingestion with the current script version.
