knowledge-graph-prototype-0/load_data.py at main · LevelInteractive/knowledge-graph-prototype-0

238 lines (194 loc) · 7.55 KB
#!/usr/bin/env python3
Unified data loading script for all KG projects.
Checks kg_export against each project's processed_ids.json,
reports what's new, then runs ingestion for any project with new data.
import json
import subprocess
from pathlib import Path
ROOT = Path(__file__).parent.resolve()
EXPORT_DATA = ROOT / "kg_export" / "data"
EXPORT_FILES = ROOT / "kg_export" / "files"
PROJECTS = {
    "KG-1": {
        "dir": ROOT / "kg-1",
        "processed_file": ROOT / "kg-1" / "processed_ids.json",
        "id_type": "uuid",
        "python": ROOT / "kg-1" / "transcript-kg" / ".venv" / "bin" / "python",
    "KG-2": {
        "dir": ROOT / "kg-2",
        "processed_file": ROOT / "kg-2" / "processed_ids.json",
        "id_type": "uuid",
        "python": ROOT / "kg-2" / "venv" / "bin" / "python",
    "KG-3": {
        "dir": ROOT / "kg-3",
        "processed_file": ROOT / "kg-3" / "processed_ids.json",
        "id_type": "file_stem",
        "files_dir": ROOT / "kg-3" / "upload-files",
        "prepare_script": ROOT / "kg-3" / "prepare_uploads.py",
        "python": ROOT / "kg-3" / "app" / "backend" / "venv" / "bin" / "python",
    "KG-4": {
        "dir": ROOT / "kg-4",
        "processed_file": ROOT / "kg-4" / "processed_ids.json",
        "id_type": "file_stem",
        "files_dir": ROOT / "kg-4" / "content_lists",
        "prepare_script": ROOT / "kg-4" / "prepare_content.py",
        "python": ROOT / "kg-4" / "venv" / "bin" / "python",
def load_processed(path):
    if path.exists():
        with open(path) as f:
            return set(json.load(f))
    return set()
def get_all_meeting_uuids():
    """Get all meeting UUIDs from the export."""
    with open(EXPORT_DATA / "zoom_meetings.json") as f:
        meetings = json.load(f)
    with open(EXPORT_DATA / "zoom_past_meetings.json") as f:
        past = json.load(f)
    return set(m["uuid"] for m in meetings) | set(pm["uuid"] for pm in past)
def get_file_stems(directory, ext):
    """Get all file stems from a directory."""
    if not directory.exists():
        return set()
    return set(f.stem for f in directory.glob(f"*{ext}"))
def run_script(script_path, args=None, cwd=None, python=None):
    """Run a Python script and stream output."""
    cmd = [str(python) if python else sys.executable, str(script_path)]
    if args:
        cmd.extend(args)
    result = subprocess.run(cmd, cwd=cwd or str(script_path.parent))
    return result.returncode
def check_export():
    """Verify kg_export exists and report basic stats."""
    if not EXPORT_DATA.exists():
        print("ERROR: kg_export/data/ not found")
        sys.exit(1)
    with open(EXPORT_DATA / "zoom_meetings.json") as f:
        meetings = json.load(f)
    print(f"  Export: {len(meetings)} meetings in kg_export/data/")
    file_count = sum(1 for _ in EXPORT_FILES.iterdir()) if EXPORT_FILES.exists() else 0
    print(f"  Export: {file_count:,} files in kg_export/files/")
    return len(meetings)
def prepare_project_files(name, config):
    """Run prepare script for projects that need it, so file counts are current."""
    if "prepare_script" not in config:
        return True
    print(f"  Preparing {name} files...")
    rc = run_script(config["prepare_script"], python=config.get("python"))
    if rc != 0:
        print(f"  ERROR: {name} prepare script failed (exit {rc})")
        return False
    return True
def check_project(name, config):
    """Check a single project and return (total, processed, new) counts."""
    processed = load_processed(config["processed_file"])
    if config["id_type"] == "uuid":
        total_ids = get_all_meeting_uuids()
        # File-stem based — need to check prepared files
        files_dir = config["files_dir"]
        ext = ".txt" if name == "KG-3" else ".json"
        total_ids = get_file_stems(files_dir, ext)
    new_ids = total_ids - processed
    done = len(total_ids & processed)
    return len(total_ids), done, len(new_ids)
def run_project(name, config, new_count):
    """Run ingestion for a single project."""
    print(f"\n{'='*50}")
    print(f"  Running {name} ({new_count} new)")
    print(f"{'='*50}\n")
    # Run the main ingestion script
    python = config.get("python")
    if name == "KG-1":
        script = config["dir"] / "transcript-kg" / "ingest_zts_data.py"
        rc = run_script(script, python=python)
    elif name == "KG-2":
        script = config["dir"] / "ingest_transcripts.py"
        rc = run_script(script, python=python)
    elif name == "KG-3":
        script = config["dir"] / "extract_direct.py"
        rc = run_script(script, python=python)
    elif name == "KG-4":
        script = config["dir"] / "ingest_raganything.py"
        rc = run_script(script, python=python)
    if rc != 0:
        print(f"  WARNING: {name} exited with code {rc}")
        return False
    return True
def main():
    import argparse
    parser = argparse.ArgumentParser(description="Unified data loader for all KG projects")
    parser.add_argument("--check", action="store_true", help="Check only, don't run ingestion")
    parser.add_argument("--only", nargs="+", choices=["kg-1", "kg-2", "kg-3", "kg-4"],
                        help="Only run specific projects (e.g., --only kg-1 kg-3)")
    args = parser.parse_args()
    print("=" * 50)
    print("  KG Data Loader")
    print("=" * 50)
    # Check export
    print("\nChecking export data...")
    check_export()
    # Determine which projects to check
    projects_to_check = PROJECTS
    if args.only:
        filter_names = set(o.upper() for o in args.only)
        projects_to_check = {k: v for k, v in PROJECTS.items() if k.upper() in filter_names}
    # Prepare files for KG-3/KG-4 so counts reflect current export
    for name, config in projects_to_check.items():
        if "prepare_script" in config:
            if not prepare_project_files(name, config):
                print(f"  WARNING: Could not prepare {name}, counts may be stale")
    # Check each project
    print("\nChecking projects...\n")
    print(f"  {'Project':<8} {'Total':>8} {'Done':>8} {'New':>8}")
    print(f"  {'-'*8} {'-'*8} {'-'*8} {'-'*8}")
    results = {}
    for name, config in projects_to_check.items():
        total, done, new = check_project(name, config)
        results[name] = {"total": total, "done": done, "new": new, "config": config}
        marker = " <--" if new > 0 else ""
        print(f"  {name:<8} {total:>8,} {done:>8,} {new:>8,}{marker}")
    projects_with_new = {name: v for name, v in results.items() if v["new"] > 0}
    if not projects_with_new:
        print("\n  All projects are up to date. Nothing to do.")
        return
    if args.check:
        print(f"\n  {len(projects_with_new)} project(s) have new data. Run without --check to ingest.")
        return
    # Confirm
    print(f"\n  Will run ingestion for: {', '.join(projects_with_new.keys())}")
        answer = input("  Proceed? [y/N] ").strip().lower()
    except (EOFError, KeyboardInterrupt):
        print("\n  Aborted.")
        return
    if answer != "y":
        print("  Aborted.")
        return
    # Run ingestion
    succeeded = []
    failed = []
    for name, info in projects_with_new.items():
        ok = run_project(name, info["config"], info["new"])
        if ok:
            succeeded.append(name)
        else:
            failed.append(name)
    # Final report
    print(f"\n{'='*50}")
    print(f"  Done!")
    if succeeded:
        print(f"  Succeeded: {', '.join(succeeded)}")
    if failed:
        print(f"  Failed:    {', '.join(failed)}")
    print(f"{'='*50}")
if __name__ == "__main__":
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

load_data.py

Latest commit

History

load_data.py

File metadata and controls