LOCK-R/run_ablation_matrix.py at main · aak204/LOCK-R

191 lines (164 loc) · 7.31 KB
from __future__ import annotations
import argparse
import json
from copy import deepcopy
from pathlib import Path
from lockr.runners.benchmark import BenchmarkRunner
from lockr.schemas import BenchmarkConfig
def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Run the extended LOCK-R ablation matrix.")
    parser.add_argument("--config", type=Path, default=Path("configs/pilot_20.json"))
    parser.add_argument("--output-root", type=Path, default=Path("outputs/ablation_matrix"))
    parser.add_argument("--fast", action="store_true", help="Use a reduced episode set for faster diagnostics.")
    return parser
def load_config(path: Path) -> BenchmarkConfig:
    return BenchmarkConfig.model_validate(json.loads(path.read_text(encoding="utf-8")))
def clone_config(config: BenchmarkConfig) -> BenchmarkConfig:
    return BenchmarkConfig.model_validate(config.model_dump(mode="json"))
def maybe_reduce_episodes(config: BenchmarkConfig, *, fast: bool) -> None:
    if fast:
        config.episodes = config.episodes[:2]
def prepare_lm_config(
    base: BenchmarkConfig,
    suite_name: str,
    regimes: list[str],
    proposal_mode: str,
    verifier_mode: str,
    fast: bool,
) -> BenchmarkConfig:
    config = clone_config(base)
    config.suite_name = suite_name
    maybe_reduce_episodes(config, fast=fast)
    config.agent.kind = "openai_compatible_json"
    config.agent.proposal_generation_mode = proposal_mode  # type: ignore[assignment]
    config.agent.verifier_generation_mode = verifier_mode  # type: ignore[assignment]
    config.agent.repair_generation_mode = "qwen_nonthinking_eval"
    config.parallel_workers = 4
    config.regimes = regimes  # type: ignore[assignment]
    return config
def prepare_baseline_config(base: BenchmarkConfig, *, suite_name: str, fast: bool) -> BenchmarkConfig:
    config = clone_config(base)
    config.suite_name = suite_name
    maybe_reduce_episodes(config, fast=fast)
    config.agent.kind = "heuristic_confirmation"
    config.parallel_workers = 4
    config.regimes = [  # type: ignore[assignment]
        "heuristic_lexical_agent",
        "bayesian_scripted_agent",
        "oracle_query_exact_update",
    return config
def run_suite(config: BenchmarkConfig, output_root: Path) -> tuple[str, Path, dict[str, object]]:
    output_dir = output_root / config.suite_name
    summary = BenchmarkRunner(config=config, output_dir=output_dir).run()
    return config.suite_name, output_dir, summary.model_dump(mode="json")
def write_matrix_csv(rows: list[dict[str, object]], path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(
            handle,
            fieldnames=[
                "suite_name",
                "regime",
                "episode_count",
                "mean_r_mean",
                "r_mean_ci_low",
                "r_mean_ci_high",
                "mean_k_c",
                "mean_cear",
                "mean_delta_cal",
                "mean_num_negative_steps_seen",
                "mean_num_posterior_rewrites",
                "mean_num_unique_tools_used",
                "mean_tool_loop_repetition_rate",
                "accuracy",
        writer.writeheader()
        for row in rows:
            writer.writerow(row)
def write_matrix_markdown(rows: list[dict[str, object]], path: Path) -> None:
    lines = [
        "| Suite | Regime | Episodes | R_mean | 95% CI | K_c | CEAR | delta_cal | neg_steps | rewrites | unique_tools | loop_rate | Accuracy |",
        "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
    for row in rows:
        lines.append(
            f"| {row['suite_name']} | {row['regime']} | {row['episode_count']} | "
            f"{float(row['mean_r_mean']):.4f} | "
            f"[{float(row['r_mean_ci_low']):.4f}, {float(row['r_mean_ci_high']):.4f}] | "
            f"{float(row['mean_k_c']):.4f} | {float(row['mean_cear']):.4f} | "
            f"{float(row['mean_delta_cal']):.4f} | "
            f"{float(row['mean_num_negative_steps_seen']):.2f} | "
            f"{float(row['mean_num_posterior_rewrites']):.2f} | "
            f"{float(row['mean_num_unique_tools_used']):.2f} | "
            f"{float(row['mean_tool_loop_repetition_rate']):.4f} | "
            f"{float(row['accuracy']):.3f} |"
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def main() -> None:
    args = build_parser().parse_args()
    os.environ.setdefault("OPENAI_API_KEY", "lm-studio")
    base = load_config(args.config)
    suites = [
        prepare_lm_config(
            base,
            suite_name="lm_nonthinking_ablation",
            regimes=[
                "same_model_locked_agent",
                "blind_checker",
                "prompt_debias_baseline",
                "oracle_query_control",
            proposal_mode="qwen_nonthinking_eval",
            verifier_mode="qwen_nonthinking_eval",
            fast=args.fast,
        prepare_lm_config(
            base,
            suite_name="thinking_compare_same_blind_nonthinking",
            regimes=["same_model_locked_agent", "blind_checker"],
            proposal_mode="qwen_nonthinking_eval",
            verifier_mode="qwen_nonthinking_eval",
            fast=args.fast,
        prepare_lm_config(
            base,
            suite_name="thinking_compare_same_blind_thinking",
            regimes=["same_model_locked_agent", "blind_checker"],
            proposal_mode="qwen_precise_coding_thinking",
            verifier_mode="qwen_nonthinking_eval",
            fast=args.fast,
        prepare_baseline_config(base, suite_name="sanity_baselines", fast=args.fast),
    matrix_rows: list[dict[str, object]] = []
    for suite in suites:
        suite_name, output_dir, summary = run_suite(suite, args.output_root)
        print(f"completed {suite_name}: {output_dir}")
        for regime_summary in summary["regime_summaries"]:
            matrix_rows.append(
                    "suite_name": suite_name,
                    "regime": regime_summary["regime"],
                    "episode_count": regime_summary["episode_count"],
                    "mean_r_mean": regime_summary["mean_r_mean"],
                    "r_mean_ci_low": regime_summary["r_mean_ci_low"],
                    "r_mean_ci_high": regime_summary["r_mean_ci_high"],
                    "mean_k_c": regime_summary["mean_k_c"],
                    "mean_cear": regime_summary["mean_cear"],
                    "mean_delta_cal": regime_summary["mean_delta_cal"],
                    "mean_num_negative_steps_seen": regime_summary["mean_num_negative_steps_seen"],
                    "mean_num_posterior_rewrites": regime_summary["mean_num_posterior_rewrites"],
                    "mean_num_unique_tools_used": regime_summary["mean_num_unique_tools_used"],
                    "mean_tool_loop_repetition_rate": regime_summary["mean_tool_loop_repetition_rate"],
                    "accuracy": regime_summary["accuracy"],
    write_matrix_csv(matrix_rows, args.output_root / "matrix_summary.csv")
    write_matrix_markdown(matrix_rows, args.output_root / "matrix_summary.md")
if __name__ == "__main__":
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

run_ablation_matrix.py

Latest commit

History

run_ablation_matrix.py

File metadata and controls