mini_imo/imo_eval.py at main · lfopensource/mini_imo

345 lines (280 loc) · 10.2 KB
import json
from dataclasses import dataclass
from typing import Optional, Literal, Dict, Any, List
from openai import OpenAI
# ===========================
# 0. LLM client
# ===========================
class LLMConfig:
    model: str = "gpt-5.1-mini"  # use stronger model for proofs if you want
    temperature: float = 0.0
    max_output_tokens: int = 1024
class LLMClient:
    def __init__(self, cfg: LLMConfig, api_key: Optional[str] = None):
        self.cfg = cfg
        self.client = OpenAI(api_key=api_key) if api_key else OpenAI()
    def chat(self, system_prompt: str, user_prompt: str, json_mode: bool = False) -> str:
        kwargs: Dict[str, Any] = {
            "model": self.cfg.model,
            "temperature": self.cfg.temperature,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            "max_completion_tokens": self.cfg.max_output_tokens,
        if json_mode:
            kwargs["response_format"] = {"type": "json_object"}
        resp = self.client.chat.completions.create(**kwargs)
        return resp.choices[0].message.content
def json_dumps(obj: Dict[str, Any]) -> str:
    return json.dumps(obj, ensure_ascii=False, indent=2)
# ===========================
# 1. Solver (LLM that tries to solve the problem)
# ===========================
SOLVER_SYSTEM_PROMPT = """
You are a mathematical problem solver.
Think step by step and write a clear solution.
At the end, explicitly state your final answer on a separate line starting with:
FINAL ANSWER:
def solve_problem(llm: LLMClient, problem: str) -> str:
    # You could use a different model for solver vs grader if you want.
    return llm.chat(SOLVER_SYSTEM_PROMPT, problem, json_mode=False)
# ===========================
# 2. Answer (short-answer) AutoGrader
# ===========================
GradeLabel = Literal["Correct", "Incorrect"]
ANSWER_SYSTEM_PROMPT = """
You are a strict mathematical answer grader.
1. Read a math problem.
2. Read a model's full solution text.
3. Extract the model's final answer (a number, expression, set, etc.).
4. Compare it to the golden answer.
5. Decide if they are mathematically equivalent.
Grading rules:
- You only grade the final result, not the reasoning steps.
- Accept algebraically equivalent forms (e.g., n(n+1)/2 == (n^2+n)/2).
- Accept numerically equivalent forms (e.g., 0.5 == 1/2).
- For sets/lists where order is not specified, ignore ordering ({1,2} == {2,1}).
- No partial credit: if the final answer is incomplete, ambiguous, or wrong, grade as "Incorrect".
- If you cannot confidently extract a clear final answer, grade as "Incorrect".
Return a JSON object with keys:
- "extracted_answer": string
- "equivalent": boolean
- "grade": "Correct" or "Incorrect"
- "notes": short explanation (1–3 sentences).
class AnswerGradeResult:
    extracted_answer: str
    equivalent: bool
    grade: GradeLabel
    notes: str
def grade_answer(
    llm: LLMClient,
    problem: str,
    model_solution: str,
    golden_answer: str,
) -> AnswerGradeResult:
    payload = {
        "problem": problem,
        "model_solution": model_solution,
        "golden_answer": golden_answer,
    user_prompt = json_dumps(payload)
    raw = llm.chat(ANSWER_SYSTEM_PROMPT, user_prompt, json_mode=True)
        data = json.loads(raw)
    except Exception as e:
        return AnswerGradeResult(
            extracted_answer="",
            equivalent=False,
            grade="Incorrect",
            notes=f"Failed to parse grader JSON: {e}",
    grade = "Correct" if data.get("grade", "Incorrect") == "Correct" else "Incorrect"
    return AnswerGradeResult(
        extracted_answer=data.get("extracted_answer", "").strip(),
        equivalent=bool(data.get("equivalent", False)),
        grade=grade, 
        notes=data.get("notes", "").strip(),
# ===========================
# 3. Proof AutoGrader
# ===========================
ProofLabel = Literal["Incorrect", "Partial", "Almost", "Correct"]
PROOF_SYSTEM_PROMPT = """
You are an IMO-style proof grader.
You receive:
- A math problem statement.
- A candidate solution (informal proof).
- A reference solution.
- Short grading guidelines describing what a complete solution must show.
- Judge the mathematical correctness and completeness of the candidate solution.
- Ignore style/formatting.
- Use four categories:
  - "Correct": fully correct, rigorous, complete.
  - "Almost": essentially correct with minor gaps or small errors, but fixable without new ideas.
  - "Partial": some relevant ideas or partial progress, but not close to full solution.
  - "Incorrect": mostly or entirely wrong / irrelevant.
Output a JSON object:
- "label": "Correct" | "Almost" | "Partial" | "Incorrect"
- "score": integer in {0, 1, 6, 7} using:
    - Correct  -> 7
    - Almost   -> 6
    - Partial  -> 1
    - Incorrect-> 0
- "rationale": 2–5 sentences explaining your grade.
class ProofGradeResult:
    label: ProofLabel
    score: int
    rationale: str
def grade_proof(
    llm: LLMClient,
    problem: str,
    candidate_solution: str,
    reference_solution: str,
    guidelines: str,
) -> ProofGradeResult:
    payload = {
        "problem": problem,
        "candidate_solution": candidate_solution,
        "reference_solution": reference_solution,
        "grading_guidelines": guidelines,
    user_prompt = json_dumps(payload)
    raw = llm.chat(PROOF_SYSTEM_PROMPT, user_prompt, json_mode=True)
        data = json.loads(raw)
    except Exception as e:
        return ProofGradeResult(
            label="Incorrect",
            score=0,
            rationale=f"Failed to parse grader JSON: {e}",
    label = data.get("label", "Incorrect")
    if label not in ["Incorrect", "Partial", "Almost", "Correct"]:
        label = "Incorrect"
    score_map = {"Correct": 7, "Almost": 6, "Partial": 1, "Incorrect": 0}
    score = score_map.get(label, 0)
    return ProofGradeResult(
        label=label,
        score=score,
        rationale=data.get("rationale", "").strip(),
# ===========================
# 4. Benchmark I/O + main eval loop
# ===========================
Expected JSONL format (one object per line):
For short-answer problems ("kind": "answer"):
  "id": "P1",
  "kind": "answer",
  "problem": "Compute 1+2+...+100.",
  "golden_answer": "5050"
For proof problems ("kind": "proof"):
  "id": "P2",
  "kind": "proof",
  "problem": "Let n be a positive integer ...",
  "reference_solution": "High-level description or full solution.",
  "guidelines": "Bullet points on what a full solution must do."
class EvalConfig:
    input_jsonl: str
    output_csv: str
    solver_model: str = "gpt-5.1-mini"
    grader_model: str = "gpt-5.1"  # you can use a stronger model for grading
def load_jsonl(path: str) -> List[Dict[str, Any]]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows
def run_eval(cfg: EvalConfig):
    solver_llm = LLMClient(LLMConfig(model=cfg.solver_model))
    grader_llm = LLMClient(LLMConfig(model=cfg.grader_model, temperature=0.0))
    problems = load_jsonl(cfg.input_jsonl)
    results_rows: List[Dict[str, Any]] = []
    for i, item in enumerate(problems):
        pid = item.get("id", f"prob_{i}")
        kind = item["kind"]
        problem = item["problem"]
        print(f"[{i+1}/{len(problems)}] Solving {pid} ({kind})")
        # 1. solve
        solution = solve_problem(solver_llm, problem)
        # 2. grade
        if kind == "answer":
            golden_answer = item["golden_answer"]
            ans_grade = grade_answer(grader_llm, problem, solution, golden_answer)
            results_rows.append({
                "id": pid,
                "kind": kind,
                "problem": problem,
                "solver_output": solution,
                "golden_answer": golden_answer,
                "extracted_answer": ans_grade.extracted_answer,
                "equivalent": int(ans_grade.equivalent),
                "grade_label": ans_grade.grade,
                "grade_score": 1 if ans_grade.grade == "Correct" else 0,
                "notes": ans_grade.notes,
        elif kind == "proof":
            ref_sol = item.get("reference_solution", "")
            guidelines = item.get("guidelines", "")
            proof_grade = grade_proof(grader_llm, problem, solution, ref_sol, guidelines)
            results_rows.append({
                "id": pid,
                "kind": kind,
                "problem": problem,
                "solver_output": solution,
                "reference_solution": ref_sol,
                "grade_label": proof_grade.label,
                "grade_score": proof_grade.score,
                "rationale": proof_grade.rationale,
        else:
            raise ValueError(f"Unknown kind: {kind}")
    # 3. write CSV
    fieldnames = sorted({k for row in results_rows for k in row.keys()})
    with open(cfg.output_csv, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in results_rows:
            writer.writerow(row)
    # 4. simple stats
    answer_correct = [
        r for r in results_rows if r["kind"] == "answer" and r.get("grade_score", 0) == 1
    answer_total = len([r for r in results_rows if r["kind"] == "answer"])
    proof_scores = [r["grade_score"] for r in results_rows if r["kind"] == "proof"]
    proof_total = len(proof_scores)
    print("\n=== Summary ===")
    if answer_total > 0:
        print(f"Short-answer accuracy: {len(answer_correct)}/{answer_total} "
              f"({len(answer_correct)/answer_total:.3f})")
    if proof_total > 0:
        print(f"Proof average score (0/1/6/7): {sum(proof_scores)/proof_total:.3f} over {proof_total} problems")
if __name__ == "__main__":
    # example usage
    cfg = EvalConfig(
        input_jsonl="imo_benchmark_example.jsonl",
        output_csv="imo_results.csv",
        solver_model="gpt-5.1-mini",
        grader_model="gpt-5.1",
    run_eval(cfg)
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

imo_eval.py

Latest commit

History

imo_eval.py

File metadata and controls