invisible-threads/check_thread_quality.py at main · baboonzero/invisible-threads

239 lines (194 loc) · 8.16 KB
Check quality of named threads against the original mission.
    modal run insights_first/check_thread_quality.py --input insights_first/data/named_threads_20260120_033944.json
import modal
import json
from datetime import datetime
app = modal.App("thread-quality-checker")
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
model_volume = modal.Volume.from_name("qwen-model-cache", create_if_missing=True)
vllm_image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install("vllm>=0.6.0", "torch", "transformers", "huggingface_hub")
CHECKER_PROMPT = """You are evaluating whether a "thread" of insights is genuinely valuable.
A good thread should:
1. Have a SPECIFIC, non-obvious connecting theme (not generic business advice)
2. Contain insights that ACTUALLY support the claimed theme
3. Provide value someone couldn't get from reading a business book
## Thread to evaluate:
THREAD NAME: {thread_name}
CORE CLAIM: {core_claim}
INSIGHTS IN THIS THREAD:
## Evaluate on these dimensions (be harsh - most threads are mediocre):
1. THEME_SPECIFICITY (1-10): Is this a specific, testable claim or vague category?
   - 1-4: Generic category ("leadership", "growth", "culture")
   - 5-7: Somewhat specific direction but not actionable
   - 8-10: Specific claim you could argue for/against
2. INSIGHT_ALIGNMENT (1-10): Do the insights ACTUALLY support the claimed theme?
   - 1-4: Insights are loosely related at best, shoehorned in
   - 5-7: Insights touch on the theme but don't strongly support it
   - 8-10: Each insight clearly demonstrates the thread's claim
3. NOVELTY (1-10): Would a well-read person already know this?
   - 1-4: Common knowledge dressed up
   - 5-7: Interesting angle on known topic
   - 8-10: Genuinely surprising connection
4. ACTIONABILITY (1-10): Can someone DO something with this?
   - 1-4: Just an observation
   - 5-7: Directionally useful
   - 8-10: Clear, specific actions
## Respond with JSON only:
  "theme_specificity": N,
  "insight_alignment": N,
  "novelty": N,
  "actionability": N,
  "average_score": N.N,
  "verdict": "STRONG" | "MODERATE" | "WEAK" | "REJECT",
  "one_line_summary": "What this thread actually offers in plain language"
    gpu="A10G",
    image=vllm_image,
    volumes={"/model-cache": model_volume},
    timeout=600,
    scaledown_window=300,
class ThreadChecker:
    @modal.enter()
    def load_model(self):
        from vllm import LLM, SamplingParams
        self.llm = LLM(
            model=MODEL_ID,
            download_dir="/model-cache",
            trust_remote_code=True,
            max_model_len=4096,
            gpu_memory_utilization=0.9,
        self.sampling_params = SamplingParams(
            temperature=0.3,
            max_tokens=500,
    @modal.method()
    def check_thread(self, thread_name: str, core_claim: str, insights: list[str]) -> dict:
        """Check quality of a single thread."""
        import re
        insights_text = "\n".join(f"{i+1}. {ins}" for i, ins in enumerate(insights))
        prompt = CHECKER_PROMPT.format(
            thread_name=thread_name,
            core_claim=core_claim,
            insights=insights_text
        outputs = self.llm.generate([prompt], self.sampling_params)
        response = outputs[0].outputs[0].text.strip()
        # Try to parse JSON
        try:
            # Find JSON in response
            json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
            if json_match:
                result = json.loads(json_match.group())
            else:
                result = {"error": "No JSON found", "raw": response}
        except:
            result = {"error": "JSON parse failed", "raw": response}
        result["thread_name"] = thread_name
        return result
@app.local_entrypoint()
def main(input: str, output: str = None, min_episodes: int = 3):
    """Check quality of all clear threads."""
    print(f"Loading threads from {input}...")
    with open(input, 'r', encoding='utf-8') as f:
        data = json.load(f)
    threads = data['threads']
    # Filter to clear threads only AND by min_episodes (true invisible threads)
    clear_threads = []
    skipped_single_source = 0
    for name, t in threads.items():
        if not t.get('thread_name') or t['thread_name'] == 'NO_CLEAR_THREAD':
            continue
        # Filter by episode count - single-source threads aren't "invisible"
        num_episodes = t.get('num_episodes', len(t.get('episodes', [])))
        if num_episodes < min_episodes:
            skipped_single_source += 1
            print(f"  SKIP: {t['thread_name'][:40]} (only {num_episodes} episode(s))")
            continue
        clear_threads.append((name, t))
    print(f"Found {len(clear_threads)} clear threads to check (>={min_episodes} episodes)")
    if skipped_single_source > 0:
        print(f"  Skipped {skipped_single_source} single-source threads")
    # Prepare inputs
    thread_inputs = []
    for name, t in clear_threads:
        insights = [i['insight_text'] for i in t['insights'][:10]]
        thread_inputs.append((t['thread_name'], t.get('core_claim', ''), insights))
    # Check threads
    checker = ThreadChecker()
    import time
    start = time.time()
    results = []
    for (thread_name, core_claim, insights), result in zip(
        thread_inputs,
        checker.check_thread.starmap(thread_inputs)
        results.append(result)
        verdict = result.get('verdict', 'UNKNOWN')
        avg = result.get('average_score', 0)
        print(f"  {thread_name[:40]:40} → {verdict} (avg: {avg})")
    elapsed = time.time() - start
    print(f"\nChecking complete in {elapsed:.1f}s")
    # Summary stats
    verdicts = [r.get('verdict', 'UNKNOWN') for r in results]
    print(f"\n{'='*60}")
    print("QUALITY SUMMARY")
    print(f"{'='*60}")
    print(f"STRONG: {verdicts.count('STRONG')}")
    print(f"MODERATE: {verdicts.count('MODERATE')}")
    print(f"WEAK: {verdicts.count('WEAK')}")
    print(f"REJECT: {verdicts.count('REJECT')}")
    # Average scores
    valid_results = [r for r in results if 'novelty' in r]
    if valid_results:
        avg_novelty = sum(r['novelty'] for r in valid_results) / len(valid_results)
        avg_specificity = sum(r['theme_specificity'] for r in valid_results) / len(valid_results)
        avg_alignment = sum(r['insight_alignment'] for r in valid_results) / len(valid_results)
        avg_action = sum(r['actionability'] for r in valid_results) / len(valid_results)
        print(f"\nAverage scores across {len(valid_results)} threads:")
        print(f"  Theme Specificity: {avg_specificity:.1f}/10")
        print(f"  Insight Alignment: {avg_alignment:.1f}/10")
        print(f"  Novelty: {avg_novelty:.1f}/10")
        print(f"  Actionability: {avg_action:.1f}/10")
    # Save results
    if output is None:
        output = input.replace("named_threads_", "quality_check_")
    output_data = {
        'metadata': {
            'timestamp': datetime.now().isoformat(),
            'input_file': input,
            'threads_checked': len(results),
            'min_episodes_filter': min_episodes,
            'skipped_single_source': skipped_single_source,
        'summary': {
            'strong': verdicts.count('STRONG'),
            'moderate': verdicts.count('MODERATE'),
            'weak': verdicts.count('WEAK'),
            'reject': verdicts.count('REJECT'),
        'results': results,
    with open(output, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    print(f"\nResults saved to: {output}")
    # Show best threads
    if valid_results:
        print(f"\n{'='*60}")
        print("TOP THREADS (by average score)")
        print(f"{'='*60}")
        sorted_results = sorted(valid_results, key=lambda r: r.get('average_score', 0), reverse=True)
        for r in sorted_results[:5]:
            print(f"\n{r['thread_name']}")
            print(f"  Scores: Spec={r['theme_specificity']} Align={r['insight_alignment']} Nov={r['novelty']} Act={r['actionability']}")
            print(f"  Summary: {r.get('one_line_summary', 'N/A')}")
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

check_thread_quality.py

Latest commit

History

check_thread_quality.py

File metadata and controls