invisible-threads/fix_pg_threads.py at main · baboonzero/invisible-threads

213 lines (167 loc) · 6.58 KB
Fix Paul Graham threads that have NO_CLEAR_THREAD or missing names/descriptions.
    modal run insights_first/fix_pg_threads.py
import modal
import json
from datetime import datetime
app = modal.App("pg-thread-fixer")
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
model_volume = modal.Volume.from_name("qwen-model-cache", create_if_missing=True)
vllm_image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install("vllm>=0.6.0", "torch", "transformers", "huggingface_hub")
# Prompt to generate name and description from insights
NAMING_PROMPT = """You are analyzing an "invisible thread" - a connection between insights from Paul Graham's essays that share a common underlying idea or principle.
## Topic hints from clustering:
## The insights in this thread (from {num_essays} different essays):
## Your task:
Generate a clear, specific name and description for this thread.
1. NO grandiose or fluffy language ("Unlocking", "Mastering", "The Power of", "Strategic Excellence")
2. Name should be LITERAL - 3-7 words describing the actual shared idea
3. Core claim should be the specific principle these insights share (15-25 words)
4. Connection should explain in 10 words or less what links them
GOOD thread names (specific, concrete):
- "Writing generates ideas, not just records them"
- "Startups fail from overextension"
- "Ignorance enables innovation"
BAD thread names (vague, fluffy):
- "Strategic Thinking in Business"
- "Keys to Success"
- "NO_CLEAR_THREAD"
## Respond with:
THREAD_NAME: [3-7 words, specific and concrete]
CORE_CLAIM: [15-25 words explaining the shared principle]
WHY_CONNECTED: [10 words or less on what links these insights]"""
    gpu="A10G",
    image=vllm_image,
    volumes={"/model-cache": model_volume},
    timeout=600,
    scaledown_window=300,
class ThreadFixer:
    @modal.enter()
    def load_model(self):
        from vllm import LLM, SamplingParams
        self.llm = LLM(
            model=MODEL_ID,
            download_dir="/model-cache",
            trust_remote_code=True,
            max_model_len=4096,
            gpu_memory_utilization=0.9,
        self.sampling_params = SamplingParams(
            temperature=0.5,
            max_tokens=300,
    @modal.method()
    def fix_thread(self, thread_key: str, topics: list, insights: list, num_essays: int) -> dict:
        """Generate proper name and description for a thread."""
        # Format insights (limit to 8 to avoid context overflow)
        insights_text = ""
        for i, ins in enumerate(insights[:8], 1):
            essay = ins.get('essay_title', 'Unknown Essay')[:50]
            text = ins.get('insight_text', '')[:200]
            insights_text += f"\nInsight {i} (from \"{essay}\"):\n\"{text}\"\n"
        topics_text = ", ".join(topics[:4]) if topics else "No topic hints"
        prompt = NAMING_PROMPT.format(
            topics=topics_text,
            insights=insights_text,
            num_essays=num_essays
        outputs = self.llm.generate([prompt], self.sampling_params)
        response = outputs[0].outputs[0].text.strip()
        # Parse response
        thread_name = ""
        core_claim = ""
        why_connected = ""
        for line in response.split("\n"):
            line = line.strip()
            if line.upper().startswith("THREAD_NAME:"):
                thread_name = line[12:].strip()
            elif line.upper().startswith("CORE_CLAIM:"):
                core_claim = line[11:].strip()
            elif line.upper().startswith("WHY_CONNECTED:"):
                why_connected = line[14:].strip()
        return {
            "thread_key": thread_key,
            "thread_name": thread_name,
            "core_claim": core_claim,
            "why_connected": why_connected,
            "raw_response": response,
@app.local_entrypoint()
    input_file: str = "insights_first/data/named_threads_v2_20260123_061212.json",
    output_file: str = None,
    """Fix threads with NO_CLEAR_THREAD or missing names."""
    # Load threads
    print(f"Loading threads from {input_file}...")
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    threads = data['threads']
    # Find threads that need fixing
    threads_to_fix = []
    for key, t in threads.items():
        thread_name = t.get('thread_name', '')
        core_claim = t.get('core_claim', '')
        # Check for bad formatting
        has_underscore = '_' in thread_name
        has_allcaps = any(word.isupper() and len(word) > 3 for word in thread_name.split())
        needs_fix = (
            thread_name == 'NO_CLEAR_THREAD' or 
            not thread_name or
            thread_name.endswith(' of') or  # Truncated names
            'too diverse' in core_claim.lower() or
            has_underscore or  # snake_case names
            has_allcaps  # ALL_CAPS names
        if needs_fix:
            threads_to_fix.append((
                t.get('representative_topics', []),
                t.get('insights', []),
                t.get('num_episodes', 1)
    print(f"Found {len(threads_to_fix)} threads needing fixes")
    if not threads_to_fix:
        print("No threads to fix!")
        return
    # Fix threads
    fixer = ThreadFixer()
    import time
    start = time.time()
    results = list(fixer.fix_thread.starmap(threads_to_fix))
    elapsed = time.time() - start
    print(f"\nGenerated {len(results)} fixes in {elapsed:.1f}s")
    # Apply fixes
    for r in results:
        key = r['thread_key']
        if key in threads:
            old_name = threads[key].get('thread_name', '')
            threads[key]['thread_name'] = r['thread_name']
            threads[key]['core_claim'] = r['core_claim']
            threads[key]['why_connected'] = r['why_connected']
            print(f"\n--- {key} ---")
            print(f"OLD: \"{old_name}\"")
            print(f"NEW: \"{r['thread_name']}\"")
            print(f"Core: {r['core_claim']}")
            print(f"Why: {r['why_connected']}")
    # Update metadata
    data['metadata']['fixed_threads'] = datetime.now().isoformat()
    data['metadata']['threads_fixed'] = len(results)
    if output_file is None:
        output_file = input_file  # Overwrite
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\nSaved to {output_file}")
if __name__ == "__main__":
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

fix_pg_threads.py

Latest commit

History

fix_pg_threads.py

File metadata and controls