-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfix_pg_threads.py
More file actions
213 lines (167 loc) · 6.58 KB
/
fix_pg_threads.py
File metadata and controls
213 lines (167 loc) · 6.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
Fix Paul Graham threads that have NO_CLEAR_THREAD or missing names/descriptions.
Usage:
modal run insights_first/fix_pg_threads.py
"""
import modal
import json
from datetime import datetime
app = modal.App("pg-thread-fixer")
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
model_volume = modal.Volume.from_name("qwen-model-cache", create_if_missing=True)
vllm_image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install("vllm>=0.6.0", "torch", "transformers", "huggingface_hub")
)
# Prompt to generate name and description from insights
NAMING_PROMPT = """You are analyzing an "invisible thread" - a connection between insights from Paul Graham's essays that share a common underlying idea or principle.
## Topic hints from clustering:
{topics}
## The insights in this thread (from {num_essays} different essays):
{insights}
## Your task:
Generate a clear, specific name and description for this thread.
## Rules:
1. NO grandiose or fluffy language ("Unlocking", "Mastering", "The Power of", "Strategic Excellence")
2. Name should be LITERAL - 3-7 words describing the actual shared idea
3. Core claim should be the specific principle these insights share (15-25 words)
4. Connection should explain in 10 words or less what links them
GOOD thread names (specific, concrete):
- "Writing generates ideas, not just records them"
- "Startups fail from overextension"
- "Ignorance enables innovation"
BAD thread names (vague, fluffy):
- "Strategic Thinking in Business"
- "Keys to Success"
- "NO_CLEAR_THREAD"
## Respond with:
THREAD_NAME: [3-7 words, specific and concrete]
CORE_CLAIM: [15-25 words explaining the shared principle]
WHY_CONNECTED: [10 words or less on what links these insights]"""
@app.cls(
gpu="A10G",
image=vllm_image,
volumes={"/model-cache": model_volume},
timeout=600,
scaledown_window=300,
)
class ThreadFixer:
@modal.enter()
def load_model(self):
from vllm import LLM, SamplingParams
self.llm = LLM(
model=MODEL_ID,
download_dir="/model-cache",
trust_remote_code=True,
max_model_len=4096,
gpu_memory_utilization=0.9,
)
self.sampling_params = SamplingParams(
temperature=0.5,
max_tokens=300,
)
@modal.method()
def fix_thread(self, thread_key: str, topics: list, insights: list, num_essays: int) -> dict:
"""Generate proper name and description for a thread."""
# Format insights (limit to 8 to avoid context overflow)
insights_text = ""
for i, ins in enumerate(insights[:8], 1):
essay = ins.get('essay_title', 'Unknown Essay')[:50]
text = ins.get('insight_text', '')[:200]
insights_text += f"\nInsight {i} (from \"{essay}\"):\n\"{text}\"\n"
topics_text = ", ".join(topics[:4]) if topics else "No topic hints"
prompt = NAMING_PROMPT.format(
topics=topics_text,
insights=insights_text,
num_essays=num_essays
)
outputs = self.llm.generate([prompt], self.sampling_params)
response = outputs[0].outputs[0].text.strip()
# Parse response
thread_name = ""
core_claim = ""
why_connected = ""
for line in response.split("\n"):
line = line.strip()
if line.upper().startswith("THREAD_NAME:"):
thread_name = line[12:].strip()
elif line.upper().startswith("CORE_CLAIM:"):
core_claim = line[11:].strip()
elif line.upper().startswith("WHY_CONNECTED:"):
why_connected = line[14:].strip()
return {
"thread_key": thread_key,
"thread_name": thread_name,
"core_claim": core_claim,
"why_connected": why_connected,
"raw_response": response,
}
@app.local_entrypoint()
def main(
input_file: str = "insights_first/data/named_threads_v2_20260123_061212.json",
output_file: str = None,
):
"""Fix threads with NO_CLEAR_THREAD or missing names."""
# Load threads
print(f"Loading threads from {input_file}...")
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
threads = data['threads']
# Find threads that need fixing
threads_to_fix = []
for key, t in threads.items():
thread_name = t.get('thread_name', '')
core_claim = t.get('core_claim', '')
# Check for bad formatting
has_underscore = '_' in thread_name
has_allcaps = any(word.isupper() and len(word) > 3 for word in thread_name.split())
needs_fix = (
thread_name == 'NO_CLEAR_THREAD' or
not thread_name or
thread_name.endswith(' of') or # Truncated names
'too diverse' in core_claim.lower() or
has_underscore or # snake_case names
has_allcaps # ALL_CAPS names
)
if needs_fix:
threads_to_fix.append((
key,
t.get('representative_topics', []),
t.get('insights', []),
t.get('num_episodes', 1)
))
print(f"Found {len(threads_to_fix)} threads needing fixes")
if not threads_to_fix:
print("No threads to fix!")
return
# Fix threads
fixer = ThreadFixer()
import time
start = time.time()
results = list(fixer.fix_thread.starmap(threads_to_fix))
elapsed = time.time() - start
print(f"\nGenerated {len(results)} fixes in {elapsed:.1f}s")
# Apply fixes
for r in results:
key = r['thread_key']
if key in threads:
old_name = threads[key].get('thread_name', '')
threads[key]['thread_name'] = r['thread_name']
threads[key]['core_claim'] = r['core_claim']
threads[key]['why_connected'] = r['why_connected']
print(f"\n--- {key} ---")
print(f"OLD: \"{old_name}\"")
print(f"NEW: \"{r['thread_name']}\"")
print(f"Core: {r['core_claim']}")
print(f"Why: {r['why_connected']}")
# Update metadata
data['metadata']['fixed_threads'] = datetime.now().isoformat()
data['metadata']['threads_fixed'] = len(results)
# Save
if output_file is None:
output_file = input_file # Overwrite
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\nSaved to {output_file}")
if __name__ == "__main__":
main()