-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsummarize_posts.py
More file actions
284 lines (257 loc) · 10.5 KB
/
summarize_posts.py
File metadata and controls
284 lines (257 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/usr/bin/env python3
import os
import psycopg2
import json
import time
import logging
from typing import List, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from openai import OpenAI
logger = logging.getLogger(__name__)
model_name = "gpt-4.1-mini"
summarization_instructions = """
You are an expert analyst with a warm tone. You specialize in extracting high-signal insights from recent forum-style content, where ideas can be messy, fast-moving, and conversational. Your goal is to distill the essential themes into a summary.
Given the set of messages, produce a JSON object containing:
1. "topic"
→ A concise thematic synthesis (maximum 3 sentences) written in a professional but easily digestible and simple form.
→ The summary must begin with a short introductory phrase. Here are some examples for inspiration but ***ENSURE YOU NEVER USE THEM VERBATIM***:
- "Recent discussions centered on..."
- "The community is currently exploring..."
- "A key theme emerging lately involves..."
- "Much of the recent dialogue highlights..."
- "There is growing interest regarding..."
→ Focus entirely on the ideas:
- key motivations, questions, or pain points
- notable nuances, tradeoffs, or emerging themes
- relationships or tensions between subtopics
→ Keep it approachable but still thoughtful and insight-rich.
2. "few_words"
→ 3–7 punchy keywords or short phrases that capture the core ideas.
→ Avoid generic filler unless essential.
3. "one_sentence"
→ One friendly, clear sentence that expresses the core insight.
Requirements:
- Maintain a warm, casual, friendly tone while keeping the ideas sharp and useful.
- Capture not just what is being explored, but why it matters and what tensions or possibilities exist.
- Include references to people, projects, or mechanisms only when relevant to the ideas themselves.
- Write cleanly and simply; aim for clarity and usefulness over formality.
Return only the JSON object, nothing else.
"""
def fetch_all_channel_ids(db_url: str) -> List[int]:
conn = psycopg2.connect(db_url)
try:
with conn.cursor() as cur:
cur.execute("SELECT DISTINCT channel_id FROM trank.messages;")
rows = cur.fetchall()
return [r[0] for r in rows]
finally:
conn.close()
def get_top_messages(db_url: str, channel_id: int, limit: int) -> List[tuple]:
conn = psycopg2.connect(db_url)
try:
with conn.cursor() as cur:
query = """
WITH latest_messages AS (
SELECT
id,
channel_id,
date,
from_id,
message
FROM trank.messages
WHERE channel_id = %s
AND message IS NOT NULL
ORDER BY date DESC, id DESC
LIMIT 1000
),
interaction AS (
SELECT
parent.channel_id,
parent.id AS message_id,
m.from_id AS user_id,
'reply' AS interaction_type
FROM trank.messages m
JOIN latest_messages parent
ON parent.channel_id = m.channel_id
AND m.reply_to_msg_id = parent.id
UNION ALL
SELECT
r.channel_id,
r.message_id,
r.user_id,
'reaction' AS interaction_type
FROM trank.message_reactions r
JOIN latest_messages lm
ON lm.channel_id = r.channel_id
AND lm.id = r.message_id
),
interaction_scores AS (
SELECT
i.message_id,
i.interaction_type,
s.user_id,
s.value AS user_score
FROM interaction i
JOIN trank.scores s
ON s.channel_id = i.channel_id
AND s.user_id = i.user_id
),
weighted AS (
SELECT
message_id,
SUM(
user_score *
CASE interaction_type
WHEN 'reply' THEN 10
WHEN 'reaction' THEN 20
END
) AS score
FROM interaction_scores
GROUP BY message_id
)
SELECT
lm.id,
lm.channel_id,
lm.date,
lm.from_id,
lm.message,
COALESCE(w.score, 0) AS score
FROM latest_messages lm
LEFT JOIN weighted w
ON w.message_id = lm.id
ORDER BY score DESC
LIMIT %s
"""
cur.execute(query, (channel_id, limit))
return cur.fetchall()
finally:
conn.close()
def summarize_with_openai(messages: List[str], client: OpenAI, max_retries: int = 3, base_delay: float = 1.0) -> Dict[str, Any]:
valid = [m for m in messages if m and len(m.strip()) > 5]
if not valid:
return None
payload = json.dumps(valid, ensure_ascii=False)
prompt = "Conversation:\n" + payload
last_error = None
for attempt in range(max_retries):
try:
resp = client.responses.create(
model=model_name,
input=prompt,
temperature=0.6,
instructions=summarization_instructions,
text={
"format": {
"type": "json_schema",
"name": "channel_summary",
"schema": {
"type": "object",
"properties": {
"topic": {"type": "string"},
"few_words": {"type": "string"},
"one_sentence": {"type": "string"}
},
"required": ["topic", "few_words", "one_sentence"],
"additionalProperties": False
},
"strict": True
}
},
)
return json.loads(resp.output_text.strip())
except Exception as e:
last_error = e
if attempt < max_retries - 1:
time.sleep(base_delay * (2**attempt))
return {
"topic": None,
"few_words": None,
"one_sentence": None,
"error": f"Failed after {max_retries} attempts: {last_error}"
}
def process_channel(db_url: str, channel_id: int, limit: int, client: OpenAI, max_retries: int = 2) -> Dict[str, Any]:
last_error = None
for attempt in range(max_retries):
try:
rows = get_top_messages(db_url, channel_id, limit)
msgs = [r[4] for r in rows if r[4]]
summary = summarize_with_openai(msgs, client)
return {"channel": channel_id, "summary": summary}
except Exception as e:
last_error = e
if attempt < max_retries - 1:
time.sleep(1.0 * (2**attempt))
return {"channel": channel_id, "error": str(last_error)}
def save_summaries(db_url: str, results: List[Dict[str, Any]], limit: int, model: str):
conn = psycopg2.connect(db_url)
try:
with conn:
with conn.cursor() as cur:
for item in results:
s = item.get("summary")
if not s:
continue
cur.execute(
"""
INSERT INTO trank.channel_summaries (
channel_id,
summary,
topic,
few_words,
one_sentence,
error,
model
)
VALUES (%s, %s::jsonb, %s, %s, %s, %s, %s)
ON CONFLICT (channel_id) DO UPDATE SET
summary = EXCLUDED.summary,
topic = EXCLUDED.topic,
few_words = EXCLUDED.few_words,
one_sentence= EXCLUDED.one_sentence,
error = EXCLUDED.error,
model = EXCLUDED.model,
created_at = NOW();
""",
(
str(item["channel"]),
json.dumps(s, ensure_ascii=False),
s.get("topic"),
s.get("few_words"),
s.get("one_sentence"),
s.get("error"),
model,
),
)
finally:
conn.close()
def process_channels_concurrently(db_url: str, channel_ids: List[int], limit: int, client: OpenAI, max_workers: int = 10) -> List[Dict[str, Any]]:
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
fut = {
executor.submit(process_channel, db_url, cid, limit, client): cid
for cid in channel_ids
}
for f in as_completed(fut):
try:
results.append(f.result())
except Exception as e:
results.append({"channel": fut[f], "error": str(e)})
save_summaries(db_url, results, limit, model_name)
return results
def main() -> None:
logging.basicConfig(
level=os.getenv("LOG_LEVEL", "INFO"),
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
url = os.getenv("DATABASE_URL")
if not url:
raise RuntimeError("DATABASE_URL environment variable is required")
limit = 50
max_workers = 5
client = OpenAI()
channel_ids = fetch_all_channel_ids(url)
logger.info(f"Processing summaries for: {channel_ids}")
if channel_ids:
process_channels_concurrently(url, channel_ids, limit, client, max_workers)
if __name__ == "__main__":
main()