trank/summarize_posts.py at main · openrankprotocol/trank

284 lines (257 loc) · 10.5 KB
#!/usr/bin/env python3
import psycopg2
import json
import time
import logging
from typing import List, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from openai import OpenAI
logger = logging.getLogger(__name__)
model_name = "gpt-4.1-mini"
summarization_instructions = """
You are an expert analyst with a warm tone. You specialize in extracting high-signal insights from recent forum-style content, where ideas can be messy, fast-moving, and conversational. Your goal is to distill the essential themes into a summary.
Given the set of messages, produce a JSON object containing:
   → A concise thematic synthesis (maximum 3 sentences) written in a professional but easily digestible and simple form.
   → The summary must begin with a short introductory phrase. Here are some examples for inspiration but ***ENSURE YOU NEVER USE THEM VERBATIM***:
        - "Recent discussions centered on..."
        - "The community is currently exploring..."
        - "A key theme emerging lately involves..."
        - "Much of the recent dialogue highlights..."
        - "There is growing interest regarding..."
   → Focus entirely on the ideas:
        - key motivations, questions, or pain points
        - notable nuances, tradeoffs, or emerging themes
        - relationships or tensions between subtopics
   → Keep it approachable but still thoughtful and insight-rich.
2. "few_words"
   → 3–7 punchy keywords or short phrases that capture the core ideas.
   → Avoid generic filler unless essential.
3. "one_sentence"
   → One friendly, clear sentence that expresses the core insight.
Requirements:
- Maintain a warm, casual, friendly tone while keeping the ideas sharp and useful.
- Capture not just what is being explored, but why it matters and what tensions or possibilities exist.
- Include references to people, projects, or mechanisms only when relevant to the ideas themselves.
- Write cleanly and simply; aim for clarity and usefulness over formality.
Return only the JSON object, nothing else.
def fetch_all_channel_ids(db_url: str) -> List[int]:
    conn = psycopg2.connect(db_url)
        with conn.cursor() as cur:
            cur.execute("SELECT DISTINCT channel_id FROM trank.messages;")
            rows = cur.fetchall()
            return [r[0] for r in rows]
    finally:
        conn.close()
def get_top_messages(db_url: str, channel_id: int, limit: int) -> List[tuple]:
    conn = psycopg2.connect(db_url)
        with conn.cursor() as cur:
            query = """
                WITH latest_messages AS (
                    SELECT
                        channel_id,
                        from_id,
                        message
                    FROM trank.messages
                    WHERE channel_id = %s
                      AND message IS NOT NULL
                    ORDER BY date DESC, id DESC
                interaction AS (
                    SELECT
                        parent.channel_id,
                        parent.id AS message_id,
                        m.from_id AS user_id,
                        'reply' AS interaction_type
                    FROM trank.messages m
                    JOIN latest_messages parent
                      ON parent.channel_id = m.channel_id
                     AND m.reply_to_msg_id = parent.id
                    UNION ALL
                    SELECT
                        r.channel_id,
                        r.message_id,
                        r.user_id,
                        'reaction' AS interaction_type
                    FROM trank.message_reactions r
                    JOIN latest_messages lm
                      ON lm.channel_id = r.channel_id
                     AND lm.id = r.message_id
                interaction_scores AS (
                    SELECT
                        i.message_id,
                        i.interaction_type,
                        s.user_id,
                        s.value AS user_score
                    FROM interaction i
                    JOIN trank.scores s
                      ON s.channel_id = i.channel_id
                     AND s.user_id = i.user_id
                weighted AS (
                    SELECT
                        message_id,
                            user_score *
                            CASE interaction_type
                                WHEN 'reply' THEN 10
                                WHEN 'reaction' THEN 20
                        ) AS score
                    FROM interaction_scores
                    GROUP BY message_id
                SELECT
                    lm.id,
                    lm.channel_id,
                    lm.date,
                    lm.from_id,
                    lm.message,
                    COALESCE(w.score, 0) AS score
                FROM latest_messages lm
                LEFT JOIN weighted w
                  ON w.message_id = lm.id
                ORDER BY score DESC
                LIMIT %s
            cur.execute(query, (channel_id, limit))
            return cur.fetchall()
    finally:
        conn.close()
def summarize_with_openai(messages: List[str], client: OpenAI, max_retries: int = 3, base_delay: float = 1.0) -> Dict[str, Any]:
    valid = [m for m in messages if m and len(m.strip()) > 5]
    if not valid:
        return None
    payload = json.dumps(valid, ensure_ascii=False)
    prompt = "Conversation:\n" + payload
    last_error = None
    for attempt in range(max_retries):
        try:
            resp = client.responses.create(
                model=model_name,
                input=prompt,
                temperature=0.6,
                instructions=summarization_instructions,
                text={
                    "format": {
                        "type": "json_schema",
                        "name": "channel_summary",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "topic": {"type": "string"},
                                "few_words": {"type": "string"},
                                "one_sentence": {"type": "string"}
                            "required": ["topic", "few_words", "one_sentence"],
                            "additionalProperties": False
                        "strict": True
            return json.loads(resp.output_text.strip())
        except Exception as e:
            last_error = e
            if attempt < max_retries - 1:
                time.sleep(base_delay * (2**attempt))
    return {
        "topic": None,
        "few_words": None,
        "one_sentence": None,
        "error": f"Failed after {max_retries} attempts: {last_error}"
def process_channel(db_url: str, channel_id: int, limit: int, client: OpenAI, max_retries: int = 2) -> Dict[str, Any]:
    last_error = None
    for attempt in range(max_retries):
        try:
            rows = get_top_messages(db_url, channel_id, limit)
            msgs = [r[4] for r in rows if r[4]]
            summary = summarize_with_openai(msgs, client)
            return {"channel": channel_id, "summary": summary}
        except Exception as e:
            last_error = e
            if attempt < max_retries - 1:
                time.sleep(1.0 * (2**attempt))
    return {"channel": channel_id, "error": str(last_error)}
def save_summaries(db_url: str, results: List[Dict[str, Any]], limit: int, model: str):
    conn = psycopg2.connect(db_url)
        with conn:
            with conn.cursor() as cur:
                for item in results:
                    s = item.get("summary")
                    if not s:
                        continue
                    cur.execute(
                        INSERT INTO trank.channel_summaries (
                            channel_id,
                            summary,
                            few_words,
                            one_sentence,
                        VALUES (%s, %s::jsonb, %s, %s, %s, %s, %s)
                        ON CONFLICT (channel_id) DO UPDATE SET
                            summary     = EXCLUDED.summary,
                            topic       = EXCLUDED.topic,
                            few_words   = EXCLUDED.few_words,
                            one_sentence= EXCLUDED.one_sentence,
                            error       = EXCLUDED.error,
                            model       = EXCLUDED.model,
                            created_at  = NOW();
                            str(item["channel"]),
                            json.dumps(s, ensure_ascii=False),
                            s.get("topic"),
                            s.get("few_words"),
                            s.get("one_sentence"),
                            s.get("error"),
    finally:
        conn.close()
def process_channels_concurrently(db_url: str, channel_ids: List[int], limit: int, client: OpenAI, max_workers: int = 10) -> List[Dict[str, Any]]:
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        fut = {
            executor.submit(process_channel, db_url, cid, limit, client): cid
            for cid in channel_ids
        for f in as_completed(fut):
            try:
                results.append(f.result())
            except Exception as e:
                results.append({"channel": fut[f], "error": str(e)})
    save_summaries(db_url, results, limit, model_name)
    return results
def main() -> None:
    logging.basicConfig(
        level=os.getenv("LOG_LEVEL", "INFO"),
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    url = os.getenv("DATABASE_URL")
    if not url:
        raise RuntimeError("DATABASE_URL environment variable is required")
    limit = 50
    max_workers = 5
    client = OpenAI()
    channel_ids = fetch_all_channel_ids(url)
    logger.info(f"Processing summaries for: {channel_ids}")
    if channel_ids:
        process_channels_concurrently(url, channel_ids, limit, client, max_workers)
if __name__ == "__main__":
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

summarize_posts.py

Latest commit

History

summarize_posts.py

File metadata and controls