invisible-threads/find_threads.py at main · baboonzero/invisible-threads

265 lines (206 loc) · 9.81 KB
Find natural insight threads using graph-based clustering.
This creates a similarity graph where insights are nodes and edges connect
insights that are semantically similar. Connected components become threads.
    python insights_first/find_threads.py --input insights_first/data/modal_extraction_20260120_024600.json
import argparse
import json
from datetime import datetime
from collections import defaultdict
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
def load_insights(input_file: str) -> list[dict]:
    """Load insights from extraction output."""
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    insights = [r['insight'] for r in data['results'] if r['has_insight'] and r['insight']]
    return insights
def build_similarity_graph(embeddings: np.ndarray, threshold: float = 0.65) -> nx.Graph:
    Build a graph where edges connect insights with similarity >= threshold.
    print(f"Computing pairwise similarities...")
    sim_matrix = cosine_similarity(embeddings)
    print(f"Building graph (threshold={threshold})...")
    G = nx.Graph()
    n = len(embeddings)
    G.add_nodes_from(range(n))
    edge_count = 0
    for i in range(n):
        for j in range(i + 1, n):
            if sim_matrix[i, j] >= threshold:
                G.add_edge(i, j, weight=sim_matrix[i, j])
                edge_count += 1
    print(f"Graph: {n} nodes, {edge_count} edges")
    return G
def find_threads_components(G: nx.Graph, min_size: int = 3) -> list[set]:
    Find threads as connected components with at least min_size nodes.
    components = list(nx.connected_components(G))
    threads = [c for c in components if len(c) >= min_size]
    threads = sorted(threads, key=len, reverse=True)
    total_in_threads = sum(len(t) for t in threads)
    isolated = len([c for c in components if len(c) == 1])
    small_groups = len([c for c in components if 1 < len(c) < min_size])
    print(f"Found {len(threads)} threads (min_size={min_size})")
    print(f"  - {total_in_threads} insights in threads")
    print(f"  - {isolated} isolated insights")
    print(f"  - {small_groups} small groups (2-{min_size-1} insights)")
    return threads
def find_threads_louvain(G: nx.Graph, min_size: int = 3, resolution: float = 1.5) -> list[set]:
    Find threads using Louvain community detection.
    This finds dense subgroups rather than just connected components.
    Higher resolution = smaller, tighter communities.
    import community as community_louvain
    # Only consider nodes that have at least one edge
    nodes_with_edges = [n for n in G.nodes() if G.degree(n) > 0]
    subgraph = G.subgraph(nodes_with_edges)
    if len(subgraph) == 0:
        print("No connected nodes found")
        return []
    # Detect communities
    partition = community_louvain.best_partition(subgraph, resolution=resolution, random_state=42)
    # Group nodes by community
    communities = {}
    for node, comm_id in partition.items():
        if comm_id not in communities:
            communities[comm_id] = set()
        communities[comm_id].add(node)
    # Filter by min_size
    threads = [c for c in communities.values() if len(c) >= min_size]
    threads = sorted(threads, key=len, reverse=True)
    total_in_threads = sum(len(t) for t in threads)
    small = sum(1 for c in communities.values() if len(c) < min_size)
    isolated = len(G) - len(nodes_with_edges)
    print(f"Louvain found {len(communities)} communities, {len(threads)} meet min_size={min_size}")
    print(f"  - {total_in_threads} insights in threads")
    print(f"  - {isolated} isolated insights (no edges)")
    print(f"  - {small} small communities (<{min_size} insights)")
    return threads
def create_thread_summary(thread_indices: set, insights: list[dict], embeddings: np.ndarray) -> dict:
    """Create a summary for a thread."""
    thread_insights = [insights[i] for i in thread_indices]
    # Get unique episodes
    episodes = list(set(i['episode_title'] for i in thread_insights))
    # Calculate average scores
    avg_novelty = sum(i['novelty_score'] for i in thread_insights) / len(thread_insights)
    avg_specificity = sum(i['specificity_score'] for i in thread_insights) / len(thread_insights)
    # Sort by quality
    sorted_insights = sorted(
        thread_insights,
        key=lambda x: x['novelty_score'] + x['specificity_score'],
        reverse=True
    # Calculate thread coherence (average pairwise similarity)
    thread_embeddings = embeddings[list(thread_indices)]
    if len(thread_embeddings) > 1:
        sim_matrix = cosine_similarity(thread_embeddings)
        # Get upper triangle (excluding diagonal)
        upper_tri = sim_matrix[np.triu_indices(len(thread_embeddings), k=1)]
        coherence = float(np.mean(upper_tri))
        coherence = 1.0
    return {
        'size': len(thread_insights),
        'num_episodes': len(episodes),
        'episodes': episodes,
        'avg_novelty': round(avg_novelty, 1),
        'avg_specificity': round(avg_specificity, 1),
        'coherence': round(coherence, 3),
        'insights': sorted_insights,
        'insight_indices': list(thread_indices),
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', required=True, help='Input JSON from extraction')
    parser.add_argument('--threshold', type=float, default=0.55, help='Similarity threshold for edges')
    parser.add_argument('--min-size', type=int, default=3, help='Minimum thread size')
    parser.add_argument('--min-episodes', type=int, default=3, help='Minimum episodes per thread (filters single-source threads)')
    parser.add_argument('--method', choices=['components', 'louvain'], default='louvain', help='Thread detection method')
    parser.add_argument('--resolution', type=float, default=2.0, help='Louvain resolution (higher=smaller threads)')
    parser.add_argument('--output', default=None, help='Output file path')
    args = parser.parse_args()
    # Load insights
    print(f"Loading insights from {args.input}...")
    insights = load_insights(args.input)
    print(f"Loaded {len(insights)} insights")
    # Embed
    print(f"\nLoading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    texts = [i['insight_text'] for i in insights]
    print(f"Embedding {len(texts)} insights...")
    embeddings = model.encode(texts, show_progress_bar=True)
    # Build graph and find threads
    print()
    G = build_similarity_graph(embeddings, threshold=args.threshold)
    if args.method == 'louvain':
        threads = find_threads_louvain(G, min_size=args.min_size, resolution=args.resolution)
        threads = find_threads_components(G, min_size=args.min_size)
    # Create thread summaries and filter by min_episodes
    thread_summaries = []
    skipped_single_source = 0
    for i, thread_indices in enumerate(threads):
        summary = create_thread_summary(thread_indices, insights, embeddings)
        summary['thread_id'] = i
        # Filter: must span multiple episodes to be a true "invisible thread"
        if summary['num_episodes'] < args.min_episodes:
            skipped_single_source += 1
            continue
        thread_summaries.append(summary)
    if skipped_single_source > 0:
        print(f"  - Skipped {skipped_single_source} threads with <{args.min_episodes} episodes (single-source)")
    # Collect unthreaded insights
    threaded_indices = set()
    for t in threads:
        threaded_indices.update(t)
    unthreaded_indices = set(range(len(insights))) - threaded_indices
    unthreaded_insights = [insights[i] for i in unthreaded_indices]
    # Print summary
    print("\n" + "="*70)
    print("THREADS DISCOVERED")
    print("="*70)
    for i, thread in enumerate(thread_summaries[:15]):  # Show top 15
        print(f"\n🧵 Thread {i+1}: {thread['size']} insights from {thread['num_episodes']} episodes")
        print(f"   Coherence: {thread['coherence']:.2f} | Avg Novelty: {thread['avg_novelty']} | Avg Spec: {thread['avg_specificity']}")
        print(f"   Sample insights:")
        for j, ins in enumerate(thread['insights'][:3], 1):
            print(f"     {j}. {ins['insight_text'][:90]}...")
        print(f"   Episodes: {', '.join(e[:30] + '...' for e in thread['episodes'][:3])}")
    if len(thread_summaries) > 15:
        print(f"\n... and {len(thread_summaries) - 15} more threads")
    print(f"\n📊 Summary: {len(thread_summaries)} threads, {len(unthreaded_insights)} unthreaded insights")
    # Save output
    if args.output is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        args.output = f"insights_first/data/threads_{timestamp}.json"
    # Calculate actual insights in valid threads
    valid_threaded_count = sum(t['size'] for t in thread_summaries)
    output_data = {
        'metadata': {
            'timestamp': datetime.now().isoformat(),
            'input_file': args.input,
            'total_insights': len(insights),
            'similarity_threshold': args.threshold,
            'min_thread_size': args.min_size,
            'min_episodes': args.min_episodes,
            'num_threads': len(thread_summaries),
            'skipped_single_source': skipped_single_source,
            'insights_in_threads': valid_threaded_count,
            'unthreaded_insights': len(insights) - valid_threaded_count,
        'threads': thread_summaries,
        'unthreaded': unthreaded_insights,
    with open(args.output, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    print(f"\nResults saved to: {args.output}")
if __name__ == "__main__":
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

find_threads.py

Latest commit

History

find_threads.py

File metadata and controls