invisible-threads/create_clean_threads_v2.py at main · baboonzero/invisible-threads

124 lines (105 loc) · 4.69 KB
"""Create cleaned thread export - remove NO_CLEAR_THREAD and deduplicate."""
import json
from datetime import datetime
from collections import defaultdict
with open('insights_first/data/named_threads_v2_20260122_234458.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
print("=" * 70)
print("CREATING CLEAN THREAD EXPORT")
print("=" * 70)
threads = list(data['threads'].values())
clean_threads = []
skipped = {'no_clear': 0, 'invalid_name': 0, 'too_small': 0}
for t in threads:
    thread_name = t.get('thread_name', 'UNNAMED')
    # Skip NO_CLEAR_THREAD and invalid names
    if 'NO_CLEAR_THREAD' in thread_name.upper():
        skipped['no_clear'] += 1
        print(f"SKIP (NO_CLEAR): Thread {t['thread_id']} - {t['size']} insights, {t['num_episodes']} episodes")
        continue
    if len(thread_name) < 3 or thread_name in ['OR', '', ' ']:
        skipped['invalid_name'] += 1
        print(f"SKIP (INVALID_NAME): Thread {t['thread_id']} - \"{thread_name}\"")
        continue
    # Deduplicate insights - keep highest quality per episode
    insights_by_episode = defaultdict(list)
    for ins in t['insights']:
        doc_id = ins['document_id']
        insights_by_episode[doc_id].append(ins)
    # Keep best insight per episode (highest novelty + specificity)
    deduped_insights = []
    for doc_id, ins_list in insights_by_episode.items():
        if len(ins_list) == 1:
            deduped_insights.append(ins_list[0])
        else:
            # Keep highest quality
            best = max(ins_list, key=lambda x: x.get('novelty_score', 0) + x.get('specificity_score', 0))
            deduped_insights.append(best)
            print(f"  DEDUP: {thread_name} - kept 1/{len(ins_list)} from {doc_id}")
    # After dedup, check if still meets min requirements
    unique_episodes = len(set(ins['document_id'] for ins in deduped_insights))
    if len(deduped_insights) < 3 or unique_episodes < 3:
        skipped['too_small'] += 1
        print(f"SKIP (TOO_SMALL): {thread_name} - {len(deduped_insights)} insights, {unique_episodes} episodes after dedup")
        continue
    # Update thread
    clean_thread = {
        'thread_id': len(clean_threads),
        'thread_name': thread_name,
        'size': len(deduped_insights),
        'num_episodes': unique_episodes,
        'episodes': list(set(ins['document_id'] for ins in deduped_insights)),
        'category': t.get('category', 'other'),
        'coherence': t.get('coherence', 0),
        'core_claim': t.get('core_claim', ''),
        'why_connected': t.get('why_connected', ''),
        'representative_topics': t.get('representative_topics', []),
        'insights': sorted(deduped_insights, 
                          key=lambda x: x.get('novelty_score', 0) + x.get('specificity_score', 0),
                          reverse=True),
    clean_threads.append(clean_thread)
    print(f"KEEP: {thread_name} - {clean_thread['size']} insights, {clean_thread['num_episodes']} episodes")
# Sort by episode coverage
clean_threads.sort(key=lambda x: (x['num_episodes'], x['size']), reverse=True)
print(f"\n{'='*70}")
print("SUMMARY")
print('='*70)
print(f"Original threads: {len(threads)}")
print(f"  Skipped NO_CLEAR_THREAD: {skipped['no_clear']}")
print(f"  Skipped invalid names: {skipped['invalid_name']}")
print(f"  Skipped too small after dedup: {skipped['too_small']}")
print(f"Clean threads: {len(clean_threads)}")
total_insights = sum(t['size'] for t in clean_threads)
print(f"\nTotal insights in clean threads: {total_insights}")
print(f"Coverage: {total_insights}/465 ({total_insights/465*100:.0f}%)")
# Show clean threads
print(f"\n{'='*70}")
print("CLEAN THREADS")
print('='*70)
for t in clean_threads:
    print(f"\n{t['thread_name']}")
    print(f"  {t['size']} insights, {t['num_episodes']} episodes, [{t['category']}]")
    print(f"  Coherence: {t['coherence']:.2f}")
    'metadata': {
        'timestamp': datetime.now().isoformat(),
        'source': 'named_threads_v2_20260122_234458.json',
        'approach': 'topic-based with deduplication',
        'filters_applied': [
            'Removed NO_CLEAR_THREAD',
            'Removed invalid names',
            'Deduplicated insights per episode (kept highest quality)',
            'Filtered threads with <3 insights or <3 episodes after dedup'
        'threads_found': len(clean_threads),
        'total_insights': total_insights,
        'coverage_pct': round(total_insights/465*100, 1),
    'threads': {f"Thread {t['thread_id']}": t for t in clean_threads}
output_file = 'insights_first/data/threads_v2_clean.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\nSaved to: {output_file}")
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

create_clean_threads_v2.py

Latest commit

History

create_clean_threads_v2.py

File metadata and controls