-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcreate_clean_threads_v2.py
More file actions
124 lines (105 loc) · 4.69 KB
/
create_clean_threads_v2.py
File metadata and controls
124 lines (105 loc) · 4.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Create cleaned thread export - remove NO_CLEAR_THREAD and deduplicate."""
import json
from datetime import datetime
from collections import defaultdict
with open('insights_first/data/named_threads_v2_20260122_234458.json', 'r', encoding='utf-8') as f:
data = json.load(f)
print("=" * 70)
print("CREATING CLEAN THREAD EXPORT")
print("=" * 70)
threads = list(data['threads'].values())
clean_threads = []
skipped = {'no_clear': 0, 'invalid_name': 0, 'too_small': 0}
for t in threads:
thread_name = t.get('thread_name', 'UNNAMED')
# Skip NO_CLEAR_THREAD and invalid names
if 'NO_CLEAR_THREAD' in thread_name.upper():
skipped['no_clear'] += 1
print(f"SKIP (NO_CLEAR): Thread {t['thread_id']} - {t['size']} insights, {t['num_episodes']} episodes")
continue
if len(thread_name) < 3 or thread_name in ['OR', '', ' ']:
skipped['invalid_name'] += 1
print(f"SKIP (INVALID_NAME): Thread {t['thread_id']} - \"{thread_name}\"")
continue
# Deduplicate insights - keep highest quality per episode
insights_by_episode = defaultdict(list)
for ins in t['insights']:
doc_id = ins['document_id']
insights_by_episode[doc_id].append(ins)
# Keep best insight per episode (highest novelty + specificity)
deduped_insights = []
for doc_id, ins_list in insights_by_episode.items():
if len(ins_list) == 1:
deduped_insights.append(ins_list[0])
else:
# Keep highest quality
best = max(ins_list, key=lambda x: x.get('novelty_score', 0) + x.get('specificity_score', 0))
deduped_insights.append(best)
print(f" DEDUP: {thread_name} - kept 1/{len(ins_list)} from {doc_id}")
# After dedup, check if still meets min requirements
unique_episodes = len(set(ins['document_id'] for ins in deduped_insights))
if len(deduped_insights) < 3 or unique_episodes < 3:
skipped['too_small'] += 1
print(f"SKIP (TOO_SMALL): {thread_name} - {len(deduped_insights)} insights, {unique_episodes} episodes after dedup")
continue
# Update thread
clean_thread = {
'thread_id': len(clean_threads),
'thread_name': thread_name,
'size': len(deduped_insights),
'num_episodes': unique_episodes,
'episodes': list(set(ins['document_id'] for ins in deduped_insights)),
'category': t.get('category', 'other'),
'coherence': t.get('coherence', 0),
'core_claim': t.get('core_claim', ''),
'why_connected': t.get('why_connected', ''),
'representative_topics': t.get('representative_topics', []),
'insights': sorted(deduped_insights,
key=lambda x: x.get('novelty_score', 0) + x.get('specificity_score', 0),
reverse=True),
}
clean_threads.append(clean_thread)
print(f"KEEP: {thread_name} - {clean_thread['size']} insights, {clean_thread['num_episodes']} episodes")
# Sort by episode coverage
clean_threads.sort(key=lambda x: (x['num_episodes'], x['size']), reverse=True)
print(f"\n{'='*70}")
print("SUMMARY")
print('='*70)
print(f"Original threads: {len(threads)}")
print(f" Skipped NO_CLEAR_THREAD: {skipped['no_clear']}")
print(f" Skipped invalid names: {skipped['invalid_name']}")
print(f" Skipped too small after dedup: {skipped['too_small']}")
print(f"Clean threads: {len(clean_threads)}")
total_insights = sum(t['size'] for t in clean_threads)
print(f"\nTotal insights in clean threads: {total_insights}")
print(f"Coverage: {total_insights}/465 ({total_insights/465*100:.0f}%)")
# Show clean threads
print(f"\n{'='*70}")
print("CLEAN THREADS")
print('='*70)
for t in clean_threads:
print(f"\n{t['thread_name']}")
print(f" {t['size']} insights, {t['num_episodes']} episodes, [{t['category']}]")
print(f" Coherence: {t['coherence']:.2f}")
# Save
output = {
'metadata': {
'timestamp': datetime.now().isoformat(),
'source': 'named_threads_v2_20260122_234458.json',
'approach': 'topic-based with deduplication',
'filters_applied': [
'Removed NO_CLEAR_THREAD',
'Removed invalid names',
'Deduplicated insights per episode (kept highest quality)',
'Filtered threads with <3 insights or <3 episodes after dedup'
],
'threads_found': len(clean_threads),
'total_insights': total_insights,
'coverage_pct': round(total_insights/465*100, 1),
},
'threads': {f"Thread {t['thread_id']}": t for t in clean_threads}
}
output_file = 'insights_first/data/threads_v2_clean.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\nSaved to: {output_file}")