-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcreate_final_export.py
More file actions
125 lines (111 loc) · 5.33 KB
/
create_final_export.py
File metadata and controls
125 lines (111 loc) · 5.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Create final thread export with manually curated names and descriptions."""
import json
from datetime import datetime
with open('insights_first/data/threads_v2_clean.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# Manual curation based on actual insight content
THREAD_NAMES = {
"FOCUS_ON_CUSTOMER_SUCCESS": {
"name": "Treat customers as partners, not transactions",
"core_claim": "Companies that prioritize genuine customer relationships and invest in customer success outperform those focused on short-term sales.",
"why_connected": "All insights emphasize personal engagement and long-term customer value over transactional relationships"
},
"PROJECT_TIMELINE_RESTRICTIONS": {
"name": "Small teams with hard deadlines ship better",
"core_claim": "Constraining team size and imposing strict time limits forces prioritization and prevents scope creep.",
"why_connected": "Each insight shows how constraints (time, team size, roles) improve outcomes"
},
"Effective Communication Across Contexts": {
"name": "Communication quality determines career impact",
"core_claim": "Your ability to communicate clearly—whether giving feedback, pitching ideas, or managing up—is the primary driver of your professional effectiveness.",
"why_connected": "All insights connect communication skills to measurable career and organizational outcomes"
},
"BIG_IMPACT_GOALS": {
"name": "Strategy is about what you won't do",
"core_claim": "Effective product strategy requires ruthless prioritization—defining impact through what you explicitly choose not to pursue.",
"why_connected": "Each insight emphasizes prioritization and saying no as the key to strategic clarity"
},
"FOCUS_ON_HIGH_IMPACT_FEATURES": {
"name": "Fewer features, higher quality",
"core_claim": "Building 5 excellent features beats building 15 mediocre ones—users value depth over breadth.",
"why_connected": "All insights advocate for quality and focus over feature quantity"
},
"DATA_OVEREMPHASIS_AND_GUT_FEELING": {
"name": "Data informs but judgment decides",
"core_claim": "Over-reliance on quantitative data leads to poor decisions; the best leaders balance data with qualitative insight and informed intuition.",
"why_connected": "Each insight challenges data-only decision making"
},
"Mission-driven product strategy": {
"name": "Mission clarity drives hard tradeoffs",
"core_claim": "When mission is clear, difficult product decisions become obvious—even when they mean leaving opportunities on the table.",
"why_connected": "All insights show mission driving non-obvious strategic choices"
},
"Productivity Improvements vs. Constant Connectivity": {
"name": "Real productivity requires disconnection",
"core_claim": "Constant connectivity and smartphone dependence reduce rather than enhance meaningful output.",
"why_connected": "All insights argue against always-on work culture"
}
}
threads = list(data['threads'].values())
final_threads = []
for t in threads:
old_name = t.get('thread_name', '')
# Find matching curation
curation = None
for key, value in THREAD_NAMES.items():
if key in old_name or old_name in key:
curation = value
break
if not curation:
print(f"WARNING: No curation found for: {old_name}")
continue
final_thread = {
'thread_id': len(final_threads),
'thread_name': curation['name'],
'core_claim': curation['core_claim'],
'why_connected': curation['why_connected'],
'size': t['size'],
'num_episodes': t['num_episodes'],
'episodes': t['episodes'],
'category': t.get('category', 'other'),
'coherence': t.get('coherence', 0),
'insights': t['insights'],
}
final_threads.append(final_thread)
print(f"✓ {old_name} → \"{curation['name']}\"")
# Sort by episode coverage
final_threads.sort(key=lambda x: (x['num_episodes'], x['size']), reverse=True)
# Renumber
for i, t in enumerate(final_threads):
t['thread_id'] = i
total_insights = sum(t['size'] for t in final_threads)
print(f"\n{'='*70}")
print("FINAL THREADS")
print('='*70)
for t in final_threads:
print(f"\n{t['thread_name']}")
print(f" {t['size']} insights from {t['num_episodes']} episodes")
print(f" Core claim: {t['core_claim'][:80]}...")
# Save
output = {
'metadata': {
'timestamp': datetime.now().isoformat(),
'approach': 'topic-based threading with manual curation',
'threads_found': len(final_threads),
'total_insights': total_insights,
'coverage_pct': round(total_insights/465*100, 1),
'quality_notes': [
'All insights have novelty >= 7/10',
'All insights deduplicated (1 per episode per thread)',
'Thread names manually curated for clarity',
'All threads span 3+ different episodes'
]
},
'threads': final_threads
}
output_file = 'insights_first/data/threads_final.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\n{'='*70}")
print(f"Saved {len(final_threads)} threads with {total_insights} insights to:")
print(f" {output_file}")