-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_seed.py
More file actions
329 lines (267 loc) · 11.4 KB
/
process_seed.py
File metadata and controls
329 lines (267 loc) · 11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#!/usr/bin/env python3
"""
General purpose script to process seed score CSV files.
Loads a CSV file, identifies tier sections, assigns weighted scores, and creates a backup.
"""
import pandas as pd
import os
import sys
import argparse
from pathlib import Path
from datetime import datetime
import shutil
def create_backup(csv_path):
"""
Create a backup of the original CSV file with timestamp.
Args:
csv_path (Path): Path to the original CSV file
Returns:
Path: Path to the backup file
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = (
csv_path.parent / f"{csv_path.stem}_backup_{timestamp}{csv_path.suffix}"
)
try:
shutil.copy2(csv_path, backup_path)
print(f"🔄 Created backup: {backup_path}")
return backup_path
except Exception as e:
print(f"❌ ERROR: Failed to create backup: {e}")
sys.exit(1)
def process_seed_csv(csv_path, tier_weights=None):
"""
Process a seed scores CSV file to assign tier-based scores.
Args:
csv_path (str or Path): Path to the CSV file to process
tier_weights (list): List of weights for each tier (default: [0.6, 0.2, 0.2])
If [1.0] is provided or there are no empty lines,
distributes seed trust equally among all peers.
Expected format:
- Multi-tier: Tier sections separated by empty lines
- Single-tier: All repositories with equal weight distribution
"""
if tier_weights is None:
tier_weights = [0.6, 0.2, 0.2]
csv_path = Path(csv_path)
if not csv_path.exists():
print(f"❌ ERROR: File {csv_path} not found!")
sys.exit(1)
print(f"📂 Loading {csv_path}...")
# Create backup before processing
backup_path = create_backup(csv_path)
try:
# First, read the raw file to map original positions to DataFrame indices
with open(csv_path, "r", encoding="utf-8") as f:
lines = f.readlines()
# Map original line numbers to DataFrame row indices
df_row_to_original_line = {}
original_line_to_df_row = {}
df_row = 0
for line_num, line in enumerate(lines):
if line_num == 0: # Skip header
continue
if line.strip() != "": # Non-empty line becomes a DataFrame row
df_row_to_original_line[df_row] = line_num
original_line_to_df_row[line_num] = df_row
df_row += 1
# Find empty lines in original file
empty_lines = []
for line_num, line in enumerate(lines):
if line_num > 0 and line.strip() == "": # Skip header, find empty lines
empty_lines.append(line_num)
# Read the CSV file
df = pd.read_csv(csv_path)
print(f"✅ Loaded {len(df)} rows")
# Check if this is a single-tier case
is_single_tier = len(tier_weights) == 1 or (
len(tier_weights) == 1 and tier_weights[0] == 1.0
)
# Initialize tier_boundaries list
tier_boundaries = []
# Handle single-tier case (no tiers, equal distribution)
if is_single_tier or len(empty_lines) == 0:
if is_single_tier:
print(
f"🔍 Single-tier mode: distributing seed trust equally among all peers"
)
else:
print(
f"🔍 Found 0 separator rows - distributing seed trust equally among all peers"
)
# Single tier with equal distribution
tier_boundaries = [(0, len(df))]
tier_weights = [1.0] # Use full weight for single tier
# Convert empty line positions to DataFrame indices for tier boundaries
elif len(empty_lines) >= len(tier_weights) - 1:
print(
f"🔍 Found {len(empty_lines)} separator rows at original lines: {empty_lines}"
)
# First tier: from start to first empty line
first_empty = empty_lines[0]
tier1_end = 0
for df_idx, orig_line in df_row_to_original_line.items():
if orig_line >= first_empty:
break
tier1_end = df_idx + 1
tier_boundaries.append((0, tier1_end))
# Middle tiers: between consecutive empty lines
for i in range(len(tier_weights) - 2):
if i + 1 < len(empty_lines):
start_line = empty_lines[i]
end_line = empty_lines[i + 1]
tier_start = None
tier_end = None
for df_idx, orig_line in df_row_to_original_line.items():
if tier_start is None and orig_line > start_line:
tier_start = df_idx
if orig_line >= end_line:
tier_end = df_idx
break
if tier_start is not None:
if tier_end is None:
tier_end = len(df)
tier_boundaries.append((tier_start, tier_end))
# Last tier: from last empty line to end
last_empty = empty_lines[-1]
last_tier_start = None
for df_idx, orig_line in df_row_to_original_line.items():
if orig_line > last_empty:
last_tier_start = df_idx
break
if last_tier_start is not None:
tier_boundaries.append((last_tier_start, len(df)))
elif len(empty_lines) > 0 and not is_single_tier:
print(
f"🔍 Found {len(empty_lines)} separator rows - using equal division for remaining tiers"
)
# Some separators, but not enough - distribute remaining rows
first_empty = empty_lines[0]
tier1_end = 0
for df_idx, orig_line in df_row_to_original_line.items():
if orig_line >= first_empty:
break
tier1_end = df_idx + 1
tier_boundaries.append((0, tier1_end))
remaining_start = tier1_end
remaining_rows = len(df) - remaining_start
remaining_tiers = len(tier_weights) - 1
rows_per_tier = remaining_rows // remaining_tiers
for i in range(remaining_tiers - 1):
start_idx = remaining_start + i * rows_per_tier
end_idx = remaining_start + (i + 1) * rows_per_tier
tier_boundaries.append((start_idx, end_idx))
# Last tier gets remaining rows
tier_boundaries.append(
(remaining_start + (remaining_tiers - 1) * rows_per_tier, len(df))
)
else:
# This case should not be reached due to earlier single-tier handling
print(f"🔍 Using equal division into {len(tier_weights)} tiers")
# No separators, divide equally into tiers
rows_per_tier = len(df) // len(tier_weights)
for i in range(len(tier_weights) - 1):
start_idx = i * rows_per_tier
end_idx = (i + 1) * rows_per_tier
tier_boundaries.append((start_idx, end_idx))
# Last tier gets remaining rows
tier_boundaries.append(((len(tier_weights) - 1) * rows_per_tier, len(df)))
# Initialize or reset the 'v' column (seed score column)
if "v" not in df.columns:
df["v"] = 0.0
else:
df["v"] = 0.0
# Count repositories in each tier and assign scores
tier_info = []
for tier_idx, (start, end) in enumerate(tier_boundaries):
if tier_idx >= len(tier_weights):
break
# Count actual repositories (non-empty rows) in this tier
tier_repos = 0
for i in range(start, end):
if i < len(df) and not df.iloc[i].isnull().all():
tier_repos += 1
tier_weight = tier_weights[tier_idx]
tier_score = tier_weight / tier_repos if tier_repos > 0 else 0
tier_info.append(
{
"tier": tier_idx + 1,
"repos": tier_repos,
"weight": tier_weight,
"score_per_repo": tier_score,
"start": start,
"end": end,
}
)
# Display tier distribution
print(f"📊 Tier distribution:")
for info in tier_info:
print(
f" Tier {info['tier']}: {info['repos']} repositories ({info['weight'] * 100:.1f}% weight)"
)
print(f"💯 Score per repository:")
for info in tier_info:
print(f" Tier {info['tier']}: {info['score_per_repo']:.6f}")
# Assign scores to each tier
for info in tier_info:
for i in range(info["start"], info["end"]):
if i < len(df) and not df.iloc[i].isnull().all():
df.iloc[i, df.columns.get_loc("v")] = info["score_per_repo"]
# Verify total score
total_score = df["v"].sum()
expected_total = sum(tier_weights)
print(f"🎯 Total score: {total_score:.6f} (expected: {expected_total:.6f})")
# Save back to the original file
df.to_csv(csv_path, index=False)
print(f"💾 Saved updated file to {csv_path}")
# Show sample of results
print(f"\n📋 Sample results:")
non_zero_scores = df[df["v"] > 0].head(10)
if len(non_zero_scores) > 0:
first_col = df.columns[0]
for i, row in non_zero_scores.iterrows():
print(f" {row[first_col]} -> {row['v']:.6f}")
else:
print(" No non-zero scores found")
except Exception as e:
print(f"❌ ERROR: Failed to process file: {e}")
print(f"🔄 Backup file preserved at: {backup_path}")
import traceback
traceback.print_exc()
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Process seed score CSV files with tier-based weighting",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python process_seed.py seed_scores.csv
python process_seed.py /path/to/scores.csv --weights 0.5 0.3 0.2
python process_seed.py data.csv --weights 0.4 0.4 0.2
The script expects CSV files with tier sections separated by empty rows.
A backup file will be created automatically before processing.
""",
)
parser.add_argument("csv_path", help="Path to the CSV file to process")
parser.add_argument(
"--weights",
"-w",
nargs="+",
type=float,
default=[0.6, 0.2, 0.2],
help="Weights for each tier (default: 0.6 0.2 0.2). Use '1.0' for single-tier equal distribution.",
)
args = parser.parse_args()
# Validate weights sum to 1.0 (with small tolerance for float precision)
weight_sum = sum(args.weights)
if abs(weight_sum - 1.0) > 0.001:
print(f"❌ ERROR: Weights must sum to 1.0, got {weight_sum:.6f}")
print(f"Current weights: {args.weights}")
sys.exit(1)
print("🚀 Processing seed scores CSV...")
print(f"📁 File: {args.csv_path}")
print(f"⚖️ Weights: {args.weights}")
process_seed_csv(args.csv_path, args.weights)
print("✅ Processing complete!")
if __name__ == "__main__":
main()