-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathsort_csv_chunked.py
More file actions
73 lines (60 loc) · 2.63 KB
/
sort_csv_chunked.py
File metadata and controls
73 lines (60 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import csv
import os
import tempfile
import shutil
chunk_size = 400000 # Adjust as needed based on available memory
# Define the input and output file paths
input_file = r'./data/mobility/bs.csv'
output_file = r'./data/mobility/bs_sorted.csv'
# Define the primary and secondary columns to sort by (0-based indices)
primary_sort_column = 2 # Replace with the desired primary column index
secondary_sort_column = 3 # Replace with the desired secondary column index
# Create a temporary directory for splitting and sorting chunks
temp_dir = tempfile.mkdtemp()
try:
# Split the large CSV file into smaller chunks
chunk_files = []
with open(input_file, newline='') as infile:
reader = csv.reader(infile)
header = next(reader) # Save the header
current_chunk = [header]
for i, row in enumerate(reader):
current_chunk.append(row)
if i > 0 and i % chunk_size == 0:
chunk_file = os.path.join(
temp_dir, f'chunk_{i // chunk_size}.csv')
with open(chunk_file, 'w', newline='') as chunk:
writer = csv.writer(chunk)
writer.writerows(current_chunk)
chunk_files.append(chunk_file)
current_chunk = []
if current_chunk:
chunk_file = os.path.join(
temp_dir, f'chunk_{i // chunk_size + 1}.csv')
with open(chunk_file, 'w', newline='') as chunk:
writer = csv.writer(chunk)
writer.writerows(current_chunk)
chunk_files.append(chunk_file)
# Merge the sorted chunks into a single sorted CSV file
with open(output_file, 'w', newline='') as outfile:
writer = csv.writer(outfile)
# Write the header only once
writer.writerow(header)
# Merge sort the chunks
merged_data = []
for chunk_file in chunk_files:
with open(chunk_file, newline='') as chunk:
data = list(csv.reader(chunk))
merged_data.extend(data)
# Define a custom sorting key function
def custom_sort_key(row):
return (row[primary_sort_column], row[secondary_sort_column])
# Sort the merged data based on the custom sorting key
merged_data.sort(key=custom_sort_key)
# Write the sorted merged data to the output file
writer.writerows(merged_data)
print(
f'Huge CSV file sorted by column {primary_sort_column} (primary) and {secondary_sort_column} (secondary) and saved as {output_file}')
finally:
# Clean up temporary files and directories
shutil.rmtree(temp_dir, ignore_errors=True)