-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_dummy_data.py
More file actions
309 lines (255 loc) · 13.4 KB
/
generate_dummy_data.py
File metadata and controls
309 lines (255 loc) · 13.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""
Generate dummy data for testing the split_data project.
Creates 1000+ entries across groups, members, expenses, and splits.
"""
from database import SplitDataDB
import random
from datetime import datetime, timedelta
from decimal import Decimal
# Sample data for realistic generation
EXPENSE_TITLES = {
'food': [
'McDonald', 'Starbucks', 'Pizza Hut', 'Subway', 'KFC', 'Burger King',
'Taco Bell', 'Domino\'s Pizza', 'Chipotle', 'Panera Bread', 'Dunkin\'',
'Olive Garden', 'Red Lobster', 'Outback Steakhouse', 'Buffalo Wild Wings',
'Restaurant dinner', 'Lunch meeting', 'Breakfast', 'Coffee shop', 'Food truck'
],
'travel': [
'Uber ride', 'Lyft ride', 'Taxi', 'Train ticket', 'Bus fare', 'Flight ticket',
'Hotel booking', 'Airbnb', 'Car rental', 'Gas station', 'Parking fee',
'Toll fee', 'Metro card', 'Bike rental', 'Scooter rental'
],
'entertainment': [
'Movie tickets', 'Concert', 'Theater show', 'Museum entry', 'Amusement park',
'Bowling', 'Karaoke', 'Escape room', 'Arcade', 'Sports game', 'Comedy show',
'Music festival', 'Festival tickets', 'Club entry', 'Bar drinks'
],
'shopping': [
'Grocery shopping', 'Target', 'Walmart', 'Amazon order', 'Clothing store',
'Electronics store', 'Bookstore', 'Pharmacy', 'Home Depot', 'Costco',
'Online purchase', 'Gift shop', 'Convenience store'
],
'utilities': [
'Electricity bill', 'Water bill', 'Internet bill', 'Phone bill', 'Gas bill',
'Cable bill', 'Streaming service', 'Insurance', 'Rent', 'Mortgage'
],
'other': [
'Gym membership', 'Gym class', 'Yoga class', 'Personal trainer', 'Doctor visit',
'Pharmacy', 'Dry cleaning', 'Laundry', 'Haircut', 'Spa', 'Massage',
'Car repair', 'Car wash', 'Pet supplies', 'Veterinary', 'Donation'
]
}
GROUP_NAMES = [
'Foodies', 'Travel Buddies', 'Roommates', 'Weekend Warriors', 'Study Group',
'Work Team', 'Family', 'Friends', 'Gym Squad', 'Concert Crew', 'Movie Night',
'Dinner Club', 'Road Trip', 'Vacation', 'Birthday Party', 'Wedding Party',
'Office Lunch', 'Happy Hour', 'Brunch Club', 'Coffee Group', 'Shopping Spree',
'Game Night', 'Hiking Group', 'Beach Day', 'City Explorers', 'Adventure Seekers',
'Food Tour', 'Wine Tasting', 'Book Club', 'Fitness Group'
]
FIRST_NAMES = [
'Alex', 'Jordan', 'Taylor', 'Casey', 'Morgan', 'Riley', 'Avery', 'Quinn',
'Sage', 'River', 'Skyler', 'Phoenix', 'Blake', 'Cameron', 'Dakota',
'Esther', 'Finan', 'Kartik', 'Sarah', 'Michael', 'Emily', 'David', 'Jessica',
'James', 'Emma', 'Robert', 'Olivia', 'William', 'Sophia', 'Richard', 'Isabella',
'Joseph', 'Ava', 'Thomas', 'Mia', 'Charles', 'Charlotte', 'Christopher', 'Amelia',
'Daniel', 'Harper', 'Matthew', 'Evelyn', 'Anthony', 'Abigail', 'Mark', 'Elizabeth',
'Donald', 'Sofia', 'Steven', 'Avery', 'Paul', 'Ella', 'Andrew', 'Scarlett',
'Joshua', 'Grace', 'Kenneth', 'Victoria', 'Kevin', 'Aria', 'Brian', 'Chloe'
]
LAST_NAMES = [
'Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis',
'Rodriguez', 'Martinez', 'Hernandez', 'Lopez', 'Wilson', 'Anderson', 'Thomas',
'Taylor', 'Moore', 'Jackson', 'Martin', 'Lee', 'Thompson', 'White', 'Harris',
'Sanchez', 'Clark', 'Ramirez', 'Lewis', 'Robinson', 'Walker', 'Young', 'Allen',
'King', 'Wright', 'Scott', 'Torres', 'Nguyen', 'Hill', 'Flores', 'Green', 'Adams',
'Nelson', 'Baker', 'Hall', 'Rivera', 'Campbell', 'Mitchell', 'Carter', 'Roberts'
]
def generate_user_ids(num_users=50):
"""Generate a list of user IDs."""
return list(range(1, num_users + 1))
def generate_group_description(member_ids):
"""Generate a description from member names."""
names = random.sample([f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"
for _ in range(len(member_ids))], len(member_ids))
return ", ".join(names)
def generate_dummy_data(db, num_groups=100, num_expenses_per_group=10):
"""Generate dummy data for the database."""
print("="*60)
print("GENERATING DUMMY DATA")
print("="*60)
if not db.connect():
print("Failed to connect to database")
return
try:
# Generate user IDs (we'll use IDs 1-50)
user_ids = generate_user_ids(50)
all_expense_ids = []
# Generate groups
print(f"\n1. Generating {num_groups} groups...")
group_ids = []
for i in range(num_groups):
group_name = random.choice(GROUP_NAMES)
if i < len(GROUP_NAMES):
group_name = GROUP_NAMES[i % len(GROUP_NAMES)]
else:
group_name = f"{group_name} {i // len(GROUP_NAMES) + 1}"
# Random number of members (2-8)
num_members = random.randint(2, 8)
member_list = random.sample(user_ids, num_members)
description = generate_group_description(member_list)
creator_id = random.choice(member_list)
# Random creation date within last 6 months
days_ago = random.randint(0, 180)
created_date = datetime.now() - timedelta(days=days_ago)
query = """
INSERT INTO `group` (group_name, description, user_id, created_dt, update_dt)
VALUES (%s, %s, %s, %s, %s)
"""
db.execute_update(query, (group_name, description, creator_id, created_date, created_date))
# Get the inserted group ID
result = db.execute_query("SELECT LAST_INSERT_ID() as id")
group_id = result[0]['id']
group_ids.append(group_id)
# Add members
for member_id in member_list:
member_query = """
INSERT INTO member (group_id, member_id)
VALUES (%s, %s)
ON DUPLICATE KEY UPDATE group_id=group_id
"""
db.execute_update(member_query, (group_id, member_id))
if (i + 1) % 20 == 0:
print(f" Created {i + 1} groups...")
print(f" ✓ Created {len(group_ids)} groups")
# Generate expenses
print(f"\n2. Generating expenses (target: ~{num_groups * num_expenses_per_group})...")
expense_count = 0
for group_id in group_ids:
# Get members of this group
members_query = "SELECT member_id FROM member WHERE group_id = %s"
members_result = db.execute_query(members_query, (group_id,))
if not members_result:
continue
member_ids = [m['member_id'] for m in members_result]
num_expenses = random.randint(5, num_expenses_per_group)
for _ in range(num_expenses):
# Choose random tag and title
tag = random.choice(list(EXPENSE_TITLES.keys()))
title = random.choice(EXPENSE_TITLES[tag])
# Random amount between $5 and $500
total_amount = Decimal(str(round(random.uniform(5, 500), 2)))
# Random payer
payer_id = random.choice(member_ids)
# Random creation date (within last 6 months, after group creation)
days_ago = random.randint(0, 180)
created_date = datetime.now() - timedelta(days=days_ago)
# Random settlement status (70% settled, 30% not settled)
is_settled = random.random() > 0.3
# Description (sometimes empty, sometimes has text)
description = "" if random.random() > 0.3 else random.choice([
"Team lunch", "Shared expense", "Group activity",
"Monthly bill", "One-time payment", ""
])
# Insert expense
expense_query = """
INSERT INTO expense (title, tag, description, is_settled, group_id, user_id, created_dt, updated_dt)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
"""
db.execute_update(expense_query, (
title, tag, description, is_settled, group_id, payer_id, created_date, created_date
))
# Get expense ID
result = db.execute_query("SELECT LAST_INSERT_ID() as id")
expense_id = result[0]['id']
all_expense_ids.append(expense_id)
# Generate splits
# Decide who actually owes (not everyone might owe)
# 60% chance that everyone splits, 40% chance only some people split
if random.random() < 0.6 or len(member_ids) <= 2:
# Everyone splits equally
split_members = member_ids
else:
# Only some members split (at least 2, but not everyone)
# Ensure we have at least 2 splitters and at most all members - 1
max_splitters = len(member_ids) - 1
if max_splitters >= 2:
num_splitters = random.randint(2, max_splitters)
split_members = random.sample(member_ids, num_splitters)
else:
# Fallback: everyone splits if we can't have partial split
split_members = member_ids
# Calculate split amount per person
split_amount = total_amount / len(split_members)
# Create splits
for member_id in split_members:
if member_id == payer_id:
# Payer: positive amount, paid_or_not = NULL
split_query = """
INSERT INTO split (expense_id, group_id, user_id, amount, paid_or_not)
VALUES (%s, %s, %s, %s, %s)
"""
db.execute_update(split_query, (
expense_id, group_id, member_id, total_amount, None
))
else:
# Owe: negative amount
# paid_or_not: NULL if not settled, TRUE/FALSE if settled
if is_settled:
paid_status = True # If expense is settled, all debts are paid
else:
# Random: 60% paid, 40% not paid
paid_status = True if random.random() > 0.4 else False
split_query = """
INSERT INTO split (expense_id, group_id, user_id, amount, paid_or_not)
VALUES (%s, %s, %s, %s, %s)
"""
db.execute_update(split_query, (
expense_id, group_id, member_id, -split_amount, paid_status
))
expense_count += 1
if expense_count % 100 == 0:
print(f" Created {expense_count} expenses...")
print(f" ✓ Created {expense_count} expenses")
# Update settlement status based on actual split data
print(f"\n3. Updating settlement status based on split data...")
updated = db.update_settled_expenses()
print(f" ✓ Updated {updated} expenses to settled status")
# Print summary statistics
print(f"\n4. Summary Statistics:")
print("-" * 60)
groups_count = db.execute_query("SELECT COUNT(*) as count FROM `group`")[0]['count']
members_count = db.execute_query("SELECT COUNT(*) as count FROM member")[0]['count']
expenses_count = db.execute_query("SELECT COUNT(*) as count FROM expense")[0]['count']
splits_count = db.execute_query("SELECT COUNT(*) as count FROM split")[0]['count']
settled_count = db.execute_query("SELECT COUNT(*) as count FROM expense WHERE is_settled = TRUE")[0]['count']
unsettled_count = db.execute_query("SELECT COUNT(*) as count FROM expense WHERE is_settled = FALSE")[0]['count']
print(f" Total Groups: {groups_count}")
print(f" Total Memberships: {members_count}")
print(f" Total Expenses: {expenses_count}")
print(f" Total Splits: {splits_count}")
print(f" Settled Expenses: {settled_count}")
print(f" Unsettled Expenses: {unsettled_count}")
print("\n" + "="*60)
print("DUMMY DATA GENERATION COMPLETE!")
print("="*60)
except Exception as e:
print(f"\nError generating data: {e}")
import traceback
traceback.print_exc()
finally:
db.disconnect()
def main():
"""Main function to generate dummy data."""
db = SplitDataDB()
# Ask user for confirmation
print("\nThis will generate dummy data in your database.")
print("This will add data to your existing tables.")
response = input("Continue? (yes/no): ").strip().lower()
if response in ['yes', 'y']:
# Generate data: 100 groups, ~10 expenses per group = ~1000 expenses
generate_dummy_data(db, num_groups=100, num_expenses_per_group=10)
else:
print("Cancelled.")
if __name__ == "__main__":
main()