forked from naeron75/sql-database
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleaning.py
More file actions
162 lines (128 loc) · 6.63 KB
/
cleaning.py
File metadata and controls
162 lines (128 loc) · 6.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def merge_and_clean():
""" Takes data from 3 raw csv files.
Cleans, merges and outputs dataframe to work with"""
import pandas as pd
#Import the csv files as dataframes
medals_df = pd.read_csv('../data/raw/olympic_medals.csv')
hosts_df = pd.read_csv('../data/raw/olympic_hosts.csv')
paris_df = pd.read_csv('../data/raw/paris_medallists.csv')
# Removes duplicated event wins for team events to return one athlete per medal (bronze, silver, gold) per event
medals_df = clean_medal_duplicates(medals_df)
paris_df = clean_paris_duplicates(paris_df)
# Removes unnecessary columns from medals table
columns_to_drop = ['participant_title', 'participant_type', 'event_gender', 'athlete_url', 'athlete_full_name', 'country_code', 'country_3_letter_code']
medals_df.drop(columns_to_drop, axis=1, inplace=True)
# Filters host countries to just include Summer games
summer_hosts = hosts_df[hosts_df['game_season'] == 'Summer']
# Gets host country, and year for each game and filters to >1920
df = medals_df.copy()
df['host_country'] = medals_df['slug_game'].apply(create_country_column)
df['game_year'] = medals_df['slug_game'].apply(create_year_column)
df['game_season'] = medals_df['slug_game'].apply(create_season_column)
df = df[df['game_season'] == 'Summer']
df = df[df['game_year'] >= 1920]
df.reset_index(drop=True, inplace=True)
df.drop(['slug_game', 'game_season'], axis=1, inplace=True)
# Removes unnecessary columns from paris dataset
tiny_paris_df = paris_df.drop(columns=['medal_date', 'medal_code', 'name', 'gender',
'country_code', 'country_long', 'nationality_code',
'nationality', 'nationality_long', 'team', 'team_gender',
'event_type', 'url_event', 'birth_date', 'code_athlete',
'code_team', 'is_medallist'])
def standardize_medal_type(medal):
return medal.lower().strip().replace(' medal', '')
# Tidies and adds Paris games columns
tiny_paris_df['medal_type'] = tiny_paris_df['medal_type'].apply(standardize_medal_type)
tiny_paris_df['host_country'] = 'France'
tiny_paris_df['game_year'] = 2024
def standardize_string(string):
return string.lower().strip()
# Standardizes columns in paris dataframes
tiny_paris_df['country'] = tiny_paris_df['country'].apply(standardize_string)
tiny_paris_df['host_country'] = tiny_paris_df['host_country'].apply(standardize_string)
tiny_paris_df['discipline'] = tiny_paris_df['discipline'].apply(standardize_string)
tiny_paris_df['event'] = tiny_paris_df['event'].apply(standardize_string)
tiny_paris_df.rename(columns={'country': 'athlete_country'}, inplace=True)
# Standardizes columns in medals dataframes
df['medal_type'] = df['medal_type'].apply(standardize_string)
df['host_country'] = df['host_country'].apply(standardize_string)
df['country_name'] = df['country_name'].apply(standardize_string)
df['discipline_title'] = df['discipline_title'].apply(standardize_string)
df['event_title'] = df['event_title'].apply(standardize_string)
df.rename(columns={'country_name': 'athlete_country'}, inplace=True)
df.rename(columns={'discipline_title': 'discipline'}, inplace=True)
df.rename(columns={'event_title': 'event'}, inplace=True)
# Combines dataframes
full_df = pd.concat([df, tiny_paris_df], ignore_index=True)
# Clean country names
full_df = clean_country_names(full_df)
# Updates event column to include discipline
event_df = full_df.copy()
event_df['event'] = event_df['discipline'] + ' - ' + event_df['event']
# Get only specific columns
event_df = event_df[['host_country', 'game_year', 'event', 'medal_type', 'athlete_country']]
'event_df.to_csv('../data/clean/clean_summer_df_events.csv')
return(event_df)
def clean_paris_duplicates(paris_df):
"""Returns the first medal winner from each team event in the paris 2024 dataset"""
condition = paris_df['event_type'].isin(['TEAM', 'HTEAM', 'HCOUP', 'COUP'])
#Gets all team events and drops duplicates
paris_df_filtered = paris_df[condition].drop_duplicates(
subset=['medal_date', 'medal_type', 'medal_code', 'country_code', 'country',
'country_long', 'nationality_code', 'nationality', 'nationality_long'],
keep='first'
)
# Combines filtered team events with individual events
paris_df_update = pd.concat([paris_df[~condition], paris_df_filtered])
return(paris_df_update)
def clean_medal_duplicates(medals_df):
"""Returns the first medal winner from each team event in the 1896-2020 dataset"""
condition = medals_df['participant_type'].isin(['GameTeam'])
#Gets all team events and drops duplicates
medals_df_filtered = medals_df[condition].drop_duplicates(
subset=['discipline_title', 'slug_game', 'event_title', 'event_gender',
'medal_type'],
keep='first'
)
# Combines filtered team events with individual events
medals_df_update = pd.concat([medals_df[~condition], medals_df_filtered])
return(medals_df_update)
def create_country_column(slug):
for i, host_slug in enumerate(hosts_df['game_slug']):
if slug == host_slug:
return hosts_df['game_location'].iloc[i]
def create_year_column(slug):
for i, host_slug in enumerate(hosts_df['game_slug']):
if slug == host_slug:
return hosts_df['game_year'].iloc[i]
def create_season_column(slug):
for i, host_slug in enumerate(hosts_df['game_slug']):
if slug == host_slug:
return hosts_df['game_season'].iloc[i]
def clean_country_names(df):
""" Takes 'athlete_country' column and replaces countries with consistent names """"
# Dictionary of country names to change
country_mapping = {
'ir iran': 'iran',
'islamic republic of iran': 'iran',
'korea': 'south korea',
'republic of korea': 'south korea',
'united states': 'usa',
'united states of america': 'usa',
"democratic people's republic of korea": 'north korea',
'dpr korea': 'north korea',
'ain': 'individual neutral athletes',
"people's republic of china": 'china',
'czech republic': 'czechia',
'türkiye': 'turkey',
}
# Replaces the names in the dataframe
df['athlete_country'] = df['athlete_country'].replace(country_mapping)
# Dictionary of country names to change
host_country_mapping = {
'republic of korea': 'south korea',
'united states': 'usa',
'australia, sweden' : 'australia'
}
df['host_country'] = df['host_country'].replace(host_country_mapping)
return df