-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpart2-4.py
More file actions
79 lines (64 loc) · 2.87 KB
/
part2-4.py
File metadata and controls
79 lines (64 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
import warnings
from sklearn.impute import SimpleImputer
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 50)
data = pd.read_csv('Playstore_final.csv')
data2 = pd.read_csv('GooglePlay.csv')
# merge two datasets together
merged_data = pd.merge(data2.rename(columns={"App": "App Name"}), data, on="App Name", how="inner")
merged_data = merged_data.drop(columns=[col for col in merged_data.columns if col.startswith('Unnamed')])
# match similar columns in each row of dataset and finding inconsistencies
match_columns = {
'Category_x': 'Category_y',
'Rating_x': 'Rating_y',
'Reviews_x': 'Reviews_y',
'Size_x': 'Size_y',
'Installs_x': 'Installs_y',
'Type': 'Free',
'Price_x': 'Price_y',
'Content Rating_x': 'Content Rating_y',
'Genres': 'Category_y',
'Last Updated': 'Last update',
'Current Ver': 'Version',
'Android Ver': 'Android version Text'
}
inconsistencies = []
for index, row in merged_data.iterrows():
for col in merged_data.columns:
if col in match_columns.keys():
if col == 'Type':
value1 = merged_data.at[index, col]
value2 = merged_data.at[index, match_columns[col]]
if (value1 == 'Free' and value2 == False) or (value1 == 'Paid' and value2 == True):
inconsistencies.append('app: ' + row['App Name'] + ' row: ' + str(index))
else:
merged_data.at[index, col] = merged_data.at[index, match_columns[col]]
new_column_names = {
'Category_x': 'Category',
'Rating_x': 'Rating',
'Reviews_x': 'Reviews',
'Size_x': 'Size',
'Price_x': 'Price',
'Installs_x': 'Installs',
'Content Rating_x': 'Content Rating'
}
merged_data = merged_data.rename(columns=new_column_names)
merged_data = merged_data.drop(columns=[col for col in merged_data.columns if col in match_columns.values()])
# combine columns together
prices_asString = merged_data['Price'].astype(str)
merged_data['price_currency'] = prices_asString.str.cat(merged_data['Currency'], sep=" ")
rating_asString = merged_data['Rating'].astype(str)
rating_count_asString = merged_data['Rating Count'].astype(str)
merged_data['rating_and_count'] = rating_asString.str.cat(rating_count_asString, sep=" - ")
merged_data['name_and_category'] = merged_data['App Name'].str.cat(merged_data['Category'], sep=" - ")
editor_choice_asString = merged_data['Editor Choice'].astype(str)
installs_asString = merged_data['Installs'].astype(str)
merged_data['installs_and_editor_choice'] = installs_asString.str.cat(editor_choice_asString, sep=" - ")
# handle missing values
numeric_columns = [
'Rating', 'Rating Count', 'Minimum Installs', 'Price'
]
mean_imputer = SimpleImputer(strategy='mean')
for col in numeric_columns:
merged_data[col] = mean_imputer.fit_transform(merged_data[[col]])