forked from azk0019/CourseProject
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathscraper.py
More file actions
executable file
·211 lines (168 loc) · 7.45 KB
/
scraper.py
File metadata and controls
executable file
·211 lines (168 loc) · 7.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
this script designed to scrape off information from two major movie rating websites: IMDB and Rotten Tomatoes
twelve movie ratings are obtained from each source.
"""
from bs4 import BeautifulSoup as SOUP
import re
import requests
from math import log
def locate_url(user_emotion):
file_path = "url/"
emotions = ["Happy", "Sad", "Satisfying", "Angry",
"Peaceful", "Fearful", "Excited", "Depressed",
"Content", "Sorrowful"]
url_lst = []
with open(file_path + "IMDB.txt") as f1, open(file_path + "RT.txt") as f2:
f1_lst = f1.read().splitlines()
f2_lst = f2.read().splitlines()
for i in range(len(emotions)):
if emotions[i] in user_emotion:
IMDB = f1_lst[i]
RT = f2_lst[i]
url_lst.append(IMDB)
url_lst.append(RT)
return url_lst
"""sort movies based on their rating, from high to low
through this function, the recommender is able to select the movie with highest weighted rating
"""
def rank_movies(movie_dict):
ranked_dict = {}
rating = []
for movie_info in movie_dict.values():
if type(movie_info[-1]) == float:
rating.append(movie_info[-1])
rating = sorted(rating, reverse = True)
for r in rating:
for k in movie_dict.keys():
if movie_dict[k][-1] == r:
ranked_dict[k] = movie_dict[k]
return ranked_dict
# IMDB should be a single link
def scrape_IMDB(IMDB, num, folder_path = None):
folder_path = "movie_summary/" # you only need the folder_path when you need to store movie summary
response = requests.get(IMDB)
data = SOUP(response.text, 'lxml')
# we hope to have movie's name, grading, runtime, and rating
IMDB_dict = {}
title_lst = []
num_reviews = []
# IMDB lists top 50 from each genre
for movie in data.findAll('div', class_= "lister-item-content"):
# title
title = movie.find("a", attrs = {"href" : re.compile(r'\/title\/tt+\d*\/')})
title = str(title).split('">')[1].split('</')[0]
IMDB_dict[title] = []
title_lst.append(title)
# movie summary
summary = movie.findAll('p', {'class':'text-muted'})
if summary != None:
summary = str(summary).split(', <p class="text-muted">')[1].replace("\n", "").replace("</p>]", "") #clean the summary text
IMDB_dict[title].append(summary)
# grading
grading = movie.find('span', class_= "certificate")
if grading != None:
grading = str(grading).split('">')[1].split('</')[0]
else:
grading = "Not Found"
IMDB_dict[title].append(grading)
# runtime
length = movie.find('span', class_ = "runtime")
if length != None:
length = str(length).split('">')[1].split('</')[0]
else:
length = "Not Found"
IMDB_dict[title].append(length)
# No. of reviewers
for title, movie in zip(title_lst, data.findAll('p', class_ = "sort-num_votes-visible")):
numRater = int(re.sub("[^0-9]", "",movie.text))
num_reviews.append(numRater)
# rating
for review, title, movie in zip(num_reviews, title_lst, data.findAll('div', class_ = "ratings-bar")):
rating = movie.find('div', class_ = "inline-block ratings-imdb-rating")
try :
rating = float(re.search(r'[\d]*[.][\d]+', str(rating).split(' ')[3]).group())
except AttributeError:
rating = float(re.search(r'\d+', str(rating).split(' ')[3]).group())
# score adjustments based on number of reviewers through logistic regression
weightedRating = rating * log(log(review, 5), 10)
weightedRating = round(weightedRating, 1)
IMDB_dict[title].append(weightedRating)
ranked_dict = rank_movies(IMDB_dict)
ranked_dict = dict(list(ranked_dict.items())[0: num])
# print(ranked_dict)
return ranked_dict
# RT should be a single link
def scrape_rt(RT, num):
response = requests.get(RT)
data = SOUP(response.text, 'lxml')
RT_dict = {}
title_lst = []
rel_lst = []
reviews_lst = []
# Rotten Tomatoes lists top 100 from each genre
# as above, we hope to obtain name, grading, runtime, and rating
for movie in data.findAll('tr'):
# title
title = movie.find("a", class_ = "unstyled articleLink")
if title != None:
cleanTitle = str(title).split('">')[1].split(" (")[0].strip('\n').strip()
RT_dict[cleanTitle] = []
title_lst.append(cleanTitle) #100
# link to movie profile
rel_link = str(title).split('href="')[1].split('">\n')[0]
link = "https://www.rottentomatoes.com/" + rel_link
RT_dict[cleanTitle].append(link)
# numbers of reviews:
num_reviews = movie.find('td', class_ = "right hidden-xs")
if num_reviews != None:
num_reviews = int(str(num_reviews).split('">')[1].split('</')[0]) #100
# collect number of reviewers for later movie score adjustments
reviews_lst.append(num_reviews)
# rating
for review, title, movie in zip(reviews_lst, title_lst, data.findAll('span', class_ = 'tMeterIcon tiny')):
rating = movie.find('span', class_ = "tMeterScore")
rating = str(rating).split('">\xa0')[1].split('%</')[0]
# transform RT rating into the same scale as IMDB rating (out of 10)
weightedRating = int(rating)/10
# score adjustments
weightedRating = weightedRating * log(log(review, 4),5)
weightedRating = round(weightedRating, 1)
RT_dict[title].append(weightedRating)
# to increase the efficiency of the script,
# we are going to rank movies based on rating
# and only look up movie profiles of top-ranked movies
ranked_dict = rank_movies(RT_dict)
ranked_dict = dict(list(ranked_dict.items())[0: num])
for value in ranked_dict.values():
rel_lst.append(value[0])
value.pop(0)
new_title_lst = list(ranked_dict.keys())
# # grading and runtime information are inside movie profile links
for title, link in zip(new_title_lst, rel_lst):
response = requests.get(link)
data_1 = SOUP(response.text, 'lxml')
#movie summary
for div_tag in data_1.findAll('div', {'class':'movie_synopsis clamp clamp-6 js-clamp'}):
summary = str(div_tag.text).replace("\n","")
ranked_dict[title].insert(0, summary)
for div_tag in data_1.findAll('li', {'class':'meta-row clearfix'}):
movie_label= div_tag.find('div', {'class': 'meta-label subtle'}).text
if movie_label == "Rating:":
rating_info = div_tag.find('div', {'class': 'meta-value'}).text
rating_info = rating_info.replace("\n","").replace(" ", "")
ranked_dict[title].insert(1, rating_info)
elif movie_label == "Runtime:":
runtime_info = div_tag.find('div', {'class': 'meta-value'}).text
runtime_info = runtime_info.replace("\n","").replace(" ", "")
ranked_dict[title].insert(2, runtime_info)
return ranked_dict
## (DO NOT RUN) For Testing Purpose:
user_emotion = 'Happy'
RT_url = locate_url(user_emotion)[1]
IMDB_url = locate_url(user_emotion)[0]
# # print(scrape_IMDB(IMDB_url, 12))
# # print(scrape_rt(RT_url, 12))
movie_dict = {}
movie_dict.update(scrape_rt(RT_url, 12))
movie_dict.update(scrape_IMDB(IMDB_url, 12))
movie_dict = rank_movies(movie_dict)