forked from azk0019/CourseProject
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscraper.py
More file actions
110 lines (97 loc) · 3.75 KB
/
scraper.py
File metadata and controls
110 lines (97 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#############################
#############################
#############################
####START CODE####
#### Requirement -
# beautifulsoup4 <= 4.9.3
import requests
from bs4 import BeautifulSoup
import re
def get_links(base_url,faculty_extension):
reqs = requests.get(base_url+faculty_extension)
soup = BeautifulSoup(reqs.text, "html.parser") #'lxml'
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
#print('Total URLs - ',len(links))
print 'Total URLs - '+str(len(links))
links = [str(x).lower() for x in links]
links = list(set(links))
links = [x for x in links if x.startswith('tel:')==False]
links = [x for x in links if x.startswith('mailto:')==False]
links1 = [x for x in links if x.startswith('http')==True]
links2 = [base_url+'/'+x for x in links if x.startswith('http')==False and x.startswith('/')==False] #
links3 = [base_url+x for x in links if x.startswith('http')==False and x.startswith('/')==True] #
links = links1+links2+links3
links = list(set(links))
#print('Useful URLs - ',len(links))
print 'Useful URLs - '+str(len(links))
#####
work_links = []
for i in links:
if i.find("bio")>-1 or i.find("faculty")>-1 or i.find("staff")>-1 or i.find("people")>-1 or i.find("facultystaff")>-1 or i.find("directory")>-1 :
work_links.append(i)
#print('Faculty URLs - ',len(work_links))
print 'Faculty URLs - '+str(len(work_links))
assert len(work_links)>0,'No Faculty links found. Kindly enter another link!'
#print('\n-> Got all links!')
print '\n-> Got all links!'
return work_links
def get_text(work_links):
#url = "https://cs.illinois.edu/about/people/all-faculty/zilles"
required_text = []
profile_text = []
count=0
for url in work_links:
#print('URL - ', url)
count+=1
print 'Fetched link - '+str(count)
# if count>5:
# break
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text,'html.parser')#'lxml'
table = soup.find_all('p')#,attrs={"class":"directory-profile maxwidth800"}) #find_all
for x in table:
try:
sentence = str(x.text)
sentence = sentence.replace('\n',' ')
match = re.findall(r'[\w.+-]+@[\w-]+\.[\w.-]+', sentence)
if len(match)==0:
profile_text.append(sentence)
except:
continue
profile_text = [x for x in profile_text if len(x)>200]
profile_text = list(set(profile_text))
required_text = required_text+profile_text
required_text = list(set(required_text))
#print('\n-> Scraped all the text!')
print '\n-> Scraped all the text!'
return required_text
def save_files(required_text):
c=1
for i in required_text:
#print('Saved file'+str(c))
print 'Saved file'+str(c)
f = open('data/complied_bios/'+str(c)+".txt", "a")
f.write(i.strip())
f.close()
c+=1
#print('\n->Saved all files!')
print '\n->Saved all files!'
#1 example
base_url = 'https://cs.illinois.edu' # 'https://illinois.edu'
faculty_extension = '/about/people/all-faculty' #'/'#
#2 example
#base_url = 'https://www.stern.nyu.edu'
#faculty_extension = '/faculty/search_name_form'
#3 example
#base_url = 'https://history.uchicago.edu'
#faculty_extension = '/directories/full/current-faculty'
####Fetch links
work_links = get_links(base_url,faculty_extension)
####Fetch texts
required_text = get_text(work_links)
####Save texts (given path)
save_files(required_text)
##--##--##--##--##--##--##--##--##--##--##--##
### End of code