-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlink_crawler2.py
More file actions
35 lines (29 loc) · 1.12 KB
/
link_crawler2.py
File metadata and controls
35 lines (29 loc) · 1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
import urlparse
from common import download
def link_crawler(seed_url, link_regex):
"""Crawl from the given seed URL following links matched by link_regex
"""
crawl_queue = [seed_url]
seen = set(crawl_queue) # keep track which URL's have seen before
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
# check if link matches expected regex
if re.match(link_regex, link):
# form absolute link
link = urlparse.urljoin(seed_url, link)
# check if have already seen this link
if link not in seen:
seen.add(link)
crawl_queue.append(link)
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '/(index|view)')