-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlink_crawler1.py
More file actions
29 lines (23 loc) · 931 Bytes
/
link_crawler1.py
File metadata and controls
29 lines (23 loc) · 931 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import re
from common import download
def link_crawler(seed_url, link_regex):
"""Crawl from the given seed URL following links matched by link_regex
"""
crawl_queue = [seed_url] # the queue of URL's to download
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
# filter for links matching our regular expression
for link in get_links(html):
if re.match(link_regex, link):
# add this link to the crawl queue
crawl_queue.append(link)
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '/(index|view)')