python_LEARN/python/chapter01/link_crawler2.py at master · LemonLighter/python_LEARN

35 lines (29 loc) · 1.12 KB

import urlparse
from common import download
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    crawl_queue = [seed_url]
    seen = set(crawl_queue) # keep track which URL's have seen before
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urlparse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)
def get_links(html):
    """Return a list of links from html 
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)
if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '/(index|view)')

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

link_crawler2.py

Latest commit

History

link_crawler2.py

File metadata and controls