python_LEARN/python/chapter02/scrape_callback2.py at master · LemonLighter/python_LEARN

27 lines (20 loc) · 940 Bytes

# -*- coding: utf-8 -*-
import urlparse
import lxml.html
from link_crawler import link_crawler
class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
        self.writer.writerow(self.fields)
    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
            self.writer.writerow(row)
if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/', '/(index|view)', scrape_callback=ScrapeCallback())

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

scrape_callback2.py

Latest commit

History

scrape_callback2.py

File metadata and controls