forked from PythonCHB/PythonCertSpring
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathredis_demo_bs_spider.py
More file actions
88 lines (80 loc) · 3.87 KB
/
redis_demo_bs_spider.py
File metadata and controls
88 lines (80 loc) · 3.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#! /usr/bin/env python
"""This software traverses the web and loads the data into a redis database
The key to the database is a URL, the values of the keys are a list of
outbound urls. If the url has already been visited, then it is skipped"""
from bs4 import BeautifulSoup
import redis
import time
import urllib2
import socket # because we might handle a socket timeout exception
import pydot # python library to generate graphs
import sys
if len(sys.argv) > 1 :
max_num_of_links = int(sys.argv[1])
else:
max_num_of_links = 100 # reasonable default value
VISITED_LIST = "redis_demo_bs_spider_visited_list" # keyname of the visited list
TO_VISIT_SET = "redis_demo_bs_spider_to_visit_set" # keyname of the set of pages
# to visit
# Open a connection to the database
# r_server = redis.Redis("108.59.89.216", port=6379, db=0)
r_server = redis.Redis("localhost", port=6379, db=0)
# Clear out the keys in the database that hold the list of sites that we have visited
# and the set of sites to visit.
r_server.delete(VISITED_LIST )
r_server.delete(TO_VISIT_SET )
# Insert all the specified values at the tail of the list stored at key. If key
# does not exist, it is created as empty list before performing the push
# operation. Start at commercialventvac.com, which is well connected.
r_server.sadd(TO_VISIT_SET, "http://www.commercialventvac.com")
start_time = time.time()
for link_cnt in range(0,max_num_of_links):
# Removes and returns a random element from the set value stored at key.
this_page = r_server.spop(TO_VISIT_SET)
print "**** Crawling %s visited %d " % ( this_page, r_server.llen( VISITED_LIST ))
try:
response = urllib2.urlopen(this_page, timeout=15) # timeout is in seconds
except (urllib2.URLError, socket.timeout, urllib2.HTTPError, ValueError),e:
print "The URL %s failed due to %s - skipping" % ( this_page, e)
continue
html_doc = response.read()
# Push this_page to the end of the visited list
r_server.rpush(VISITED_LIST, this_page)
soup = BeautifulSoup(html_doc, "html5lib")
link_list = soup.find_all('a')
for link in link_list:
if 'href' in link.attrs :
url = link.attrs['href']
# This is necessary because if this key already exists and it is not a set
# type, then the sadd command throws a
# "Operation against a key holding the wrong kind of value"
# This is a leftover from previous runs of this software, which used a different
# data structure
if r_server.exists(url) :
# Returns the string representation of the type of the value stored at key. The
# different types that can be returned are: string, list, set, zset and hash.
if r_server.type(url) != "set" :
r_server.delete(url)
# Add the url to the set of urls that we need to visit. If the URL is already
# in the set, then this doesn't do anything.
r_server.sadd(TO_VISIT_SET, url)
r_server.sadd(this_page, url)
end_time = time.time()
g = pydot.Dot(graph_type='digraph') # digraph => directed graph
while r_server.llen(VISITED_LIST) > 0:
# Get the next URL in the list of URLs that we visited
url = r_server.lpop(VISITED_LIST)
source_node = pydot.Node(url)
g.add_node( source_node )
# is the set of links that URL points to empty?
while r_server.scard(url) > 0 :
link = r_server.spop(url)
print "%s => %s"% ( url, link )
destination_node = pydot.Node(link)
g.add_node(destination_node)
g.add_edge(pydot.Edge(source_node, destination_node))
g.write_png('example2_graph.png')
g.write_dot('example2_graph.dot')
execution_time = end_time - start_time
print "Execution time was %f to collect %d links or %f links/second" % (execution_time,
max_num_of_links, max_num_of_links/execution_time)