Skip to content

Commit 216d803

Browse files
committed
Non-SQL database demo
1 parent 38c9e3d commit 216d803

2 files changed

Lines changed: 92 additions & 0 deletions

File tree

week-06/placeholder.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Slides are at
2+
https://docs.google.com/presentation/d/1n94UhTE7LNwbKiroracByxqGigm8QBrz-W46KZ00rjU/pub?start=false&loop=false&delayms=3000
3+
4+

week-06/redis_demo_bs_spider.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#! /usr/bin/env python
2+
"""This software traverses the web and loads the data into a redis database
3+
The key to the database is a URL, the values of the keys are a list of
4+
outbound urls. If the url has already been visited, then it is skipped"""
5+
6+
from bs4 import BeautifulSoup
7+
import redis
8+
import time
9+
import urllib2
10+
import socket # because we might handle a socket timeout exception
11+
import pydot # python library to generate graphs
12+
import sys
13+
14+
if len(sys.argv) > 1 :
15+
max_num_of_links = int(sys.argv[1])
16+
else:
17+
max_num_of_links = 100 # reasonable default value
18+
19+
VISITED_LIST = "redis_demo_bs_spider_visited_list" # keyname of the visited list
20+
TO_VISIT_SET = "redis_demo_bs_spider_to_visit_set" # keyname of the set of pages
21+
# to visit
22+
# Open a connection to the database
23+
# r_server = redis.Redis("108.59.89.216", port=6379, db=0)
24+
r_server = redis.Redis("localhost", port=6379, db=0)
25+
26+
27+
# Clear out the keys in the database that hold the list of sites that we have visited
28+
# and the set of sites to visit.
29+
r_server.delete(VISITED_LIST )
30+
r_server.delete(TO_VISIT_SET )
31+
# Insert all the specified values at the tail of the list stored at key. If key
32+
# does not exist, it is created as empty list before performing the push
33+
# operation. Start at commercialventvac.com, which is well connected.
34+
r_server.sadd(TO_VISIT_SET, "http://www.commercialventvac.com")
35+
start_time = time.time()
36+
for link_cnt in range(0,max_num_of_links):
37+
# Removes and returns a random element from the set value stored at key.
38+
this_page = r_server.spop(TO_VISIT_SET)
39+
print "**** Crawling %s visited %d " % ( this_page, r_server.llen( VISITED_LIST ))
40+
try:
41+
response = urllib2.urlopen(this_page, timeout=15) # timeout is in seconds
42+
except (urllib2.URLError, socket.timeout, urllib2.HTTPError, ValueError),e:
43+
print "The URL %s failed due to %s - skipping" % ( this_page, e)
44+
continue
45+
46+
html_doc = response.read()
47+
# Push this_page to the end of the visited list
48+
r_server.rpush(VISITED_LIST, this_page)
49+
soup = BeautifulSoup(html_doc, "html5lib")
50+
link_list = soup.find_all('a')
51+
for link in link_list:
52+
if 'href' in link.attrs :
53+
url = link.attrs['href']
54+
55+
# This is necessary because if this key already exists and it is not a set
56+
# type, then the sadd command throws a
57+
# "Operation against a key holding the wrong kind of value"
58+
# This is a leftover from previous runs of this software, which used a different
59+
# data structure
60+
if r_server.exists(url) :
61+
# Returns the string representation of the type of the value stored at key. The
62+
# different types that can be returned are: string, list, set, zset and hash.
63+
if r_server.type(url) != "set" :
64+
r_server.delete(url)
65+
# Add the url to the set of urls that we need to visit. If the URL is already
66+
# in the set, then this doesn't do anything.
67+
r_server.sadd(TO_VISIT_SET, url)
68+
r_server.sadd(this_page, url)
69+
70+
end_time = time.time()
71+
g = pydot.Dot(graph_type='digraph') # digraph => directed graph
72+
while r_server.llen(VISITED_LIST) > 0:
73+
# Get the next URL in the list of URLs that we visited
74+
url = r_server.lpop(VISITED_LIST)
75+
source_node = pydot.Node(url)
76+
g.add_node( source_node )
77+
# is the set of links that URL points to empty?
78+
while r_server.scard(url) > 0 :
79+
link = r_server.spop(url)
80+
print "%s => %s"% ( url, link )
81+
destination_node = pydot.Node(link)
82+
g.add_node(destination_node)
83+
g.add_edge(pydot.Edge(source_node, destination_node))
84+
g.write_png('example2_graph.png')
85+
g.write_dot('example2_graph.dot')
86+
execution_time = end_time - start_time
87+
print "Execution time was %f to collect %d links or %f links/second" % (execution_time,
88+
max_num_of_links, max_num_of_links/execution_time)

0 commit comments

Comments
 (0)