Non-SQL database demo

jeffsilverm · jeffsilverm · commit 216d80333187 · 2013-03-23T21:57:07.000-07:00
diff --git a/week-06/placeholder.txt b/week-06/placeholder.txt
@@ -0,0 +1,4 @@
+Slides are at
+https://docs.google.com/presentation/d/1n94UhTE7LNwbKiroracByxqGigm8QBrz-W46KZ00rjU/pub?start=false&loop=false&delayms=3000
+
+
diff --git a/week-06/redis_demo_bs_spider.py b/week-06/redis_demo_bs_spider.py
@@ -0,0 +1,88 @@
+#! /usr/bin/env python
+"""This software traverses the web and loads the data into a redis database
+The key to the database is a URL, the values of the keys are a list of
+outbound urls.  If the url has already been visited, then it is skipped"""
+
+from bs4 import BeautifulSoup
+import redis
+import time
+import urllib2
+import socket   # because we might handle a socket timeout exception
+import pydot    # python library to generate graphs
+import sys
+
+if len(sys.argv) > 1 :
+    max_num_of_links = int(sys.argv[1])
+else:
+    max_num_of_links = 100   # reasonable default value
+
+VISITED_LIST = "redis_demo_bs_spider_visited_list"  # keyname of the visited list
+TO_VISIT_SET = "redis_demo_bs_spider_to_visit_set"  # keyname of the set of pages
+                                                    # to visit
+# Open a connection to the database
+# r_server = redis.Redis("108.59.89.216", port=6379, db=0)
+r_server = redis.Redis("localhost", port=6379, db=0)
+
+
+# Clear out the keys in the database that hold the list of sites that we have visited
+# and the set of sites to visit.
+r_server.delete(VISITED_LIST )
+r_server.delete(TO_VISIT_SET )
+# Insert all the specified values at the tail of the list stored at key. If key
+# does not exist, it is created as empty list before performing the push
+# operation.  Start at commercialventvac.com, which is well connected.
+r_server.sadd(TO_VISIT_SET, "http://www.commercialventvac.com")
+start_time = time.time()
+for link_cnt in range(0,max_num_of_links):
+# Removes and returns a random element from the set value stored at key.
+    this_page = r_server.spop(TO_VISIT_SET)
+    print "**** Crawling %s visited %d " % ( this_page, r_server.llen( VISITED_LIST ))
+    try:
+        response = urllib2.urlopen(this_page, timeout=15)    # timeout is in seconds
+    except (urllib2.URLError, socket.timeout, urllib2.HTTPError, ValueError),e:
+        print "The URL %s failed due to %s - skipping" % ( this_page, e)
+        continue
+    
+    html_doc = response.read()
+# Push this_page to the end of the visited list    
+    r_server.rpush(VISITED_LIST, this_page)
+    soup = BeautifulSoup(html_doc, "html5lib")
+    link_list = soup.find_all('a')
+    for link in link_list:
+        if 'href' in link.attrs :
+            url = link.attrs['href']
+        
+    # This is necessary because if this key already exists and it is not a set
+    # type, then the sadd command throws a
+    # "Operation against a key holding the wrong kind of value"
+    # This is a leftover from previous runs of this software, which used a different
+    # data structure
+            if r_server.exists(url) :
+    # Returns the string representation of the type of the value stored at key. The
+    # different types that can be returned are: string, list, set, zset and hash.            
+                if r_server.type(url) != "set" :
+                    r_server.delete(url)
+    # Add the url to the set of urls that we need to visit.  If the URL is already
+    # in the set, then this doesn't do anything.
+            r_server.sadd(TO_VISIT_SET, url)
+            r_server.sadd(this_page, url)
+            
+end_time = time.time()
+g = pydot.Dot(graph_type='digraph')        # digraph => directed graph
+while r_server.llen(VISITED_LIST) > 0:
+# Get the next URL in the list of URLs that we visited  
+    url = r_server.lpop(VISITED_LIST)
+    source_node = pydot.Node(url)
+    g.add_node( source_node )
+# is the set of links that URL points to empty?
+    while r_server.scard(url) > 0 :
+        link = r_server.spop(url)
+        print "%s => %s"% ( url, link )
+        destination_node = pydot.Node(link)
+        g.add_node(destination_node)
+        g.add_edge(pydot.Edge(source_node, destination_node))
+g.write_png('example2_graph.png')
+g.write_dot('example2_graph.dot')
+execution_time = end_time - start_time
+print "Execution time was %f to collect %d links or %f links/second" % (execution_time,
+                max_num_of_links, max_num_of_links/execution_time)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +Slides are at
 +https://docs.google.com/presentation/d/1n94UhTE7LNwbKiroracByxqGigm8QBrz-W46KZ00rjU/pub?start=false&loop=false&delayms=3000
++
++