imohd23
diff --git a/‎web-crawler/README‎
Lines changed: 10 additions & 3 deletions b/‎web-crawler/README‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎web-crawler/cable-dealers.xls‎
108 KB b/‎web-crawler/cable-dealers.xls‎
108 KB
diff --git a/‎web-crawler/crawl.py‎
Lines changed: 58 additions & 83 deletions b/‎web-crawler/crawl.py‎
Lines changed: 58 additions & 83 deletions
diff --git a/‎web-crawler/escalator-dealers.xls‎
15.5 KB b/‎web-crawler/escalator-dealers.xls‎
15.5 KB
@@ -1,3 +1,10 @@
-Contains web-crawler code to fetch the contact information of all the companies belonging to a category.
-This code has been tried only for the site:"http://www.justdial.com".
- 
+Contains web-crawler code to make a excel database from the contact information of all the companies belonging to a category.
+This code has been tested only for the site:"http://www.justdial.com".
+
+Test:
+Input:
+url=www.justdial.com
+where?mumbai
+what?cable-dealers<or>escalator-dealers
+
+Output files: cable-dealers.xls, escalator-dealers.xls
@@ -1,6 +1,7 @@
 import re
 import urllib2
 from BeautifulSoup import BeautifulSoup
+import xlwt
 
 def get_start_url(listing):
     if(listing.endswith('/')):
@@ -12,116 +13,90 @@ def get_start_url(listing):
         list="http://"+listing+'/'+where.capitalize()+'/'+what.capitalize()
     return list
 
-
-keywordregex=re.compile('<span\sclass=["\']Ctitle["\']><a\shref=[\'"](.*?)[\'"]>(.*?)</a></span>.*?')
-
 def get_html(url):
     msg=urllib2.urlopen(url).read()
     return msg
 
-def get_links(tocrawl, crawled):
-    links=[]
-    while tocrawl:
-        print "crawling", tocrawl
-        msg = get_html(tocrawl)
-        keywordlist = keywordregex.findall(msg)
-        """
-        link={}
-        for i in range(0, len(keywordlist)):
-            link[i]=keywordlist[i]
-        print link
-        """
-        #links=linkregex.findall(msg)
-        crawled.append(tocrawl)
-        links=[x[0] for x in keywordlist]
-        tocrawl=''
-    return links
-
 def get_next_link(msg):
     class_regex=re.compile('<div\sclass=[\'"]pagination[\'"]>(.*?)</div>')
     a_list=class_regex.findall(msg)
-    a=a_list[0].split('</a>')
-    a=[i for i in a if 'next' in i]
-    link_regex=re.compile('<a\shref=[\'"](.*?)[\'"]>next\s')
-    next_link=link_regex.findall(a[0])
-    if len(next_link)>0:
-        return next_link[0]
+    if len(a_list)>0:
+        a=a_list[0].split('</a>')
+        a=[i for i in a if 'next' in i]
+        link_regex=re.compile('<a\shref=[\'"](.*?)[\'"]>next\s')
+        next_link=link_regex.findall(a[0])
+        if len(next_link)>0:
+            return next_link[0]
+        else:
+            return ''
     else:
         return ''
 
-def all_links(url,company_links):
-    while 'http://' in url:
-        company_links.append(get_links(url,crawled))
-        #print company_links
-        #url=get_next_link(get_html(url))
-        url=''
-        if url:
-            if (' ' in url):
-                url=url.replace(' ','%20')
-
-    return company_links
-
 def get_company_name(soup):
-    cname=str(soup.find(id="cn"))
-    cregex=re.compile('value=[\'"](.*?)[\'"]')
-    c_name=cregex.findall(cname)
-    if c_name:
-        return c_name[0]
-    else:
-        return '-'
-
-def get_person_name(soup):
-    pname=soup.find(id="more").p.text
-    if pname:
-        return pname
-    else:
-        return '-'
-
+    c_name=soup.findAll(attrs={"class":"Ctitle"})
+    cname=[]
+    for i in c_name:
+        cname.append(i.text)
+    return cname
 
 def get_phone(soup):
-    phone=soup.find(id="more").findAll('p')[1].text
-    if phone:
-        return phone
-    else:
-        return '-'
+    divtag=soup.findAll(attrs={"class":"logoDesc"})
+    phone=[i.p.text.replace('Call:','') for i in divtag]
+    return phone
 
 def get_add(soup):
-    addr=str(soup.find(id="add"))
-    aregex=re.compile('value=[\'"](.*?)[\'"]')
-    add=aregex.findall(addr)
-    if add:
-        return add[0]
-    else:
-        return '-'
+    add_list=soup.findAll(attrs={"class":"logoDesc"})
+    add=[i.text.split('|')[0].replace('\t','') for i in add_list]
+    return add
 
-    
 if __name__=='__main__':
     listing=raw_input("url:")
     where=raw_input("where?")
     what=raw_input("what?")
     url=get_start_url(listing)
-    crawled=[]
-    company_links=[]
- 
-    company_links=all_links(url,company_links)[0]
-    #print company_links
-
+   
     name=[]
     person=[]
     phone=[]
     add=[]
-    
-    while len(company_links)>0:
-        soup=BeautifulSoup(get_html(company_links.pop(0)))
-        name.append(get_company_name(soup))
-        person.append(get_person_name(soup))
-        phone.append(get_phone(soup))
-        add.append(get_add(soup))
+
+    while url:
+        if ' ' in url:
+            url=url.replace(' ','%20')
+            print url
+        msg=get_html(url)
+        print url
+        soup=BeautifulSoup(msg)
+        for i in get_company_name(soup):
+            name.append(i)
+        for i in get_phone(soup):   
+            phone.append(i)
+        for i in get_add(soup):
+            add.append(i)
+        url=get_next_link(msg)
+"""
     print "names:",name
     print "person:",person
     print "phone:",phone
     print "add:",add
-
-
+"""
+
+    book=xlwt.Workbook()
+    sheet1=book.add_sheet('sheet1')
+    index=['NAME','PHONE','ADDRESS']
+    style=xlwt.XFStyle()
+    font=xlwt.Font()
+    font.name='Times New Roman'
+    font.bold=True
+    style.font=font
 
-
+    for n in range(0,3):
+        sheet1.write(0,n,index[n].upper(),style)
+        
+    for i in range(0,len(name)):
+        sheet1.write(i+1,0,name[i])
+        sheet1.write(i+1,1,phone[i])
+        sheet1.write(i+1,2,add[i])
+
+    book.save(what+'.xls')
+