Skip to content

Commit 4e7be83

Browse files
author
parita pooj
committed
creates .xls database with contact details of companies of a particular category
1 parent 6a9ea71 commit 4e7be83

6 files changed

Lines changed: 68 additions & 3479 deletions

File tree

web-crawler/README

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1-
Contains web-crawler code to fetch the contact information of all the companies belonging to a category.
2-
This code has been tried only for the site:"http://www.justdial.com".
3-
1+
Contains web-crawler code to make a excel database from the contact information of all the companies belonging to a category.
2+
This code has been tested only for the site:"http://www.justdial.com".
3+
4+
Test:
5+
Input:
6+
url=www.justdial.com
7+
where?mumbai
8+
what?cable-dealers<or>escalator-dealers
9+
10+
Output files: cable-dealers.xls, escalator-dealers.xls

web-crawler/cable-dealers.xls

108 KB
Binary file not shown.

web-crawler/crawl.py

Lines changed: 58 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import re
22
import urllib2
33
from BeautifulSoup import BeautifulSoup
4+
import xlwt
45

56
def get_start_url(listing):
67
if(listing.endswith('/')):
@@ -12,116 +13,90 @@ def get_start_url(listing):
1213
list="http://"+listing+'/'+where.capitalize()+'/'+what.capitalize()
1314
return list
1415

15-
16-
keywordregex=re.compile('<span\sclass=["\']Ctitle["\']><a\shref=[\'"](.*?)[\'"]>(.*?)</a></span>.*?')
17-
1816
def get_html(url):
1917
msg=urllib2.urlopen(url).read()
2018
return msg
2119

22-
def get_links(tocrawl, crawled):
23-
links=[]
24-
while tocrawl:
25-
print "crawling", tocrawl
26-
msg = get_html(tocrawl)
27-
keywordlist = keywordregex.findall(msg)
28-
"""
29-
link={}
30-
for i in range(0, len(keywordlist)):
31-
link[i]=keywordlist[i]
32-
print link
33-
"""
34-
#links=linkregex.findall(msg)
35-
crawled.append(tocrawl)
36-
links=[x[0] for x in keywordlist]
37-
tocrawl=''
38-
return links
39-
4020
def get_next_link(msg):
4121
class_regex=re.compile('<div\sclass=[\'"]pagination[\'"]>(.*?)</div>')
4222
a_list=class_regex.findall(msg)
43-
a=a_list[0].split('</a>')
44-
a=[i for i in a if 'next' in i]
45-
link_regex=re.compile('<a\shref=[\'"](.*?)[\'"]>next\s')
46-
next_link=link_regex.findall(a[0])
47-
if len(next_link)>0:
48-
return next_link[0]
23+
if len(a_list)>0:
24+
a=a_list[0].split('</a>')
25+
a=[i for i in a if 'next' in i]
26+
link_regex=re.compile('<a\shref=[\'"](.*?)[\'"]>next\s')
27+
next_link=link_regex.findall(a[0])
28+
if len(next_link)>0:
29+
return next_link[0]
30+
else:
31+
return ''
4932
else:
5033
return ''
5134

52-
def all_links(url,company_links):
53-
while 'http://' in url:
54-
company_links.append(get_links(url,crawled))
55-
#print company_links
56-
#url=get_next_link(get_html(url))
57-
url=''
58-
if url:
59-
if (' ' in url):
60-
url=url.replace(' ','%20')
61-
62-
return company_links
63-
6435
def get_company_name(soup):
65-
cname=str(soup.find(id="cn"))
66-
cregex=re.compile('value=[\'"](.*?)[\'"]')
67-
c_name=cregex.findall(cname)
68-
if c_name:
69-
return c_name[0]
70-
else:
71-
return '-'
72-
73-
def get_person_name(soup):
74-
pname=soup.find(id="more").p.text
75-
if pname:
76-
return pname
77-
else:
78-
return '-'
79-
36+
c_name=soup.findAll(attrs={"class":"Ctitle"})
37+
cname=[]
38+
for i in c_name:
39+
cname.append(i.text)
40+
return cname
8041

8142
def get_phone(soup):
82-
phone=soup.find(id="more").findAll('p')[1].text
83-
if phone:
84-
return phone
85-
else:
86-
return '-'
43+
divtag=soup.findAll(attrs={"class":"logoDesc"})
44+
phone=[i.p.text.replace('Call:','') for i in divtag]
45+
return phone
8746

8847
def get_add(soup):
89-
addr=str(soup.find(id="add"))
90-
aregex=re.compile('value=[\'"](.*?)[\'"]')
91-
add=aregex.findall(addr)
92-
if add:
93-
return add[0]
94-
else:
95-
return '-'
48+
add_list=soup.findAll(attrs={"class":"logoDesc"})
49+
add=[i.text.split('|')[0].replace('\t','') for i in add_list]
50+
return add
9651

97-
9852
if __name__=='__main__':
9953
listing=raw_input("url:")
10054
where=raw_input("where?")
10155
what=raw_input("what?")
10256
url=get_start_url(listing)
103-
crawled=[]
104-
company_links=[]
105-
106-
company_links=all_links(url,company_links)[0]
107-
#print company_links
108-
57+
10958
name=[]
11059
person=[]
11160
phone=[]
11261
add=[]
113-
114-
while len(company_links)>0:
115-
soup=BeautifulSoup(get_html(company_links.pop(0)))
116-
name.append(get_company_name(soup))
117-
person.append(get_person_name(soup))
118-
phone.append(get_phone(soup))
119-
add.append(get_add(soup))
62+
63+
while url:
64+
if ' ' in url:
65+
url=url.replace(' ','%20')
66+
print url
67+
msg=get_html(url)
68+
print url
69+
soup=BeautifulSoup(msg)
70+
for i in get_company_name(soup):
71+
name.append(i)
72+
for i in get_phone(soup):
73+
phone.append(i)
74+
for i in get_add(soup):
75+
add.append(i)
76+
url=get_next_link(msg)
77+
"""
12078
print "names:",name
12179
print "person:",person
12280
print "phone:",phone
12381
print "add:",add
124-
125-
82+
"""
83+
84+
book=xlwt.Workbook()
85+
sheet1=book.add_sheet('sheet1')
86+
index=['NAME','PHONE','ADDRESS']
87+
style=xlwt.XFStyle()
88+
font=xlwt.Font()
89+
font.name='Times New Roman'
90+
font.bold=True
91+
style.font=font
12692

127-
93+
for n in range(0,3):
94+
sheet1.write(0,n,index[n].upper(),style)
95+
96+
for i in range(0,len(name)):
97+
sheet1.write(i+1,0,name[i])
98+
sheet1.write(i+1,1,phone[i])
99+
sheet1.write(i+1,2,add[i])
100+
101+
book.save(what+'.xls')
102+

web-crawler/escalator-dealers.xls

15.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)