11import re
22import urllib2
33from BeautifulSoup import BeautifulSoup
4+ import xlwt
45
56def get_start_url (listing ):
67 if (listing .endswith ('/' )):
@@ -12,116 +13,90 @@ def get_start_url(listing):
1213 list = "http://" + listing + '/' + where .capitalize ()+ '/' + what .capitalize ()
1314 return list
1415
15-
16- keywordregex = re .compile ('<span\sclass=["\' ]Ctitle["\' ]><a\shref=[\' "](.*?)[\' "]>(.*?)</a></span>.*?' )
17-
1816def get_html (url ):
1917 msg = urllib2 .urlopen (url ).read ()
2018 return msg
2119
22- def get_links (tocrawl , crawled ):
23- links = []
24- while tocrawl :
25- print "crawling" , tocrawl
26- msg = get_html (tocrawl )
27- keywordlist = keywordregex .findall (msg )
28- """
29- link={}
30- for i in range(0, len(keywordlist)):
31- link[i]=keywordlist[i]
32- print link
33- """
34- #links=linkregex.findall(msg)
35- crawled .append (tocrawl )
36- links = [x [0 ] for x in keywordlist ]
37- tocrawl = ''
38- return links
39-
4020def get_next_link (msg ):
4121 class_regex = re .compile ('<div\sclass=[\' "]pagination[\' "]>(.*?)</div>' )
4222 a_list = class_regex .findall (msg )
43- a = a_list [0 ].split ('</a>' )
44- a = [i for i in a if 'next' in i ]
45- link_regex = re .compile ('<a\shref=[\' "](.*?)[\' "]>next\s' )
46- next_link = link_regex .findall (a [0 ])
47- if len (next_link )> 0 :
48- return next_link [0 ]
23+ if len (a_list )> 0 :
24+ a = a_list [0 ].split ('</a>' )
25+ a = [i for i in a if 'next' in i ]
26+ link_regex = re .compile ('<a\shref=[\' "](.*?)[\' "]>next\s' )
27+ next_link = link_regex .findall (a [0 ])
28+ if len (next_link )> 0 :
29+ return next_link [0 ]
30+ else :
31+ return ''
4932 else :
5033 return ''
5134
52- def all_links (url ,company_links ):
53- while 'http://' in url :
54- company_links .append (get_links (url ,crawled ))
55- #print company_links
56- #url=get_next_link(get_html(url))
57- url = ''
58- if url :
59- if (' ' in url ):
60- url = url .replace (' ' ,'%20' )
61-
62- return company_links
63-
6435def get_company_name (soup ):
65- cname = str (soup .find (id = "cn" ))
66- cregex = re .compile ('value=[\' "](.*?)[\' "]' )
67- c_name = cregex .findall (cname )
68- if c_name :
69- return c_name [0 ]
70- else :
71- return '-'
72-
73- def get_person_name (soup ):
74- pname = soup .find (id = "more" ).p .text
75- if pname :
76- return pname
77- else :
78- return '-'
79-
36+ c_name = soup .findAll (attrs = {"class" :"Ctitle" })
37+ cname = []
38+ for i in c_name :
39+ cname .append (i .text )
40+ return cname
8041
8142def get_phone (soup ):
82- phone = soup .find (id = "more" ).findAll ('p' )[1 ].text
83- if phone :
84- return phone
85- else :
86- return '-'
43+ divtag = soup .findAll (attrs = {"class" :"logoDesc" })
44+ phone = [i .p .text .replace ('Call:' ,'' ) for i in divtag ]
45+ return phone
8746
8847def get_add (soup ):
89- addr = str (soup .find (id = "add" ))
90- aregex = re .compile ('value=[\' "](.*?)[\' "]' )
91- add = aregex .findall (addr )
92- if add :
93- return add [0 ]
94- else :
95- return '-'
48+ add_list = soup .findAll (attrs = {"class" :"logoDesc" })
49+ add = [i .text .split ('|' )[0 ].replace ('\t ' ,'' ) for i in add_list ]
50+ return add
9651
97-
9852if __name__ == '__main__' :
9953 listing = raw_input ("url:" )
10054 where = raw_input ("where?" )
10155 what = raw_input ("what?" )
10256 url = get_start_url (listing )
103- crawled = []
104- company_links = []
105-
106- company_links = all_links (url ,company_links )[0 ]
107- #print company_links
108-
57+
10958 name = []
11059 person = []
11160 phone = []
11261 add = []
113-
114- while len (company_links )> 0 :
115- soup = BeautifulSoup (get_html (company_links .pop (0 )))
116- name .append (get_company_name (soup ))
117- person .append (get_person_name (soup ))
118- phone .append (get_phone (soup ))
119- add .append (get_add (soup ))
62+
63+ while url :
64+ if ' ' in url :
65+ url = url .replace (' ' ,'%20' )
66+ print url
67+ msg = get_html (url )
68+ print url
69+ soup = BeautifulSoup (msg )
70+ for i in get_company_name (soup ):
71+ name .append (i )
72+ for i in get_phone (soup ):
73+ phone .append (i )
74+ for i in get_add (soup ):
75+ add .append (i )
76+ url = get_next_link (msg )
77+ """
12078 print "names:",name
12179 print "person:",person
12280 print "phone:",phone
12381 print "add:",add
124-
125-
82+ """
83+
84+ book = xlwt .Workbook ()
85+ sheet1 = book .add_sheet ('sheet1' )
86+ index = ['NAME' ,'PHONE' ,'ADDRESS' ]
87+ style = xlwt .XFStyle ()
88+ font = xlwt .Font ()
89+ font .name = 'Times New Roman'
90+ font .bold = True
91+ style .font = font
12692
127-
93+ for n in range (0 ,3 ):
94+ sheet1 .write (0 ,n ,index [n ].upper (),style )
95+
96+ for i in range (0 ,len (name )):
97+ sheet1 .write (i + 1 ,0 ,name [i ])
98+ sheet1 .write (i + 1 ,1 ,phone [i ])
99+ sheet1 .write (i + 1 ,2 ,add [i ])
100+
101+ book .save (what + '.xls' )
102+
0 commit comments