-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHouseCrawler.py
More file actions
22 lines (20 loc) · 916 Bytes
/
HouseCrawler.py
File metadata and controls
22 lines (20 loc) · 916 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import requests, re
def readPages(start, end):
for page in range(start,end+1):
url = "https://sh.lianjia.com/ershoufang/pg" + str(page) + "/"
html = requests.get(url)
# print(html.text)
items = re.findall('<li class="clear LOGCLICKDATA" >(.*?)</li>',html.text,re.S)
infos = []
for item in items:
info = {}
link = re.findall('<div class="title"><a class="" href="(.*?)"',item, re.S)[0]
totalprice = re.findall('<div class="totalPrice"><span>(.*?)</span>',item, re.S)[0]
unitprice = re.findall('<div class="unitPrice" data-hid="\d+" data-rid="\d+" data-price="\d+"><span>(.*?)</span>',item, re.S)[0]
info["totalprice"] = totalprice
info["unitprice"] = unitprice
info["link"] = link
infos.append(info)
print(infos)
if __name__ == "__main__":
readPages(1,1)