Skip to content

Commit 2d1397d

Browse files
authored
Add files via upload
1 parent ff0b18c commit 2d1397d

1 file changed

Lines changed: 45 additions & 0 deletions

File tree

Python/reptile/qiushibaike.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# -*- coding: utf-8 -*-
2+
# @Time : 2018/5/5 19:08
3+
# @Author : Ape Code
4+
# @FileName: qiushibaike.py
5+
# @Software: PyCharm
6+
# @Blog :https://www.liuyangxiong.cn
7+
8+
import requests
9+
from bs4 import BeautifulSoup
10+
11+
12+
class Qiushibaike:
13+
14+
# 初始化
15+
def __init__(self):
16+
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) "
17+
"AppleWebKit/537.36 (KHTML, like Gecko) "
18+
"Chrome/66.0.3359.139 Safari/537.36"}
19+
self.url = "https://www.qiushibaike.com"
20+
self.articleList = [] # 内容的url地址
21+
22+
# 返回每条内容的url
23+
def returnUrl(self, content): # 参数: / hot imgrank text history pic textnew
24+
homeUrlResponse = requests.get(self.url + content, headers=self.headers).text
25+
homebsoup = BeautifulSoup(homeUrlResponse, 'lxml')
26+
homebfind = homebsoup.find('div', class_="col1").find_all('a', class_="contentHerf") # 获取每页内容的所有url链接
27+
for all_href in homebfind:
28+
self.articleList.append(all_href['href'])
29+
return self.articleList
30+
31+
# 获取用户发送的内容
32+
def getContent(self, url): # 参数: returnUrl返回的
33+
pass
34+
35+
# Run
36+
def main(self):
37+
# 默认爬取首页的,其他内容 hot imgrank text history pic textnew
38+
spiderContent = "/8hr/page/{}/".format(1)
39+
print(self.returnUrl(spiderContent))
40+
pass
41+
42+
43+
if __name__ == '__main__':
44+
qiushibaike = Qiushibaike()
45+
qiushibaike.main()

0 commit comments

Comments
 (0)