Skip to content

Commit 5634ba4

Browse files
author
mohyeah
committed
24/12/17 learning BASIC Web crawler~
1 parent 5be13b3 commit 5634ba4

30 files changed

Lines changed: 718 additions & 0 deletions

File tree

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# 导入模块
2+
import requests
3+
# 通过requests.get()发送请求
4+
# data保存返回的响应数据(这里的响应数据不是单纯的html,需要通过content获取html代码)
5+
url = "https://movie.douban.com/"
6+
headers = {
7+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
8+
}
9+
10+
data = requests.get(url, headers=headers)
11+
# 通过data.content获取html代码
12+
data = data.content.decode("utf-8")
13+
print(data)

AI Learn/Stage 2 Python Advanced/Chapter 2 Python Web Crawler Combat/source/html/gdp.html

Lines changed: 337 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tr>
10+
<td><img src="./images/1.jpg" width="184px" height="122px" /></td>
11+
<td><img src="./images/2.jpg" width="184px" height="122px" /></td>
12+
<td><img src="./images/3.jpg" width="184px" height="122px" /></td>
13+
</tr>
14+
</table>
15+
</body>
16+
</html>
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>传智一下 你就高薪</title>
6+
7+
<style type="text/css">
8+
ul {list-style-type: none;}
9+
li {float: left; display: inline; margin-left: 10px;}
10+
.body { width: 980px; height: 800px; margin: 0 auto;}
11+
table{width: 100%;}
12+
.left {width: 60%; float: left; text-align: center;margin-top:10px}
13+
.right {width: 40%; float: left; }
14+
img {border-radius: 20px;}
15+
16+
17+
body{
18+
font-family:arial;
19+
font-size:12px;
20+
}
21+
22+
23+
#topbar{/*id选择器*/
24+
text-align:right;
25+
font-size:13px;
26+
margin-top:18px;
27+
padding-right:5px;
28+
}
29+
30+
#topbar1 a{
31+
color:#999;
32+
margin-left:25px;
33+
margin-top:18px
34+
}
35+
36+
37+
#topbar2 a{
38+
color:#999;
39+
}
40+
41+
42+
#topbar2 span{
43+
color:#999;
44+
}
45+
46+
#topbar a{
47+
color:#333;
48+
margin-left:20px;
49+
}
50+
51+
.bola{
52+
font-weight:700;
53+
}
54+
55+
#topbar #morepro{
56+
background:#2d78f4;
57+
color:#fff;
58+
padding:3px;
59+
}
60+
61+
.centerdiv{
62+
text-align:center;
63+
}
64+
65+
#searchText{
66+
width:540px;
67+
height:32px;
68+
border-radius:9px;
69+
border:2px solid #2d78f4;
70+
}
71+
72+
#submitBtn{
73+
width:85px;
74+
height:32px;
75+
background:#3385ff;
76+
color:#fff;
77+
border-radius:9px;
78+
border:3px solid #3385ff;
79+
text-align: center;}
80+
</style>
81+
82+
</head>
83+
84+
<body>
85+
86+
<div id="topbar">
87+
<a class="bola">黑马</a>
88+
<a class="bola">博学谷</a>
89+
<a class="bola">Python</a>
90+
<a class="bola">Java</a>
91+
<a class="bola">大数据</a>
92+
<a class="bola">数据分析</a>
93+
<a >登录</a>
94+
<a >设置</a>
95+
<a id="morepro" href="传智一下,你就知道.htm" >更多产品</a>
96+
</div>
97+
98+
<div class="centerdiv">
99+
<img src="./images/0.jpg" width="350px" height="160px" style="margin-top:100px" >
100+
</div>
101+
102+
<div class="centerdiv" >
103+
<form >
104+
105+
<input id="searchText">
106+
<input id="submitBtn" value="传智一下" style="margin-top:10px" onclick="window.location.href='./gdp.html';">
107+
108+
</form>
109+
</div>
110+
111+
<div class="body" >
112+
113+
<div class="left">
114+
<p> </p>
115+
116+
<a style="display: block;text-align:left; color:gray"><h3>推荐 导航 关注</h3></a>
117+
<table style = "border-collapse:separate; border-spacing:10px;">
118+
<tr>
119+
<td colspan="3" style="text-align:left;"><font size="4" face="Arial,sans-serif">惊爆 亚洲舞王卫冕成功</font></td>
120+
</tr>
121+
122+
<tr>
123+
<td><img src="./images/1.jpg" width="184px" height="122px" /></td>
124+
<td><img src="./images/2.jpg" width="184px" height="122px" /></td>
125+
<td><img src="./images/3.jpg" width="184px" height="122px" /></td>
126+
</tr>
127+
128+
<tr>
129+
<td colspan="3" style="text-align:left;"><font size="4" face="Arial,sans-serif">两少年人穷志不短 仅着内裤环游世界</font></td>
130+
</tr>
131+
132+
<tr>
133+
<td><img src="./images/4.jpg" width="184px" height="122px" /></td>
134+
<td><img src="./images/5.jpg" width="184px" height="122px" /></td>
135+
<td><img src="./images/6.jpg" width="184px" height="122px" /></td>
136+
</tr>
137+
</table>
138+
139+
</div>
140+
141+
142+
<div class="right">
143+
144+
</div>
145+
146+
</div>
147+
148+
<div class="centerdiv">
149+
<span style="color:#666;">传智</span>
150+
</div>
151+
152+
<div id="topbar1" class="centerdiv" style="margin-top:0px">
153+
<a>把传智设置为主页 </a>
154+
<a>关于传智 </a>
155+
<a>AboutBaidu </a>
156+
<a>传智推广</a>
157+
</div>
158+
159+
<div id="topbar2" class="centerdiv" style="margin-top:0px">
160+
<span>@2018Baidu</span>
161+
<a>使用传智前必读 </a>
162+
<a>意见反馈 </a>
163+
<span>京ICP证030173号</span>
164+
<span class="iconCls0"></span>
165+
<a style="margin-left:18px">京公安网备号11000002000001 </a>
166+
<span class="iconCls"></span>
167+
</div>
168+
169+
</body>
170+
</html>
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>index1</title>
6+
</head>
7+
<body>
8+
<h1>index1.html</h1>
9+
</body>
10+
</html>
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>index2</title>
6+
</head>
7+
<body>
8+
<h1>index2.html</h1>
9+
</body>
10+
</html>
98.4 KB
Loading
311 KB
Loading
156 KB
Loading
141 KB
Loading

0 commit comments

Comments
 (0)