-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHTML5PythonCrawler.py
More file actions
153 lines (134 loc) · 4.48 KB
/
HTML5PythonCrawler.py
File metadata and controls
153 lines (134 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# coding=utf-8
import logging
import os
import re
import time
from urllib.parse import urlparse # py3
import pdfkit
import requests
from bs4 import BeautifulSoup
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
class Crawler(object):
"""
爬虫基类,所有爬虫都应该继承此类
"""
name = None
def __init__(self, name, start_url):
"""
初始化
:param name: 将要被保存为PDF的文件名称
:param start_url: 爬虫入口URL
"""
self.name = name
self.start_url = start_url
self.domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(self.start_url))
@staticmethod
def request(url, **kwargs):
"""
网络请求,返回response对象
:return:
"""
response = requests.get(url, **kwargs)
return response
def parse_menu(self, response):
"""
从response中解析出所有目录的URL链接
"""
raise NotImplementedError
def parse_body(self, response):
"""
解析正文,由子类实现
:param response: 爬虫返回的response对象
:return: 返回经过处理的html正文文本
"""
raise NotImplementedError
def run(self):
start = time.time()
print("Start!")
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
htmls = []
count=1
for index, url in enumerate(self.parse_menu(self.request(self.start_url))):
html = self.parse_body(self.request(url))
f_name = ".".join([str(index), "html"])
with open(f_name, 'wb') as f:
print("正在爬取第 %d 页......" % count)
f.write(html)
count += 1
htmls.append(f_name)
print("HTML文件下载完成,开始转换PDF")
pdfkit.from_file(htmls, self.name + ".pdf", options=options)
print("PDF转换完成,开始清除无用HTML文件")
for html in htmls:
os.remove(html)
total_time = time.time() - start
print(u"完成!总共耗时:%f 秒" % total_time)
class LiaoxuefengPythonCrawler(Crawler):
"""
廖雪峰Python3教程
"""
def parse_menu(self, response):
bsObj = BeautifulSoup(response.content, "html.parser")
menu_tag = bsObj.find_all(id="course")[0]
for li in menu_tag.find_all("li"):
url = li.a.get("href")
if not url.startswith("http"):
url = "".join([self.domain, url]) # 补全为全路径
yield url
def parse_body(self, response):
try:
bsObj = BeautifulSoup(response.content, 'html.parser')
# 加入标题, 居中显示
title = bsObj.find('h1').get_text()
bsObj.find(id='tpn').extract()
bsObj.find('h1').extract()
body = bsObj.find_all(id="maincontent")[0]
center_tag = bsObj.new_tag("center")
title_tag = bsObj.new_tag('h1')
title_tag.string = title
center_tag.insert(1, title_tag)
body.insert(1, center_tag)
html = str(body)
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group(2).startswith("http"):
rtn = "".join([m.group(1), self.domain, m.group(2), m.group(3)])
return rtn
else:
return "".join([m.group(1), m.group(2), m.group(3)])
html = re.compile(pattern).sub(func, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
return html
except Exception as e:
logging.error("解析错误", exc_info=True)
if __name__ == '__main__':
start_url = "http://www.w3school.com.cn/html5/index.asp"
crawler = LiaoxuefengPythonCrawler("HTML5教程", start_url)
crawler.run()