PythonSnippet/exportDoulistAsMarkdown.py at master · Maginx/PythonSnippet

History

executable file

208 lines (180 loc) · 6.82 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

#!/usr/bin/env python

#! encoding=utf-8

# Author : [email protected]

# Blog : http://kesalin.github.io

# Date : 2016/07/12

# Description : 将豆列导出为 Markdown 文件.

# Version : 1.0.0.0

# Python Version: Python 2.7.3

import os

import threading

import time

import datetime

import re

import string

import urllib2

from bs4 import BeautifulSoup

gHeader = {"User-Agent": "Mozilla-Firefox5.0"}

# 书籍信息类

class BookInfo:

name = ''

url = ''

icon = ''

ratingNums = ''

ratingPeople = ''

comment = ''

def __init__(self, name, url, icon, nums, people, comment):

self.name = name

self.url = url

self.icon = icon

self.ratingNums = nums

self.ratingPeople = people

self.comment = comment

# 获取 url 内容

def getHtml(url):

try :

request = urllib2.Request(url, None, gHeader)

response = urllib2.urlopen(request)

data = response.read().decode('utf-8')

except urllib2.URLError, e :

if hasattr(e, "code"):

print "The server couldn't fulfill the request: " + url

print "Error code: %s" % e.code

elif hasattr(e, "reason"):

print "We failed to reach a server. Please check your url: " + url + ", and read the Reason."

print "Reason: %s" % e.reason

return data

# 转换 html 转义字符

def decodeHtmlSpecialCharacter(htmlStr):

specChars = {"&ensp;" : "", \

"&emsp;" : "", \

" " : "", \

"<" : "<", \

"&gt" : ">", \

"&" : "&", \

""" : "\"", \

"©" : "®", \

"×" : "×", \

"÷" : "÷", \

}

for key in specChars.keys():

htmlStr = htmlStr.replace(key, specChars[key])

return htmlStr

# 导出为 Markdown 格式文件

def exportToMarkdown(doulistTile, doulistAbout, bookInfos):

path = "{0}.md".format(doulistTile)

if(os.path.isfile(path)):

os.remove(path)

today = datetime.datetime.now()

todayStr = today.strftime('%Y-%m-%d %H:%M:%S %z')

file = open(path, 'a')

file.write('## {0}\n'.format(doulistTile))

file.write('{0}\n'.format(doulistAbout))

file.write('## 图书列表\n')

file.write('### 总计 {0} 本，更新时间：{1}\n'.format(len(bookInfos), todayStr))

i = 0

for book in bookInfos:

file.write('\n### No.{0:d} {1}\n'.format(i + 1, book.name))

file.write(' > **图书名称**：[{0}]({1}) \n'.format(book.name, book.url))

file.write(' > **豆瓣链接**：[{0}]({1}) \n'.format(book.url, book.url))

file.write(' > **豆瓣评分**：{0} \n'.format(book.ratingNums))

file.write(' > **评分人数**：{0} \n'.format(book.ratingPeople))

file.write(' > **我的评论**：{0} \n'.format(book.comment))

i = i + 1

file.close()

# 解析图书信息

def parseItemInfo(page, bookInfos):

soup = BeautifulSoup(page, 'html.parser')

items = soup.find_all("div", "doulist-item")

for item in items:

#itemStr = item.prettify().encode('utf-8')

#print itemStr

# get book name

content = item.find("div", "title").contents[1]

bookName = content.string.strip().encode('utf-8')

# get book url and icon

contents = item.find("div", "post")

hrefStr = contents.find('a').prettify().encode('utf-8')

#print hrefStr

bookUrl = ''

pattern = re.compile(r'(<a href=\")(.*)(\" target=)')

match = pattern.search(hrefStr)

if match:

bookUrl = match.group(2)

bookIcon = ''

pattern = re.compile(r'(img src=\")(.*)(\" width=)')

match = pattern.search(hrefStr)

if match:

bookIcon = match.group(2)

#print " > Book {0} : {1}, {2}".format(bookName, bookUrl, bookIcon)

# get rating

ratingNums = ''

ratingPeople = ''

contents = item.find("div", "rating")

for content in contents:

if content.name != None and content.string != None:

if content.get("class") != None:

ratingNums = content.string.encode('utf-8')

else:

ratingPeople = content.string.encode('utf-8')

pattern = re.compile(r'(\()(.*)(\))')

match = pattern.search(ratingPeople)

if match:

ratingPeople = match.group(2)

#print " RatingNums: {0}, ratingPeople: {1}".format(ratingNums, ratingPeople)

# get comment

comment = ''

contents = item.find_all("blockquote", "comment")

comment = contents[0].contents[2].encode('utf-8')

#print " Comment: {0}".format(comment)

# add book info to list

bookInfo = BookInfo(bookName, bookUrl, bookIcon, ratingNums, ratingPeople, comment)

bookInfos.append(bookInfo)

# 解析豆列 url

def parse(url):

start = datetime.datetime.now()

page = getHtml(url)

soup = BeautifulSoup(page, 'html.parser')

# get doulist title

doulistTile = soup.html.head.title.string.encode('utf-8')

print " > 获取豆列：" + doulistTile

# get doulist about

content = soup.find("div", "doulist-about")

#print content.prettify().encode('utf-8')

doulistAbout = ''

for child in content.children:

if child.string != None:

htmlContent = child.string.strip().encode('utf-8')

doulistAbout = "{0}\n{1}".format(doulistAbout, htmlContent)

#print "doulist about:" + doulistAbout

# get page urls

pageUrls = []

content = soup.find("div", "paginator")

for child in content.children:

childStr = "{0}".format(child)

if childStr.startswith('<a href=') == True:

childStr = decodeHtmlSpecialCharacter(childStr)

pattern = re.compile(r'(<a href=")(.*)(=">)(\d*)(</a>)')

match = pattern.search(childStr)

if match:

hrefStr = match.group(2)

pageUrls.append(hrefStr)

bookInfos = []

# get books from current page

#print " scan page : {0}".format(url)

parseItemInfo(page, bookInfos)

# get books from follow pages

for pageUrl in pageUrls:

#print " scan page : {0}".format(hrefStr)

page = getHtml(pageUrl)

parseItemInfo(page, bookInfos)

exportToMarkdown(doulistTile, doulistAbout, bookInfos)

total = len(bookInfos)

end = datetime.datetime.now()

cost = end - start

print " > 共获取 {0} 本图书信息，耗时 {1} 秒 {2} 毫秒".format(total,

cost.microseconds/1000000, (cost.microseconds%1000000)/1000)

#=============================================================================

# 程序入口：解析指定豆列

parse("https://www.douban.com/doulist/1133232/")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

exportDoulistAsMarkdown.py

Latest commit

History

exportDoulistAsMarkdown.py

File metadata and controls