#! encoding=utf-8
# Author : kesalin@gmail.com
# Blog : http://kesalin.github.io
# Date : 2014/10/18
# Description : Export CSND blog articles to Markdown files.
# Version : 1.0.0.0
# Python Version: Python 2.7.3
#
import urllib2
import re
import os
import sys
import datetime
import time
import traceback
import codecs
from bs4 import BeautifulSoup
#===========================================================================
# set your CSDN username
__username__ = "kesalin"
# set output dir
__output__ = "C:/Code/Python"
enableLog = True
# for test
#__testArticleUrl__ = "http://blog.csdn.net/kesalin/article/details/5414998"
#===========================================================================
# 尝试获取资源次数
gRetryCount = 5
header = {"User-Agent": "Mozilla-Firefox5.0"}
def log(str):
if enableLog:
print str
newFile = open('log.txt', 'a+')
newFile.write(str + '\n')
newFile.close()
def decodeHtmlSpecialCharacter(htmlStr):
specChars = {" " : "", \
" " : "", \
" " : "", \
"<" : "<", \
">" : ">", \
"&" : "&", \
""" : "\"", \
"©" : "®", \
"×" : "×", \
"÷" : "÷", \
}
for key in specChars.keys():
htmlStr = htmlStr.replace(key, specChars[key])
return htmlStr
def repalceInvalidCharInFilename(filename):
specChars = {"\\" : "", \
"/" : "", \
":" : "", \
"*" : "", \
"?" : "", \
"\"" : "", \
"<" : "小于", \
">" : "大于", \
"|" : " and ", \
"&" :" or ", \
}
for key in specChars.keys():
filename = filename.replace(key, specChars[key])
return filename
# process html content to markdown content
def htmlContent2String(contentStr):
patternImg = re.compile(r'()')
patternHref = re.compile(r'()(.+?)()')
patternRemoveHtml = re.compile(r'?[^>]+>')
resultContent = patternImg.sub(r'', contentStr)
resultContent = patternHref.sub(r'[\4](\2)', resultContent)
resultContent = re.sub(patternRemoveHtml, r'', resultContent)
resultContent = decodeHtmlSpecialCharacter(resultContent)
return resultContent
def exportToMarkdown(exportDir, postdate, categories, title, content):
titleDate = postdate.strftime('%Y-%m-%d')
contentDate = postdate.strftime('%Y-%m-%d %H:%M:%S %z')
filename = titleDate + '-' + title
filename = repalceInvalidCharInFilename(filename)
filepath = exportDir + '/' + filename + '.markdown'
log(" >> save as " + filename)
newFile = open(unicode(filepath, "utf8"), 'w')
newFile.write('---' + '\n')
newFile.write('layout: post' + '\n')
newFile.write('title: \"' + title + '\"\n')
newFile.write('date: ' + contentDate + '\n')
newFile.write('comments: true' + '\n')
newFile.write('categories: [' + categories + ']' + '\n')
newFile.write('tags: [' + categories + ']' + '\n')
newFile.write('description: \"' + title + '\"\n')
newFile.write('keywords: ' + categories + '\n')
newFile.write('---' + '\n\n')
newFile.write(content)
newFile.write('\n')
newFile.close()
def download(url, output):
# 下载文章,并保存为 markdown 格式
log(" >> download: " + url)
data = None
title = ""
categories = ""
content = ""
postDate = datetime.datetime.now()
global gRetryCount
count = 0
while True:
if count >= gRetryCount:
break
count = count + 1
try:
time.sleep(2.0) #访问太快会不响应
request = urllib2.Request(url, None, header)
response = urllib2.urlopen(request)
data = response.read().decode('UTF-8')
break
except Exception,e:
exstr = traceback.format_exc()
log(" >> failed to download " + url + ", retry: " + str(count) + ", error:" + exstr)
pass
if data == None:
log(" >> failed to download " + url)
return
#print data
soup = BeautifulSoup(data)
topTile = "[置顶]"
titleDocs = soup.find_all("div", "article_title")
for titleDoc in titleDocs:
titleStr = titleDoc.a.get_text().encode('UTF-8')
title = titleStr.replace(topTile, '').strip()
#log(" >> title: " + title)
manageDocs = soup.find_all("div", "article_manage")
for managerDoc in manageDocs:
categoryDoc = managerDoc.find_all("span", "link_categories")
if len(categoryDoc) > 0:
categories = categoryDoc[0].a.get_text().encode('UTF-8').strip()
postDateDoc = managerDoc.find_all("span", "link_postdate")
if len(postDateDoc) > 0:
postDateStr = postDateDoc[0].string.encode('UTF-8').strip()
postDate = datetime.datetime.strptime(postDateStr, '%Y-%m-%d %H:%M')
contentDocs = soup.find_all(id="article_content")
for contentDoc in contentDocs:
htmlContent = contentDoc.prettify().encode('UTF-8')
content = htmlContent2String(htmlContent)
exportToMarkdown(output, postDate, categories, title, content)
def getPageUrlList(url):
# 获取所有的页面的 url
request = urllib2.Request(url, None, header)
response = urllib2.urlopen(request)
data = response.read()
#print data
soup = BeautifulSoup(data)
lastArticleHref = None
pageListDocs = soup.find_all(id="papelist")
for pageList in pageListDocs:
hrefDocs = pageList.find_all("a")
if len(hrefDocs) > 0:
lastArticleHrefDoc = hrefDocs[len(hrefDocs) - 1]
lastArticleHref = lastArticleHrefDoc["href"].encode('UTF-8')
if lastArticleHref == None:
return []
print " > last page href:" + lastArticleHref
lastPageIndex = lastArticleHref.rfind("/")
lastPageNum = int(lastArticleHref[lastPageIndex+1:])
urlInfo = "http://blog.csdn.net" + lastArticleHref[0:lastPageIndex]
pageUrlList = []
for x in xrange(1, lastPageNum + 1):
pageUrl = urlInfo + "/" + str(x)
pageUrlList.append(pageUrl)
log(" > page " + str(x) + ": " + pageUrl)
log("total pages: " + str(len(pageUrlList)) + "\n")
return pageUrlList
def getArticleList(url):
# 获取所有的文章的 url/title
pageUrlList = getPageUrlList(url)
articleListDocs = []
strPage = " > parsing page {0}"
pageNum = 0
global gRetryCount
for pageUrl in pageUrlList:
retryCount = 0
pageNum = pageNum + 1
pageNumStr = strPage.format(pageNum)
print pageNumStr
while retryCount <= gRetryCount:
try:
retryCount = retryCount + 1
time.sleep(1.0) #访问太快会不响应
request = urllib2.Request(pageUrl, None, header)
response = urllib2.urlopen(request)
data = response.read().decode('UTF-8')
#print data
soup = BeautifulSoup(data)
topArticleDocs = soup.find_all(id="article_toplist")
articleDocs = soup.find_all(id="article_list")
articleListDocs = articleListDocs + topArticleDocs + articleDocs
break
except Exception, e:
print "getArticleList exception:%s, url:%s, retry count:%d" % (e, pageUrl, retryCount)
pass
artices = []
topTile = "[置顶]"
for articleListDoc in articleListDocs:
linkDocs = articleListDoc.find_all("span", "link_title")
for linkDoc in linkDocs:
#print linkDoc.prettify().encode('UTF-8')
link = linkDoc.a
url = link["href"].encode('UTF-8')
title = link.get_text().encode('UTF-8')
title = title.replace(topTile, '').strip()
oneHref = "http://blog.csdn.net" + url
#log(" > title:" + title + ", url:" + oneHref)
artices.append([oneHref, title])
log("total articles: " + str(len(artices)) + "\n")
return artices
def getHtmlName(url):
htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = ""
if htmlNameIndex + 1 == urlLen:
htmlNameIndex = url.rfind("/", 0, htmlNameIndex)
htmlName = url[htmlNameIndex + 1:urlLen - 1]
else:
htmlName = url[htmlNameIndex + 1:]
return htmlName
def exportBlog(username, output):
url = "http://blog.csdn.net/" + username
outputDir = output + "/" + username
log(" >> user name: " + username)
log(" >> output dir: " + outputDir)
log("start export...")
outputDir.replace("\\", "/")
if not os.path.exists(outputDir.decode("utf-8")):
os.makedirs(outputDir.decode("utf-8"))
articleList = getArticleList(url)
totalNum = len(articleList)
log("start downloading...")
currentNum = 0
strPage = "[{0}/{1}] ".decode("utf-8").encode("utf-8")
for article in articleList:
currentNum = currentNum + 1
strPageTemp = strPage.format(currentNum, totalNum)
strPageTemp = strPageTemp + article[1]
#log(strPageTemp)
download(article[0], username)
log("============================================================")
exportBlog(__username__, __output__)
#download(__testArticleUrl__, __output__)