python_learning/html_parser.py at master · BinWayne/python_learning

35 lines (29 loc) · 1.04 KB

from bs4 import BeautifulSoup
import urlparse
class HtmlParser(object):
	def _get_new_urls(self,page_url,soup):
		new_urls = set()
		links = soup.find_all('a',href = re.compile(r"/view/\d+\.htm"))
		for link in links:
			new_url = link['href']
			#print new_url
			new_full_url = urlparse.urljoin(page_url,new_url)
			new_urls.add(new_full_url)
		return new_urls
	def _get_new_data(self,page_url,soup):
		res_data = {}
		res_data['url'] = page_url
		title_node = soup.find('dd',class_= "lemmaWgt-lemmaTitle-title").find("h1")
		res_data['title'] = title_node.get_text()
		summary_node = soup.find('div',class_='lemma-summary')
		#<div class="lemma-summary" label-module="lemmaSummary">
		res_data['summary'] = summary_node.get_text()
		return res_data
	def parse(self,page_url,html_cont):
		if page_url is None or html_cont is None:
		soup = BeautifulSoup(html_cont,'html.parser',from_encoding = 'utf-8')
		new_urls = self._get_new_urls(page_url,soup)
		new_data = self._get_new_data(page_url,soup)
		return new_urls,new_data

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

html_parser.py

Latest commit

History

html_parser.py

File metadata and controls