Skip to content

Commit 200eaf7

Browse files
committed
Create 0008.py
1 parent 4224771 commit 200eaf7

1 file changed

Lines changed: 19 additions & 0 deletions

File tree

razzl/0008/0008.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from __future__ import division#division
2+
import re
3+
import urllib2
4+
5+
url = 'http://world.cankaoxiaoxi.com/2015/0404/730644.shtml'
6+
html = urllib2.urlopen(url).read()
7+
html = re.sub(r'<script[^>]*>([\s\S])*?</script[^>]*>','',html)#delete the script
8+
html = re.sub(r'<style[^>]*>([\s\S])*?</style[^>]*>','',html)#delete the style
9+
html = re.split("[\r\n]+",html)#split
10+
for line in html:
11+
if line.strip()=='':
12+
continue
13+
line_sub = re.sub(r'<[^>]*>','',line)#record the words in a line
14+
if len(line_sub)/len(line) >= 0.5:#compare the text of the density
15+
if(line_sub.strip()!=''):
16+
print line_sub.strip()
17+
18+
19+

0 commit comments

Comments
 (0)