-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser2.py
More file actions
136 lines (104 loc) · 2.63 KB
/
parser2.py
File metadata and controls
136 lines (104 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
#encoding:utf-8
from sgmllib import SGMLParser
import urllib2, re
#抓取封面
#div 1
#div class = "boxdiv1" > img src
#抓取漫画名
#div 2 li 1 a 1
#div class = "boxdiv1" > div class = "pictext" > ul > li > a text
#抓取作者
#div 2 li 2 span 1
#div class = "boxdiv1" > div class = "pictext" > ul > li > span text
#抓取分类
#div 2 li 3
#div class = "boxdiv1" > div class = "pictext" > ul > li text
#抓取更新
#div 2 li 4 a 2
#div class = "boxdiv1" > div class = "pictext" > ul > li > a text
#抓取状态
#div 2 li 5
#div class = "boxdiv1" > div class = "pictext" > ul > li > text
#抓取时间
#div 2 li 6 span 2
#div class = "boxdiv1" > div class = "pictext" > ul > li > span text
class Parser(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.test_div = 0
self.test_li = 0
self.test_a = 0
self.test_span = 0
self.test_end_div = 0
self.img = ""
self.name = ""
self.author = ""
self.category = ""
self.update = ""
self.status = ""
self.time = ""
self.info = []
def start_div(self, attrs):
for k, v in attrs:
if k == "class":
if v == "boxdiv1":
self.test_div = 1
elif v == "pictext" and self.test_div == 1:
self.test_div = 2
def end_div(self):
if self.test_div > 0:
self.test_end_div += 1
if self.test_end_div == 3:
self.test_end_div = 0
self.info.append((self.img, self.name, self.author, self.category, self.update, self.status, self.time))
def start_img(self, attrs):
if self.test_div == 1:
for k, v in attrs:
if k == "src":
self.img = v
def start_li(self, attrs):
if self.test_div == 2:
self.test_li += 1
def end_li(self):
if self.test_div == 2 and self.test_li == 6:
self.test_li = 0
def start_a(self, attrs):
if self.test_li == 1:
self.test_a = 1
elif self.test_li == 4:
self.test_a = 2
def end_a(self):
if self.test_a > 0:
self.test_a = 0
def handle_data(self, data):
if len(data.strip()):
if self.test_a == 1:
self.name = data
elif self.test_span == 1:
self.author = data
elif self.test_li == 3:
self.category = data
elif self.test_li == 5:
self.status = data
elif self.test_a == 2:
self.update = data
elif self.test_span == 2:
self.time = data
def start_span(self, attrs):
if self.test_li == 2:
self.test_span = 1
elif self.test_li == 6:
self.test_span = 2
def end_span(self):
if self.test_span > 0:
self.test_span = 0
with open("input.txt") as f:
content = f.read()
parser = Parser()
parser.feed(content)
with open("output.txt", "w") as f:
for x in parser.info:
for y in x:
f.write(y + "\n")
f.write("\n")