Skip to content

Commit 352bae8

Browse files
authored
Add files via upload
1 parent d5bd6db commit 352bae8

5 files changed

Lines changed: 264 additions & 0 deletions

File tree

百度采集器/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
## 百度 ##
2+
- [x] 支持自定义语法
3+
- [x] 取有关键字的链接
4+
- [x] 保存功能
191 Bytes
Binary file not shown.

百度采集器/config/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
SEARCH='inurl:admin.php' #搜索语法
2+
GUANJIANZI='' #匹配关键字,如果没有则留空
3+
PAGE=14 #页数

百度采集器/main.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# author:jiushi
2+
# time:2019/7/11
3+
# file:main.py
4+
5+
6+
from gevent import monkey;monkey.patch_all()
7+
from urllib.request import quote
8+
from multiprocessing import Process
9+
import sys
10+
import itertools
11+
import time
12+
import gevent
13+
import requests
14+
import json
15+
import config.config
16+
import re
17+
18+
class Request:
19+
def __init__(self):
20+
self.headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
21+
self.djcs=[]
22+
self.guol=[]
23+
self.guanjianzhi=config.config.GUANJIANZI
24+
self.xcs=[]
25+
self.url=[]
26+
self.calc=0
27+
self.kq=0
28+
def banner(self):
29+
write,flush=sys.stdout.write,sys.stdout.flush
30+
for i in itertools.cycle('|/- \\'):
31+
if self.kq==30:
32+
flush()
33+
break
34+
data='Start Baidu Search:'+i
35+
write(data)
36+
flush()
37+
time.sleep(.1)
38+
write('\x08' * len(data))
39+
self.kq+=1
40+
41+
def baidu_search(self,url):
42+
rqt=requests.get(url=url,headers=self.headers)
43+
data_tools=re.findall("data-tools='.*}'",rqt.text)
44+
for j in data_tools:
45+
try:
46+
data=json.loads(str(j).replace('data-tools=','').replace("'",''))
47+
urls=requests.get(url=data['url'],headers=self.headers,timeout=3)
48+
headers=urls.headers
49+
if 'Server' in headers:
50+
server=headers['Server']
51+
else:
52+
server=''
53+
54+
if 'x-powered-by' in headers:
55+
power=headers['x-powered-by']
56+
else:
57+
power=''
58+
59+
data='url:{} title:{} server:{} x-power-by:{}'.format(urls.url,data['title'],server,power)
60+
if data not in self.url:
61+
self.guol.append(data)
62+
else:
63+
continue
64+
self.url.append(data)
65+
66+
except:
67+
pass
68+
69+
def echo(self):
70+
for c in self.guol:
71+
if self.guanjianzhi !='' and self.guanjianzhi in str(c):
72+
print(c)
73+
print(c,file=open('save.txt','a',encoding='utf-8'))
74+
elif self.guanjianzhi == '':
75+
print(c)
76+
print(c,file=open('save.txt','a',encoding='utf-8'))
77+
else:
78+
pass
79+
80+
self.guol.clear()
81+
82+
def xc(self,rw):
83+
for r in rw:
84+
self.xcs.append(gevent.spawn(self.baidu_search,r))
85+
86+
gevent.joinall(self.xcs)
87+
self.echo()
88+
89+
def djc(self):
90+
for j in range(config.config.PAGE):
91+
if self.calc==10:
92+
p=Process(target=self.xc,args=(self.djcs,))
93+
p.start()
94+
self.calc=0
95+
self.djcs.clear()
96+
url='https://www.baidu.com/s?wd={}&pn={}&oq=1'.format(quote(config.config.SEARCH),j*10)
97+
self.djcs.append(url)
98+
self.calc+=1
99+
if len(self.djcs)>0:
100+
p = Process(target=self.xc, args=(self.djcs,))
101+
p.start()
102+
103+
if __name__ == '__main__':
104+
obj=Request()
105+
obj.banner()
106+
obj.djc()

0 commit comments

Comments
 (0)