Skip to content

Commit 9d45318

Browse files
author
caocheng
committed
update code
1 parent c235046 commit 9d45318

File tree

4 files changed

+222
-8
lines changed

4 files changed

+222
-8
lines changed

eastmoney/eastmoney.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,13 @@ def run_detail1(code,name,url):
5454
else:
5555
content=tag.a.text
5656
code=re.findall(r'\d+',content)[0]
57-
#print(code)
57+
print(code)
5858
name=content.split(')')[1]
59-
#print(name)
59+
print(name)
6060
url=tag.a['href']
61-
#print(content)
61+
print(content)
6262
content_dict={'code':code,'name':name,'url':url}
63-
#print (content_dict)
63+
print (content_dict)
6464
col1.insert(content_dict)
6565
time.sleep(0.1)
6666
run_detail1(code,name,url)

eastmoney/fund_spider.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import requests
2+
import pandas as pd
3+
import re
4+
import json
5+
import time
6+
import random
7+
import math
8+
from bs4 import BeautifulSoup
9+
10+
11+
def get_fundcode():
12+
'''
13+
获取fundcode列表
14+
:return: 将获取的DataFrame以csv格式存入本地
15+
'''
16+
url = 'http://fund.eastmoney.com/js/fundcode_search.js'
17+
r = requests.get(url)
18+
cont = re.findall('var r = (.*])', r.text)[0] # 提取list
19+
ls = json.loads(cont) # 将字符串个事的list转化为list格式
20+
fundcode = pd.DataFrame(ls, columns=['fundcode', 'fundsx', 'name', 'category', 'fundpy']) # list转为DataFrame
21+
fundcode = fundcode.loc[0:100, ['fundcode', 'name', 'category']]
22+
#fundcode.to_csv('./fundcode.csv', index=False, encoding = 'gbk')
23+
return fundcode
24+
25+
def get_fundjbgk():
26+
fund_jbgk = []
27+
fund_list = get_fundcode()
28+
for i in fund_list['fundcode']:
29+
jbgk_addr = f'http://fundf10.eastmoney.com/jbgk_{i}.html'
30+
g = requests.get(jbgk_addr)
31+
g.encoding = g.apparent_encoding
32+
s = BeautifulSoup(g.text, 'html.parser')
33+
table = s.find('table', {'class': 'info w790'})
34+
temp_jbgk = []
35+
for row in table.findAll('tr'):
36+
for col in row.findAll('td'):
37+
temp_jbgk.append(col.get_text())
38+
fund_jbgk.append(temp_jbgk)
39+
time.sleep(random.randint(1, 3))
40+
df_jbgk = pd.DataFrame(fund_jbgk, columns=['基金全称', '基金简称', '基金代码', '基金类型', '发行日期', '成立日期/规模', '资产规模', '份额规模', '基金管理人', '基金托管人', '基金经理人', '成立来分红', '管理费率', '托管费率', '销售服务费率', '最高认购费率'])
41+
df_jbgk.to_csv('./fund_info.csv', index=False, encoding = 'gbk')
42+
43+
44+
def get_one_page(fundcode, pageIndex=1):
45+
'''
46+
获取基金净值某一页的html
47+
:param fundcode: str格式,基金代码
48+
:param pageIndex: int格式,页码数
49+
:return: str格式,获取网页内容
50+
'''
51+
url = 'http://api.fund.eastmoney.com/f10/lsjz'
52+
cookie = 'EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND8=null; EMFUND0=null; EMFUND9=01-24 17:11:50@#$%u957F%u4FE1%u5229%u5E7F%u6DF7%u5408A@%23%24519961; st_pvi=27838598767214; st_si=11887649835514'
53+
headers = {
54+
'Cookie': cookie,
55+
'Host': 'api.fund.eastmoney.com',
56+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
57+
'Referer': 'http://fundf10.eastmoney.com/jjjz_%s.html' % fundcode,
58+
}
59+
params = {
60+
'callback': 'jQuery18307633215694564663_1548321266367',
61+
'fundCode': fundcode,
62+
'pageIndex': pageIndex,
63+
'pageSize': 20,
64+
}
65+
try:
66+
r = requests.get(url=url, headers=headers, params=params)
67+
if r.status_code == 200:
68+
return r.text
69+
return None
70+
except RequestException:
71+
return None
72+
73+
74+
def parse_one_page(html):
75+
'''
76+
解析网页内容
77+
:param html: str格式,html内容
78+
:return: dict格式,获取历史净值和访问页数
79+
'''
80+
if html is not None: # 判断内容是否为None
81+
content = re.findall('\((.*?)\)', html)[0] # 提取网页文本内容中的数据部分
82+
lsjz_list = json.loads(content)['Data']['LSJZList'] # 获取历史净值列表
83+
total_count = json.loads(content)['TotalCount'] # 获取数据量
84+
total_page = math.ceil(total_count / 20) #
85+
lsjz = pd.DataFrame(lsjz_list)
86+
info = {'lsjz': lsjz,
87+
'total_page': total_page}
88+
return info
89+
return None
90+
91+
92+
def main(fundcode):
93+
'''
94+
将爬取的基金净值数据储存至本地csv文件
95+
'''
96+
html = get_one_page(fundcode)
97+
info = parse_one_page(html)
98+
total_page = info['total_page']
99+
lsjz = info['lsjz']
100+
lsjz.to_csv('./%s_lsjz.csv' % fundcode, index=False, encoding = 'gbk') # 将基金历史净值以csv格式储存
101+
page = 1
102+
while page < total_page:
103+
page += 1
104+
print(lsjz)
105+
html = get_one_page(fundcode, pageIndex=page)
106+
info = parse_one_page(html)
107+
if info is None:
108+
break
109+
lsjz = info['lsjz']
110+
lsjz.to_csv('./%s_lsjz.csv' % fundcode, mode='a', index=False, header=False, encoding = 'gbk') # 追加存储
111+
time.sleep(random.randint(3, 5))
112+
113+
114+
if __name__=='__main__':
115+
# 获取所有基金代码
116+
get_fundjbgk()
117+
# # fundcode = '519961'
118+
# fundcodes = pd.read_csv('./fundcode.csv', converters={'fundcode': str})
119+
# # 获取所有基金净值数据
120+
# for fundcode in fundcodes['fundcode']:
121+
# print(fundcode)
122+
# main(fundcode)
123+
# time.sleep(random.randint(5, 10))

eastmoney/geturl.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,13 @@
55

66
UA_LIST = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
77
header={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive','User-Agent': random.choice(UA_LIST) }
8-
proxies=['http://118.178.124.33:3128',
9-
'http://139.129.166.68:3128',
10-
'http://61.163.39.70:9999',
11-
'http://61.143.228.162']
8+
# proxies=['http://118.178.124.33:3128',
9+
# 'http://139.129.166.68:3128',
10+
# 'http://61.163.39.70:9999',
11+
# 'http://61.143.228.162']
12+
13+
proxies=['http://61.135.217.7:80']
14+
1215
def geturl_gbk(url):
1316
html=requests.get(url,headers=header,proxies={'http':random.choice(proxies)}).content.decode('gbk')
1417
soup=BeautifulSoup(html,'lxml')

eastmoney/test.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import requests
2+
import json
3+
from time import time
4+
from lxml import etree
5+
import re
6+
from random import sample
7+
from time import sleep
8+
9+
np = 1
10+
base = "http://fund.eastmoney.com/"
11+
acount = 2
12+
# 爬取排行页
13+
headers = {
14+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4620.400 QQBrowser/9.7.13014.400'
15+
}
16+
url = "http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft=all&rs=&gs=0&sc=zzf&st=desc&sd=2017-07-09&ed=2018-07-09&qdii=&tabSubtype=,,,,,&pi=1&pn=50&dx=1&v=0."
17+
18+
19+
def IndexSpider(url, headers): # 爬取第一个页面信息
20+
url = url + str(int(time())) # 这个主要是url最后一个值是v=数字,我就用时间戳来伪装了
21+
rsp = requests.get(url, headers=headers).content
22+
html = rsp.decode('utf-8')
23+
url = url[:-10] # 请求完了之后减掉加的时间戳,方面爬取下一页的时候重复操作
24+
return html
25+
26+
27+
def ChangeUrl(url): # 改变url,pi代表的是页数,这样可以按顺序爬取相应页数的数据
28+
global acount
29+
url = url.replace("&pi=1", "&pi=" + str(acount))
30+
acount = acount + 1
31+
return url
32+
33+
34+
def ChangeUrl_2(jijindaima): # 我要爬取相应基金点开之后的页面的数据,分析可知是对应基金代码前面加上域名,后面加上.html
35+
global base
36+
jijindaima = jijindaima.replace('\"', '')
37+
url_2 = base + jijindaima + '.html'
38+
return url_2
39+
40+
41+
def DetailRequest(url): # 爬取点开那一页之后的数据
42+
global np
43+
url = url.replace('\"', '')
44+
print(url)
45+
print("正在爬取第{0}条记录".format(np))
46+
np = np + 1
47+
re_leixing = re.compile('基金类型(.*?)</a>')
48+
re_jingli = re.compile('基金经理:<a href=(.*?)</a>')
49+
re_chengliri = re.compile('<td><span class="letterSpace01">成 立 日</span>:(.*?)</td>')
50+
rsp = requests.get(url, headers=headers).content
51+
html = rsp.decode('utf-8')
52+
leixing = re_leixing.findall(html)[0][-3:]
53+
jingli = re_jingli.findall(html)[0][-2:]
54+
chengliri = re_chengliri.findall(html)[0]
55+
return jingli, leixing, chengliri
56+
57+
58+
if __name__ == '__main__':
59+
nw = 1
60+
url2_detail = []
61+
jijindaima_list = []
62+
detail_url_list = []
63+
with open('w.txt', 'a', encoding='utf-8') as f:
64+
f.write("基金代码\t\t基金简称\t\t单位净值\t\t累计净值\t\t基金经理\t\t基金类型\t\t成立日\n")
65+
for i in range(1, 32):
66+
html = IndexSpider(url, headers=headers)
67+
url = ChangeUrl(url)
68+
right = html.find("]")
69+
left = html.find("[")
70+
html = html[left + 1:right]
71+
lists = html.split("\",\"")
72+
for list in lists:
73+
l = list.split(",")
74+
jijindaima_list.append(l[0])
75+
for i in jijindaima_list:
76+
detail_url_list.append(ChangeUrl_2(i))
77+
for i in detail_url_list:
78+
url2_detail.append(DetailRequest(i))
79+
with open('w.txt', 'a', encoding='utf-8') as f:
80+
for list, l2 in zip(lists, url2_detail):
81+
l = list.split(",")
82+
f.writelines(
83+
l[0] + '\t\t' + l[1] + '\t\t' + l[4] + '\t\t' + l[5] + '\t\t' + l2[0] + '\t\t' + l2[1] + '\t\t' +
84+
l2[2] + '\n')
85+
print('正在写入第{0}条记录……'.format(nw))
86+
nw = nw + 1
87+
print("5秒后爬取下一页……")
88+
sleep(5)

0 commit comments

Comments
 (0)