参考文章:http://www.sohu.com/a/149042886_572440
如下筛选出最优基金“东吴阿尔法灵活配置混合”
[127 rows x 15 columns]
3年,2年,1年选取:
['华泰柏瑞创新动力混合', '申万菱信沪深300价值指数', '博时产业新动力混合', '农银策略价值混合', '交银消费新驱动股票', '海富通中证100', '富国天惠成长混合A', '银河蓝筹混合', '兴全沪深300指数(LOF)', '融通转型三动力灵活配置混合']
1年,半年,3月,1月选取:
{'诺德成长优势混合', '东吴阿尔法灵活配置混合', '前海开源工业革命4.0混合', '国联安鑫富混合C'}
3年,2年,1年,半年,3个月,1个月选取:
{'诺德成长优势混合', '东吴阿尔法灵活配置混合'}
3年,2年,1年,半年,3个月,1个月,一周选取:
{'东吴阿尔法灵活配置混合'}
Process finished with exit code 0
一.抓取天天基金的首页的基金信息并存储带scv
def get_fund(): # 取得基金列表
# 先凑一个我们需要的URL出来
max_jj = '5000' # 调试5 工作5000
fromstr = datetime.datetime.now().strftime('%Y-%m-%d')
url = "http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft=all&rs=&gs=0&sc=zzf&st=desc&sd=#custday&ed=#nowdate&qdii=&tabSubtype=,,,,,&pi=1&pn=#count&dx=1"
url = url.replace('#count', max_jj)
url = url.replace('#nowdate', fromstr)
tostr = (datetime.datetime.now() - datetime.timedelta(days=5 * 365 + 1)).strftime('%Y-%m-%d')
url = url.replace('#custday', tostr)
# 取得文本
s = get_html(url)
# 去掉冗余信息,这里应该有更好的方法,但懒得折腾了,这样简单,十分钟搞定
s = s[22:-159]
for x in ['"', "'", ']', '[']:
s = s.replace(x, '')
lst = s.split(',')
lst = split_list(lst, 25)
frame = pd.DataFrame(lst,
columns=['code', 'name', 'py', '3', '4', 'jz', 'day1', 'week1', 'month1', 'month3', 'month6',
'year1', 'year2', 'year3', 'year0', 'yearall', 'fromdate', '17', 'year5', '19', '20',
'21', '22', '23', '24'])
frame = frame.iloc[:, [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18]]
frame.to_csv('fund.csv')
return frame
二.筛选数据
df_full = df_full.sort_values(by='year1', axis=0, ascending=False)
df = df_full.head(X)
for xx in ['year1','year2','year3','year5','month6']:
tmp = df_full.sort_values(by=xx, axis=0, ascending=False).head(X)
df=df.merge(tmp,on=['code'])
df=df.iloc[:,0:15]
df.to_csv('result.csv')
print('1年选取:')
print(df)
print('')
完整代码
import pandas as pd
import requests
import datetime
def get_html(url): # 取得HTML文本
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
# 转换list维度,从1维到2维
def split_list(datas, n):
length = len(datas)
size = length // n + 1 if length % n else length // n
_datas = []
for i in range(size):
start = i * n
end = (i + 1) * n
_datas.append(datas[start: end])
return _datas
def get_fund(): # 取得基金列表
# 先凑一个我们需要的URL出来
max_jj = '5000' # 调试5 工作5000
fromstr = datetime.datetime.now().strftime('%Y-%m-%d')
url = "http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft=all&rs=&gs=0&sc=zzf&st=desc&sd=#custday&ed=#nowdate&qdii=&tabSubtype=,,,,,&pi=1&pn=#count&dx=1"
url = url.replace('#count', max_jj)
url = url.replace('#nowdate', fromstr)
tostr = (datetime.datetime.now() - datetime.timedelta(days=5 * 365 + 1)).strftime('%Y-%m-%d')
url = url.replace('#custday', tostr)
# 取得文本
s = get_html(url)
# 去掉冗余信息,这里应该有更好的方法,但懒得折腾了,这样简单,十分钟搞定
s = s[22:-159]
for x in ['"', "'", ']', '[']:
s = s.replace(x, '')
lst = s.split(',')
lst = split_list(lst, 25)
frame = pd.DataFrame(lst,
columns=['code', 'name', 'py', '3', '4', 'jz', 'day1', 'week1', 'month1', 'month3', 'month6',
'year1', 'year2', 'year3', 'year0', 'yearall', 'fromdate', '17', 'year5', '19', '20',
'21', '22', '23', '24'])
frame = frame.iloc[:, [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18]]
frame.to_csv('fund.csv')
return frame
def main():
get_fund() # 如果每次都需要用最新数据,用这句
df_full = pd.read_csv('fund.csv') # 节省网络流量,就用这句
X = 500 # 取排名前多少的基金
df_full = df_full.sort_values(by='year1', axis=0, ascending=False)
df = df_full.head(X)
for xx in ['year1','year2','year3','year5','month6']:
tmp = df_full.sort_values(by=xx, axis=0, ascending=False).head(X)
df=df.merge(tmp,on=['code'])
df=df.iloc[:,0:15]
df.to_csv('result.csv')
print('1年选取:')
print(df)
print('')
## 三年以来
y3_index = df_full.sort_values(by=['year3'], ascending=False).head(X).name
## 二年以来
y2_index = df_full.sort_values(by=['year2'], ascending=False).head(X).name
## 一年以来
y1_index = df_full.sort_values(by=['year1'], ascending=False).head(X).name
## 六月以来
m6_index = df_full.sort_values(by=['month6'], ascending=False).head(X).name
## 三月以来
m3_index = df_full.sort_values(by=['month3'], ascending=False).head(X).name
## 一月以来
m1_index = df_full.sort_values(by=['month1'], ascending=False).head(X).name
## 一周以来
w1_index = df_full.sort_values(by=['week1'], ascending=False).head(X).name
y3_index_set = set(y3_index)
y2_index_set = set(y2_index)
y1_index_set = set(y1_index)
m6_index_set = set(m6_index)
m3_index_set = set(m3_index)
m1_index_set = set(m1_index)
w1_index_set = set(w1_index)
print('3年,2年,1年选取:')
print(y3_index_set & y2_index_set & y1_index_set)
print('')
print('1年,半年,3月,1月选取:')
print(y1_index_set,m6_index_set & m3_index_set & m1_index_set)
print('')
print('3年,2年,1年,半年,3个月,1个月选取:')
print(y3_index_set & y2_index_set & y1_index_set & m6_index_set & m3_index_set & m1_index_set)
print('')
print('3年,2年,1年,半年,3个月,1个月,一周选取:')
print(y3_index_set & y2_index_set & y1_index_set & m6_index_set & m3_index_set & m1_index_set & w1_index_set)
main()