LSTM预测多支股票的收盘价
之前对某支个股进行预测:LSTM估计股票收盘价
这次尝试建立适用于多支股票的模型。思路是使用多支股票训练一个模型,用前30个交易日的数据预测下一天的收盘价,每支股票只贡献一个样本。
先收集股票数据
import tushare as ts
pro = ts.pro_api('f3bbc97d0ffbbed8666e6f7c82e712165950d048987f5d6cfbf1e0ce')
#查询当前所有正常上市交易的上交所股票列表
code_list = pro.stock_basic(exchange='SSE', list_status='L', fields='ts_code,symbol,name')
print(code_list)
for i in range(1400,len(code_list)):
print(code_list.iloc[i,0])
data = pro.daily(ts_code=code_list.iloc[i,0], start_date='20180801', end_date='20180912')
data.to_excel('C:/Users/Administrator/Desktop/data/' + code_list.iloc[i,0] + '.xlsx')
然后预处理数据
import pandas as pd
import os
def read_xls(path):
data = []
for file in os.walk(path):
for each_list in file[2]:
#os.walk()函数返回三个参数:路径,子文件夹,路径下的文件,利用字符串拼接file[0]和file[2]得到文件的路径
file_path=file[0]+"/"+each_list
f = open(file_path,'rb')
df = pd.read_excel(f)
df1 = df[['open','high','low','close','vol']] #保留需要的features
df1 = df1[::-1] #倒序,使日期靠前的在前面
df1.reset_index(drop=True, inplace=True) #把每行的索引改为“0、1、2……”
data.append(df1) #把一支股票的数据加到data里面
f.close()
return data
xpath = r"C:\Users\Administrator\Desktop\data"
data = read_xls(xpath)
import numpy as np
data_new = data.copy() #copy创建一个新的list
#删除这31天里停过市的股票
for i in range(len(data)):
try:
if len(data_new[i]) < 31:
del data_new[i]
except:
continue
for i in range(len(data)):
try:
if len(data_new[i]) < 31:
del data_new[i]
except:
continue
#这里需要遍历两次才能删完,我也不知道为什么
#打印一下,应该删完了
for i in range(len(data_new)):
if len(data_new[i]) < 31:
print(i)
#看看数据的维度
print(len(data_new))
print(len(data_new[0]))
a = np.array(data_new[0])
print(len(a[0]))
#把dataframe转为array,方便计算
raw_data = np.zeros((len(data_new), len(data_new[0]), len(a[0])))
for i in range(len(data_new)):
raw_data[i] = np.array(data_new[i].values)
print(raw_data.shape)
from sklearn.preprocessing import MinMaxScaler
#确定训练集和测试集的长度
training_set_size = int(raw_data.shape[0] * 0.7)
test_set_size = raw_data.shape[0] - training_set_size
#打乱顺序,再划分训练集和测试集
np.random.shuffle(raw_data)
xy_train = raw_data[:training_set_size]
xy_test = raw_data[training_set_size:]
#确定几个参数
seq_length = raw_data.shape[1] - 1 #减1是因为只用前30天做features,最后一天是做label的
data_dim = raw_data.shape[2]
output_dim = 1
#数据规范化
xy_train = xy_train.reshape((-1, data_dim)) #先转为二维
scaler = MinMaxScaler()
xy_train = scaler.fit_transform(xy_train)
xy_train = xy_train.reshape((-1, seq_length + 1, data_dim)) #再变回三维
#用training set 的 scaler 处理test set
xy_test = xy_test.reshape((-1, data_dim)) #先转为二维
xy_test = scaler.transform(xy_test) #这里用training set的scaler
xy_test = xy_test.reshape((-1, seq_length + 1, data_dim)) #再变回三维
#新建array,用于储存features 和 labels
x_train = np.zeros((training_set_size, seq_length, data_dim))
y_train = np.zeros(training_set_size)
x_test = np.zeros((test_set_size, seq_length, data_dim))
y_test = np.zeros(test_set_size)
#载入数据
x_train = xy_train[:,:-1] #除了最后一行
for i in range(training_set_size):
y_train[i] = xy_train[i,-1,3] * 10 #取最后一行的close,适当放大,便于训练
x_test = xy_test[:,:-1] #除了最后一行
for i in range(test_set_size):
y_test[i] = xy_test[i,-1,3] * 10 #取最后一行的close,适当放大
#看看训练集和测试集的维度
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
现在可以开始搭建神经网络了
from keras.layers import Input, Dense, LSTM, Reshape
from keras.models import Model
from keras import regularizers
# 构建神经网络层 1层Dense层+1层LSTM层+3层Dense层
rnn_units = 10
Dense_input = Input(shape=(seq_length, data_dim), name='dense_input')
#shape: 形状元组(整型)不包括batch size。表示了预期的输入将是一批(seq_len,data_dim)的向量。
Dense_output_1 = Dense(rnn_units, activation='relu', kernel_regularizer=regularizers.l2(0.001), name='dense1')(Dense_input) #全连接网络
lstm_input = Reshape(target_shape=(seq_length, rnn_units), name='reshape2')(Dense_output_1) #变换形状,变成(None,seq_length, rnn_units)
lstm_output = LSTM(rnn_units, activation='tanh', dropout=1.0, name='lstm')(lstm_input) #LSTM网络
#units: Positive integer,dimensionality of the output space.
#dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.
Dense_input_2 = Reshape(target_shape=(rnn_units,), name='reshape3')(lstm_output) #变换形状,变成(None,rnn_units)
Dense_output_2 = Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001), name='dense2')(Dense_input_2) #全连接网络
Dense_output_3 = Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.001), name='dense3')(Dense_output_2) #全连接网络
predictions = Dense(output_dim, activation=None, kernel_regularizer=regularizers.l2(0.001), name='dense4')(Dense_output_3) #全连接网络
model = Model(inputs=Dense_input, outputs=predictions)
#This model will include all layers required in the computation of output given input.
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
#Configures the model for training.
#optimizer: String (name of optimizer) or optimizer instance. See optimizers.
#loss: String (name of objective function) or objective function.The loss value will be minimized by the model.
#metrics: List of metrics to be evaluated by the model during training and testing. Typically you will use metrics=['accuracy'].
history = model.fit(x_train, y_train, batch_size=256, epochs=300, verbose=2, validation_split=0.1)
#Trains the model for a given number of epochs (iterations on a dataset).
#verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.
import matplotlib.pyplot as plt
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

trainPredict2 = model.predict(x_train)
trainPredict2_2 = trainPredict2 / 10 * scaler.data_range_[3] + scaler.data_min_[3] #放大和scale的逆运算
trainY2 = y_train / 10 * scaler.data_range_[3] + scaler.data_min_[3]
plt.figure(figsize=(8,8)) #画布大小
plt.xlim((0,10)) #x坐标范围
plt.ylim((0,10)) #y坐标范围
plt.scatter(y_train, trainPredict2) #理想情况下散点应该分布在斜率为1的直线周围
plt.ylabel('prediction')
plt.xlabel('label')

#打印平均、最大、最小误差
print(np.mean((trainPredict2_2 - trainY2) / trainY2 * 100))
print(np.max((trainPredict2_2 - trainY2) / trainY2 * 100))
print(np.min((trainPredict2_2 - trainY2) / trainY2 * 100))
17.33178044611391
5398.242040701764
-98.09094687312943
#计算误差绝对值小于10%的比例
count = 0
for i in range(len(trainY2)):
if abs(trainPredict2_2[i] - trainY2[i]) / trainY2[i] * 100 <= 10:
count += 1
count = count / len(trainY2) * 100
print(count)
90.81419624217119
testPredict2 = model.predict(x_test)
testPredict2_2 = testPredict2 / 10 * scaler.data_range_[3] + scaler.data_min_[3] #放大和scale的逆运算
testY2 = y_test / 10 * scaler.data_range_[3] + scaler.data_min_[3]
plt.figure(figsize=(8,8)) #画布大小
plt.xlim((0,1.5)) #x坐标范围
plt.ylim((0,1.5)) #y坐标范围
plt.scatter(y_test, testPredict2) #理想情况下散点应该分布在斜率为1的直线周围
plt.ylabel('prediction')
plt.xlabel('label')

#打印平均、最大、最小误差
print(np.mean((testPredict2_2 - testY2) / testY2 * 100))
print(np.max((testPredict2_2 - testY2) / testY2 * 100))
print(np.min((testPredict2_2 - testY2) / testY2 * 100))
33.73716795854983
5415.860408716808
-98.17271910887521
count = 0
for i in range(len(testY2)):
if abs(testPredict2_2[i] - testY2[i]) / testY2[i] * 100 <= 10:
count += 1
count = count / len(testY2) * 100
print(count)
90.77669902912622
代码和数据在GitHub里:GitHub