LSTM预测多支股票的收盘价

之前对某支个股进行预测：LSTM估计股票收盘价
这次尝试建立适用于多支股票的模型。思路是使用多支股票训练一个模型，用前30个交易日的数据预测下一天的收盘价，每支股票只贡献一个样本。

先收集股票数据

import tushare as ts
pro = ts.pro_api('f3bbc97d0ffbbed8666e6f7c82e712165950d048987f5d6cfbf1e0ce')
#查询当前所有正常上市交易的上交所股票列表
code_list = pro.stock_basic(exchange='SSE', list_status='L', fields='ts_code,symbol,name')
print(code_list)

for i in range(1400,len(code_list)):
    print(code_list.iloc[i,0])
    data = pro.daily(ts_code=code_list.iloc[i,0], start_date='20180801', end_date='20180912')
    data.to_excel('C:/Users/Administrator/Desktop/data/' + code_list.iloc[i,0] + '.xlsx')

然后预处理数据

import pandas as pd
import os
def read_xls(path):
    data = []
    for file in os.walk(path):
        for each_list in file[2]:
            #os.walk()函数返回三个参数：路径，子文件夹，路径下的文件，利用字符串拼接file[0]和file[2]得到文件的路径
            file_path=file[0]+"/"+each_list
            f = open(file_path,'rb')
            df = pd.read_excel(f)
            df1 = df[['open','high','low','close','vol']] #保留需要的features
            df1 = df1[::-1] #倒序，使日期靠前的在前面
            df1.reset_index(drop=True, inplace=True) #把每行的索引改为“0、1、2……”
            data.append(df1) #把一支股票的数据加到data里面
            f.close()
    return data
xpath = r"C:\Users\Administrator\Desktop\data"
data = read_xls(xpath)

import numpy as np

data_new = data.copy() #copy创建一个新的list

#删除这31天里停过市的股票
for i in range(len(data)):
    try:
        if len(data_new[i]) < 31:
            del data_new[i]
    except:
        continue

for i in range(len(data)):
    try:
        if len(data_new[i]) < 31:
            del data_new[i]
    except:
        continue
#这里需要遍历两次才能删完，我也不知道为什么

#打印一下，应该删完了
for i in range(len(data_new)):
    if len(data_new[i]) < 31:
        print(i)

#看看数据的维度
print(len(data_new))
print(len(data_new[0]))
a = np.array(data_new[0])
print(len(a[0]))

#把dataframe转为array，方便计算
raw_data = np.zeros((len(data_new), len(data_new[0]), len(a[0])))
for i in range(len(data_new)):
    raw_data[i] = np.array(data_new[i].values)
print(raw_data.shape)

from sklearn.preprocessing import MinMaxScaler

#确定训练集和测试集的长度
training_set_size = int(raw_data.shape[0] * 0.7)
test_set_size = raw_data.shape[0] - training_set_size

#打乱顺序，再划分训练集和测试集
np.random.shuffle(raw_data)
xy_train = raw_data[:training_set_size]
xy_test = raw_data[training_set_size:]

#确定几个参数
seq_length = raw_data.shape[1] - 1 #减1是因为只用前30天做features，最后一天是做label的
data_dim = raw_data.shape[2]
output_dim = 1

#数据规范化
xy_train = xy_train.reshape((-1, data_dim)) #先转为二维
scaler = MinMaxScaler()
xy_train = scaler.fit_transform(xy_train)
xy_train = xy_train.reshape((-1, seq_length + 1, data_dim)) #再变回三维

#用training set 的 scaler 处理test set
xy_test = xy_test.reshape((-1, data_dim)) #先转为二维
xy_test = scaler.transform(xy_test) #这里用training set的scaler
xy_test = xy_test.reshape((-1, seq_length + 1, data_dim)) #再变回三维

#新建array，用于储存features 和 labels
x_train = np.zeros((training_set_size, seq_length, data_dim))
y_train = np.zeros(training_set_size)
x_test = np.zeros((test_set_size, seq_length, data_dim))
y_test = np.zeros(test_set_size)

#载入数据
x_train = xy_train[:,:-1] #除了最后一行
for i in range(training_set_size):
    y_train[i] = xy_train[i,-1,3] * 10 #取最后一行的close，适当放大，便于训练

x_test = xy_test[:,:-1] #除了最后一行
for i in range(test_set_size):
    y_test[i] = xy_test[i,-1,3]  * 10 #取最后一行的close，适当放大

#看看训练集和测试集的维度
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

现在可以开始搭建神经网络了

from keras.layers import Input, Dense, LSTM, Reshape
from keras.models import Model
from keras import regularizers

# 构建神经网络层 1层Dense层+1层LSTM层+3层Dense层
rnn_units = 10
Dense_input = Input(shape=(seq_length, data_dim), name='dense_input')
#shape: 形状元组（整型）不包括batch size。表示了预期的输入将是一批（seq_len,data_dim）的向量。
Dense_output_1 = Dense(rnn_units, activation='relu', kernel_regularizer=regularizers.l2(0.001), name='dense1')(Dense_input) #全连接网络

lstm_input = Reshape(target_shape=(seq_length, rnn_units), name='reshape2')(Dense_output_1) #变换形状，变成（None，seq_length, rnn_units）
lstm_output = LSTM(rnn_units, activation='tanh', dropout=1.0, name='lstm')(lstm_input)  #LSTM网络
#units: Positive integer,dimensionality of the output space.
#dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.

Dense_input_2 = Reshape(target_shape=(rnn_units,), name='reshape3')(lstm_output) #变换形状，变成（None，rnn_units）
Dense_output_2 = Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001), name='dense2')(Dense_input_2) #全连接网络
Dense_output_3 = Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.001), name='dense3')(Dense_output_2) #全连接网络
predictions = Dense(output_dim, activation=None, kernel_regularizer=regularizers.l2(0.001), name='dense4')(Dense_output_3) #全连接网络

model = Model(inputs=Dense_input, outputs=predictions)
#This model will include all layers required in the computation of output given input.
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
#Configures the model for training.
#optimizer: String (name of optimizer) or optimizer instance. See optimizers.
#loss: String (name of objective function) or objective function.The loss value will be minimized by the model.
#metrics: List of metrics to be evaluated by the model during training and testing. Typically you will use  metrics=['accuracy'].
history = model.fit(x_train, y_train, batch_size=256, epochs=300, verbose=2, validation_split=0.1)
#Trains the model for a given number of epochs (iterations on a dataset).
#verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.

import matplotlib.pyplot as plt
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

训练过程中loss的变化

trainPredict2 = model.predict(x_train)
trainPredict2_2 = trainPredict2 / 10 * scaler.data_range_[3] + scaler.data_min_[3] #放大和scale的逆运算
trainY2 = y_train / 10 * scaler.data_range_[3] + scaler.data_min_[3]
plt.figure(figsize=(8,8)) #画布大小
plt.xlim((0,10)) #x坐标范围
plt.ylim((0,10)) #y坐标范围
plt.scatter(y_train, trainPredict2) #理想情况下散点应该分布在斜率为1的直线周围
plt.ylabel('prediction')
plt.xlabel('label')

训练结果

#打印平均、最大、最小误差
print(np.mean((trainPredict2_2 - trainY2) / trainY2 * 100))
print(np.max((trainPredict2_2 - trainY2) / trainY2 * 100))
print(np.min((trainPredict2_2 - trainY2) / trainY2 * 100))

17.33178044611391
5398.242040701764
-98.09094687312943

#计算误差绝对值小于10%的比例
count = 0
for i in range(len(trainY2)):
    if abs(trainPredict2_2[i] - trainY2[i]) / trainY2[i] * 100 <= 10:
        count += 1
count = count / len(trainY2) * 100
print(count)

90.81419624217119

testPredict2 = model.predict(x_test)
testPredict2_2 = testPredict2 / 10 * scaler.data_range_[3] + scaler.data_min_[3] #放大和scale的逆运算
testY2 = y_test / 10 * scaler.data_range_[3] + scaler.data_min_[3]
plt.figure(figsize=(8,8)) #画布大小
plt.xlim((0,1.5)) #x坐标范围
plt.ylim((0,1.5)) #y坐标范围
plt.scatter(y_test, testPredict2) #理想情况下散点应该分布在斜率为1的直线周围
plt.ylabel('prediction')
plt.xlabel('label')

测试结果
#打印平均、最大、最小误差

print(np.mean((testPredict2_2 - testY2) / testY2 * 100))
print(np.max((testPredict2_2 - testY2) / testY2 * 100))
print(np.min((testPredict2_2 - testY2) / testY2 * 100))

33.73716795854983
5415.860408716808
-98.17271910887521

count = 0
for i in range(len(testY2)):
    if abs(testPredict2_2[i] - testY2[i]) / testY2[i] * 100 <= 10:
        count += 1
count = count / len(testY2) * 100
print(count)

90.77669902912622

代码和数据在GitHub里：GitHub