import numpy as np
import pandas as pd
import math
import sklearn
import sklearn.preprocessing
import datetime
import os
import matplotlib.pyplot as plt
import tensorflow as tf


# we split data in 90% for the train, 5% for validation and 5% for test sets
valid_set_size_percentage = 5
test_set_size_percentage = 5 

#display parent directory and working directory
print(os.path.dirname(os.getcwd())+':', os.listdir(os.path.dirname(os.getcwd())));
print(os.getcwd()+':', os.listdir(os.getcwd()));

C:\Users\albim\Desktop: ['.ipynb_checkpoints', 'dashdub-master', 'desktop.ini', 'FORMATION MSDP100.mp4', 'Hate Speech App', 'MSc in Business Analytics and Artificial Intelligence.lnk', 'New folder', 'NY Stock Price Prediction.ipynb']
C:\Users\albim\Desktop\New folder: ['.ipynb_checkpoints', 'datasets', 'NY Stock Price Prediction.ipynb']


# import all stock prices 
df = pd.read_csv("datasets/prices-split-adjusted.csv")
df.info()
df.head()

# number of different stocks
print('\nnumber of different stocks: ', len(list(set(df.symbol))))
print(list(set(df.symbol))[:10])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 851264 entries, 0 to 851263
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    851264 non-null  object 
 1   symbol  851264 non-null  object 
 2   open    851264 non-null  float64
 3   close   851264 non-null  float64
 4   low     851264 non-null  float64
 5   high    851264 non-null  float64
 6   volume  851264 non-null  float64
dtypes: float64(5), object(2)
memory usage: 45.5+ MB

number of different stocks:  501
['URI', 'LMT', 'MJN', 'RTN', 'TEL', 'NLSN', 'NEM', 'LOW', 'WYNN', 'HBI']


df['date_time'] = pd.to_datetime(df['date'], format="%Y-%m-%d")


df.tail()


df.describe()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 851264 entries, 0 to 851263
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   date       851264 non-null  object        
 1   symbol     851264 non-null  object        
 2   open       851264 non-null  float64       
 3   close      851264 non-null  float64       
 4   low        851264 non-null  float64       
 5   high       851264 non-null  float64       
 6   volume     851264 non-null  float64       
 7   date_time  851264 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 52.0+ MB


# function for min-max normalization of stock
def normalize_data(df):
    min_max_scaler = sklearn.preprocessing.MinMaxScaler()
    df['open'] = min_max_scaler.fit_transform(df.open.values.reshape(-1,1))
    df['high'] = min_max_scaler.fit_transform(df.high.values.reshape(-1,1))
    df['low'] = min_max_scaler.fit_transform(df.low.values.reshape(-1,1))
    df['close'] = min_max_scaler.fit_transform(df['close'].values.reshape(-1,1))
    return df

# function to create train, validation, test data given stock data and sequence length
def load_data(stock, seq_len):
    data_raw = stock # convert to numpy array
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - seq_len): 
        data.append(data_raw[index: index + seq_len])
    
    data = np.array(data);
    valid_set_size = int(np.round(valid_set_size_percentage/100*data.shape[0]));  
    test_set_size = int(np.round(test_set_size_percentage/100*data.shape[0]));
    train_set_size = data.shape[0] - (valid_set_size + test_set_size);
    
    x_train = data[:train_set_size,:-1,:]
    y_train = data[:train_set_size,-1,:]
    
    x_valid = data[train_set_size:train_set_size+valid_set_size,:-1,:]
    y_valid = data[train_set_size:train_set_size+valid_set_size,-1,:]
    
    x_test = data[train_set_size+valid_set_size:,:-1,:]
    y_test = data[train_set_size+valid_set_size:,-1,:]
    
    return [x_train, y_train, x_valid, y_valid, x_test, y_test]


from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates


plt.figure(figsize=(22, 8))
plt.plot(df[df.symbol == 'NFLX'].date_time, df[df.symbol == 'NFLX'].open.values, color='red', label='open')
plt.plot(df[df.symbol == 'NFLX'].date_time, df[df.symbol == 'NFLX'].close.values, color='green', label='close')
plt.plot(df[df.symbol == 'NFLX'].date_time, df[df.symbol == 'NFLX'].low.values, color='blue', label='low')
plt.plot(df[df.symbol == 'NFLX'].date_time, df[df.symbol == 'NFLX'].high.values, color='black', label='high')

plt.title('Stock Price Netflix')
plt.xlabel('Date')
plt.ylabel('price')
plt.legend(loc='best');


plt.figure(figsize=(22, 8))
plt.plot(df[df.symbol == 'NFLX'].date_time, df[df.symbol == 'NFLX'].volume.values, color='black', label='volume')
plt.title('Stock Volume Netflix')
plt.xlabel('Date')
plt.ylabel('volume')
plt.legend(loc='best');


df_stock = df[df.symbol == 'NFLX'].copy()
df_stock.drop(['symbol'],1,inplace=True)
df_stock.drop(['volume'],1,inplace=True)
df_stock.drop(['date'],1,inplace=True)
df_stock.drop(['date_time'],1,inplace=True)

cols = list(df_stock.columns.values)
print('df_stock.columns.values = ', cols)

df_stock_norm = df_stock.copy()
df_stock_norm = normalize_data(df_stock_norm)

print(df_stock_norm)

df_stock.columns.values =  ['open', 'close', 'low', 'high']
            open     close       low      high
555     0.007820  0.005015  0.005310  0.006209
1023    0.005577  0.002744  0.002739  0.003795
1491    0.003231  0.004831  0.002224  0.003920
1959    0.006210  0.003770  0.004449  0.004589
2427    0.004335  0.004808  0.004473  0.004475
...          ...       ...       ...       ...
849088  0.955083  0.956905  0.989034  0.944564
849588  0.960155  0.979179  0.994224  0.966532
850088  0.982935  0.959326  0.994726  0.966691
850588  0.957820  0.954806  0.981667  0.943454
851088  0.960155  0.942459  0.976645  0.946864

[1762 rows x 4 columns]


# create train, validation and test data
seq_len = 20 # choose sequence length
x_train, y_train, x_valid, y_valid, x_test, y_test = load_data(df_stock_norm, seq_len)
print('x_train.shape = ',x_train.shape)
print('y_train.shape = ', y_train.shape)
print('x_valid.shape = ',x_valid.shape)
print('y_valid.shape = ', y_valid.shape)
print('x_test.shape = ', x_test.shape)
print('y_test.shape = ',y_test.shape)

x_train.shape =  (1568, 19, 4)
y_train.shape =  (1568, 4)
x_valid.shape =  (87, 19, 4)
y_valid.shape =  (87, 4)
x_test.shape =  (87, 19, 4)
y_test.shape =  (87, 4)


plt.figure(figsize=(22, 8));
plt.plot(df[df.symbol == 'NFLX'].date_time, df_stock_norm.open.values, color='red', label='open')
plt.plot(df[df.symbol == 'NFLX'].date_time, df_stock_norm.close.values, color='green', label='low')
plt.plot(df[df.symbol == 'NFLX'].date_time, df_stock_norm.low.values, color='blue', label='low')
plt.plot(df[df.symbol == 'NFLX'].date_time, df_stock_norm.high.values, color='black', label='high')

plt.title('Normalized Netflix stock')
plt.xlabel('Date')
plt.ylabel('normalized price/volume')
plt.legend(loc='best');


## Basic Cell RNN in tensorflow

index_in_epoch = 0;
perm_array  = np.arange(x_train.shape[0])
np.random.shuffle(perm_array)

# function to get the next batch
def get_next_batch(batch_size):
    global index_in_epoch, x_train, perm_array   
    start = index_in_epoch
    index_in_epoch += batch_size
    
    if index_in_epoch > x_train.shape[0]:
        np.random.shuffle(perm_array) # shuffle permutation array
        start = 0 # start next epoch
        index_in_epoch = batch_size
        
    end = index_in_epoch
    return x_train[perm_array[start:end]], y_train[perm_array[start:end]]

# parameters
n_steps = seq_len-1 
n_inputs = 4
n_neurons = 200 
n_outputs = 4
n_layers = 2
learning_rate = 0.001
batch_size = 100
n_epochs = 100 
train_set_size = x_train.shape[0]
test_set_size = x_test.shape[0]

tf.compat.v1.reset_default_graph()
tf.compat.v1.disable_eager_execution()


X = tf.compat.v1.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.compat.v1.placeholder(tf.float32, [None, n_outputs])


import warnings 
warnings.filterwarnings("ignore")

layers = [tf.compat.v1.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.elu)
          for layer in range(n_layers)]

multi_layer_cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(layers)
rnn_outputs, states = tf.compat.v1.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons]) 
stacked_outputs = tf.compat.v1.layers.dense(stacked_rnn_outputs, n_outputs)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])
outputs = outputs[:,n_steps-1,:] # keep only last output of sequence
                                              
loss = tf.reduce_mean(input_tensor=tf.square(outputs - y)) # loss function = mean squared error 
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) 
training_op = optimizer.minimize(loss)
                                              
# run graph
with tf.compat.v1.Session() as sess: 
    sess.run(tf.compat.v1.global_variables_initializer())
    for iteration in range(int(n_epochs*train_set_size/batch_size)):
        x_batch, y_batch = get_next_batch(batch_size) # fetch the next training batch 
        sess.run(training_op, feed_dict={X: x_batch, y: y_batch}) 
        if iteration % int(5*train_set_size/batch_size) == 0:
            mse_train = loss.eval(feed_dict={X: x_train, y: y_train}) 
            mse_valid = loss.eval(feed_dict={X: x_valid, y: y_valid}) 
            print('%.2f epochs: MSE train/valid = %.6f/%.6f'%(
                iteration*batch_size/train_set_size, mse_train, mse_valid))

    y_train_pred = sess.run(outputs, feed_dict={X: x_train})
    y_valid_pred = sess.run(outputs, feed_dict={X: x_valid})
    y_test_pred = sess.run(outputs, feed_dict={X: x_test})

WARNING:tensorflow:`tf.nn.rnn_cell.MultiRNNCell` is deprecated. This class is equivalent as `tf.keras.layers.StackedRNNCells`, and will be replaced by that in Tensorflow 2.0.
WARNING:tensorflow:From <ipython-input-17-975f7dd785f0>:8: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
WARNING:tensorflow:From C:\Users\albim\anaconda3\lib\site-packages\tensorflow\python\keras\layers\legacy_rnn\rnn_cell_impl.py:465: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
0.00 epochs: MSE train/valid = 0.760673/1.796394
4.97 epochs: MSE train/valid = 0.000418/0.000763
9.95 epochs: MSE train/valid = 0.000347/0.000665
14.92 epochs: MSE train/valid = 0.000355/0.000678
19.90 epochs: MSE train/valid = 0.000362/0.000700
24.87 epochs: MSE train/valid = 0.000267/0.000467
29.85 epochs: MSE train/valid = 0.000222/0.000396
34.82 epochs: MSE train/valid = 0.000234/0.000433
39.80 epochs: MSE train/valid = 0.000238/0.000449
44.77 epochs: MSE train/valid = 0.000229/0.000506
49.74 epochs: MSE train/valid = 0.000223/0.000506
54.72 epochs: MSE train/valid = 0.000193/0.000399
59.69 epochs: MSE train/valid = 0.000190/0.000407
64.67 epochs: MSE train/valid = 0.000179/0.000371
69.64 epochs: MSE train/valid = 0.000199/0.000426
74.62 epochs: MSE train/valid = 0.000166/0.000363
79.59 epochs: MSE train/valid = 0.000167/0.000339
84.57 epochs: MSE train/valid = 0.000231/0.000573
89.54 epochs: MSE train/valid = 0.000162/0.000337
94.52 epochs: MSE train/valid = 0.000163/0.000392
99.49 epochs: MSE train/valid = 0.000160/0.000368


y_train.shape

(1568, 4)


ft = 0 # 0 = open, 1 = close, 2 = highest, 3 = lowest

## show predictions
plt.figure(figsize=(22, 8));

plt.plot(np.arange(y_train.shape[0]), y_train[:,ft], color='blue', label='train target')

plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+y_valid.shape[0]), y_valid[:,ft],
         color='gray', label='valid target')

plt.plot(np.arange(y_train.shape[0]+y_valid.shape[0],
                   y_train.shape[0]+y_test.shape[0]+y_test.shape[0]),
         y_test[:,ft], color='black', label='test target')

plt.plot(np.arange(y_train_pred.shape[0]),y_train_pred[:,ft], color='red',
         label='train prediction')

plt.plot(np.arange(y_train_pred.shape[0], y_train_pred.shape[0]+y_valid_pred.shape[0]),
         y_valid_pred[:,ft], color='orange', label='valid prediction')

plt.plot(np.arange(y_train_pred.shape[0]+y_valid_pred.shape[0],
                   y_train_pred.shape[0]+y_valid_pred.shape[0]+y_test_pred.shape[0]),
         y_test_pred[:,ft], color='green', label='test prediction')

plt.title('past and future stock prices')
plt.xlabel('time [days]')
plt.ylabel('normalized price')
plt.legend(loc='best');


plt.figure(figsize=(22, 8));
plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+y_test.shape[0]),
         y_test[:,ft], color='black', label='test target')

plt.plot(np.arange(y_train_pred.shape[0], y_train_pred.shape[0]+y_test_pred.shape[0]),
         y_test_pred[:,ft], color='green', label='test prediction')

plt.title('future stock prices')
plt.xlabel('time [days]')
plt.ylabel('normalized price')
plt.legend(loc='best');


corr_price_development_train = np.sum(np.equal(np.sign(y_train[:,1]-y_train[:,0]),
            np.sign(y_train_pred[:,1]-y_train_pred[:,0])).astype(int)) / y_train.shape[0]
corr_price_development_valid = np.sum(np.equal(np.sign(y_valid[:,1]-y_valid[:,0]),
            np.sign(y_valid_pred[:,1]-y_valid_pred[:,0])).astype(int)) / y_valid.shape[0]
corr_price_development_test = np.sum(np.equal(np.sign(y_test[:,1]-y_test[:,0]),
            np.sign(y_test_pred[:,1]-y_test_pred[:,0])).astype(int)) / y_test.shape[0]

print('Correct sign prediction for close - open price for train/valid/test: %.2f/%.2f/%.2f'%(
    corr_price_development_train, corr_price_development_valid, corr_price_development_test))

Correct sign prediction for close - open price for train/valid/test: 0.53/0.55/0.59

	date	symbol	open	close	low	high	volume	date_time
851259	2016-12-30	ZBH	103.309998	103.199997	102.849998	103.930000	973800.0	2016-12-30
851260	2016-12-30	ZION	43.070000	43.040001	42.689999	43.310001	1938100.0	2016-12-30
851261	2016-12-30	ZTS	53.639999	53.529999	53.270000	53.740002	1701200.0	2016-12-30
851262	2016-12-30	AIV	44.730000	45.450001	44.410000	45.590000	1380900.0	2016-12-30
851263	2016-12-30	FTV	54.200001	53.630001	53.389999	54.480000	705100.0	2016-12-30

	open	close	low	high	volume
count	851264.000000	851264.000000	851264.000000	851264.000000	8.512640e+05
mean	64.993618	65.011913	64.336541	65.639748	5.415113e+06
std	75.203893	75.201216	74.459518	75.906861	1.249468e+07
min	1.660000	1.590000	1.500000	1.810000	0.000000e+00
25%	31.270000	31.292776	30.940001	31.620001	1.221500e+06
50%	48.459999	48.480000	47.970001	48.959999	2.476250e+06
75%	75.120003	75.139999	74.400002	75.849998	5.222500e+06
max	1584.439941	1578.130005	1549.939941	1600.930054	8.596434e+08

New York Stock Exchange price prediction¶

1. Choosing the data¶

2. Choosing the model¶

3. Building the model¶

Netflix prediction¶