-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMainAnalysisPythonCode.py
More file actions
320 lines (256 loc) · 12.5 KB
/
Copy pathMainAnalysisPythonCode.py
File metadata and controls
320 lines (256 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# -*- coding: utf-8 -*-
"""ERC20_Machine_Learning.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1u65OMZCRYt2_aEZIwBOrSDjdKvHAWq3-
"""
# importing required libraries
import sys
import numpy as numpy
from pandas import read_csv, DataFrame
import matplotlib.pyplot as pyplot
! pip install scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
# libraries for ARIMA model
! pip install pmdarima
import pmdarima as pmdarima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
# libraries for LSTM model
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping
"""# Importing and pre-processing data"""
# array of lags that will be analysed
lags_arr = [24, 48, 72]
# array of token symbols that will be analysed
token_symbols = ['UNI', 'LINK', 'AAVE', 'MKR', 'LEO', 'COMP', 'GRT', 'HT', 'CEL',
'CHZ', 'TEL', 'YFI', 'HOT', 'ENJ', 'MANA', 'QNT', 'BAT', 'SNX', 'NEXO',
'BNT', 'CRV', 'CHSB', 'KCS', 'ZRX', 'UMA', 'ANKR', 'VGX', '1INCH']
number_of_tokens = len(token_symbols)
# importing data from csv files (before executing this step, csv data files should be uploaded)
raw_datasets = []
for symbol in token_symbols:
raw_datasets.append(read_csv('{}.csv'.format(symbol)))
# scaling data and splitting datasets to training and test datasets
training_set_ratio = 0.80
data_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Trades']
scalers = []
training_datasets, test_datasets = [], []
for i in range(number_of_tokens):
# separating dates from values, converting data to float type
raw_dataset_without_dates = raw_datasets[i][data_columns].astype('float')
# scaling data
scaler = MinMaxScaler()
training_set_size = int(len(raw_datasets[i])*training_set_ratio)
raw_training_dataset = raw_dataset_without_dates[:training_set_size]
scaler.fit(raw_training_dataset)
scaled_dataset = DataFrame(scaler.transform(raw_dataset_without_dates), columns=data_columns)
scalers.append(scaler)
# splitting scaled dataset to training and test datasets
training_datasets.append(scaled_dataset[:training_set_size])
test_datasets.append(scaled_dataset[training_set_size:])
"""#ARIMA"""
# determining optimal ARIMA parameters for each dataset
ARIMA_params = []
for i in range(number_of_tokens):
model = pmdarima.auto_arima(training_datasets[i]['Close'],
test='adf', # using Augmented Dickey-Fuller (ADF) test to find optimal 'd'
d=None, # stating that the model should determine 'd' parameter
start_p=1, max_p=3, # setting range for potential 'p' parameters
start_q=1, max_q=3, # setting range for potential 'q' parameters
error_action='ignore', # stating that erros should not be printed
suppress_warnings=True) # stating that warnings should not be printed
# recording calculated optimal ARIMA parameters
ARIMA_params.append(model.order)
# printing out the parameters for visibility
print(token_symbols[i] + ' - ' + str(model.order))
# stating that warnings should not be printed
import warnings
warnings.filterwarnings("ignore")
# predicting future prices with ARIMA
ARIMA_predictions_for_all_lags = []
# for each lags option (i.e. for 24 lags, 48 lags, and 72 lags)
for lags in lags_arr:
print('{} lags:'.format(lags))
ARIMA_predictions = []
# for each token
for i in range(number_of_tokens):
# Extracting closing prices data
test_dataset_close_prices = test_datasets[i]['Close'].to_numpy().tolist()
training_dataset_close_prices = training_datasets[i]['Close'].to_numpy().tolist()
# creating batches for testing
training_X = []
for j in range(len(test_dataset_close_prices)):
curr_data = []
if j >= lags:
curr_data.extend(test_dataset_close_prices[(j-lags):j])
else:
curr_data.extend(training_dataset_close_prices[-(lags-j):])
curr_data.extend(test_dataset_close_prices[0:j])
training_X.append(curr_data)
# forecasting future prices from the test batches with ARIMA
curr_predictions = []
for j in range(len(test_dataset_close_prices)):
sys.stdout.write('\r' + '{} - {}/{}'.format(token_symbols[i], j, len(test_dataset_close_prices)))
sys.stdout.flush()
model = ARIMA(training_X[j], order=ARIMA_params[i])
model_fit = model.fit()
output = model_fit.forecast()
yhat = output[0]
curr_predictions.append(yhat)
# recording predictions for a token
ARIMA_predictions.append(curr_predictions)
print(' - done\n')
# recording predictions for all tokens
ARIMA_predictions_for_all_lags.append(ARIMA_predictions)
"""#LSTM"""
# creating batches for training and testing
training_y_arr, training_X_arr, test_X_arr = [], [], []
for lags in lags_arr:
# arrays for storing batches for trianing and testing
training_X, test_X = [], []
# array for storing dependent variables of the training dataset
training_y = []
for i in range(number_of_tokens):
# extracting dependent variables of the training dataset
training_y.append(training_datasets[i][lags:]['Close'].to_numpy())
# creating batches for training
curr_training_X = []
for j in range(0, len(training_datasets[i]) - lags):
curr_training_X.append(training_datasets[i][j:j+lags].to_numpy().tolist())
training_X.append(numpy.array(curr_training_X))
# creating batches for testing
curr_test_X = []
for j in range(len(test_datasets[i])):
curr_data = []
if j >= lags:
curr_data.extend(test_datasets[i][(j-lags):j].to_numpy().tolist())
else:
curr_data.extend(training_datasets[i][-(lags-j):].to_numpy().tolist())
curr_data.extend(test_datasets[i][0:j].to_numpy().tolist())
curr_test_X.append(curr_data)
test_X.append(numpy.array(curr_test_X))
# recording dependent variables of the training datase
training_y_arr.append(training_y)
# recording training and testing batches
training_X_arr.append(training_X)
test_X_arr.append(test_X)
# array for storing all LSTM predictions results
LSTM_predictions_for_all_lags = []
# for each lag option
for l in range(len(lags_arr)):
LSTM_predictions = []
print('{} lags:'.format(lags_arr[l]))
# for each token building LSTM models and predicting test values
for i in range(number_of_tokens):
# building an LSTM model
training_data = training_X_arr[l][i]
model = Sequential()
model.add(LSTM(64, input_shape=(training_data.shape[1], training_data.shape[2]), return_sequences=False))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# training the LSTM model with the test dataset (test batches and dependent variables of the training datase)
earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
model.fit(training_data, training_y_arr[l][i], epochs=1000, batch_size=64,
validation_split=0.9, verbose=2, shuffle=False, callbacks=[earlyStopping])
# making predictions by the LSTM model
yhat = model.predict(test_X_arr[l][i])
# recording predictions
LSTM_predictions.append(yhat.flatten('C'))
print('{} - done\n\n'.format(token_symbols[i]))
# recording all predictions
LSTM_predictions_for_all_lags.append(LSTM_predictions)
"""#Calculate performance of ARIMA and LSTM"""
def mean_absolute_percentage_error(test_data, predicted_data):
return numpy.mean(numpy.abs((test_data - predicted_data) / test_data)) * 100
def calculate_performance(predictions, test_data, scaler):
# rescaling the predictions to normal scale
dataframe_for_rescaling = DataFrame({"col1":predictions, "col2":predictions, "col3":predictions,
"col4":predictions, "col5":predictions, "col6":predictions})
dataframe_scaled_back = scaler.inverse_transform(dataframe_for_rescaling)
predictions_scaled_back = DataFrame(dataframe_scaled_back)[3].to_numpy()
# rescaling test dataset back to normal scale
dataframe_for_rescaling = DataFrame({"col1":test_data, "col2":test_data, "col3":test_data,
"col4":test_data, "col5":test_data, "col6":test_data})
dataframe_scaled_back = scaler.inverse_transform(dataframe_for_rescaling)
test_y_scaled_back = DataFrame(dataframe_scaled_back)[3].to_numpy()
# calculating performance metrics
RMSE = mean_squared_error(test_y_scaled_back, predictions_scaled_back, squared=False) # Root mean square error
MAE = mean_absolute_error(test_y_scaled_back, predictions_scaled_back) # Mean absolute error
MAPE = mean_absolute_percentage_error(test_y_scaled_back, predictions_scaled_back) # Mean absolute percentage error
return [RMSE, MAE, MAPE]
# array holding all results
all_results = []
# calculating and saving performance results for each lag option
for l in range(len(lags_arr)):
lags = lags_arr[l]
print('{} lags:'.format(lags))
results = DataFrame(columns=['Token', 'ARIMA_RMSE','LSTM_RMSE', 'ARIMA_MAE', 'LSTM_MAE', 'ARIMA_MAPE', 'LSTM_MAPE'])
# calculating and saving performance results for each token
for i in range(number_of_tokens):
# calculating performance for a token
test_data = test_datasets[i]['Close'].to_numpy()
ARIMA_performance = calculate_performance(ARIMA_predictions_for_all_lags[l][i], test_data, scalers[i])
LSTM_performance = calculate_performance(LSTM_predictions_for_all_lags[l][i], test_data, scalers[i])
# recording results for a token
results = results.append({'Token': token_symbols[i],
'ARIMA_RMSE': ARIMA_performance[0], 'LSTM_RMSE': LSTM_performance[0],
'ARIMA_MAE': ARIMA_performance[1], 'LSTM_MAE': LSTM_performance[1],
'ARIMA_MAPE': ARIMA_performance[2], 'LSTM_MAPE': LSTM_performance[2]},
ignore_index=True)
# recording all results
all_results.append(results)
print(results)
print()
# downloading results as a file (optional)
from google.colab import drive
from google.colab import files
drive.mount('/drive')
for i in range(len(lags_arr)):
fileName = 'results_ARIMA_LSTM_{}_lags.csv'.format(lags_arr[i])
all_results[i].to_csv(fileName)
files.download(fileName)
# a method for plotting results
def plotting_results(token_symbols, ARIMA_series, LSTM_series, metrics_name, number_of_lags):
# locations of the labels
x = numpy.arange(len(token_symbols))
# width of a bars
width = 0.35
# passing data for the chart
fig, ax = pyplot.subplots()
series1 = ax.bar(x - width/2, ARIMA_series, width, label='ARIMA')
series2 = ax.bar(x + width/2, LSTM_series, width, label='LSTM')
# adding labels, title, and formatting
ax.set_ylabel('Scaled logarithms of {}'.format(metrics_name))
ax.set_title('{} results for {} lags'.format(metrics_name, number_of_lags))
ax.set_xticks(x)
ax.set_xticklabels(token_symbols)
ax.legend()
pyplot.rcParams["figure.figsize"] = (12,3)
pyplot.xticks(rotation = 90)
pyplot.show()
# plotting charts with results for each lag option
for i in range(len(all_results)):
# selecting list of tokens from the results
token_symbols = all_results[i]['Token']
number_of_tokens = len(token_symbols)
# concatenating ARIMA and LSTM results for scaling for better data visualisation
arima = all_results[i][['ARIMA_RMSE', 'ARIMA_MAE', 'ARIMA_MAPE']].to_numpy()
lstm = all_results[i][['LSTM_RMSE', 'LSTM_MAE', 'LSTM_MAPE']].to_numpy()
concatendated_results = DataFrame(numpy.concatenate((arima, lstm)), columns=['RMSE', 'MAE', 'MAPE'])
# computing logarithm of the results, so the scale is less diverge
log_results = numpy.log(concatendated_results)
# scaling the results data so it can be visualised on the same chart
scaler = MinMaxScaler()
scaledResults = DataFrame(scaler.fit_transform(log_results), columns=['RMSE', 'MAE', 'MAPE'])
# separating ARIMA and LSTM scaled results
ARIMA_scaled_results = scaledResults[:number_of_tokens]
LSTM_scaled_results = scaledResults[number_of_tokens:]
# Plotting charts
print('Results for {} lags:'.format(lags_arr[i]))
plotting_results(token_symbols, ARIMA_scaled_results['RMSE'], LSTM_scaled_results['RMSE'], 'RMSE', lags_arr[i])
plotting_results(token_symbols, ARIMA_scaled_results['MAE'], LSTM_scaled_results['MAE'], 'MAE', lags_arr[i])
plotting_results(token_symbols, ARIMA_scaled_results['MAPE'], LSTM_scaled_results['MAPE'], 'MAPE', lags_arr[i])
print('\n\n')