ERC20-Machine_Learning/MainAnalysisPythonCode.py at master · atesluks/ERC20-Machine_Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# -*- coding: utf-8 -*-
"""ERC20_Machine_Learning.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1u65OMZCRYt2_aEZIwBOrSDjdKvHAWq3-
"""

# importing required libraries
import sys
import numpy as numpy
from pandas import read_csv, DataFrame
import matplotlib.pyplot as pyplot
! pip install scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# libraries for ARIMA model
! pip install pmdarima
import pmdarima as pmdarima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller

# libraries for LSTM model
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

"""# Importing and pre-processing data"""

# array of lags that will be analysed
lags_arr = [24, 48, 72]

# array of token symbols that will be analysed
token_symbols = ['UNI', 'LINK', 'AAVE', 'MKR', 'LEO', 'COMP', 'GRT', 'HT', 'CEL',
                 'CHZ', 'TEL', 'YFI', 'HOT', 'ENJ', 'MANA', 'QNT', 'BAT', 'SNX', 'NEXO',
                 'BNT', 'CRV', 'CHSB', 'KCS', 'ZRX', 'UMA', 'ANKR', 'VGX', '1INCH']
number_of_tokens = len(token_symbols)

# importing data from csv files (before executing this step, csv data files should be uploaded)
raw_datasets = []
for symbol in token_symbols:
  raw_datasets.append(read_csv('{}.csv'.format(symbol)))

# scaling data and splitting datasets to training and test datasets
training_set_ratio = 0.80
data_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Trades']
scalers = []
training_datasets, test_datasets = [], []

for i in range(number_of_tokens):
  # separating dates from values, converting data to float type
  raw_dataset_without_dates = raw_datasets[i][data_columns].astype('float')

  # scaling data
  scaler = MinMaxScaler()
  training_set_size = int(len(raw_datasets[i])*training_set_ratio)
  raw_training_dataset = raw_dataset_without_dates[:training_set_size]
  scaler.fit(raw_training_dataset)
  scaled_dataset = DataFrame(scaler.transform(raw_dataset_without_dates), columns=data_columns)
  scalers.append(scaler)

  # splitting scaled dataset to training and test datasets
  training_datasets.append(scaled_dataset[:training_set_size])
  test_datasets.append(scaled_dataset[training_set_size:])

"""#ARIMA"""

# determining optimal ARIMA parameters for each dataset
ARIMA_params = []
for i in range(number_of_tokens):
  model = pmdarima.auto_arima(training_datasets[i]['Close'],
                        test='adf',             # using Augmented Dickey-Fuller (ADF) test to find optimal 'd'
                        d=None,                 # stating that the model should determine 'd' parameter
                        start_p=1, max_p=3,     # setting range for potential 'p' parameters
                        start_q=1, max_q=3,     # setting range for potential 'q' parameters
                        error_action='ignore',  # stating that erros should not be printed
                        suppress_warnings=True) # stating that warnings should not be printed

  # recording calculated optimal ARIMA parameters
  ARIMA_params.append(model.order)

  # printing out the parameters for visibility
  print(token_symbols[i] + ' - ' + str(model.order))

# stating that warnings should not be printed
import warnings
warnings.filterwarnings("ignore")

# predicting future prices with ARIMA
ARIMA_predictions_for_all_lags = []

# for each lags option (i.e. for 24 lags, 48 lags, and 72 lags)
for lags in lags_arr:
  print('{} lags:'.format(lags))
  ARIMA_predictions = []

  # for each token
  for i in range(number_of_tokens):
    # Extracting  closing prices data
    test_dataset_close_prices = test_datasets[i]['Close'].to_numpy().tolist()
    training_dataset_close_prices = training_datasets[i]['Close'].to_numpy().tolist()

    # creating batches for testing
    training_X = []
    for j in range(len(test_dataset_close_prices)):
      curr_data = []
      if j >= lags:
        curr_data.extend(test_dataset_close_prices[(j-lags):j])
      else:
        curr_data.extend(training_dataset_close_prices[-(lags-j):])
        curr_data.extend(test_dataset_close_prices[0:j])
      training_X.append(curr_data)

    # forecasting future prices from the test batches with ARIMA
    curr_predictions = []
    for j in range(len(test_dataset_close_prices)):
      sys.stdout.write('\r' + '{} - {}/{}'.format(token_symbols[i], j, len(test_dataset_close_prices)))
      sys.stdout.flush()
      model = ARIMA(training_X[j], order=ARIMA_params[i])
      model_fit = model.fit()
      output = model_fit.forecast()
      yhat = output[0]
      curr_predictions.append(yhat)

    # recording predictions for a token
    ARIMA_predictions.append(curr_predictions)
    print(' - done\n')

  # recording predictions for all tokens
  ARIMA_predictions_for_all_lags.append(ARIMA_predictions)

"""#LSTM"""

# creating batches for training and testing
training_y_arr, training_X_arr, test_X_arr = [], [], []

for lags in lags_arr:
  # arrays for storing batches for trianing and testing
  training_X, test_X = [], []

  # array for storing dependent variables of the training dataset
  training_y = []

  for i in range(number_of_tokens):
    # extracting dependent variables of the training dataset
    training_y.append(training_datasets[i][lags:]['Close'].to_numpy())

    # creating batches for training
    curr_training_X = []
    for j in range(0, len(training_datasets[i]) - lags):
      curr_training_X.append(training_datasets[i][j:j+lags].to_numpy().tolist())
    training_X.append(numpy.array(curr_training_X))

    # creating batches for testing
    curr_test_X = []
    for j in range(len(test_datasets[i])):
      curr_data = []
      if j >= lags:
        curr_data.extend(test_datasets[i][(j-lags):j].to_numpy().tolist())
      else:
        curr_data.extend(training_datasets[i][-(lags-j):].to_numpy().tolist())
        curr_data.extend(test_datasets[i][0:j].to_numpy().tolist())
      curr_test_X.append(curr_data)
    test_X.append(numpy.array(curr_test_X))

  # recording dependent variables of the training datase
  training_y_arr.append(training_y)

  # recording training and testing batches
  training_X_arr.append(training_X)
  test_X_arr.append(test_X)

# array for storing all LSTM predictions results
LSTM_predictions_for_all_lags = []

# for each lag option
for l in range(len(lags_arr)):
  LSTM_predictions = []
  print('{} lags:'.format(lags_arr[l]))

  # for each token building LSTM models and predicting test values
  for i in range(number_of_tokens):
    # building an LSTM model
    training_data = training_X_arr[l][i]
    model = Sequential()
    model.add(LSTM(64, input_shape=(training_data.shape[1], training_data.shape[2]), return_sequences=False))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')

    # training the LSTM model with the test dataset (test batches and dependent variables of the training datase)
    earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
    model.fit(training_data, training_y_arr[l][i], epochs=1000, batch_size=64,
                        validation_split=0.9, verbose=2, shuffle=False, callbacks=[earlyStopping])

    # making predictions by the LSTM model
    yhat = model.predict(test_X_arr[l][i])

    # recording predictions
    LSTM_predictions.append(yhat.flatten('C'))
    print('{} - done\n\n'.format(token_symbols[i]))

  # recording all predictions
  LSTM_predictions_for_all_lags.append(LSTM_predictions)

"""#Calculate performance of ARIMA and LSTM"""

def mean_absolute_percentage_error(test_data, predicted_data):
    return numpy.mean(numpy.abs((test_data - predicted_data) / test_data)) * 100

def calculate_performance(predictions, test_data, scaler):
  # rescaling the predictions to normal scale
  dataframe_for_rescaling = DataFrame({"col1":predictions, "col2":predictions, "col3":predictions,
                                       "col4":predictions, "col5":predictions, "col6":predictions})
  dataframe_scaled_back = scaler.inverse_transform(dataframe_for_rescaling)
  predictions_scaled_back = DataFrame(dataframe_scaled_back)[3].to_numpy()

  # rescaling test dataset back to normal scale
  dataframe_for_rescaling = DataFrame({"col1":test_data, "col2":test_data, "col3":test_data,
                                       "col4":test_data, "col5":test_data, "col6":test_data})
  dataframe_scaled_back = scaler.inverse_transform(dataframe_for_rescaling)
  test_y_scaled_back = DataFrame(dataframe_scaled_back)[3].to_numpy()

  # calculating performance metrics
  RMSE = mean_squared_error(test_y_scaled_back, predictions_scaled_back, squared=False) # Root mean square error
  MAE =  mean_absolute_error(test_y_scaled_back, predictions_scaled_back) # Mean absolute error
  MAPE = mean_absolute_percentage_error(test_y_scaled_back, predictions_scaled_back) # Mean absolute percentage error
  return [RMSE, MAE, MAPE]

# array holding all results
all_results = []

# calculating and saving performance results for each lag option
for l in range(len(lags_arr)):
  lags = lags_arr[l]
  print('{} lags:'.format(lags))

  results = DataFrame(columns=['Token', 'ARIMA_RMSE','LSTM_RMSE', 'ARIMA_MAE', 'LSTM_MAE', 'ARIMA_MAPE', 'LSTM_MAPE'])

  # calculating and saving performance results for each token
  for i in range(number_of_tokens):
    # calculating performance for a token
    test_data = test_datasets[i]['Close'].to_numpy()
    ARIMA_performance = calculate_performance(ARIMA_predictions_for_all_lags[l][i], test_data, scalers[i])
    LSTM_performance = calculate_performance(LSTM_predictions_for_all_lags[l][i], test_data, scalers[i])

    # recording results for a token
    results = results.append({'Token': token_symbols[i],
                              'ARIMA_RMSE': ARIMA_performance[0], 'LSTM_RMSE': LSTM_performance[0],
                              'ARIMA_MAE': ARIMA_performance[1], 'LSTM_MAE': LSTM_performance[1],
                              'ARIMA_MAPE': ARIMA_performance[2], 'LSTM_MAPE': LSTM_performance[2]},
                            ignore_index=True)

  # recording all results
  all_results.append(results)
  print(results)
  print()

# downloading results as a file (optional)
from google.colab import drive
from google.colab import files
drive.mount('/drive')

for i in range(len(lags_arr)):
  fileName = 'results_ARIMA_LSTM_{}_lags.csv'.format(lags_arr[i])
  all_results[i].to_csv(fileName)
  files.download(fileName)

# a method for plotting results
def plotting_results(token_symbols, ARIMA_series, LSTM_series, metrics_name, number_of_lags):
  # locations of the labels
  x = numpy.arange(len(token_symbols))

  # width of a bars
  width = 0.35

  # passing data for the chart
  fig, ax = pyplot.subplots()
  series1 = ax.bar(x - width/2, ARIMA_series, width, label='ARIMA')
  series2 = ax.bar(x + width/2, LSTM_series, width, label='LSTM')

  # adding labels, title, and formatting
  ax.set_ylabel('Scaled logarithms of {}'.format(metrics_name))
  ax.set_title('{} results for {} lags'.format(metrics_name, number_of_lags))
  ax.set_xticks(x)
  ax.set_xticklabels(token_symbols)
  ax.legend()
  pyplot.rcParams["figure.figsize"] = (12,3)
  pyplot.xticks(rotation = 90)
  pyplot.show()

# plotting charts with results for each lag option
for i in range(len(all_results)):
  # selecting list of tokens from the results
  token_symbols = all_results[i]['Token']
  number_of_tokens = len(token_symbols)

  # concatenating ARIMA and LSTM results for scaling for better data visualisation
  arima = all_results[i][['ARIMA_RMSE', 'ARIMA_MAE', 'ARIMA_MAPE']].to_numpy()
  lstm = all_results[i][['LSTM_RMSE', 'LSTM_MAE', 'LSTM_MAPE']].to_numpy()
  concatendated_results = DataFrame(numpy.concatenate((arima, lstm)), columns=['RMSE', 'MAE', 'MAPE'])

  # computing logarithm of the results, so the scale is less diverge
  log_results = numpy.log(concatendated_results)

  # scaling the results data so it can be visualised on the same chart
  scaler = MinMaxScaler()
  scaledResults = DataFrame(scaler.fit_transform(log_results), columns=['RMSE', 'MAE', 'MAPE'])

  # separating ARIMA and LSTM scaled results
  ARIMA_scaled_results = scaledResults[:number_of_tokens]
  LSTM_scaled_results = scaledResults[number_of_tokens:]

  # Plotting charts
  print('Results for {} lags:'.format(lags_arr[i]))
  plotting_results(token_symbols, ARIMA_scaled_results['RMSE'], LSTM_scaled_results['RMSE'], 'RMSE', lags_arr[i])
  plotting_results(token_symbols, ARIMA_scaled_results['MAE'], LSTM_scaled_results['MAE'], 'MAE', lags_arr[i])
  plotting_results(token_symbols, ARIMA_scaled_results['MAPE'], LSTM_scaled_results['MAPE'], 'MAPE', lags_arr[i])
  print('\n\n')