· 6 years ago · Jul 22, 2019, 06:56 AM
1from multiprocessing import Pool
2
3import time
4import timeit
5from datetime import datetime, timedelta
6import os, fnmatch
7import psutil
8import csv
9import numpy as np
10import itertools
11import sys
12import mysql.connector
13import pandas as pd
14import pymysql
15
16from backtesting import Strategy
17from backtesting import Backtest
18from scipy.stats import linregress
19from mk_test_file import mk_test
20
21import warnings
22from statistics import mean
23from matplotlib.pylab import plt #load plot library
24
25
26xs = np.array([1,2,3,4,5], dtype=np.float64)
27ys = np.array([5,4,6,5,6], dtype=np.float64)
28
29def best_fit_slope(xs,ys):
30 m = (((mean(xs)*mean(ys)) - mean(xs*ys)) / ((mean(xs)**2) - mean(xs**2)))
31 return(m)
32
33
34table = ""
35postfix = "*export1.csv" # that's added to regex to search for Altreva report files
36
37# max amount of records to write into the database (saves time)
38MaxResultRows2Database = 100
39MaxPoolForModelsGroup = 15 # from how many models shall we take the pool of group selection
40
41# fixed columns in the database
42f_date=""
43l_date=""
44
45# columns in file
46fBar_col = 0
47fDate_col = 1
48fTime_col = 2
49fPrice_col = 3
50fSignal_col = 5
51fFDA_col = 15 # (Trailing 1 bars; Range applied)
52fFDS_col = 16 # (Trailing 100 bars; Range applied)
53
54# this will hold [Open,High,Low,Close,Buy,Sell] values for every row of the quotes
55data_ohlcbs = None
56data_ohlcbs_a = None
57data_ohlcbs_b = None
58
59# will need those to make performance comparison in the database
60slope_a = 0
61slope_b = 0
62
63# for the filename of the chart
64chart_title = ""
65chart_filename = ""
66
67
68# columns in OHLC HISTORY
69_d=0
70_t=1
71_o=2
72_h=3
73_l=4
74_c=5
75_b=6
76_s=7
77CHUNK_A = 0
78CHUNK_B = 1
79
80
81# THIS BELOW IS NOT THERE FOR CURRENT CONFIGS!
82fFD_AUC_col = 17 # (Trailing 1 bars; Range applied)
83fFDA_All = 21
84
85# columns in numpy array
86aDate_col = 0
87aTime_col = 1
88aPrice_col = 2
89aSignal_col = 3
90
91# model's parameters
92iPeriods = range(5,10,1) # period values for FDA & FA_AUC calculations
93FDA_threshold = 0.7
94group_sizes = [1,2,4,5,6,7,8,9]
95
96
97# will need below data to fill MySQL table
98# integer bar number
99start_a = 0
100end_a = 0
101start_b = 0
102end_b = 0
103
104# numpy array
105price_a = 0
106price_b = 0
107
108# double - benchmark values for PnL (if a trade is better than longterm buy/sell strategy and best trade taken between all time chart extrenuma
109buy_hold_a= 0
110max_min_a = 0
111buy_hold_b= 0
112max_min_b= 0
113
114# dates
115dt_a = 0
116dt_b = 0
117
118start_a_date= 0
119end_a_date= 0
120start_b_date= 0
121end_b_date= 0
122
123# actual trading days to compute Performance (avg daily PnL) and Turnover (lots) per period of given days
124trading_days_a = 0
125trading_days_b = 0
126
127
128def identity(values):
129 """
130 fubction is used in strategy to calculate signals
131 """
132 return pd.Series(values)
133
134
135class MyStrategy(Strategy):
136
137 def init(self):
138 self.signals = self.I(identity, self.data.Signals)
139 self.files_period = self.data.files_period[0]
140
141 def next(self):
142 signal = self.signals[-1]
143 if signal > 0:
144 # Let the strategy close any current position and use all available funds to buy the asset
145 self.buy()
146 elif signal < 0:
147 # Let the strategy close any current position and use all available funds to short sell the asset
148 self.sell()
149 else:
150 assert signal == 0
151 self.position.close()
152
153
154
155def check_table(table): # will create table or exit the software is table exists
156
157 # todo: change logic
158 # if does not exit, then create it, otherwise leave it
159
160 mydb = mysql.connector.connect(
161 host="localhost",
162 user="root",
163 passwd="xsjhk%123SD==-Ghk11",
164 database="altreva"
165 )
166
167 cursor = mydb.cursor()
168
169 stmt = "SHOW TABLES LIKE '" + table + "'"
170 cursor.execute(stmt)
171 result = cursor.fetchone()
172 if result: return # there is a table named "tableName"
173
174 # there are no tables named "tableName"
175
176 # this should create the table OR exit with error
177 _SQL = "CREATE TABLE `" + table + "` ("
178 _SQL += "`ts` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,"
179
180 _SQL += "`chunk` INT NULL,"
181 _SQL += "`hist_bars` VARCHAR(45) NULL,"
182 _SQL += "`length` INT NULL,"
183 _SQL += "`ratio` FLOAT NULL,"
184 _SQL += "`period_fda` INT NULL,"
185
186 _SQL += "`pnl_a` DOUBLE NULL,"
187 _SQL += "`dd_a` DOUBLE NULL,"
188 _SQL += "`buy_hold_a` DOUBLE NULL,"
189 _SQL += "`max_min_a` DOUBLE NULL,"
190
191 _SQL += "`pnl_b` DOUBLE NULL,"
192 _SQL += "`dd_b` DOUBLE NULL,"
193 _SQL += "`buy_hold_b` DOUBLE NULL,"
194 _SQL += "`max_min_b` DOUBLE NULL,"
195
196 _SQL += "`trading_days_a` INT NULL,"
197 _SQL += "`trading_days_b` INT NULL,"
198
199 _SQL += "`trades_a` INT NULL,"
200 _SQL += "`trades_b` INT NULL,"
201
202 # this should be calculated automatically by an expression like "field_name double GENERATED ALWAYS AS (pnl_a/trading_days)"
203 _SQL += "`avg_d_pnl_a` DOUBLE GENERATED ALWAYS AS (pnl_a/trading_days_a),"
204 _SQL += "`avg_d_dd_a` DOUBLE GENERATED ALWAYS AS (dd_a/trading_days_a),"
205 _SQL += "`avg_d_pnl_b` DOUBLE GENERATED ALWAYS AS (pnl_b/trading_days_b),"
206 _SQL += "`avg_d_dd_b` DOUBLE GENERATED ALWAYS AS (dd_b/trading_days_b),"
207
208 _SQL += "`avg_d_vol_a` DOUBLE GENERATED ALWAYS AS (trades_a/trading_days_a),"
209 _SQL += "`avg_d_vol_b` DOUBLE GENERATED ALWAYS AS (trades_b/trading_days_b),"
210
211 # log dates as well
212 _SQL += "`start_a` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,"
213 _SQL += "`end_a` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,"
214 _SQL += "`start_b` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,"
215 _SQL += "`end_b` TIMESTAMP DEFAULT CURRENT_TIMESTAMP);"
216
217 cursor.execute(_SQL)
218 ts("Created table " + table)
219
220def write2mysql(vofv):
221
222 mydb = mysql.connector.connect(
223 host="localhost",
224 user="root",
225 passwd="xsjhk%123SD==-Ghk11",
226 database="altreva"
227 )
228
229 mycursor = mydb.cursor()
230
231 '''
232use altreva;
233CREATE TABLE `1K_take_1_usdcad_1440` (
234
235/* 0 */ `ts` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
236/* 1 */ `chunk` int(11) DEFAULT NULL,
237/* 2 */ `hist_bars` varchar(45) DEFAULT NULL,
238/* 3 */ `length` int(11) DEFAULT NULL,
239/* 4 */ `ratio` float DEFAULT NULL,
240/* 5 */ `period_fda` int(11) DEFAULT NULL,
241/* 6 */ `pnl_a` double DEFAULT NULL,
242/* 7 */ `dd_a` double DEFAULT NULL,
243/* 8 */ `buy_hold_a` double DEFAULT NULL,
244/* 9 */ `max_min_a` double DEFAULT NULL,
245/* 10 */ `pnl_b` double DEFAULT NULL,
246/* 11 */ `dd_b` double DEFAULT NULL,
247/* 12 */ `buy_hold_b` double DEFAULT NULL,
248/* 13 */ `max_min_b` double DEFAULT NULL,
249/* 14 */ `trading_days_a` int(11) DEFAULT NULL,
250/* 15 */ `trading_days_b` int(11) DEFAULT NULL,
251/* 16 */ `trades_a` int(11) DEFAULT NULL,
252/* 17 */ `trades_b` int(11) DEFAULT NULL,
253/* 18 */ `avg_d_pnl_a` double GENERATED ALWAYS AS ((`pnl_a` / `trading_days_a`)) VIRTUAL,
254/* 19 */ `avg_d_dd_a` double GENERATED ALWAYS AS ((`dd_a` / `trading_days_a`)) VIRTUAL,
255/* 20 */ `avg_d_pnl_b` double GENERATED ALWAYS AS ((`pnl_b` / `trading_days_b`)) VIRTUAL,
256/* 21 */ `avg_d_dd_b` double GENERATED ALWAYS AS ((`dd_b` / `trading_days_b`)) VIRTUAL,
257/* 22 */ `avg_d_vol_a` double GENERATED ALWAYS AS ((`trades_a` / `trading_days_a`)) VIRTUAL,
258/* 23 */ `avg_d_vol_b` double GENERATED ALWAYS AS ((`trades_b` / `trading_days_b`)) VIRTUAL,
259/* 24 */ `start_a` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
260/* 25 */ `end_a` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
261/* 26 */ `start_b` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
262/* 27 */ `end_b` timestamp NULL DEFAULT CURRENT_TIMESTAMP
263
264) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
265
266 '''
267
268 print(vofv)
269
270 # 0 1 2 3 4 5 6
271 #return [ str(group_str.count("/"))+" "+group_str, res["SQN"], res["# Trades"], res["Return [%]"], res["Max. Drawdown [%]"], res["Equity Final [$]"], res["Equity Peak [$]"] ]
272 group_str = vofv[0]
273
274 #### TODO: fill these
275 chunk = ""
276 ratio = 0
277 bars = 0
278 ####
279
280 sqn = vofv[1]
281 trades = vofv[2]
282 return_percent = vofv[3]
283 max_dd = vofv[4]
284 equity = vofv[5]
285 equity_peak = vofv[6]
286
287
288 '''
289 first = True
290 for index, row in top.iterrows():
291 sql =""
292 files_periods = row["files_periods"]
293 trades = row["trades"]
294 pnl = row["pnl"]
295 dd = row["dd"]
296
297 if (first):
298 gen_signals_file(files_periods)
299 first=False
300
301 sql += "INSERT INTO " + table + " (files_periods, trades, pnl, dd, openclose_range, max_range,trading_days, first_date, last_date) VALUES ("
302 sql += repr(files_periods) + "," + str(trades) + "," + str(pnl) + "," + str(dd) + "," + str(open_close_range) + "," + str(max_range) + "," + str(trading_days) + ","
303 sql += repr(f_date) + "," + repr(l_date)
304 sql += ");"
305 #print(sql)
306 mycursor.execute(sql)
307 mydb.commit()
308 mydb.close()
309 '''
310
311
312# we put 1 if avg is more than a threshold, so we can multiply by this number later
313def fda_avg(fda,period):
314 rslt = np.zeros(len(fda), dtype=float, order='C')
315 for r in range(len(fda)):
316 if (r+1-period >= 0):
317 rslt[r] = 1 if fda[r+1-period:r+1].mean() >= FDA_threshold else 0
318
319 return rslt
320
321# File I/O function for multiprocessing
322def files2sig_fda(filename):
323 sig = np.genfromtxt(filename, delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fSignal_col))
324 fda = np.genfromtxt(filename, delimiter=',', encoding="utf-8", dtype=float, skip_header=True, usecols = (fFDA_col))
325 return([filename, sig, fda])
326
327
328def digitize(sig_fda, period=-1): # used in miltiprocessing mode
329
330 # TODO: the below should be taken out for I/O not to slow down multiprocessing
331 #sig = np.genfromtxt(filename, delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fSignal_col))
332 #fda = np.genfromtxt(filename, delimiter=',', encoding="utf-8", dtype=float, skip_header=True, usecols = (fFDA_col))
333
334 filename = sig_fda[0]
335 sig = sig_fda[1]
336 fda = sig_fda[2]
337
338 for r in range(0,len(sig),1):
339 if (r==0 and sig[r]==''): # if the first row is empy, we don't know what signal was there, so we put 0 aka Cash
340 sig[r]=0
341 else:
342 if (sig[r]==''):
343 sig[r]=sig[r-1] # set to previous signal
344 elif (sig[r]=='Cash'):
345 sig[r]=0
346 elif (sig[r]=='Long'):
347 sig[r]=1
348 elif (sig[r]=='Short'):
349 sig[r]=-1
350
351 sig = sig.astype(float) # origianl signals
352
353 v=[] # as many columns as we have periods, each contains signals filtered by FDA of the corresponding period
354 if (period==-1): # do all periods
355 for p in iPeriods:
356 v.append ( [filename, p, np.multiply(sig, fda_avg(fda,p))] ) # scalar multiplication of signal and {1,0} based on fda
357 else: # use specific period
358 v.append ( [filename, period, np.multiply(sig, fda_avg(fda,p))] )
359
360 return(v)
361
362def drawdowns(equity_curve):
363 i = np.argmax(np.maximum.accumulate(equity_curve) - equity_curve) # end of the period
364 j = np.argmax(equity_curve[:i]) # start of period
365 drawdown=abs(100.0*(equity_curve[i]-equity_curve[j]))
366
367 return drawdown
368
369
370# pnl_classic( [ filename, period, chunk_section, signals, calc_dd ] )
371
372#
373def pnl_classic(model_infos):
374
375 BUY_COL = 4
376 SELL_COL = 5
377
378 filename = model_infos[0]
379 period = model_infos[1]
380 ohlcbs = model_infos[2]
381 signals = model_infos[3]
382 calc_dd = model_infos[4]
383
384
385 #print(len(ohlcbs),len(signals))
386 #exit()
387 #print(ohlcbs)
388
389 trades = 0
390 pnl = 0
391 max_dd_percent = 0
392
393 dt_pnl = [0] # first element would be 0
394
395 for r in range(1,len(ohlcbs),1): # signal on line N has to be traded on line N+1, thus no PnL on line 0
396 signal = signals[r-1] # TODO: Are the signals themselves rigth? Check it out at some point!
397 dpips = 0 # It's really importtant to get this one nulled for rows that have 0 signal
398 if ( signal != 0): # it was a valid signal and we execute it
399 trades += 1
400 if (signal>0):
401 dpips = ohlcbs[r][BUY_COL]
402 else:
403 dpips = ohlcbs[r][SELL_COL]
404 pnl += dpips # dpips was nulled, so it's OK to add it
405
406 # DEBUG - TEST THIS NEXT TODO!
407 #if (filename=="54_16055-19155_3000_0.9_USDCAD_240 1075_export1.csv" and period ==8):
408 # # maybe add DATE/time for the signals and prices columns to make sure we can see precisely the right rows
409 # print("DEBUG:", ohlcbs[r][0],ohlcbs[r][1],ohlcbs[r][2],ohlcbs[r][3], ohlcbs[r][4], ohlcbs[r][5], signals[r], dpips, pnl )
410
411 dt_pnl.append(pnl) # this dataseries will be used for Drawdown calculation and also to return the PnL chart
412
413 if pnl > 0:
414 if (calc_dd == True): # do it here to save resources
415 max_dd_percent = drawdowns(np.array(dt_pnl))
416 else:
417 max_dd_percent = 0 # initially we don't use that fot the sake of CPU resources
418
419 #print(filename, period, trades, pnl, max_dd_percent)
420 label = filename + ", "
421 if (period != 0): label += "p=" + str(period) + ", "
422 label += "pnl=" + str(round(pnl,5)) + ", dd=" + str(round(max_dd_percent,2)) + "%, trades=" + str(trades) + ", bars=" + str(ohlcbs)
423
424 return [ filename, period, trades, pnl, max_dd_percent, dt_pnl, label ]
425 else:
426 return None
427
428# chart_title should contain Gradient_A=... Gradient_B=... GradientA/GradientB=...
429def save_chart( close_price, v_separator, \
430 best_a, label_a, best_ga, label_ga, best_b, label_b, \
431 best_gb, label_gb, best_ab, label_ab, best_gab, label_gab):
432
433 # TODO: pips_per_week for each series, such that in differentiates between the timeframe in the report file, saving it somewhere in global constant
434
435 chart_close, = plt.plot(close_price, label = "Price Close")
436 chart_best_a, = plt.plot(best_a, label = label_a)
437 chart_best_ga, = plt.plot(best_ga, label = label_ga)
438 chart_best_b, = plt.plot(best_b, label = label_b)
439 chart_best_gb, = plt.plot(best_gb, label = label_gb)
440 chart_best_ab, = plt.plot(best_ab, label = label_ab)
441 chart_best_gab, = plt.plot(best_gab, label = label_gab)
442
443 plt.axvline(x=v_separator)
444 plt.legend([chart_close,chart_best_a, chart_best_ga, chart_best_b, chart_best_gb, chart_best_ab, chart_best_gab],loc='lower left',)
445 plt.title(label=chart_title, fontdict=None, loc='center')
446 try:
447 os.mkdir(folder)
448 except Exception:
449 print("")
450 plt.savefig(chart_filename)
451
452
453# do combinatorics and PnL calculations
454def process_reports(ReportNames): # we probably should pass separately the arrays here.
455
456 # returns numpy arrays with digized signals adjusted by FDA, each column corresponds to an FDA value from iPeriods
457 ts("building digitized vectors of signals adjusted by FDA values")
458 # step 1: multiprocess-read all and return the vector of files
459 with Pool() as p: sig_fda = p.map(files2sig_fda,ReportNames)
460 # step 2: multiprocess-digitize the signals
461 # npReports contains [filename, period, digitized_signals]
462 with Pool() as p: dig = p.map(digitize, sig_fda) # p.map return data in consequitive order, this was tested
463
464 npReports = []
465 for d in dig:
466 for e in d:
467 npReports.append(e)
468 #print(e)
469
470
471 ''' we'll need the code below for tables of correlations of model's signals
472 header = ""
473 for f in best:
474 header += (f[0]).split(" ")[1] + "-" + str(f[1]) + "-" + str(f[2]) + ","
475 print(header)
476 for r in range(0,2999,1):
477 sig = str(r)+","
478 for f in best:
479 for e in npReports:
480 if f[0]==e[0] and f[1]==e[1]:
481 sig += str(e[2][r]) + ","
482 print(sig)
483
484 exit()
485 '''
486
487 # MODELS OF 1 on Chunk's section A
488 ts(" -> calculating PnLs for period A")
489 mp_a = [] # mp stands for Model Period
490 mp_b = []
491 #print(npReports)
492
493 # these will be cut accordinly like data_ohlcbs_a & data_ohlcbs_b
494 signals_a = []
495 signals_b = []
496 for r in npReports:
497 filename = r[0]
498 period = r[1]
499 ''' doing like before
500 data_ohlcbs_a = data_ohlcbs[start_a:end_a]
501 data_ohlcbs_b = data_ohlcbs[start_b:end_b]
502 '''
503 sig_all = r[2]
504 signals_a = sig_all[start_a:end_a]
505 signals_b = sig_all[start_b:end_b]
506 # TODO: fix the below for this very format: pnl_classic( [ filename, period, chunk_section, signals, calc_dd ] )
507 mp_a.append( [ filename, period, data_ohlcbs_a, signals_a, True] ) # note: price_a is used for the data
508 # need to cut signals for below!
509 mp_b.append( [ filename, period, data_ohlcbs_b, signals_b, True] ) # note: price_a is used for the data
510
511
512
513 #with Pool() as p: sqlLines = [l for l in p.map(pnl, mp) if l is not None]
514 with Pool() as p: results_positive_pnl_a = [l for l in p.map(pnl_classic, mp_a) if l is not None]
515 with Pool() as p: results_positive_pnl_b = [l for l in p.map(pnl_classic, mp_b) if l is not None]
516
517 '''
518 print("PnL results for A")
519 for a in results_positive_pnl_a:
520 print(a)
521 print("\n")
522 print("PnL results for B")
523 for b in results_positive_pnl_b:
524 print(b)
525 '''
526
527 # this will show all the dataframe completely
528 pd.set_option('display.max_columns', None)
529
530 pnl_classic_cols = ["Filename","Period","# Trades","PnL","Max. Drawdown"]
531 df_a = pd.DataFrame(results_positive_pnl_a)
532 df_a.columns = pnl_classic_cols
533 print("TOP models with positive pnl_classic generated for A:",len(df_a))
534 top_a = df_a.sort_values(by=["PnL"],ascending=False)[:MaxPoolForModelsGroup] # this will keep MaxResultRows2Database sorted desc by pnl (from 1 to avoid column names)
535 print(top_a)
536
537 df_b = pd.DataFrame(results_positive_pnl_b)
538 df_b.columns = pnl_classic_cols
539 print("TOP models with positive pnl_classic generated for B:",len(df_b))
540 top_b = df_b.sort_values(by=['PnL'],ascending=False)[:MaxPoolForModelsGroup] # this will keep MaxResultRows2Database sorted desc by pnl (from 1 to avoid column names)
541 print(top_b)
542
543 # build a list consistion of our best models from top
544 mps = [] # model/period/signals
545 #print("top length",len(top))
546 for index, row in top_a.iterrows():
547 f=row['Filename']
548 p=row['Period']
549 # this this in npReports:
550 for npr in npReports:
551 if (npr[0]==f and npr[1]==p):
552 signals = npr[2]
553 mps.append( [ f, p, signals] ) # note: price_a is used for the data
554 #print(f, p, "ADDED")
555
556 #ts(" -> calculating PnLs for groups on period A")
557 # construct the groups
558 #print( mps)
559
560 models = range(0,len(top_a),1)
561 groups = [] # array of comninations of models
562 for n in group_sizes:
563 comb = list(itertools.combinations( models, n))
564 for c in comb:
565 groups.append(c) # adding combination to the list of groups
566
567 #print(groups)
568 #exit()
569 # fill in the groups with data
570 #print("Total model groups: ",len(groups))
571 mg = []
572 for g in groups: # for group in groups
573 #print(":: group",g)
574 group_str = "" # string with the list of all modls
575 group_sig = np.zeros(len(data_ohlcbs_a), dtype=float, order='C') # CHECK!!! is the size correct? YES - we just all ALL signals
576 for m_index in g:
577 # m is index of model inside models
578 model = mps[m_index]
579 f = model[0]
580 p = str(model[1]) # that's integer period
581 s = model[2]
582 group_str += f + ":" + p + "/" # just a long string with list of model:period separated by '/' character
583 #print('s:',len(s))
584 #print('group_sig',len(group_sig))
585 group_sig = np.add(group_sig, s[start_a:end_a])
586
587 # mp_b.append( [ filename, period, data_ohlcbs_b, signals_b, True] ) # note: price_a is used for the data
588
589 mg.append( [ group_str, 0, data_ohlcbs_a, group_sig, True] ) # note: price_a is used for the data
590
591 # execute pnl for each group in multiprocessing mode
592 with Pool() as p: profitable_groups_a = [l for l in p.map(pnl_classic, mg) if l is not None]
593 df = pd.DataFrame(profitable_groups_a)
594 df.columns = pnl_classic_cols
595 print("PART A: Total groups with positive PnL generated:",len(profitable_groups_a))
596 top_g_a = df.sort_values(by=['PnL'],ascending=False)[:MaxPoolForModelsGroup] # this will keep MaxResultRows2Database sorted desc by pnl (from 1 to avoid column names)
597 # show the best ones from the groups
598 print(top_g_a)
599
600 #exit()
601
602 # TOFIX: all below is WRONG! because we need to take only 1 best group from the top
603
604 # take the best model group
605 mg_b = [] # model/period/signals
606 df = pd.DataFrame(profitable_groups) #, columns=["Group","SQN","# Trades","Return [%]","Max. Drawdown [%]","Equity Final [$]","Equity Peak [$]"])
607 df.columns = pnl_classic_cols
608 # TODO: try sorting by SQN versus Return, see which one works better
609 top_group = df.sort_values(by=['PnL'],ascending=False)[:MaxPoolForModelsGroup] # this will keep MaxResultRows2Database sorted desc by pnl (from 1 to avoid column names)
610 #print("top length",len(top))
611 for index, row in top_group.iterrows():
612 group_str=row['Group']
613
614 # get this group signal
615 for m in mg:
616 if(m[0] in group_str): # group_str contains the number of models before it, so we're looking for mg[0] in it, the group_str original
617 #print(m[0],group_sig)
618 group_sig = m[1]
619 mg_b.append([m[0], group_sig, price, start_b, end_b]) # press it for the period B
620 #print(mg_b)
621 break
622 break # this is exactly how we take only 1 top group as it's already sorted by Pn
623
624 # TODO: calculate and print performance results of individual groups on PART B - THIS IS THE BENCHMARK!
625
626 with Pool() as p: profitable_groups_b = [l for l in p.map(pnl_group, mg_b) if l is not None]
627 df = pd.DataFrame(profitable_groups_b) #, columns=["Group","SQN","# Trades","Return [%]","Max. Drawdown [%]","Equity Final [$]","Equity Peak [$]"])
628 print(df)
629 #print("PART B: Total groups with positive Return [%] generated:",len(profitable_groups_b))
630 #print(df)
631
632 print("\nGroup,SQN,# Trades,Return [%],Max. Drawdown [%],Equity Final [$],Equity Peak [$],B Return [%],B Max. Drawdown [%], B Equity Final [$],B Equity Peak [$]")
633 for index, row in df.iterrows():
634 s=row['Group']
635 sqn=row['SQN']
636 trades=row['# Trades']
637 ret=row['Return [%]']
638 max_dd=row['Max. Drawdown [%]']
639 eq=row['Equity Final [$]']
640 ep=row['Equity Peak [$]']
641 for iindex, b_row in top_b.iterrows():
642 b_ret=b_row['Return [%]']
643 b_max_dd=b_row['Max. Drawdown [%]']
644 b_eq=b_row['Equity Final [$]']
645 b_ep=b_row['Equity Peak [$]']
646 print(s,",",sqn,",",trades,",",ret,",",max_dd,",",eq,",",ep,",",b_ret,",",b_max_dd,",",b_eq,",",b_ep)
647 break
648 break
649
650 #print("actual group results:",profitable_groups_b)
651 #print("best individual results:")
652 #print(top_b)
653
654 #write2mysql(profitable_groups_b)
655
656 # TODO OPTIMIZATIONS: try signal thresholds when modelling
657
658
659 '''
660 TODO:
661 1) get best models by formula:
662 algo: the first 10 and then another more until 0.xx is the same
663 n = number of model:period
664 2) combinations of [5 to n-1] models
665 IMPORTANT: make sure not to read the report files twice, use existing report - SAVE TIME, RAM AND I/O RESOURCES
666 3) write to table tblPrefix_groups_[the same as this table of individual models]
667
668 Use code from here https://github.com/packandsell/AM/blob/master/gen-mt4-group-perm-sig.py
669 TODO: a) check which PnL and digitilization code is more optimal
670 b) how pnl vs pnl2 is different, which one is better?
671
672 4) generate graph for every chunks:period and mark chunks with vertical lines
673 '''
674
675
676
677
678
679# print something with timestamp
680def ts(s):
681 now = datetime.now()
682 print(now.strftime("%Y-%m-%d %H:%M:%S") + " // " + s)
683
684
685
686if __name__ == "__main__":
687
688 warnings.filterwarnings("ignore") # backtester rises warnings, we don't need them
689
690 # python ts-combinator.py 1K_take_1 8_3253-4853_1500_0.7_USDCAD_1440 history_file_that_has_OHLC.csv
691 start = time.time() # timing
692 #print(sys.argv, len(sys.argv))
693 if (len(sys.argv)!=4):
694 ts("ERR: wrong number of input parmeters!")
695 exit()
696
697 #ts("________________________started________________________")
698 tblPrefix = sys.argv[1]
699 regex = sys.argv[2]
700 DTOHLC_hist = sys.argv[3]
701
702 # 0 1 2 3 4 5
703 arr = regex.split("_") # 8 _ 3253-4853 _ 1500 _ 0.7 _ USDCAD _ 1440
704 chunk = arr[0]
705 history_bars = arr[1]
706 length = int(arr[2])
707 ratio = float(arr[3])
708 tblCoreName = arr[4] + "_" + arr[5]
709 table = tblPrefix + "_" + tblCoreName
710
711 # TODO remove comment from the previous line for live version
712 #check_table(table) # will create the table or stop executing if table exists
713
714
715 ReportNames = [] # this will contain lists for every report, in-memory numpy
716 listOfFiles = os.listdir('.')
717 # build a list of report files
718 for report in listOfFiles:
719 if fnmatch.fnmatch(report, regex+postfix ):
720 ReportNames.append(report)
721
722
723
724 #debugging
725 #print(len(ReportNames), ReportNames)
726 #exit()
727
728 try:
729 # we assume that all the reports are of equal length
730 # date,time array
731 dt = np.genfromtxt(ReportNames[0], delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fDate_col, fTime_col))
732 # date array
733 datez = np.genfromtxt(ReportNames[0], delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fDate_col))
734 timez = np.genfromtxt(ReportNames[0], delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fTime_col))
735 # TODO: REMOVE, replace the uses for price_ohlc_bs
736 # price array - WHERE ARE WE USING IT?
737 price = np.genfromtxt(ReportNames[0], delimiter=',', encoding="utf-8", dtype=float, skip_header=True, usecols = (fPrice_col))
738 # we need those for TP/SL and other things...
739 price_ohlc = np.genfromtxt(DTOHLC_hist, delimiter=',', encoding="utf-8", dtype=float, skip_header=True, usecols = (_o,_h,_l,_c))
740 except (OSError):
741 print(OSError, "maybe this OHLC files is not found",DTOHLC_hist)
742 exit()
743
744 #print(price_ohlc)
745 #print(price_ohlc[0][1])
746 #exit()
747
748 # pre-calculate the results of buy&sell actions to save speed on multiprocessing later
749 OHLC_pnl = []
750 line_to_pass = 100 # intial .csv quote file contains 100 more bars, which are not in the model's report,
751 # 0 PnL on the first row, because it's the signal for the next row because Altreva uses them for "warming up"
752 for r in range(line_to_pass,len(price_ohlc),1): # the signal is always for the next line, so we start from relative line 1, not line 0
753 # calculate OPEN HIGH LOW CLOSE BUY = Close-prev_Close / SELL = prevClose-Close
754 OHLC_pnl.append( [ price_ohlc[r][0], price_ohlc[r][1], price_ohlc[r][2], price_ohlc[r][3], price_ohlc[r][3]-price_ohlc[r-1][3] , price_ohlc[r-1][3]-price_ohlc[r][3] ] )
755
756 data_ohlcbs = np.array(OHLC_pnl)
757 # DEBUG - OK GOOD
758 np.set_printoptions(precision=5,suppress=True) # this is to print numpy floats without e-notation
759 #for r in range(0,len(data_ohlcbs),1):
760 # print(data_ohlcbs[r])
761 #exit()
762 # DEBUG
763
764
765 # OK
766 if (length != len(data_ohlcbs)):
767 ts("ERR: Length=" + str(length) + " while " + str(len(data_ohlcbs)) + " rows in " + DTOHLC_hist )
768 exit()
769 else:
770 ts("OK: PASSED CHECK, lines in filename string =" +str(length) + ", lines in " + DTOHLC_hist + " =" + str(len(data_ohlcbs)))
771
772 #exit()
773 # TODO: now double check this for having all the quotes and correct cuttage
774
775 # calculate our limits for pnl_a and pnl_b
776 start_a = 0
777 end_a = int(length * ratio)-1
778 start_b = end_a-1
779 end_b = length+1
780 print("variables: start_a / end_a / start_b / end_b:", start_a, end_a, start_b, end_b)
781
782 data_ohlcbs_a = data_ohlcbs[start_a:end_a]
783 data_ohlcbs_b = data_ohlcbs[start_b:end_b]
784
785 #print("price A")
786 #print(data_ohlcbs_a)
787 #print("price B")
788 #print(data_ohlcbs_b)
789 #exit()
790
791 # calculate roughly by 'Close' value since Altreva also uses Close-only internally
792 #print('data_ohlcbs_a[3:]')
793 #print(data_ohlcbs_a[:,3]) # all rows, Close column
794
795 A = data_ohlcbs_a[:,3]
796 B = data_ohlcbs_b[:,3]
797
798 # get slope as z values from MK Test
799 trend, H, p, z_a = mk_test(A)
800 trend, H, p, z_b = mk_test(B)
801
802 xa = range(1,len(A)+1,1)
803 xb = range(1,len(B)+1,1)
804 # will need those slopes in the future
805 #slope_a, intercept, r_value, p_value, std_err = linregress(xa, A)
806 #slope_b, intercept, r_value, p_value, std_err = linregress(xb, B)
807
808 # for the chart filename to save
809 chart_title = "Chunk file: " + (ReportNames[0]).split(" ")[0] + ".csv\nMK A = " + f'{z_a:.4f}' + " | " + "MK B = " + f'{z_b:.4f}' + " | A/B MK Test ratio = " + f'{z_a/z_b:.4f}'
810 chart_filename = str(length)+"/" + (ReportNames[0]).split(" ")[0] + ".png"
811
812 #print(chart_title, " // ", chart_filename)
813
814 '''
815 def save_chart( close_price, v_separator, \
816 best_a, label_a, best_ga, label_ga, best_b, label_b, \
817 best_gb, label_gb, best_ab, label_ab, best_gab, label_gab):
818
819 '''
820
821 # Use below code later in process_reports()
822 #ab = np.append(A,B)
823 #save_chart ( ab, len(A), ab, "ab", ab, "ab", ab, "ab", ab, "ab", ab, "ab", ab, "ab" )
824
825 exit()
826
827 buy_hold_a = abs( float(A[0]) - float(A[len(A)-1]) )
828 max_min_a = np.max(A) - np.min(A) # roughly by Close value
829 #print(buy_hold_a,max_min_a,np.max(A), np.min(A))
830 #exit()
831 buy_hold_b = abs( float(B[0]) - float(B[len(B)-1]) )
832 max_min_b = np.max(B) - np.min(B) # roughly by Close value
833
834 trading_days_a = len(set(datez[start_a:end_a]))
835 trading_days_b = len(set(datez[start_b:end_b]))
836
837 #print(buy_hold_a,buy_hold_b,max_min_a,max_min_b,trading_days_a, trading_days_b)
838 # ** ALL GOOD AND TESTED UNTIL HERE
839 #exit()
840
841 dt_a = dt[start_a:end_a]
842 dt_b = dt[start_b:end_b]
843
844 # do we need this date shit?
845 start_a_date = datetime.strptime(dt_a[0][0] + " " + dt_a[0][1], '%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
846 end_a_date = datetime.strptime(dt_a[len(dt_a)-1][0] + " " + dt_a[len(dt_a)-1][1], '%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
847 start_b_date = datetime.strptime(dt_b[0][0] + " " + dt_b[0][1], '%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
848 end_b_date = datetime.strptime(dt_b[len(dt_b)-1][0] + " " + dt_b[len(dt_b)-1][1], '%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
849
850 #print(start_a_date,end_a_date, start_b_date, end_b_date)
851 #exit()
852
853 '''
854 d1 = datetime.strptime(dt_a[0][0] + " " + dt_a[0][1], '%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
855 d2 = datetime.strptime(dt[len(dt)-1][0] + " " + dt[len(dt)-1][1], '%m/%d/%Y %H:%M:%S')
856 # date periods - that's in MySQL format
857 f_date = d1.strftime('%Y-%m-%d %H:%M:%S')
858 l_date = d2.strftime('%Y-%m-%d %H:%M:%S')
859
860 print("// -- some dataset stats ---")
861 print("Start Open to Last Close range = ",open_close_range)
862 print("Max Price to Min Price range =",max_range)
863 print("Number of trading days is",trading_days)
864 print("// -------------------------")
865 '''
866
867 # def process_reports(ReportNames, dt, datez, price ):
868 process_reports( ReportNames )
869
870 #ts("finished")
871 t = time.time() - start
872 print("time taken: ", t/60, "minutes")
873 #ts("_______________________________________________________")