6FkBaE8K

· 6 years ago · Jul 22, 2019, 06:56 AM
1from multiprocessing import Pool
2
3import time
4import timeit
5from datetime import datetime, timedelta
6import os, fnmatch
7import psutil
8import csv
9import numpy as np
10import itertools
11import sys
12import mysql.connector
13import pandas as pd
14import pymysql
15
16from backtesting import Strategy
17from backtesting import Backtest
18from scipy.stats import linregress
19from mk_test_file import mk_test
20
21import warnings
22from statistics import mean
23from matplotlib.pylab import plt #load plot library
24
25
26xs = np.array([1,2,3,4,5], dtype=np.float64)
27ys = np.array([5,4,6,5,6], dtype=np.float64)
28
29def best_fit_slope(xs,ys):
30	m = (((mean(xs)*mean(ys)) - mean(xs*ys)) / ((mean(xs)**2) - mean(xs**2)))
31	return(m)
32
33
34table = ""
35postfix = "*export1.csv" # that's added to regex to search for Altreva report files
36
37# max amount of records to write into the database (saves time)
38MaxResultRows2Database = 100
39MaxPoolForModelsGroup = 15 # from how many models shall we take the pool of group selection
40
41# fixed columns in the database
42f_date=""
43l_date=""
44
45# columns in file
46fBar_col = 0
47fDate_col = 1
48fTime_col = 2
49fPrice_col = 3
50fSignal_col = 5
51fFDA_col = 15 # (Trailing 1 bars; Range applied)
52fFDS_col = 16 # (Trailing 100 bars; Range applied)
53
54# this will hold [Open,High,Low,Close,Buy,Sell] values for every row of the quotes
55data_ohlcbs = None
56data_ohlcbs_a = None
57data_ohlcbs_b = None
58
59# will need those to make performance comparison in the database
60slope_a = 0
61slope_b = 0
62
63# for the filename of the chart
64chart_title = ""
65chart_filename = ""
66
67
68# columns in OHLC HISTORY
69_d=0
70_t=1
71_o=2
72_h=3
73_l=4
74_c=5
75_b=6
76_s=7
77CHUNK_A = 0
78CHUNK_B = 1
79
80
81# THIS BELOW IS NOT THERE FOR CURRENT CONFIGS!
82fFD_AUC_col = 17 # (Trailing 1 bars; Range applied)
83fFDA_All = 21
84
85# columns in numpy array
86aDate_col = 0
87aTime_col = 1
88aPrice_col = 2
89aSignal_col = 3
90
91# model's parameters
92iPeriods = range(5,10,1) 	# period values for FDA & FA_AUC calculations
93FDA_threshold = 0.7
94group_sizes = [1,2,4,5,6,7,8,9]
95
96
97# will need below data to fill MySQL table
98# integer bar number
99start_a = 0
100end_a = 0
101start_b = 0
102end_b = 0
103
104# numpy array
105price_a = 0
106price_b = 0
107
108# double - benchmark values for PnL (if a trade is better than longterm buy/sell strategy and best trade taken between all time chart extrenuma
109buy_hold_a= 0
110max_min_a = 0
111buy_hold_b= 0
112max_min_b= 0
113
114# dates
115dt_a = 0
116dt_b = 0
117
118start_a_date= 0
119end_a_date= 0
120start_b_date= 0
121end_b_date= 0
122
123# actual trading days to compute Performance (avg daily PnL) and Turnover (lots) per period of given days
124trading_days_a = 0
125trading_days_b = 0
126
127
128def identity(values):
129	"""
130	fubction is used in strategy to calculate signals
131	"""
132	return pd.Series(values)
133
134
135class MyStrategy(Strategy):
136
137	def init(self):
138		self.signals = self.I(identity, self.data.Signals)
139		self.files_period = self.data.files_period[0]
140
141	def next(self):
142		signal = self.signals[-1]
143		if signal > 0:
144			# Let the strategy close any current position and use all available funds to buy the asset
145			self.buy()
146		elif signal < 0:
147			# Let the strategy close any current position and use all available funds to short sell the asset
148			self.sell()
149		else:
150			assert signal == 0
151			self.position.close()
152
153
154
155def check_table(table): # will create table or exit the software is table exists
156
157	# todo: change logic
158	# if does not exit, then create it, otherwise leave it
159
160	mydb = mysql.connector.connect(
161		host="localhost",
162		user="root",
163		passwd="xsjhk%123SD==-Ghk11",
164		database="altreva"
165	)
166
167	cursor = mydb.cursor()
168
169	stmt = "SHOW TABLES LIKE '" + table + "'"
170	cursor.execute(stmt)
171	result = cursor.fetchone()
172	if result: return # there is a table named "tableName"
173
174    # there are no tables named "tableName"
175
176	# this should create the table OR exit with error
177	_SQL = "CREATE TABLE `" + table + "` ("
178	_SQL += "`ts` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,"
179
180	_SQL += "`chunk` INT NULL,"
181	_SQL += "`hist_bars` VARCHAR(45) NULL,"
182	_SQL += "`length` INT NULL,"
183	_SQL += "`ratio` FLOAT NULL,"
184	_SQL += "`period_fda` INT NULL,"
185
186	_SQL += "`pnl_a` DOUBLE NULL,"
187	_SQL += "`dd_a` DOUBLE NULL,"
188	_SQL += "`buy_hold_a` DOUBLE NULL,"
189	_SQL += "`max_min_a` DOUBLE NULL,"
190
191	_SQL += "`pnl_b` DOUBLE NULL,"
192	_SQL += "`dd_b` DOUBLE NULL,"
193	_SQL += "`buy_hold_b` DOUBLE NULL,"
194	_SQL += "`max_min_b` DOUBLE NULL,"
195
196	_SQL += "`trading_days_a` INT NULL,"
197	_SQL += "`trading_days_b` INT NULL,"
198
199	_SQL += "`trades_a` INT NULL,"
200	_SQL += "`trades_b` INT NULL,"
201
202	# this should be calculated automatically by an expression like "field_name double GENERATED ALWAYS AS (pnl_a/trading_days)"
203	_SQL += "`avg_d_pnl_a` DOUBLE GENERATED ALWAYS AS (pnl_a/trading_days_a),"
204	_SQL += "`avg_d_dd_a` DOUBLE GENERATED ALWAYS AS (dd_a/trading_days_a),"
205	_SQL += "`avg_d_pnl_b` DOUBLE GENERATED ALWAYS AS (pnl_b/trading_days_b),"
206	_SQL += "`avg_d_dd_b` DOUBLE GENERATED ALWAYS AS (dd_b/trading_days_b),"
207
208	_SQL += "`avg_d_vol_a` DOUBLE GENERATED ALWAYS AS (trades_a/trading_days_a),"
209	_SQL += "`avg_d_vol_b` DOUBLE GENERATED ALWAYS AS (trades_b/trading_days_b),"
210
211	# log dates as well
212	_SQL += "`start_a` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,"
213	_SQL += "`end_a` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,"
214	_SQL += "`start_b` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,"
215	_SQL += "`end_b` TIMESTAMP DEFAULT CURRENT_TIMESTAMP);"
216
217	cursor.execute(_SQL)
218	ts("Created table " + table)
219
220def write2mysql(vofv):
221
222	mydb = mysql.connector.connect(
223		host="localhost",
224		user="root",
225		passwd="xsjhk%123SD==-Ghk11",
226		database="altreva"
227	)
228
229	mycursor = mydb.cursor()
230
231	'''
232use altreva;
233CREATE TABLE `1K_take_1_usdcad_1440` (
234
235/* 0 */    `ts` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
236/* 1 */    `chunk` int(11) DEFAULT NULL,
237/* 2 */    `hist_bars` varchar(45) DEFAULT NULL,
238/* 3 */    `length` int(11) DEFAULT NULL,
239/* 4 */    `ratio` float DEFAULT NULL,
240/* 5 */    `period_fda` int(11) DEFAULT NULL,
241/* 6 */    `pnl_a` double DEFAULT NULL,
242/* 7 */    `dd_a` double DEFAULT NULL,
243/* 8 */    `buy_hold_a` double DEFAULT NULL,
244/* 9 */    `max_min_a` double DEFAULT NULL,
245/* 10 */    `pnl_b` double DEFAULT NULL,
246/* 11 */    `dd_b` double DEFAULT NULL,
247/* 12 */    `buy_hold_b` double DEFAULT NULL,
248/* 13 */    `max_min_b` double DEFAULT NULL,
249/* 14 */    `trading_days_a` int(11) DEFAULT NULL,
250/* 15 */    `trading_days_b` int(11) DEFAULT NULL,
251/* 16 */    `trades_a` int(11) DEFAULT NULL,
252/* 17 */    `trades_b` int(11) DEFAULT NULL,
253/* 18 */    `avg_d_pnl_a` double GENERATED ALWAYS AS ((`pnl_a` / `trading_days_a`)) VIRTUAL,
254/* 19 */    `avg_d_dd_a` double GENERATED ALWAYS AS ((`dd_a` / `trading_days_a`)) VIRTUAL,
255/* 20 */    `avg_d_pnl_b` double GENERATED ALWAYS AS ((`pnl_b` / `trading_days_b`)) VIRTUAL,
256/* 21 */    `avg_d_dd_b` double GENERATED ALWAYS AS ((`dd_b` / `trading_days_b`)) VIRTUAL,
257/* 22 */    `avg_d_vol_a` double GENERATED ALWAYS AS ((`trades_a` / `trading_days_a`)) VIRTUAL,
258/* 23 */    `avg_d_vol_b` double GENERATED ALWAYS AS ((`trades_b` / `trading_days_b`)) VIRTUAL,
259/* 24 */    `start_a` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
260/* 25 */    `end_a` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
261/* 26 */    `start_b` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
262/* 27 */    `end_b` timestamp NULL DEFAULT CURRENT_TIMESTAMP
263
264) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
265
266	'''
267
268	print(vofv)
269
270    #                                      0             1         2                3                  4                          5                       6
271	#return [ str(group_str.count("/"))+" "+group_str, res["SQN"], res["# Trades"], res["Return [%]"], res["Max. Drawdown [%]"], res["Equity Final [$]"], res["Equity Peak [$]"] ]
272	group_str = vofv[0]
273
274	#### TODO: fill these
275	chunk = ""
276	ratio = 0
277	bars = 0
278	####
279
280	sqn = vofv[1]
281	trades = vofv[2]
282	return_percent = vofv[3]
283	max_dd = vofv[4]
284	equity = vofv[5]
285	equity_peak = vofv[6]
286
287
288	'''
289	first = True
290	for index, row in top.iterrows():
291		sql =""
292		files_periods = row["files_periods"]
293		trades = row["trades"]
294		pnl = row["pnl"]
295		dd = row["dd"]
296
297		if (first):
298			gen_signals_file(files_periods)
299			first=False
300
301		sql += "INSERT INTO " + table + " (files_periods, trades, pnl, dd, openclose_range, max_range,trading_days, first_date, last_date) VALUES ("
302		sql += repr(files_periods) + "," + str(trades) + "," + str(pnl) + "," + str(dd) + "," + str(open_close_range) + "," + str(max_range) + "," + str(trading_days) + ","
303		sql += repr(f_date) + "," + repr(l_date)
304		sql += ");"
305		#print(sql)
306		mycursor.execute(sql)
307	mydb.commit()
308	mydb.close()
309	'''
310
311
312# we put 1 if avg is more than a threshold, so we can multiply by this number later
313def fda_avg(fda,period):
314	rslt = np.zeros(len(fda), dtype=float, order='C')
315	for r in range(len(fda)):
316		if (r+1-period >= 0):
317			rslt[r] = 1 if fda[r+1-period:r+1].mean() >= FDA_threshold else 0
318
319	return rslt
320
321# File I/O function for multiprocessing
322def files2sig_fda(filename):
323	sig = np.genfromtxt(filename, delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fSignal_col))
324	fda = np.genfromtxt(filename, delimiter=',', encoding="utf-8", dtype=float, skip_header=True, usecols = (fFDA_col))
325	return([filename, sig, fda])
326
327
328def digitize(sig_fda, period=-1): # used in miltiprocessing mode
329
330	# TODO: the below should be taken out for I/O not to slow down multiprocessing
331	#sig = np.genfromtxt(filename, delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fSignal_col))
332	#fda = np.genfromtxt(filename, delimiter=',', encoding="utf-8", dtype=float, skip_header=True, usecols = (fFDA_col))
333
334	filename = sig_fda[0]
335	sig 	 = sig_fda[1]
336	fda 	 = sig_fda[2]
337
338	for r in range(0,len(sig),1):
339		if (r==0 and sig[r]==''): # if the first row is empy, we don't know what signal was there, so we put 0 aka Cash
340			sig[r]=0
341		else:
342			if (sig[r]==''):
343				sig[r]=sig[r-1] # set to previous signal
344			elif (sig[r]=='Cash'):
345				sig[r]=0
346			elif (sig[r]=='Long'):
347				sig[r]=1
348			elif (sig[r]=='Short'):
349				sig[r]=-1
350
351	sig = sig.astype(float)	# origianl signals
352
353	v=[] # as many columns as we have periods, each contains signals filtered by FDA of the corresponding period
354	if (period==-1): # do all periods
355		for p in iPeriods:
356			v.append ( [filename, p, np.multiply(sig, fda_avg(fda,p))] ) # scalar multiplication of signal and {1,0} based on fda
357	else: # use specific period
358		v.append ( [filename, period, np.multiply(sig, fda_avg(fda,p))] )
359
360	return(v)
361
362def drawdowns(equity_curve):
363	i = np.argmax(np.maximum.accumulate(equity_curve) - equity_curve) # end of the period
364	j = np.argmax(equity_curve[:i]) # start of period
365	drawdown=abs(100.0*(equity_curve[i]-equity_curve[j]))
366
367	return drawdown
368
369
370# pnl_classic( [ filename, period, chunk_section, signals, calc_dd ]  )
371
372#
373def pnl_classic(model_infos):
374
375	BUY_COL = 4
376	SELL_COL = 5
377
378	filename		= model_infos[0]
379	period			= model_infos[1]
380	ohlcbs 			= model_infos[2]
381	signals 		= model_infos[3]
382	calc_dd 		= model_infos[4]
383
384
385	#print(len(ohlcbs),len(signals))
386	#exit()
387	#print(ohlcbs)
388
389	trades = 0
390	pnl = 0
391	max_dd_percent = 0
392
393	dt_pnl = [0] # first element would be 0
394
395	for r in range(1,len(ohlcbs),1): # signal on line N has to be traded on line N+1, thus no PnL on line 0
396		signal = signals[r-1] # TODO: Are the signals themselves rigth? Check it out at some point!
397		dpips  = 0 # It's really importtant to get this one nulled for rows that have 0 signal
398		if ( signal != 0): # it was a valid signal and we execute it
399			trades += 1
400			if (signal>0):
401				dpips = ohlcbs[r][BUY_COL]
402			else:
403				dpips = ohlcbs[r][SELL_COL]
404			pnl += dpips # dpips was nulled, so it's OK to add it
405
406		# DEBUG - TEST THIS NEXT TODO!
407		#if (filename=="54_16055-19155_3000_0.9_USDCAD_240 1075_export1.csv" and period ==8):
408		#	# maybe add DATE/time for the signals and prices columns to make sure we can see precisely the right rows
409		#	print("DEBUG:", ohlcbs[r][0],ohlcbs[r][1],ohlcbs[r][2],ohlcbs[r][3], ohlcbs[r][4], ohlcbs[r][5], signals[r], dpips, pnl )
410
411		dt_pnl.append(pnl) # this dataseries will be used for Drawdown calculation and also to return the PnL chart
412
413	if pnl > 0:
414		if (calc_dd == True): # do it here to save resources
415			max_dd_percent = drawdowns(np.array(dt_pnl))
416		else:
417			max_dd_percent = 0 # initially we don't use that fot the sake of CPU resources
418
419		#print(filename, period, trades, pnl, max_dd_percent)
420		label = filename + ", "
421		if (period != 0): label += "p=" + str(period) + ", "
422		label += "pnl=" + str(round(pnl,5)) + ", dd=" + str(round(max_dd_percent,2)) + "%, trades=" + str(trades) + ", bars=" + str(ohlcbs)
423		
424		return [ filename, period, trades, pnl, max_dd_percent, dt_pnl, label ]
425	else:
426		return None
427
428# chart_title should contain  Gradient_A=... Gradient_B=...  GradientA/GradientB=...
429def save_chart( close_price, v_separator, \
430					best_a,  label_a,  best_ga, label_ga, best_b,   label_b, \
431					best_gb, label_gb, best_ab, label_ab, best_gab, label_gab):
432
433	# TODO:  pips_per_week for each series, such that in differentiates between the timeframe in the report file, saving it somewhere in global constant
434
435	chart_close,	=	plt.plot(close_price, label = "Price Close")
436	chart_best_a,	=	plt.plot(best_a,   label = label_a)
437	chart_best_ga,	=	plt.plot(best_ga,  label = label_ga)
438	chart_best_b,	=	plt.plot(best_b,   label = label_b)
439	chart_best_gb,	=	plt.plot(best_gb,  label = label_gb)
440	chart_best_ab,	=	plt.plot(best_ab,  label = label_ab)
441	chart_best_gab,	=	plt.plot(best_gab, label = label_gab)
442	
443	plt.axvline(x=v_separator)	
444	plt.legend([chart_close,chart_best_a, chart_best_ga, chart_best_b, chart_best_gb, chart_best_ab, chart_best_gab],loc='lower left',)
445	plt.title(label=chart_title, fontdict=None, loc='center')
446	try:	
447		os.mkdir(folder)
448	except Exception:
449		print("")
450	plt.savefig(chart_filename)
451
452
453# do combinatorics and PnL calculations
454def process_reports(ReportNames): # we probably should pass separately the arrays here.
455
456	# returns numpy arrays with digized signals adjusted by FDA, each column corresponds to an FDA value from iPeriods
457	ts("building digitized vectors of signals adjusted by FDA values")
458	# step 1: multiprocess-read all and return the vector of files
459	with Pool() as p: sig_fda = p.map(files2sig_fda,ReportNames)
460	# step 2: multiprocess-digitize the signals
461	# npReports contains [filename, period, digitized_signals]
462	with Pool() as p: dig = p.map(digitize, sig_fda) # p.map return data in consequitive order, this was tested
463
464	npReports = []
465	for d in dig:
466		for e in d:
467			npReports.append(e)
468			#print(e)
469
470
471	''' we'll need the code below for tables of correlations of model's signals
472	header = ""
473	for f in best:
474		header += (f[0]).split(" ")[1] + "-" + str(f[1]) + "-" + str(f[2]) + ","
475	print(header)
476	for r in range(0,2999,1):
477		sig = str(r)+","
478		for f in best:
479			for e in npReports:
480				if f[0]==e[0] and f[1]==e[1]:
481					sig += str(e[2][r]) + ","
482		print(sig)
483
484	exit()
485	'''
486
487	# MODELS OF 1 on Chunk's section A
488	ts(" -> calculating PnLs for period A")
489	mp_a = [] # mp stands for Model Period
490	mp_b = []
491	#print(npReports)
492
493    # these will be cut accordinly like data_ohlcbs_a & data_ohlcbs_b
494	signals_a = []
495	signals_b = []
496	for r in npReports:
497		filename = r[0]
498		period = r[1]
499		''' doing like before
500		data_ohlcbs_a = data_ohlcbs[start_a:end_a]
501		data_ohlcbs_b = data_ohlcbs[start_b:end_b]
502		'''
503		sig_all = r[2]
504		signals_a = sig_all[start_a:end_a]
505		signals_b = sig_all[start_b:end_b]
506		# TODO: fix the below for this very format: pnl_classic( [ filename, period, chunk_section, signals, calc_dd ]  )
507		mp_a.append( [ filename, period, data_ohlcbs_a, signals_a, True] ) # note: price_a is used for the data
508        # need to cut signals for below!
509		mp_b.append( [ filename, period, data_ohlcbs_b, signals_b, True] )  # note: price_a is used for the data
510
511	
512
513	#with Pool() as p: sqlLines = [l for l in p.map(pnl, mp) if l is not None]
514	with Pool() as p: results_positive_pnl_a = [l for l in p.map(pnl_classic, mp_a) if l is not None]
515	with Pool() as p: results_positive_pnl_b = [l for l in p.map(pnl_classic, mp_b) if l is not None]
516	
517	'''
518	print("PnL results for A")
519	for a in results_positive_pnl_a:
520		print(a)
521	print("\n")
522	print("PnL results for B")
523	for b in results_positive_pnl_b:
524		print(b)
525	'''
526	
527	# this will show all the dataframe completely
528	pd.set_option('display.max_columns', None)
529
530	pnl_classic_cols = ["Filename","Period","# Trades","PnL","Max. Drawdown"]
531	df_a = pd.DataFrame(results_positive_pnl_a)
532	df_a.columns = pnl_classic_cols
533	print("TOP models with positive pnl_classic generated for A:",len(df_a))
534	top_a = df_a.sort_values(by=["PnL"],ascending=False)[:MaxPoolForModelsGroup] # this will keep MaxResultRows2Database sorted desc by pnl (from 1 to avoid column names)
535	print(top_a)
536
537	df_b = pd.DataFrame(results_positive_pnl_b)
538	df_b.columns = pnl_classic_cols
539	print("TOP models with positive pnl_classic generated for B:",len(df_b))
540	top_b = df_b.sort_values(by=['PnL'],ascending=False)[:MaxPoolForModelsGroup] # this will keep MaxResultRows2Database sorted desc by pnl (from 1 to avoid column names)
541	print(top_b)
542
543	# build a list consistion of our best models from top
544	mps = [] # model/period/signals
545	#print("top length",len(top))
546	for index, row in top_a.iterrows():
547			f=row['Filename']
548			p=row['Period']
549			# this this in npReports:
550			for npr in npReports:
551				if (npr[0]==f and npr[1]==p):
552					signals = npr[2]
553					mps.append( [ f, p, signals] ) # note: price_a is used for the data
554					#print(f, p, "ADDED")
555
556	#ts(" -> calculating PnLs for groups on period A")
557	# construct the groups
558	#print( mps)
559
560	models = range(0,len(top_a),1)
561	groups = [] # array of comninations of models
562	for n in group_sizes:
563		comb = list(itertools.combinations( models, n))
564		for c in comb:
565			groups.append(c) # adding combination to the list of groups
566
567	#print(groups)
568	#exit()
569	# fill in the groups with data
570	#print("Total model groups: ",len(groups))
571	mg = []
572	for g in groups: # for group in groups
573		#print(":: group",g)
574		group_str = "" # string with the list of all modls
575		group_sig = np.zeros(len(data_ohlcbs_a), dtype=float, order='C') #  CHECK!!! is the size correct? YES - we just all ALL signals
576		for m_index in g:
577			# m is index of model inside models
578			model = mps[m_index]
579			f = model[0]
580			p = str(model[1]) # that's integer period
581			s = model[2]
582			group_str += f + ":" + p + "/" # just a long string with list of model:period separated by '/' character
583			#print('s:',len(s))
584			#print('group_sig',len(group_sig))
585			group_sig = np.add(group_sig, s[start_a:end_a])
586
587		# 	mp_b.append( [ filename, period, data_ohlcbs_b, signals_b, True] )  # note: price_a is used for the data
588
589		mg.append( [ group_str, 0, data_ohlcbs_a, group_sig, True] ) # note: price_a is used for the data
590
591	# execute pnl for each group in multiprocessing mode
592	with Pool() as p: profitable_groups_a = [l for l in p.map(pnl_classic, mg) if l is not None]
593	df = pd.DataFrame(profitable_groups_a)
594	df.columns = pnl_classic_cols
595	print("PART A: Total groups with positive PnL generated:",len(profitable_groups_a))
596	top_g_a = df.sort_values(by=['PnL'],ascending=False)[:MaxPoolForModelsGroup] # this will keep MaxResultRows2Database sorted desc by pnl (from 1 to avoid column names)
597	# show the best ones from the groups
598	print(top_g_a)
599
600	#exit()
601
602	# TOFIX: all below is WRONG! because we need to take only 1 best group from the top
603
604	# take the best model group
605	mg_b = [] # model/period/signals
606	df = pd.DataFrame(profitable_groups) #, columns=["Group","SQN","# Trades","Return [%]","Max. Drawdown [%]","Equity Final [$]","Equity Peak [$]"])
607	df.columns = pnl_classic_cols
608	# TODO: try sorting by SQN versus Return, see which one works better
609	top_group = df.sort_values(by=['PnL'],ascending=False)[:MaxPoolForModelsGroup] # this will keep MaxResultRows2Database sorted desc by pnl (from 1 to avoid column names)
610	#print("top length",len(top))
611	for index, row in top_group.iterrows():
612			group_str=row['Group']
613
614			# get this group signal
615			for m in mg:
616				if(m[0] in group_str): # group_str contains the number of models before it, so we're looking for mg[0] in it, the group_str original
617					#print(m[0],group_sig)
618					group_sig = m[1]
619					mg_b.append([m[0], group_sig, price, start_b, end_b]) # press it for the period B
620					#print(mg_b)
621					break
622			break # this is exactly how we take only 1 top group as it's already sorted by Pn
623
624	# TODO: calculate and print performance results of individual groups on PART B - THIS IS THE BENCHMARK!
625
626	with Pool() as p: profitable_groups_b = [l for l in p.map(pnl_group, mg_b) if l is not None]
627	df = pd.DataFrame(profitable_groups_b) #, columns=["Group","SQN","# Trades","Return [%]","Max. Drawdown [%]","Equity Final [$]","Equity Peak [$]"])
628	print(df)
629	#print("PART B: Total groups with positive Return [%] generated:",len(profitable_groups_b))
630	#print(df)
631
632	print("\nGroup,SQN,# Trades,Return [%],Max. Drawdown [%],Equity Final [$],Equity Peak [$],B Return [%],B Max. Drawdown [%], B Equity Final [$],B Equity Peak [$]")
633	for index, row in df.iterrows():
634		s=row['Group']
635		sqn=row['SQN']
636		trades=row['# Trades']
637		ret=row['Return [%]']
638		max_dd=row['Max. Drawdown [%]']
639		eq=row['Equity Final [$]']
640		ep=row['Equity Peak [$]']
641		for iindex, b_row in top_b.iterrows():
642			b_ret=b_row['Return [%]']
643			b_max_dd=b_row['Max. Drawdown [%]']
644			b_eq=b_row['Equity Final [$]']
645			b_ep=b_row['Equity Peak [$]']
646			print(s,",",sqn,",",trades,",",ret,",",max_dd,",",eq,",",ep,",",b_ret,",",b_max_dd,",",b_eq,",",b_ep)
647			break
648		break
649
650	#print("actual group results:",profitable_groups_b)
651	#print("best individual results:")
652	#print(top_b)
653
654	#write2mysql(profitable_groups_b)
655
656	# TODO OPTIMIZATIONS: try signal thresholds when modelling
657
658
659	'''
660	TODO:
661			1) get best models by formula:
662				algo:   the first 10 and then another more until 0.xx is the same
663						n = number of model:period
664			2) combinations of [5 to n-1] models
665				IMPORTANT: make sure not to read the report files twice, use existing report - SAVE TIME, RAM AND I/O RESOURCES
666			3) write to table tblPrefix_groups_[the same as this table of individual models]
667
668			Use code from here https://github.com/packandsell/AM/blob/master/gen-mt4-group-perm-sig.py
669			TODO: 	a) check which PnL and digitilization code is more optimal
670					b) how pnl vs pnl2 is different, which one is better?
671
672			4) generate graph for every chunks:period and mark chunks with vertical lines
673	'''
674
675
676
677
678
679# print something with timestamp
680def ts(s):
681	now = datetime.now()
682	print(now.strftime("%Y-%m-%d %H:%M:%S") + " // " + s)
683
684
685
686if __name__ == "__main__":
687
688	warnings.filterwarnings("ignore") # backtester rises warnings, we don't need them
689
690	# python ts-combinator.py 1K_take_1 8_3253-4853_1500_0.7_USDCAD_1440 history_file_that_has_OHLC.csv
691	start = time.time() # timing
692	#print(sys.argv, len(sys.argv))
693	if (len(sys.argv)!=4):
694		ts("ERR: wrong number of input parmeters!")
695		exit()
696
697	#ts("________________________started________________________")
698	tblPrefix = sys.argv[1]
699	regex = sys.argv[2]
700	DTOHLC_hist = sys.argv[3]
701
702	                       # 0       1        2      3     4        5
703	arr = regex.split("_") # 8 _ 3253-4853 _ 1500 _ 0.7 _ USDCAD _ 1440
704	chunk = arr[0]
705	history_bars = arr[1]
706	length = int(arr[2])
707	ratio = float(arr[3])
708	tblCoreName = arr[4] + "_" + arr[5]
709	table = tblPrefix + "_" + tblCoreName
710
711	# TODO remove comment from the previous line for live version
712	#check_table(table) # will create the table or stop executing if table exists
713
714
715	ReportNames = [] # this will contain lists for every report, in-memory numpy
716	listOfFiles = os.listdir('.')
717	# build a list of report files
718	for report in listOfFiles:
719		if fnmatch.fnmatch(report, regex+postfix ):
720			ReportNames.append(report)
721
722	
723	
724	#debugging
725	#print(len(ReportNames), ReportNames)
726	#exit()
727
728	try:
729		# we assume that all the reports are of equal length
730		# date,time array
731		dt = np.genfromtxt(ReportNames[0], delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fDate_col, fTime_col))
732		# date array
733		datez = np.genfromtxt(ReportNames[0], delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fDate_col))
734		timez = np.genfromtxt(ReportNames[0], delimiter=',', encoding="utf-8", dtype=str, skip_header=True, usecols = (fTime_col))
735		# TODO: REMOVE, replace the uses for price_ohlc_bs
736		# price array - WHERE ARE WE USING IT?
737		price = np.genfromtxt(ReportNames[0], delimiter=',', encoding="utf-8", dtype=float, skip_header=True, usecols = (fPrice_col))
738		# we need those for TP/SL and other things...
739		price_ohlc = np.genfromtxt(DTOHLC_hist, delimiter=',', encoding="utf-8", dtype=float, skip_header=True, usecols = (_o,_h,_l,_c))
740	except (OSError):
741		print(OSError, "maybe this OHLC files is not found",DTOHLC_hist)
742		exit()
743
744	#print(price_ohlc)
745	#print(price_ohlc[0][1])
746	#exit()
747
748	# pre-calculate the results of buy&sell actions to save speed on multiprocessing later
749	OHLC_pnl = []
750	line_to_pass = 100 # intial .csv quote file contains 100 more bars, which are not in the model's report,
751	# 0 PnL on the first row, because it's the signal for the next row because Altreva uses them for "warming up"
752	for r in range(line_to_pass,len(price_ohlc),1): # the signal is always for the next line, so we start from relative line 1, not line 0
753		# calculate          OPEN                 HIGH             LOW               CLOSE             BUY = Close-prev_Close         /       SELL = prevClose-Close
754		OHLC_pnl.append( [ price_ohlc[r][0], price_ohlc[r][1], price_ohlc[r][2], price_ohlc[r][3], price_ohlc[r][3]-price_ohlc[r-1][3] , price_ohlc[r-1][3]-price_ohlc[r][3] ] )
755
756	data_ohlcbs = np.array(OHLC_pnl)
757	# DEBUG - OK GOOD
758	np.set_printoptions(precision=5,suppress=True) # this is to print numpy floats without e-notation
759	#for r in range(0,len(data_ohlcbs),1):
760	#	print(data_ohlcbs[r])
761	#exit()
762	# DEBUG
763
764
765	# OK
766	if (length != len(data_ohlcbs)):
767		ts("ERR: Length=" + str(length) + " while " + str(len(data_ohlcbs)) + " rows in " + DTOHLC_hist )
768		exit()
769	else:
770		ts("OK: PASSED CHECK, lines in filename string =" +str(length) + ", lines in " + DTOHLC_hist + " =" + str(len(data_ohlcbs)))
771
772	#exit()
773	# TODO: now double check this for having all the quotes and correct cuttage
774
775	# calculate our limits for pnl_a and pnl_b
776	start_a = 0
777	end_a = int(length * ratio)-1
778	start_b = end_a-1
779	end_b = length+1
780	print("variables: start_a / end_a / start_b / end_b:", start_a, end_a, start_b, end_b)
781
782	data_ohlcbs_a = data_ohlcbs[start_a:end_a]
783	data_ohlcbs_b = data_ohlcbs[start_b:end_b]
784
785	#print("price A")
786	#print(data_ohlcbs_a)
787	#print("price B")
788	#print(data_ohlcbs_b)
789	#exit()
790
791	# calculate roughly by 'Close' value since Altreva also uses Close-only internally
792	#print('data_ohlcbs_a[3:]')
793	#print(data_ohlcbs_a[:,3]) # all rows, Close column
794
795	A = data_ohlcbs_a[:,3]
796	B = data_ohlcbs_b[:,3]
797	
798	# get slope as z values from MK Test
799	trend, H, p, z_a = mk_test(A)
800	trend, H, p, z_b = mk_test(B)
801
802	xa 		= range(1,len(A)+1,1)
803	xb 		= range(1,len(B)+1,1)
804	# will need those slopes in the future
805	#slope_a, intercept, r_value, p_value, std_err = linregress(xa, A)
806	#slope_b, intercept, r_value, p_value, std_err = linregress(xb, B)
807	
808	# for the chart filename to save
809	chart_title = "Chunk file: " + (ReportNames[0]).split(" ")[0] + ".csv\nMK A = " + f'{z_a:.4f}' + " | " + "MK B = " + f'{z_b:.4f}' + " | A/B MK Test ratio = " + f'{z_a/z_b:.4f}'
810	chart_filename = str(length)+"/" + (ReportNames[0]).split(" ")[0] + ".png"
811
812	#print(chart_title, " // ", chart_filename)
813	
814	'''
815	def save_chart( close_price, v_separator, \
816					best_a,  label_a,  best_ga, label_ga, best_b,   label_b, \
817					best_gb, label_gb, best_ab, label_ab, best_gab, label_gab):
818	
819	'''
820	
821	# Use below code later in process_reports()
822	#ab = np.append(A,B)
823	#save_chart ( ab, len(A), ab, "ab", ab, "ab", ab, "ab", ab, "ab", ab, "ab", ab, "ab" )
824	
825	exit()
826
827	buy_hold_a	 = abs( float(A[0]) - float(A[len(A)-1]) )
828	max_min_a 	 = np.max(A) - np.min(A) # roughly by Close value
829	#print(buy_hold_a,max_min_a,np.max(A), np.min(A))
830	#exit()
831	buy_hold_b	 = abs( float(B[0]) - float(B[len(B)-1]) )
832	max_min_b 	 = np.max(B) - np.min(B) # roughly by Close value
833
834	trading_days_a = len(set(datez[start_a:end_a]))
835	trading_days_b = len(set(datez[start_b:end_b]))
836
837	#print(buy_hold_a,buy_hold_b,max_min_a,max_min_b,trading_days_a, trading_days_b)
838	# ** ALL GOOD AND TESTED UNTIL HERE
839	#exit()
840
841	dt_a = dt[start_a:end_a]
842	dt_b = dt[start_b:end_b]
843
844	# do we need this date shit?
845	start_a_date	= datetime.strptime(dt_a[0][0]					+ " " + dt_a[0][1], 			'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
846	end_a_date		= datetime.strptime(dt_a[len(dt_a)-1][0]		+ " " + dt_a[len(dt_a)-1][1],	'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
847	start_b_date	= datetime.strptime(dt_b[0][0]					+ " " + dt_b[0][1], 			'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
848	end_b_date		= datetime.strptime(dt_b[len(dt_b)-1][0]		+ " " + dt_b[len(dt_b)-1][1],	'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
849
850	#print(start_a_date,end_a_date, start_b_date, end_b_date)
851	#exit()
852
853	'''
854	d1 = datetime.strptime(dt_a[0][0]				+ " " + dt_a[0][1], 			'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
855	d2 = datetime.strptime(dt[len(dt)-1][0]		+ " " + dt[len(dt)-1][1],	'%m/%d/%Y %H:%M:%S')
856	# date periods - that's in MySQL format
857	f_date = d1.strftime('%Y-%m-%d %H:%M:%S')
858	l_date = d2.strftime('%Y-%m-%d %H:%M:%S')
859
860	print("// -- some dataset stats ---")
861	print("Start Open to Last Close range = ",open_close_range)
862	print("Max Price to Min Price range =",max_range)
863	print("Number of trading days is",trading_days)
864	print("// -------------------------")
865	'''
866	
867	# def process_reports(ReportNames, dt, datez, price ):
868	process_reports( ReportNames )
869
870	#ts("finished")
871	t = time.time() - start
872	print("time taken: ", t/60, "minutes")
873	#ts("_______________________________________________________")