· 6 years ago · Apr 23, 2020, 05:02 AM
1import csv
2from datetime import datetime, timezone
3import httpx
4import logging
5import os
6import pandas as pd
7import pickle
8import pytz
9import shutil
10
11# why is there a module with a name very similar to this class? that is confusing, and confusing things cause
12# technical debt.
13from study_automation import configSettings
14
15logger = logging.getLogger(__name__)
16# this startTime won't be the time the process started, but the time the module was imported. depending on
17# the calling script, this could have a noticeable offset
18startTimeStamp = datetime.now(timezone.utc)
19# both of these values should be calculated inline as needed, rather than pre-loaded. I would argue this is a form
20# of technical debt because it's 3 very similar objects, with similar names, in the global scope. remembering which
21# is which is going to be a persistent headache, while only storing a single UTC datetime object is easier to
22# remember, and when you need to render one of these modified formats, it's all right on screen and easy to track
23startTime = datetime.strftime(startTimeStamp, '%Y-%m-%d_%H%M%S')
24localStartTime = datetime.now()
25
26# looks like you've got a block meant to clean up the input data. i would argue that is inappropriate here. you should
27# have two separate classes - one that does nothing but clean up data, and one that processes cleaned data. that allows
28# you to have multiple pre-processors that can clean up data from different sources, or pass in data that is clean
29# from the start
30
31# IT IS SETUP TO ALIGN THEM TO THE SAME NAMING CONVENTION AS FOUND IN THE ASSIGNMENT API (tasks tab in the UI)
32KI_ROLE_LU = {'Lead Clinical Data Manager': 'DM',
33 'Backup Clinical Data Manager': 'DM',
34 'Lead Clinical Data Analyst': 'DM',
35 'Lead Clinical Programmer': 'CP',
36 'Medical Coder': 'DM',
37 'IRT Project Manager': 'IRT',
38 'Lead Biostatistician': 'BS-BIOSTAT',
39 'Lead Statistical Programmer': 'SAS-PRG',
40 'Biostatistics QC': 'BS-QC',
41 'Reviewing Biostatistician': 'BS-BIOSTAT',
42 'Lead Validating Statistical Programmer': 'SAS-PRG-VAL',
43 'Unmasked Biostatistician': 'BS-BIOSTAT',
44 'Project Manager': 'PM',
45 'Data Manager Co-Lead': 'DM',
46 'IRT Project Manager Co-Lead': 'IRT',
47 'Biostatistician Co-Lead': 'BS-BIOSTAT'
48 }
49
50# These are the roles from the task page. Normalizing them a bit. Used for assignment API
51KI_TASK_ROLE_LU = {'BS-AS-DIR': 'BS-BIOSTAT',
52 'BS-BIOSTAT': 'BS-BIOSTAT',
53 'BS-BIOSTAT2': 'BS-BIOSTAT',
54 'BS-DIR': 'BS-BIOSTAT',
55 'BS-MGR': 'BS-BIOSTAT',
56 'BS-PRIN-BIOSTAT': 'BS-BIOSTAT',
57 'BS-PRIN-RS': 'BS-BIOSTAT',
58 'BS-QC': 'BS-QC',
59 'CP': 'CP',
60 'CP-MGR': 'CP',
61 'DM-CDA': 'DM',
62 'DM-CDM': 'DM',
63 'DM-DIR': 'DM',
64 'DM-MEDCOD': 'DM',
65 'DM-MEDCOD-MGR': 'DM',
66 'DM-MGR': 'DM',
67 'IRT-PM': 'IRT',
68 'IRT-SERV-MGR': 'IRT',
69 'PM': 'PM',
70 'SAS-PRG': 'SAS-PRG',
71 'SAS-PRG-MGR': 'SAS-PRG',
72 'SSC-PRG-VAL': 'SAS-PRG=VAL',
73 }
74
75# I don't understand this comment. It sounds like it is tightly associated with the KI_TASK_ROLE_LU dict.
76# could you have the dict value be a 2-tuple instead? this seems like technical debt, because a change to one
77# would trigger a bug if you don't remember to change both.
78
79# these are the keys that need to be extracted from the project API that correspond to the roles above
80role_key = [218, 282, 293, 294, 295, 296, 283, 284, 297, 298, 299, 300, 301, 326, 327, 328]
81
82# does this comment add information? ignoredFields seems like a straightforward variable name, and fluff comments
83# are a form of technical debt because you won't get compile or test errors if they are no longer accurate, but can
84# still mislead someone. or, worse yet, someone could cut/paste the code but forget the comment, leaving it hanging
85# inappropriately.
86
87# for the project data, we dont care about these keys for the most part
88ignoredFields = ['startDate', 'Last Edit Date', 'newStudy',
89 'description', 'clientDescription', 'duplicates', 'employees']
90
91# i don't really like the name of this class. automation seems to be more about who is calling it, how often, and with
92# what data. and i don't know if study is a noun or a verb.
93class StudyAutomation():
94 """
95 monitors keyedin API and automates some processes
96 """
97
98 # a class that doesn't take any parameters is a big red flag to me. what data is it operating on? where does it
99 # get it's data from? what global state does it read from or write to? it is possible that the class is expected
100 # to be built up after init, but this immediately caught my attention. if it requires lots of global state
101 # then it will be untestable without very difficult and hard to maintain tests.
102 def __init__(self):
103 """
104 Init reports locations of files and files the most previous versions of resourceDict and projectData.
105 """
106 logger.info("Study Automation process begun.")
107
108 # this whole block is odd. each time, you do basically the same thing - transfer a value from one collection
109 # to a local field, log the value. what else is in configSettings? Does it need to be whitelisted like this
110 # or could you do a full copy, then do a for loop to echo the values?
111
112 # also configSettings should be passed in as a paramter. if this __init__ is any indication, the entire logic
113 # of this class depends on it
114 self.DATA_LOCATION = configSettings['DATA_LOC']
115 logger.info('Previous data located at {0}'.format(self.DATA_LOCATION))
116
117 self.PROJ_DATA_LOCATION = configSettings['DATA_LOC'] + 'projectData\\'
118 logger.info('Previous project data located at {0}'.format(self.PROJ_DATA_LOCATION))
119
120 self.RES_DATA_LOCATION = configSettings['DATA_LOC'] + 'resourceDict\\'
121 logger.info('Previous resource data located at {0}'.format(self.RES_DATA_LOCATION))
122
123 self.STATS_LOCATION = configSettings['STATS_LOC']
124 logger.info('Stats drive root located at {0}'.format(self.STATS_LOCATION))
125
126 self.DM_LOCATION = configSettings['DM_LOC']
127 logger.info('DM drive root located at {0}'.format(self.DM_LOCATION))
128
129 self.STATS_COPY_LOCATION = configSettings['STATS_COPY_LOC']
130 logger.info('Stats copy drive root located at {0}'.format(self.STATS_COPY_LOCATION))
131
132 self.DM_COPY_LOCATION = configSettings['DM_COPY_LOC']
133 logger.info('DM copy drive root located at {0}'.format(self.DM_COPY_LOCATION))
134
135 self.OUTPUT_LOCATION = configSettings['OUTPUT_LOC']
136 logger.info('Output location located at {0}'.format(self.OUTPUT_LOCATION))
137
138 self.project_mappings = pd.read_csv('project_mappings.csv', index_col='project')
139 logger.info('Project mappings located at {}'.format(os.getcwd() + '\\project_mappings.csv'))
140
141 self.permission_mappings = pd.read_csv('permission_mappings.csv')
142 logger.info('Permission mappings located at {}'.format(os.getcwd() + '\\permission_mappings.csv'))
143
144 self.newProjectsDesc = []
145
146 # after reviewing these three functions, i feel you have an aversion to both input parameters and return values
147 # and rely very heavily on class fields. that makes testing harder, because those functions are dependent on
148 # state, and a stateless function is a wonderful thing
149
150 self._load_previous_resource_dict()
151 self._load_previous_project_data()
152 self._get_excel_serial_date(self.previousProjectTimestamp)
153
154 # passing in a boolean to a function is a red flag. it typically tells me "this function does two things", and
155 # functions should do 1 thing. if the caller can decide if the data should be refreshed, then the caller should
156 # just refresh the data
157 # i'm also not sure i like this function name. combined with the class name itself, i have absolutely no idea
158 # what StudyAutomation.call_api_get_data(False) could possibly do.
159 # it seems like both "call api" and "get data" are more about what it does internally, and not so much about
160 # why it is doing it
161 def call_api_get_data(self, refresh_data=False):
162 """
163 Collects the data needed to determine differences.
164
165 Parameters:
166 refresh_data (bool): Whether to completely refresh data from the API or use non changed values.
167
168 """
169 logger.info("Refresh data set to {0}".format(str(refresh_data)))
170
171 # if it has been longer than a day since the resource dict has been refreshed, refresh it
172
173 if (startTimeStamp.date() - self.previousResourceTimestamp.date()).days > 1:
174 self._create_resource_dict(True)
175 logger.info('Resource Dict refresh over a day old. Forcing Refresh')
176 else:
177 self._create_resource_dict(refresh_data)
178
179 # the number of times i see a function take no arguments and return nothing is a red flag. that means this
180 # class has to be tested as a whole, rather than being able to test smaller portions. smaller, stateless
181 # functions are also really easily relocated into another class if you decide this one has grown monolithic
182 # and certain aspects can be split off.
183 # also, there is nothing to stop someone from moving this line to above that if/else, although i would guess
184 # that the "correct" calling order is _create_resource_dict, then _get_project_data. by relying on inputs and
185 # outputs, you can actually enforce that order by making the output of one be the required input of the next
186 self._get_project_data()
187
188 def _load_previous_resource_dict(self):
189 # javadoc style comments are often unnecessary. the function description is identical to the function name
190 # which means you add a bunch of fluff lines that don't provide any useful information
191 """
192 Loads previous resource dict.
193
194 Creates:
195 self.previousResourceTimestamp (datetime): Last time a resource dict was created. In UTC
196 self.previousResourceDictLoc (str): Where the file was located
197 self.previousResourceDict (dict): Keys are employee codes from KI. Fields include name, role, department
198 """
199
200 previousLoc, self.previousResourceTimestamp = self._find_most_recent_file('resourceDict')
201 self.previousResourceDictLoc = self.RES_DATA_LOCATION + previousLoc
202 self.previousResourceDict = pickle.load(open(self.previousResourceDictLoc, 'rb'))
203 logger.info("Previous Resource dictionary loaded from {0}".format(self.previousResourceDictLoc))
204
205 def _load_previous_project_data(self):
206 # this function and the one above are so close that they really should be merged into a single function
207 # it would take in the most recent file as a parameter, and return the found data/dataloc to be stored
208 # by the calling function
209 """
210 Loads previous project data.
211
212 Creates:
213 self.previousProjectTimestamp (datetime): Last time project data was created. In UTC.
214 self.previousProjectDataLoc (str): Where the file was located
215 self.previousProjectData (dict): Project Data. Defined below.
216 """
217 previousLoc, self.previousProjectTimestamp = self._find_most_recent_file('projectData')
218 self.previousProjectDataLoc = self.PROJ_DATA_LOCATION + previousLoc
219 self.previousProjectData = pickle.load(open(self.previousProjectDataLoc, 'rb'))
220 logger.info("Previous Project Data loaded from:{0}".format(self.previousProjectDataLoc))
221
222 def _create_resource_dict(self, refresh_data=True):
223 """
224 This method creates the resource dict if refresh data is True. If it is false or there is some error, the most recent dictionary is loaded.
225
226 Parameters:
227 refresh_data (bool): Whether to completely refresh data from the API or use non changed values.
228
229 Creates:
230 self.resourceJson (dict): JSON returned from API. See URL definition below for filters and fields included.
231 self.resourceDict (dict): Keys are employee codes from KI. Fields include name, role, department
232 """
233
234 if refresh_data:
235 try:
236 self.resourceJson = self._return_json_from_api('resource')
237 self.resourceDict = {}
238 # this could be converted to a map() call.
239 # map, along with filter and reduce are some of the most powerful functions on collections in most
240 # modern languages.
241 for item in self.resourceJson:
242 self.resourceDict[item['code']] = {'name': item['name'], 'role': item['primaryRole'],
243 'dept': item['departmentDescription'], 'manager': item['lineManagerDescription']}
244
245 # pokemon exceptions (gotta catch em all!) are generally frowned upon, although sometimes they are okay
246 except:
247 logger.error("Resource Connection failed. Using previous resource Dict")
248 self.resourceDict = self.previousResourceDict
249
250 else:
251 self.resourceDict = self.previousResourceDict
252
253 logger.info("Resource Dict: {0}".format(self.resourceDict))
254
255 def _get_project_data(self):
256 """
257 This method calls the project API, determines which projects have changed and then calls the assignment API.
258
259 Note that the employees attached have to come through two different calls. The project call (where their roles are standardized)
260 and the assignment call which comes with those standardized names.
261
262 Parameters:
263
264
265 Creates:
266 self.projectJson (dict): JSON returned from API. See URL definition below for filters and fields included.
267 self.changedProjects (array): Array of project codes that have changed since last check or are new.
268 self.projectData (dict): Has project codes, sponsors, projects, start date, last edit date and employees attached
269 self.assignmentJson: JSON returned from API. See URL definition below for filters and fields included.
270
271 """
272 self.projectJson = self._return_json_from_api('project')
273
274 self.projectData = {}
275
276 # you can't iterate over JSON, which is a string. so at this point, it's not JSON anymore. it's probably
277 # a dict. it's also bad advice to use the type in the variable name because it's often misleading or unneeded
278 for item in self.projectJson:
279 project_code = item['code']
280 code_data = {}
281 # emp match is to see if one employee has two different roles
282 emp_match = {}
283 code_data['description'] = item['description']
284 code_data['clientDescription'] = item['clientDescription']
285 code_data['startDate'] = item['startDate']
286
287 # dont include None, only valid references
288 if item.get('manager') is not None:
289 code_data['PM'] = {item['manager']}
290 emp_match[item['manager']] = {'PM'}
291
292 for field in item['customFieldValues']:
293 if field['key'] in role_key and len(field['value']) > 0:
294 # edit date key
295 if field['key'] == 218:
296 code_data[field['name']] = field['value']
297 else:
298 ki_role = KI_ROLE_LU[field['name']]
299 emp_id = field['value']
300 if ki_role in code_data.keys():
301 code_data[ki_role].add(emp_id)
302 else:
303 code_data[ki_role] = {emp_id}
304
305 if emp_id in emp_match.keys():
306 emp_match[emp_id].add(ki_role)
307 else:
308 emp_match[emp_id] = {ki_role}
309
310 # check if any employees are in two different roles, alert and remove
311 if len(emp_match) > 0:
312 employees_with_mult_roles = [k for k, v in emp_match.items() if len(v) > 1]
313 employees = list(emp_match.keys())
314 projRoles = [k for k, v in code_data.items() if k not in ignoredFields]
315
316 if len(employees_with_mult_roles) > 0:
317 for e in employees_with_mult_roles:
318 for role in projRoles:
319 assignedEmployees = code_data[role]
320 if e in assignedEmployees:
321 code_data[role].remove(e)
322 logger.warn("Employee {0} with multiple roles in project {1}. Removing {2} role.".format(
323 e, project_code, role))
324
325 else:
326 logger.info("No employees found for project {0}".format(project_code))
327
328 code_data['employees'] = employees
329 self.projectData[project_code] = code_data
330
331 self.assignmentJson = self._return_json_from_api('assignment')
332
333 self.changedProjects = []
334
335 # put the assignment data back into the projectData. However if the employee already has a home page role, do not add.
336 for item in self.assignmentJson:
337 mappedRole = KI_TASK_ROLE_LU.get(item['role'])
338
339 if item['resource'] in self.projectData[item['project']]['employees']:
340 # only log if employee is in different role from home page
341 if mappedRole not in self.projectData[item['project']].keys() or item['resource'] not in self.projectData[item['project']][mappedRole]:
342 logger.info("Employee {0} already assigned to role in project {1} from home screen. Ignoring addl role {2} via {3}."
343 .format(item['resource'], item['project'], mappedRole, item['role']))
344
345 else:
346
347 if mappedRole in self.projectData[item['project']].keys():
348 self.projectData[item['project']][mappedRole].add(item['resource'])
349 else:
350 self.projectData[item['project']][mappedRole] = {item['resource']}
351
352 for key in self.projectData.keys():
353 if key not in self.previousProjectData.keys():
354 self.newProjectsDesc.append([self.projectData[key]['description']])
355 self.changedProjects.append(key)
356 self._create_folders(self.projectData[key]['description'], key)
357 logger.info("Project new since last check: {0}".format(key))
358 # have to subset the object for the keys we care about
359 elif ({i: self.projectData[key][i] for i in self.projectData[key].keys() if i not in ignoredFields} !=
360 {i: self.previousProjectData[key][i] for i in self.previousProjectData[key].keys() if i not in ignoredFields}):
361 self.changedProjects.append(key)
362 logger.info("Project changed since last check: {0}".format(key))
363 else:
364 pass
365
366 logger.info("List of changed projects:{0}".format(str(self.changedProjects)))
367
368 """
369 Sometimes people have multiple roles within a project. This section prioritizes and dedupes them all.
370
371 For instance, the validating programmer will also be a SAS-PRG. However, they should not have the same permissions
372 as a standard SAS-PRG
373
374 """
375
376 def find_differences(self):
377 """
378 This finds differences between the current project data and most previous
379
380 Creates:
381 self.permissionChanges (array): Project, person, role and action to take for permissions changes
382
383 """
384 self.permissionChanges = []
385
386 if len(self.changedProjects) == 0:
387 logger.info("There are no changes between the new and most recent project data files.")
388
389 else:
390 logger.info("There are {0} changes between the new and most recent project data files.".format(
391 str(len(self.changedProjects))))
392
393 for code in self.changedProjects:
394 logger.info("Starting {0}".format(code))
395 newData = self.projectData[code]
396 previousData = self.previousProjectData.get(code, None)
397
398 # this means is a new project, all permissions need to be added after the folders are created
399 if previousData is None:
400
401 roles = newData.keys()
402
403 roles = [r for r in roles if r not in ignoredFields]
404 for role in roles:
405 for resource in newData[role]:
406 self._create_permissions_row(code, role, resource, 'allow')
407
408 else:
409 # check to see if sponsor or project have changed names
410 if newData['clientDescription'] != previousData['clientDescription']:
411 logger.warn("Sponsor names have changed. Previously {0}, now {1}".format(
412 newData['clientDescription'], previousData['clientDescription']))
413 if newData['description'] != previousData['description']:
414 logger.warn("Study names have changed. Previously {0}, now {1}".format(
415 newData['description'], previousData['description']))
416
417 roles = list(set(list(newData.keys()) + list(previousData.keys())))
418 roles = [f for f in roles if f not in ignoredFields]
419
420 for role in roles:
421 if role not in previousData.keys():
422 for resource in newData[role]:
423 self._create_permissions_row(code, role, resource, 'allow')
424 elif role not in newData.keys():
425 for resource in previousData[role]:
426 self._create_permissions_row(code, role, resource, 'deny')
427 elif previousData[role] == newData[role]:
428 pass
429 else:
430 adds = newData[role] - previousData[role]
431 for resource in adds:
432 self._create_permissions_row(code, role, resource, 'allow')
433 rems = previousData[role] - newData[role]
434 for resource in rems:
435 self._create_permissions_row(code, role, resource, 'deny')
436
437 def _find_most_recent_file(self, typ):
438 """
439 This finds the most recent file for a given type. Since files are all in the same folder it needs to determine which one
440
441 Creates:
442 typ (string): Either projectData or resourceDict
443
444 """
445
446 if typ == 'projectData':
447 eligible_files = [f for f in os.listdir(self.PROJ_DATA_LOCATION)
448 if os.path.isfile(os.path.join(self.PROJ_DATA_LOCATION, f))]
449 elif typ == 'resourceDict':
450 eligible_files = [f for f in os.listdir(self.RES_DATA_LOCATION)
451 if os.path.isfile(os.path.join(self.RES_DATA_LOCATION, f))]
452 else:
453 raise ValueError("{0} not a valid type".format(typ))
454
455 most_recent_file = sorted(eligible_files, reverse=True)[0]
456 previous_timestamp = datetime.strptime(most_recent_file.replace(
457 typ + '_', '').replace('.pkl', ' +0000'), '%Y-%m-%d_%H%M%S %z')
458 logger.info("Most recent {1} timestamp is {0}".format(previous_timestamp, typ))
459
460 logger.info("Most recent file of type {0} is {1}".format(typ, most_recent_file))
461
462 return(most_recent_file, previous_timestamp)
463
464 # this should be pulled into a separate class and passed in with the other config parameters. using a hardcoded
465 # external API makes mocks impossible, which is a big part of unit testing. by using dependency injection, you can
466 # create a class that shares the same interface, but returns dummy data
467 def _return_json_from_api(self, api_type, additional_info=False):
468 """
469 Returns a json from the api specificed.
470
471
472 Parameters:
473 api_type (string): One of the three APIs that will be called.
474 **kwargs (dict): the assignment API needs two additional paramtners, a list of projects and the last edit date
475 Returns:
476 api_json (dict): The desired JSON
477 """
478
479 if api_type == 'resource':
480 url = 'https://api.keyedinprojects.com/V3/api/search/resource?fields=code,name,department,primaryRole,lineManager&resultsPerPage=1000&criteria=active=True&pageNumber='
481 elif api_type == 'project':
482 url = 'https://api.keyedinprojects.com/V3/api/search/project?resultsPerPage=1000&criteria=code contains(PRJ) and active=-1&pageNumber='
483 # this is a custom API call based on a report. Edits to the report on the UI would have to take place to change it
484 elif api_type == 'assignment':
485 url = 'https://api.keyedinprojects.com/V3/api/report?resultsPerPage=1000&key=277&pageNumber='
486 else:
487 logger.error("Invalid api type: {0}".format(api_type))
488
489 logger.info("API type of {0} with url {1} called.".format(api_type, url))
490
491 complete = False
492 api_json = []
493 page_number = 1
494 attempts = 1
495
496 # the idea behind this is that for every page needed, it tries to obtain it three times before moving on
497 # maybe should wait or something in case that is the issue.
498 # needs further testing.
499
500 # this outer while loop does nothing. immediately after you set complete = True, you return from the
501 # function.
502 while not complete:
503 attempts = 1
504 while attempts <= 3:
505 try:
506 call = url + str(page_number)
507 self.connection = httpx.get(call, auth=(configSettings['KI_USER'], configSettings['KI_PW']))
508 if self.connection.status_code == 200:
509 logger.info("{0} Connection {1} successful: {2}".format(api_type, str(page_number), call))
510 api_json += self.connection.json()['Data']
511 if self.connection.json()['PageNumber'] >= self.connection.json()['TotalPages']:
512 complete = True
513 return(api_json)
514 else:
515 page_number += 1
516 else:
517 logger.error("Connection failed with status code:{0}: {1}".format(
518 str(self.connection.status_code)), call)
519 except:
520 logger.error("API return failure on attempt {0}.".format(attempts))
521 attempts += 1
522
523 def _create_permissions_row(self, code, role, resource, typ):
524 """
525 Creates a row for a csv that will be acted on for permissions
526
527 Parameters:
528 code (string): Project ID
529 role (string): The role of the person
530 resource (string): Persons name
531 typ (string): Either to be added or removed.
532
533 Creates:
534 self.permissionChanges (array): List of changes to be made.
535 """
536
537 # based on the role of the person need to determine location of permissions
538 # directors and some departments have full access, this is where they are filtered.
539
540 try:
541 resourceInfo = self.resourceDict[resource]
542
543 except KeyError:
544 logger.warn("Resource not found in dict. {0}".format(resource))
545 return
546 # people with the following primary role are out of scope for this
547
548 if resourceInfo['role'] in ['AD-EXEC', 'AD-FIN-HR', 'AD-IT', 'AD-IT-INT', 'AD-QA', 'BD', 'BS-AS-DIR', 'BS-DIR',
549 'BS-PRIN-RS', 'DM-DIR', 'SSC-AI', 'SSC-CSO']:
550 logger.info('{0} with primary role {1} out of scope.'.format(resourceInfo['name'], resourceInfo['role']))
551 else:
552
553 perChanges = (self.permission_mappings.copy()[(self.permission_mappings['role'] == role) &
554 (self.permission_mappings['action'] == typ)])
555 for index, row in perChanges.iterrows():
556 if code not in self.project_mappings.index:
557 logger.warn('Project Code not in project mappings'.format(code))
558 return
559
560 try:
561 folder = self.project_mappings.at[code, row['location']]
562 except KeyError:
563 logger.warn("Folder not found. {0}".format(str([code, role, row['location']])))
564
565 if pd.isnull(folder):
566 logger.warn("Folder not found. {0}".format(str([code, role, row['location']])))
567 else:
568
569 newRow = [folder, row['role'], resourceInfo['name'], row['permissionChange']]
570 self.permissionChanges.append(newRow)
571 logger.info("Permission changed updated: {0}".format(str(newRow)))
572
573 def _create_folders(self, project_name, project_key):
574 """
575 Creates new folders
576
577 Parameters:
578 project_name (string): Name of the new project that needs folders to be created.
579 project_key(string)L Project ID
580 """
581 # need to change this so there is a sponsor/study nomenclature, dummy for now
582 try:
583 sponsor, study = (project_name).split('_', 1)
584 except:
585 logger.error("Project Name not in correct format: {0}".format(project_name))
586 return
587 stats_loc = self.STATS_LOCATION + sponsor + '\\' + study
588 dm_loc = self.DM_LOCATION + sponsor + '\\' + study
589
590 try:
591 shutil.copytree(self.STATS_COPY_LOCATION, stats_loc)
592 logger.info('Stats directory created at: {0}'.format(stats_loc))
593 except:
594 logger.warn('Stats directory already exists at: {0}'.format(stats_loc))
595 try:
596 shutil.copytree(self.DM_COPY_LOCATION, dm_loc)
597 logger.info('DM directory created at: {0}'.format(dm_loc))
598 except:
599 logger.warn('DM directory already exists at: {0}'.format(dm_loc))
600
601 # if thens because if the stats_copy_location changes, its possible these are in different places.
602 if os.path.isdir(stats_loc + '\\Statistics\\Programs\\Primary Programs'):
603 primProg = stats_loc + '\\Statistics\\Programs\\Primary Programs'
604 else:
605 primProg = ''
606 logger.warn("Project Mappings primary program not found for {0} {1}".format(project_name, project_name))
607
608 if os.path.isdir(stats_loc + '\\Statistics\\Programs\\Validation Programs'):
609 valProg = stats_loc + '\\Statistics\\Programs\\Validation Programs'
610 else:
611 valProg = ''
612 logger.warn("Project Mappings validation program not found for {0} {1}".format(project_name, project_name))
613
614 if os.path.isdir(stats_loc + '\\Statistics\\Randomization'):
615 rand = stats_loc + '\\Statistics\\Randomization'
616 else:
617 rand = ''
618 logger.warn("Project Mappings randomization not found for {0} {1}".format(project_name, project_name))
619
620 if os.path.isdir(stats_loc + '\\DMData'):
621 dmd = stats_loc + '\\DMData'
622 else:
623 dmd = ''
624 logger.warn("Project Mappings dmdata not found for {0} {1}".format(project_name, project_name))
625
626 newProject = pd.DataFrame([[dm_loc, stats_loc, primProg, valProg, rand, dmd]], columns=[
627 'dmDrive', 'statsDrive', 'primaryProgram', 'validationProgram', 'randomization', 'dmdata'], index=[project_key])
628 self.project_mappings = self.project_mappings.append(newProject)
629 logger.info("Project mappings created for {0} {1}".format(project_name, project_name))
630
631 # this function makes me think the caller logic is really minimal. it seems like you put too much thinking into
632 # this class. i would probably rewrite this so that this function generates a csv string and returns it, and the
633 # caller can put it in a file. that also shifts this function towards being more stateless, because the file
634 # system is irrelevant. you could also modify it to take an argument telling it which data to write
635 def output_csv(self):
636 """
637 Outputs CSVs for powershell script to take action on. Also for auditing
638
639
640 """
641 with open(self.OUTPUT_LOCATION + 'permissionsChanges_' + startTime + '.csv', 'w', newline="") as handle:
642 writer = csv.writer(handle)
643 writer.writerow(['folder', 'role', 'person', 'action'])
644 writer.writerows(self.permissionChanges)
645 logger.info("CSV succssfully output: {0} ".format(self.OUTPUT_LOCATION + 'permissionsChanges_' + startTime + '.csv'))
646
647 with open(self.OUTPUT_LOCATION + 'newProjects_' + startTime + '.csv', 'w', newline="") as handle:
648 writer = csv.writer(handle)
649 writer.writerow(['study'])
650 writer.writerows(self.newProjectsDesc)
651 logger.info("CSV succssfully output: {0} ".format(self.OUTPUT_LOCATION + 'newProjects_' + startTime + '.csv'))
652
653 def _get_excel_serial_date(self, dttm):
654 """
655 KeyedIn Assignment API uses the excel serial date format in eastern timezone. This converts to that
656
657 Parameters:
658 dttm (datetime): datetime to calculate from
659
660
661 Creates:
662 self.excelSerialDate (float): The float date to be used
663 """
664 # this should not be a class field, since all usages are in this function
665 self.eastern = pytz.timezone('America/New_York')
666
667 temp = datetime(1899, 12, 30, tzinfo=self.eastern) # Note, not 31st Dec but 30th!
668
669 dttm = dttm.astimezone(self.eastern)
670
671 delta = dttm - temp
672 # this should be returned instead of created, which would make this function able to be converted to static
673 # which means you can bombard it with unit tests
674 self.excelSerialDate = round(float(delta.days) + (float(delta.seconds) / 86400), 3)
675 logger.info("Excel Serial Date is: {0}".format(self.excelSerialDate))
676
677 def wrap_up(self):
678 """
679 Compresses older files for space purposes. Logs and old data compressed every month.
680
681 Outputs the current resourceDict and projectData as previous ones.
682
683 """
684
685 # if the month is different or the year (when Dec goes to Jan)
686 # needs more testing.
687 if startTimeStamp.month > self.previousProjectTimestamp.month or startTimeStamp.year > self.previousProjectTimestamp.year:
688 logger.info("New Month detected, zipping files: {0} to {1}".format(datetime.strftime(self.previousProjectTimestamp, '%Y-%m-%d'),
689 datetime.strftime(startTimeStamp, '%Y-%m-%d')))
690 self._zip_files(self.PROJ_DATA_LOCATION)
691 self._zip_files(self.RES_DATA_LOCATION)
692 self._zip_files(self.DATA_LOCATION + 'logs\\')
693 self._zip_files(self.DATA_LOCATION + 'json\\')
694
695 # save most rescent version, overwrite one that is there.
696 with open(self.PROJ_DATA_LOCATION + 'projectData_' + startTime + '.pkl', 'wb') as handle:
697 pickle.dump(self.projectData, handle)
698 logger.info("Project data saved as pickle file: {0}".format(
699 self.PROJ_DATA_LOCATION + 'projectData_' + startTime + '.pkl'))
700
701 with open(self.RES_DATA_LOCATION + 'resourceDict_' + startTime + '.pkl', 'wb') as handle:
702 pickle.dump(self.resourceDict, handle)
703 logger.info("Resource dict saved as pickle file: {0}".format(
704 self.RES_DATA_LOCATION + 'resourceDict_' + startTime + '.pkl'))
705
706 with open(self.DATA_LOCATION + 'json\\projectJson_' + startTime + '.pkl', 'wb') as handle:
707 pickle.dump(self.projectJson, handle)
708 logger.info("Project json saved as pickle file: {0}".format(
709 self.DATA_LOCATION + 'json\\projectJson_' + startTime + '.pkl'))
710
711 with open(self.DATA_LOCATION + 'json\\assignmentJson_' + startTime + '.pkl', 'wb') as handle:
712 pickle.dump(self.assignmentJson, handle)
713 logger.info("Assignment json saved as pickle file: {0}".format(
714 self.DATA_LOCATION + 'json\\assignmentJson_' + startTime + '.pkl'))
715
716 # resourceJson does not always exist.
717 try:
718 if len(self.resourceJson) > 0:
719 with open(self.DATA_LOCATION + 'json\\resourceJson_' + startTime + '.pkl', 'wb') as handle:
720 pickle.dump(self.resourceJson, handle)
721 logger.info("Resource json saved as pickle file: {0}".format(
722 self.DATA_LOCATION + 'json\\resourceJson_' + startTime + '.pkl'))
723 except:
724 logger.info("Resource json does not exist")
725
726 # need to make sure changes made because of new projects are available for future runs
727 self.project_mappings.to_csv('project_mappings.csv')
728
729 def _zip_files(self, loc):
730 """
731 Creates and archive, deletes the files in the current dir
732
733 Parameters:
734 loc (str): The file path of the directory to modify
735 """
736 logger.info("Starting archive process for {0}".format(loc))
737 loc_name = loc.split('\\')[-2]
738
739 fls = os.listdir(loc)
740 fls = [f for f in fls if f != 'mylog.log']
741 newest = sorted(fls, reverse=True)[0]
742 oldest = sorted(fls, reverse=False)[0]
743
744 if loc_name != 'logs':
745 newest_ts = datetime.strptime(newest.replace(loc_name + '_', '').replace('.pkl', ''), '%Y-%m-%d_%H%M%S')
746 oldest_ts = datetime.strptime(oldest.replace(loc_name + '_', '').replace('.pkl', ''), '%Y-%m-%d_%H%M%S')
747 else:
748
749 newest_ts = datetime.strptime(newest.replace('mylog.log.', '').replace('.pkl', ''), '%Y-%m-%d')
750 oldest_ts = datetime.strptime(oldest.replace('mylog.log.', '').replace('.pkl', ''), '%Y-%m-%d')
751
752 newest_ts_str = datetime.strftime(newest_ts, '%Y-%m-%d')
753 oldest_ts_str = datetime.strftime(oldest_ts, '%Y-%m-%d')
754
755 logger.info("{0} archive contains {1} files from dates {2} to {3}.".format(
756 loc, str(len(fls)), oldest_ts_str, newest_ts_str))
757
758 shutil.make_archive(self.DATA_LOCATION + 'archive\\' + loc_name + '_' + oldest_ts_str +
759 '_' + newest_ts_str, root_dir=loc, format='tar', logger=logger)
760
761 for f in fls:
762 os.remove(loc + f)
763 logger.info("File removed at {0}".format(loc + f))