PRsLuuYx

· 6 years ago · Apr 23, 2020, 05:02 AM
1import csv
2from datetime import datetime, timezone
3import httpx
4import logging
5import os
6import pandas as pd
7import pickle
8import pytz
9import shutil
10
11# why is there a module with a name very similar to this class? that is confusing, and confusing things cause
12# technical debt.
13from study_automation import configSettings
14
15logger = logging.getLogger(__name__)
16# this startTime won't be the time the process started, but the time the module was imported. depending on
17# the calling script, this could have a noticeable offset
18startTimeStamp = datetime.now(timezone.utc)
19# both of these values should be calculated inline as needed, rather than pre-loaded. I would argue this is a form
20# of technical debt because it's 3 very similar objects, with similar names, in the global scope. remembering which
21# is which is going to be a persistent headache, while only storing a single UTC datetime object is easier to
22# remember, and when you need to render one of these modified formats, it's all right on screen and easy to track
23startTime = datetime.strftime(startTimeStamp, '%Y-%m-%d_%H%M%S')
24localStartTime = datetime.now()
25
26# looks like you've got a block meant to clean up the input data. i would argue that is inappropriate here. you should
27# have two separate classes - one that does nothing but clean up data, and one that processes cleaned data. that allows
28# you to have multiple pre-processors that can clean up data from different sources, or pass in data that is clean
29# from the start
30
31# IT IS SETUP TO ALIGN THEM TO THE SAME NAMING CONVENTION AS FOUND IN THE ASSIGNMENT API (tasks tab in the UI)
32KI_ROLE_LU = {'Lead Clinical Data Manager': 'DM',
33              'Backup Clinical Data Manager': 'DM',
34              'Lead Clinical Data Analyst': 'DM',
35              'Lead Clinical Programmer': 'CP',
36              'Medical Coder': 'DM',
37              'IRT Project Manager': 'IRT',
38              'Lead Biostatistician': 'BS-BIOSTAT',
39              'Lead Statistical Programmer': 'SAS-PRG',
40              'Biostatistics QC': 'BS-QC',
41              'Reviewing Biostatistician': 'BS-BIOSTAT',
42              'Lead Validating Statistical Programmer': 'SAS-PRG-VAL',
43              'Unmasked Biostatistician': 'BS-BIOSTAT',
44              'Project Manager': 'PM',
45              'Data Manager Co-Lead': 'DM',
46              'IRT Project Manager Co-Lead': 'IRT',
47              'Biostatistician Co-Lead': 'BS-BIOSTAT'
48              }
49
50# These are the roles from the task page.  Normalizing them a bit.  Used for assignment API
51KI_TASK_ROLE_LU = {'BS-AS-DIR': 'BS-BIOSTAT',
52                   'BS-BIOSTAT': 'BS-BIOSTAT',
53                   'BS-BIOSTAT2': 'BS-BIOSTAT',
54                   'BS-DIR': 'BS-BIOSTAT',
55                   'BS-MGR': 'BS-BIOSTAT',
56                   'BS-PRIN-BIOSTAT': 'BS-BIOSTAT',
57                   'BS-PRIN-RS': 'BS-BIOSTAT',
58                   'BS-QC': 'BS-QC',
59                   'CP': 'CP',
60                   'CP-MGR': 'CP',
61                   'DM-CDA': 'DM',
62                   'DM-CDM': 'DM',
63                   'DM-DIR': 'DM',
64                   'DM-MEDCOD': 'DM',
65                   'DM-MEDCOD-MGR': 'DM',
66                   'DM-MGR': 'DM',
67                   'IRT-PM': 'IRT',
68                   'IRT-SERV-MGR': 'IRT',
69                   'PM': 'PM',
70                   'SAS-PRG': 'SAS-PRG',
71                   'SAS-PRG-MGR': 'SAS-PRG',
72                   'SSC-PRG-VAL': 'SAS-PRG=VAL',
73                   }
74
75# I don't understand this comment. It sounds like it is tightly associated with the KI_TASK_ROLE_LU dict.
76# could you have the dict value be a 2-tuple instead? this seems like technical debt, because a change to one
77# would trigger a bug if you don't remember to change both.
78
79# these are the keys that need to be extracted from the project API that correspond to the roles above
80role_key = [218, 282, 293, 294, 295, 296, 283, 284, 297, 298, 299, 300, 301, 326, 327, 328]
81
82# does this comment add information? ignoredFields seems like a straightforward variable name, and fluff comments
83# are a form of technical debt because you won't get compile or test errors if they are no longer accurate, but can
84# still mislead someone. or, worse yet, someone could cut/paste the code but forget the comment, leaving it hanging
85# inappropriately.
86
87# for the project data, we dont care about these keys for the most part
88ignoredFields = ['startDate', 'Last Edit Date', 'newStudy',
89                 'description', 'clientDescription', 'duplicates', 'employees']
90
91# i don't really like the name of this class. automation seems to be more about who is calling it, how often, and with
92# what data. and i don't know if study is a noun or a verb.
93class StudyAutomation():
94    """
95    monitors keyedin API and automates some processes
96    """
97
98    # a class that doesn't take any parameters is a big red flag to me. what data is it operating on? where does it
99    # get it's data from? what global state does it read from or write to? it is possible that the class is expected
100    # to be built up after init, but this immediately caught my attention. if it requires lots of global state
101    # then it will  be untestable without very difficult and hard to maintain tests.
102    def __init__(self):
103        """
104        Init reports locations of files and files the most previous versions of resourceDict and projectData.
105        """
106        logger.info("Study Automation process begun.")
107
108        # this whole block is odd. each time, you do basically the same thing - transfer a value from one collection
109        # to a local field, log the value. what else is in configSettings? Does it need to be whitelisted like this
110        # or could you do a full copy, then do a for loop to echo the values?
111
112        # also configSettings should be passed in as a paramter. if this __init__ is any indication, the entire logic
113        # of this class depends on it
114        self.DATA_LOCATION = configSettings['DATA_LOC']
115        logger.info('Previous data located at {0}'.format(self.DATA_LOCATION))
116
117        self.PROJ_DATA_LOCATION = configSettings['DATA_LOC'] + 'projectData\\'
118        logger.info('Previous project data located at {0}'.format(self.PROJ_DATA_LOCATION))
119
120        self.RES_DATA_LOCATION = configSettings['DATA_LOC'] + 'resourceDict\\'
121        logger.info('Previous resource data located at {0}'.format(self.RES_DATA_LOCATION))
122
123        self.STATS_LOCATION = configSettings['STATS_LOC']
124        logger.info('Stats drive root located at {0}'.format(self.STATS_LOCATION))
125
126        self.DM_LOCATION = configSettings['DM_LOC']
127        logger.info('DM drive root located at {0}'.format(self.DM_LOCATION))
128
129        self.STATS_COPY_LOCATION = configSettings['STATS_COPY_LOC']
130        logger.info('Stats copy drive root located at {0}'.format(self.STATS_COPY_LOCATION))
131
132        self.DM_COPY_LOCATION = configSettings['DM_COPY_LOC']
133        logger.info('DM copy drive root located at {0}'.format(self.DM_COPY_LOCATION))
134
135        self.OUTPUT_LOCATION = configSettings['OUTPUT_LOC']
136        logger.info('Output location located at {0}'.format(self.OUTPUT_LOCATION))
137
138        self.project_mappings = pd.read_csv('project_mappings.csv', index_col='project')
139        logger.info('Project mappings located at {}'.format(os.getcwd() + '\\project_mappings.csv'))
140
141        self.permission_mappings = pd.read_csv('permission_mappings.csv')
142        logger.info('Permission mappings located at {}'.format(os.getcwd() + '\\permission_mappings.csv'))
143
144        self.newProjectsDesc = []
145
146        # after reviewing these three functions, i feel you have an aversion to both input parameters and return values
147        # and rely very heavily on class fields. that makes testing harder, because those functions are dependent on
148        # state, and a stateless function is a wonderful thing
149
150        self._load_previous_resource_dict()
151        self._load_previous_project_data()
152        self._get_excel_serial_date(self.previousProjectTimestamp)
153
154    # passing in a boolean to a function is a red flag. it typically tells me "this function does two things", and
155    # functions should do 1 thing. if the caller can decide if the data should be refreshed, then the caller should
156    # just refresh the data
157    # i'm also not sure i like this function name. combined with the class name itself, i have absolutely no idea
158    # what StudyAutomation.call_api_get_data(False) could possibly do.
159    # it seems like both "call api" and "get data" are more about what it does internally, and not so much about
160    # why it is doing it
161    def call_api_get_data(self, refresh_data=False):
162        """
163        Collects the data needed to determine differences.
164
165        Parameters:
166           refresh_data (bool): Whether to completely refresh data from the API or use non changed values.
167
168        """
169        logger.info("Refresh data set to {0}".format(str(refresh_data)))
170
171        # if it has been longer than a day since the resource dict has been refreshed, refresh it
172
173        if (startTimeStamp.date() - self.previousResourceTimestamp.date()).days > 1:
174            self._create_resource_dict(True)
175            logger.info('Resource Dict refresh over a day old.  Forcing Refresh')
176        else:
177            self._create_resource_dict(refresh_data)
178
179        # the number of times i see a function take no arguments and return nothing is a red flag. that means this
180        # class has to be tested as a whole, rather than being able to test smaller portions. smaller, stateless
181        # functions are also really easily relocated into another class if you decide this one has grown monolithic
182        # and certain aspects can be split off.
183        # also, there is nothing to stop someone from moving this line to above that if/else, although i would guess
184        # that the "correct" calling order is _create_resource_dict, then _get_project_data. by relying on inputs and
185        # outputs, you can actually enforce that order by making the output of one be the required input of the next
186        self._get_project_data()
187
188    def _load_previous_resource_dict(self):
189        # javadoc style comments are often unnecessary. the function description is identical to the function name
190        # which means you add a bunch of fluff lines that don't provide any useful information
191        """
192        Loads previous resource dict.
193
194        Creates:
195            self.previousResourceTimestamp (datetime): Last time a resource dict was created.  In UTC
196            self.previousResourceDictLoc (str): Where the file was located
197            self.previousResourceDict (dict): Keys are employee codes from KI.  Fields include name, role, department
198        """
199
200        previousLoc, self.previousResourceTimestamp = self._find_most_recent_file('resourceDict')
201        self.previousResourceDictLoc = self.RES_DATA_LOCATION + previousLoc
202        self.previousResourceDict = pickle.load(open(self.previousResourceDictLoc, 'rb'))
203        logger.info("Previous Resource dictionary loaded from  {0}".format(self.previousResourceDictLoc))
204
205    def _load_previous_project_data(self):
206        # this function and the one above are so close that they really should be merged into a single function
207        # it would take in the most recent file as a parameter, and return the found data/dataloc to be stored
208        # by the calling function
209        """
210        Loads previous project data.
211
212        Creates:
213            self.previousProjectTimestamp (datetime): Last time project data was created.  In UTC.
214            self.previousProjectDataLoc (str): Where the file was located
215            self.previousProjectData (dict): Project Data. Defined below.
216        """
217        previousLoc, self.previousProjectTimestamp = self._find_most_recent_file('projectData')
218        self.previousProjectDataLoc = self.PROJ_DATA_LOCATION + previousLoc
219        self.previousProjectData = pickle.load(open(self.previousProjectDataLoc, 'rb'))
220        logger.info("Previous Project Data loaded from:{0}".format(self.previousProjectDataLoc))
221
222    def _create_resource_dict(self, refresh_data=True):
223        """
224        This method creates the resource dict if refresh data is True.  If it is false or there is some error, the most recent dictionary is loaded.
225
226        Parameters:
227           refresh_data (bool): Whether to completely refresh data from the API or use non changed values.
228
229        Creates:
230            self.resourceJson (dict): JSON returned from API.  See URL definition below for filters and fields included.
231            self.resourceDict (dict): Keys are employee codes from KI.  Fields include name, role, department
232        """
233
234        if refresh_data:
235            try:
236                self.resourceJson = self._return_json_from_api('resource')
237                self.resourceDict = {}
238                # this could be converted to a map() call.
239                # map, along with filter and reduce are some of the most powerful functions on collections in most
240                # modern languages.
241                for item in self.resourceJson:
242                    self.resourceDict[item['code']] = {'name': item['name'], 'role': item['primaryRole'],
243                                                       'dept': item['departmentDescription'], 'manager': item['lineManagerDescription']}
244
245            # pokemon exceptions (gotta catch em all!) are generally frowned upon, although sometimes they are okay
246            except:
247                logger.error("Resource Connection failed.  Using previous resource Dict")
248                self.resourceDict = self.previousResourceDict
249
250        else:
251            self.resourceDict = self.previousResourceDict
252
253        logger.info("Resource Dict: {0}".format(self.resourceDict))
254
255    def _get_project_data(self):
256        """
257        This method calls the project API, determines which projects have changed and then calls the assignment API.
258
259        Note that the employees attached have to come through two different calls.  The project call (where their roles are standardized)
260        and the assignment call which comes with those standardized names.
261
262        Parameters:
263
264
265        Creates:
266            self.projectJson (dict): JSON returned from API.  See URL definition below for filters and fields included.
267            self.changedProjects (array): Array of project codes that have changed since last check or are new.
268            self.projectData (dict): Has project codes, sponsors, projects, start date, last edit date and employees attached
269            self.assignmentJson: JSON returned from API.  See URL definition below for filters and fields included.
270
271        """
272        self.projectJson = self._return_json_from_api('project')
273
274        self.projectData = {}
275
276        # you can't iterate over JSON, which is a string. so at this point, it's not JSON anymore. it's probably
277        # a dict. it's also bad advice to use the type in the variable name because it's often misleading or unneeded
278        for item in self.projectJson:
279            project_code = item['code']
280            code_data = {}
281            # emp match is to see if one employee has two different roles
282            emp_match = {}
283            code_data['description'] = item['description']
284            code_data['clientDescription'] = item['clientDescription']
285            code_data['startDate'] = item['startDate']
286
287            # dont include None, only valid references
288            if item.get('manager') is not None:
289                code_data['PM'] = {item['manager']}
290                emp_match[item['manager']] = {'PM'}
291
292            for field in item['customFieldValues']:
293                if field['key'] in role_key and len(field['value']) > 0:
294                    # edit date key
295                    if field['key'] == 218:
296                        code_data[field['name']] = field['value']
297                    else:
298                        ki_role = KI_ROLE_LU[field['name']]
299                        emp_id = field['value']
300                        if ki_role in code_data.keys():
301                            code_data[ki_role].add(emp_id)
302                        else:
303                            code_data[ki_role] = {emp_id}
304
305                        if emp_id in emp_match.keys():
306                            emp_match[emp_id].add(ki_role)
307                        else:
308                            emp_match[emp_id] = {ki_role}
309
310            # check if any employees are in two different roles, alert and remove
311            if len(emp_match) > 0:
312                employees_with_mult_roles = [k for k, v in emp_match.items() if len(v) > 1]
313                employees = list(emp_match.keys())
314                projRoles = [k for k, v in code_data.items() if k not in ignoredFields]
315
316                if len(employees_with_mult_roles) > 0:
317                    for e in employees_with_mult_roles:
318                        for role in projRoles:
319                            assignedEmployees = code_data[role]
320                            if e in assignedEmployees:
321                                code_data[role].remove(e)
322                                logger.warn("Employee {0} with multiple roles in project {1}. Removing {2} role.".format(
323                                    e, project_code, role))
324
325            else:
326                logger.info("No employees found for project {0}".format(project_code))
327
328            code_data['employees'] = employees
329            self.projectData[project_code] = code_data
330
331        self.assignmentJson = self._return_json_from_api('assignment')
332
333        self.changedProjects = []
334
335        # put the assignment data back into the projectData. However if the employee already has a home page role, do not add.
336        for item in self.assignmentJson:
337            mappedRole = KI_TASK_ROLE_LU.get(item['role'])
338
339            if item['resource'] in self.projectData[item['project']]['employees']:
340                # only log if employee is in different role from home page
341                if mappedRole not in self.projectData[item['project']].keys() or item['resource'] not in self.projectData[item['project']][mappedRole]:
342                    logger.info("Employee {0} already assigned to role in project {1} from home screen.  Ignoring addl role {2} via {3}."
343                                .format(item['resource'], item['project'], mappedRole, item['role']))
344
345            else:
346
347                if mappedRole in self.projectData[item['project']].keys():
348                    self.projectData[item['project']][mappedRole].add(item['resource'])
349                else:
350                    self.projectData[item['project']][mappedRole] = {item['resource']}
351
352        for key in self.projectData.keys():
353            if key not in self.previousProjectData.keys():
354                self.newProjectsDesc.append([self.projectData[key]['description']])
355                self.changedProjects.append(key)
356                self._create_folders(self.projectData[key]['description'], key)
357                logger.info("Project new since last check: {0}".format(key))
358            # have to subset the object for the keys we care about
359            elif ({i: self.projectData[key][i] for i in self.projectData[key].keys() if i not in ignoredFields} !=
360                  {i: self.previousProjectData[key][i] for i in self.previousProjectData[key].keys() if i not in ignoredFields}):
361                self.changedProjects.append(key)
362                logger.info("Project changed since last check: {0}".format(key))
363            else:
364                pass
365
366        logger.info("List of changed projects:{0}".format(str(self.changedProjects)))
367
368        """
369        Sometimes people have multiple roles within a project.  This section prioritizes and dedupes them all.
370
371        For instance, the validating programmer will also be a SAS-PRG.  However, they should not have the same permissions
372        as a standard SAS-PRG
373
374        """
375
376    def find_differences(self):
377        """
378        This finds differences between the current project data and most previous
379
380        Creates:
381            self.permissionChanges (array): Project, person, role and action to take for permissions changes
382
383        """
384        self.permissionChanges = []
385
386        if len(self.changedProjects) == 0:
387            logger.info("There are no changes between the new and most recent project data files.")
388
389        else:
390            logger.info("There are {0} changes  between the new and most recent project data files.".format(
391                str(len(self.changedProjects))))
392
393            for code in self.changedProjects:
394                logger.info("Starting {0}".format(code))
395                newData = self.projectData[code]
396                previousData = self.previousProjectData.get(code, None)
397
398                # this means is a new project, all permissions need to be added after the folders are created
399                if previousData is None:
400
401                    roles = newData.keys()
402
403                    roles = [r for r in roles if r not in ignoredFields]
404                    for role in roles:
405                        for resource in newData[role]:
406                            self._create_permissions_row(code, role, resource, 'allow')
407
408                else:
409                    # check to see if sponsor or project have changed names
410                    if newData['clientDescription'] != previousData['clientDescription']:
411                        logger.warn("Sponsor names have changed.  Previously {0}, now {1}".format(
412                            newData['clientDescription'], previousData['clientDescription']))
413                    if newData['description'] != previousData['description']:
414                        logger.warn("Study names have changed.  Previously {0}, now {1}".format(
415                            newData['description'], previousData['description']))
416
417                    roles = list(set(list(newData.keys()) + list(previousData.keys())))
418                    roles = [f for f in roles if f not in ignoredFields]
419
420                    for role in roles:
421                        if role not in previousData.keys():
422                            for resource in newData[role]:
423                                self._create_permissions_row(code, role, resource, 'allow')
424                        elif role not in newData.keys():
425                            for resource in previousData[role]:
426                                self._create_permissions_row(code, role, resource, 'deny')
427                        elif previousData[role] == newData[role]:
428                            pass
429                        else:
430                            adds = newData[role] - previousData[role]
431                            for resource in adds:
432                                self._create_permissions_row(code, role, resource, 'allow')
433                            rems = previousData[role] - newData[role]
434                            for resource in rems:
435                                self._create_permissions_row(code, role, resource, 'deny')
436
437    def _find_most_recent_file(self, typ):
438        """
439        This finds the most recent file for a given type.  Since files are all in the same folder it needs to determine which one
440
441        Creates:
442            typ (string): Either projectData or resourceDict
443
444        """
445
446        if typ == 'projectData':
447            eligible_files = [f for f in os.listdir(self.PROJ_DATA_LOCATION)
448                              if os.path.isfile(os.path.join(self.PROJ_DATA_LOCATION, f))]
449        elif typ == 'resourceDict':
450            eligible_files = [f for f in os.listdir(self.RES_DATA_LOCATION)
451                              if os.path.isfile(os.path.join(self.RES_DATA_LOCATION, f))]
452        else:
453            raise ValueError("{0} not a valid type".format(typ))
454
455        most_recent_file = sorted(eligible_files, reverse=True)[0]
456        previous_timestamp = datetime.strptime(most_recent_file.replace(
457            typ + '_', '').replace('.pkl', ' +0000'), '%Y-%m-%d_%H%M%S %z')
458        logger.info("Most recent {1} timestamp is {0}".format(previous_timestamp, typ))
459
460        logger.info("Most recent file of type {0} is {1}".format(typ, most_recent_file))
461
462        return(most_recent_file, previous_timestamp)
463
464    # this should be pulled into a separate class and passed in with the other config parameters. using a hardcoded
465    # external API makes mocks impossible, which is a big part of unit testing. by using dependency injection, you can
466    # create a class that shares the same interface, but returns dummy data
467    def _return_json_from_api(self, api_type, additional_info=False):
468        """
469        Returns a json from the api specificed.
470
471
472        Parameters:
473            api_type (string): One of the three APIs that will be called.
474            **kwargs (dict): the assignment API needs two additional paramtners, a list of projects and the last edit date
475        Returns:
476            api_json (dict): The desired JSON
477        """
478
479        if api_type == 'resource':
480            url = 'https://api.keyedinprojects.com/V3/api/search/resource?fields=code,name,department,primaryRole,lineManager&resultsPerPage=1000&criteria=active=True&pageNumber='
481        elif api_type == 'project':
482            url = 'https://api.keyedinprojects.com/V3/api/search/project?resultsPerPage=1000&criteria=code contains(PRJ) and active=-1&pageNumber='
483        # this is a custom API call based on a report.  Edits to the report on the UI would have to take place to change it
484        elif api_type == 'assignment':
485            url = 'https://api.keyedinprojects.com/V3/api/report?resultsPerPage=1000&key=277&pageNumber='
486        else:
487            logger.error("Invalid api type: {0}".format(api_type))
488
489        logger.info("API type of {0} with url {1} called.".format(api_type, url))
490
491        complete = False
492        api_json = []
493        page_number = 1
494        attempts = 1
495
496        # the idea behind this is that for every page needed, it tries to obtain it three times before moving on
497        # maybe should wait or something in case that is the issue.
498        # needs further testing.
499
500        # this outer while loop does nothing. immediately after you set complete = True, you return from the
501        # function.
502        while not complete:
503            attempts = 1
504            while attempts <= 3:
505                try:
506                    call = url + str(page_number)
507                    self.connection = httpx.get(call, auth=(configSettings['KI_USER'], configSettings['KI_PW']))
508                    if self.connection.status_code == 200:
509                        logger.info("{0} Connection {1} successful: {2}".format(api_type, str(page_number), call))
510                        api_json += self.connection.json()['Data']
511                        if self.connection.json()['PageNumber'] >= self.connection.json()['TotalPages']:
512                            complete = True
513                            return(api_json)
514                        else:
515                            page_number += 1
516                    else:
517                        logger.error("Connection failed with status code:{0}: {1}".format(
518                            str(self.connection.status_code)), call)
519                except:
520                    logger.error("API return failure on attempt {0}.".format(attempts))
521                    attempts += 1
522
523    def _create_permissions_row(self, code, role, resource, typ):
524        """
525        Creates a row for a csv that will be acted on for permissions
526
527        Parameters:
528            code (string): Project ID
529            role (string): The role of the person
530            resource (string): Persons name
531            typ (string): Either to be added or removed.
532
533        Creates:
534            self.permissionChanges (array): List of changes to be made.
535        """
536
537        # based on the role of the person need to determine location of permissions
538        # directors and some departments have full access, this is where they are filtered.
539
540        try:
541            resourceInfo = self.resourceDict[resource]
542
543        except KeyError:
544            logger.warn("Resource not found in dict. {0}".format(resource))
545            return
546        # people with the following primary role are out of scope for this
547
548        if resourceInfo['role'] in ['AD-EXEC', 'AD-FIN-HR', 'AD-IT', 'AD-IT-INT', 'AD-QA', 'BD', 'BS-AS-DIR', 'BS-DIR',
549                                    'BS-PRIN-RS', 'DM-DIR', 'SSC-AI', 'SSC-CSO']:
550            logger.info('{0} with primary role {1} out of scope.'.format(resourceInfo['name'], resourceInfo['role']))
551        else:
552
553            perChanges = (self.permission_mappings.copy()[(self.permission_mappings['role'] == role) &
554                                                          (self.permission_mappings['action'] == typ)])
555            for index, row in perChanges.iterrows():
556                if code not in self.project_mappings.index:
557                    logger.warn('Project Code not in project mappings'.format(code))
558                    return
559
560                try:
561                    folder = self.project_mappings.at[code, row['location']]
562                except KeyError:
563                    logger.warn("Folder not found. {0}".format(str([code, role, row['location']])))
564
565                if pd.isnull(folder):
566                    logger.warn("Folder not found. {0}".format(str([code, role, row['location']])))
567                else:
568
569                    newRow = [folder, row['role'], resourceInfo['name'], row['permissionChange']]
570                    self.permissionChanges.append(newRow)
571                    logger.info("Permission changed updated: {0}".format(str(newRow)))
572
573    def _create_folders(self, project_name, project_key):
574        """
575        Creates new folders
576
577        Parameters:
578            project_name (string): Name of the new project that needs folders to be created.
579            project_key(string)L Project ID
580        """
581        # need to change this so there is a sponsor/study nomenclature, dummy for now
582        try:
583            sponsor, study = (project_name).split('_', 1)
584        except:
585            logger.error("Project Name not in correct format: {0}".format(project_name))
586            return
587        stats_loc = self.STATS_LOCATION + sponsor + '\\' + study
588        dm_loc = self.DM_LOCATION + sponsor + '\\' + study
589
590        try:
591            shutil.copytree(self.STATS_COPY_LOCATION, stats_loc)
592            logger.info('Stats directory created at: {0}'.format(stats_loc))
593        except:
594            logger.warn('Stats directory already exists at: {0}'.format(stats_loc))
595        try:
596            shutil.copytree(self.DM_COPY_LOCATION, dm_loc)
597            logger.info('DM directory created at: {0}'.format(dm_loc))
598        except:
599            logger.warn('DM directory already exists at: {0}'.format(dm_loc))
600
601        # if thens because if the stats_copy_location changes, its possible these are in different places.
602        if os.path.isdir(stats_loc + '\\Statistics\\Programs\\Primary Programs'):
603            primProg = stats_loc + '\\Statistics\\Programs\\Primary Programs'
604        else:
605            primProg = ''
606            logger.warn("Project Mappings primary program not found for {0} {1}".format(project_name, project_name))
607
608        if os.path.isdir(stats_loc + '\\Statistics\\Programs\\Validation Programs'):
609            valProg = stats_loc + '\\Statistics\\Programs\\Validation Programs'
610        else:
611            valProg = ''
612            logger.warn("Project Mappings validation program not found for {0} {1}".format(project_name, project_name))
613
614        if os.path.isdir(stats_loc + '\\Statistics\\Randomization'):
615            rand = stats_loc + '\\Statistics\\Randomization'
616        else:
617            rand = ''
618            logger.warn("Project Mappings randomization not found for {0} {1}".format(project_name, project_name))
619
620        if os.path.isdir(stats_loc + '\\DMData'):
621            dmd = stats_loc + '\\DMData'
622        else:
623            dmd = ''
624            logger.warn("Project Mappings dmdata not found for {0} {1}".format(project_name, project_name))
625
626        newProject = pd.DataFrame([[dm_loc, stats_loc, primProg, valProg, rand, dmd]], columns=[
627            'dmDrive', 'statsDrive', 'primaryProgram', 'validationProgram', 'randomization', 'dmdata'], index=[project_key])
628        self.project_mappings = self.project_mappings.append(newProject)
629        logger.info("Project mappings created for {0} {1}".format(project_name, project_name))
630
631    # this function makes me think the caller logic is really minimal. it seems like you put too much thinking into
632    # this class. i would probably rewrite this so that this function generates a csv string and returns it, and the
633    # caller can put it in a file. that also shifts this function towards being more stateless, because the file
634    # system is irrelevant. you could also modify it to take an argument telling it which data to write
635    def output_csv(self):
636        """
637        Outputs CSVs for powershell script to take action on.  Also for auditing
638
639
640        """
641        with open(self.OUTPUT_LOCATION + 'permissionsChanges_' + startTime + '.csv', 'w', newline="") as handle:
642            writer = csv.writer(handle)
643            writer.writerow(['folder', 'role', 'person', 'action'])
644            writer.writerows(self.permissionChanges)
645        logger.info("CSV succssfully output: {0} ".format(self.OUTPUT_LOCATION + 'permissionsChanges_' + startTime + '.csv'))
646
647        with open(self.OUTPUT_LOCATION + 'newProjects_' + startTime + '.csv', 'w', newline="") as handle:
648            writer = csv.writer(handle)
649            writer.writerow(['study'])
650            writer.writerows(self.newProjectsDesc)
651        logger.info("CSV succssfully output: {0} ".format(self.OUTPUT_LOCATION + 'newProjects_' + startTime + '.csv'))
652
653    def _get_excel_serial_date(self, dttm):
654        """
655        KeyedIn Assignment API uses the excel serial date format in eastern timezone.  This converts to that
656
657        Parameters:
658            dttm (datetime): datetime to calculate from
659
660
661        Creates:
662            self.excelSerialDate (float): The float date to be used
663        """
664        # this should not be a class field, since all usages are in this function
665        self.eastern = pytz.timezone('America/New_York')
666
667        temp = datetime(1899, 12, 30, tzinfo=self.eastern)   # Note, not 31st Dec but 30th!
668
669        dttm = dttm.astimezone(self.eastern)
670
671        delta = dttm - temp
672        # this should be returned instead of created, which would make this function able to be converted to static
673        # which means you can bombard it with unit tests
674        self.excelSerialDate = round(float(delta.days) + (float(delta.seconds) / 86400), 3)
675        logger.info("Excel Serial Date is: {0}".format(self.excelSerialDate))
676
677    def wrap_up(self):
678        """
679        Compresses older files for space purposes.  Logs and old data compressed every month.
680
681        Outputs the current resourceDict and projectData as previous ones.
682
683        """
684
685        # if the month is different or the year (when Dec goes to Jan)
686        # needs more testing.
687        if startTimeStamp.month > self.previousProjectTimestamp.month or startTimeStamp.year > self.previousProjectTimestamp.year:
688            logger.info("New Month detected, zipping files: {0} to {1}".format(datetime.strftime(self.previousProjectTimestamp, '%Y-%m-%d'),
689                                                                               datetime.strftime(startTimeStamp, '%Y-%m-%d')))
690            self._zip_files(self.PROJ_DATA_LOCATION)
691            self._zip_files(self.RES_DATA_LOCATION)
692            self._zip_files(self.DATA_LOCATION + 'logs\\')
693            self._zip_files(self.DATA_LOCATION + 'json\\')
694
695        # save most rescent version, overwrite one that is there.
696        with open(self.PROJ_DATA_LOCATION + 'projectData_' + startTime + '.pkl', 'wb') as handle:
697            pickle.dump(self.projectData, handle)
698        logger.info("Project data saved as pickle file: {0}".format(
699            self.PROJ_DATA_LOCATION + 'projectData_' + startTime + '.pkl'))
700
701        with open(self.RES_DATA_LOCATION + 'resourceDict_' + startTime + '.pkl', 'wb') as handle:
702            pickle.dump(self.resourceDict, handle)
703        logger.info("Resource dict saved as pickle file: {0}".format(
704            self.RES_DATA_LOCATION + 'resourceDict_' + startTime + '.pkl'))
705
706        with open(self.DATA_LOCATION + 'json\\projectJson_' + startTime + '.pkl', 'wb') as handle:
707            pickle.dump(self.projectJson, handle)
708        logger.info("Project json saved as pickle file: {0}".format(
709            self.DATA_LOCATION + 'json\\projectJson_' + startTime + '.pkl'))
710
711        with open(self.DATA_LOCATION + 'json\\assignmentJson_' + startTime + '.pkl', 'wb') as handle:
712            pickle.dump(self.assignmentJson, handle)
713        logger.info("Assignment json saved as pickle file: {0}".format(
714            self.DATA_LOCATION + 'json\\assignmentJson_' + startTime + '.pkl'))
715
716        # resourceJson does not always exist.
717        try:
718            if len(self.resourceJson) > 0:
719                with open(self.DATA_LOCATION + 'json\\resourceJson_' + startTime + '.pkl', 'wb') as handle:
720                    pickle.dump(self.resourceJson, handle)
721                logger.info("Resource json saved as pickle file: {0}".format(
722                    self.DATA_LOCATION + 'json\\resourceJson_' + startTime + '.pkl'))
723        except:
724            logger.info("Resource json does not exist")
725
726        # need to make sure changes made because of new projects are available for future runs
727        self.project_mappings.to_csv('project_mappings.csv')
728
729    def _zip_files(self, loc):
730        """
731        Creates and archive, deletes the files in the current dir
732
733        Parameters:
734            loc (str): The file path of the directory to modify
735        """
736        logger.info("Starting archive process for {0}".format(loc))
737        loc_name = loc.split('\\')[-2]
738
739        fls = os.listdir(loc)
740        fls = [f for f in fls if f != 'mylog.log']
741        newest = sorted(fls, reverse=True)[0]
742        oldest = sorted(fls, reverse=False)[0]
743
744        if loc_name != 'logs':
745            newest_ts = datetime.strptime(newest.replace(loc_name + '_', '').replace('.pkl', ''), '%Y-%m-%d_%H%M%S')
746            oldest_ts = datetime.strptime(oldest.replace(loc_name + '_', '').replace('.pkl', ''), '%Y-%m-%d_%H%M%S')
747        else:
748
749            newest_ts = datetime.strptime(newest.replace('mylog.log.', '').replace('.pkl', ''), '%Y-%m-%d')
750            oldest_ts = datetime.strptime(oldest.replace('mylog.log.', '').replace('.pkl', ''), '%Y-%m-%d')
751
752        newest_ts_str = datetime.strftime(newest_ts, '%Y-%m-%d')
753        oldest_ts_str = datetime.strftime(oldest_ts, '%Y-%m-%d')
754
755        logger.info("{0} archive contains {1} files from dates {2} to {3}.".format(
756            loc, str(len(fls)), oldest_ts_str, newest_ts_str))
757
758        shutil.make_archive(self.DATA_LOCATION + 'archive\\' + loc_name + '_' + oldest_ts_str +
759                            '_' + newest_ts_str, root_dir=loc, format='tar', logger=logger)
760
761        for f in fls:
762            os.remove(loc + f)
763            logger.info("File removed at {0}".format(loc + f))