HgBEZ6Gr

· 7 years ago · Dec 05, 2018, 09:18 PM
1import csv
2import string
3import random
4import os
5import time
6########################
7# Name of this scheme: #
8########################
9ThisScheme = 'SNKFPenF'
10########################
11
12# define the name(s) of the directories to be created:
13path0 = 'output'
14path1 = 'output/dirty'
15path2 = 'output/clean'
16path3 = 'originals'
17
18
19try:
20    os.mkdir(path0)
21except OSError:
22    print ("Creation of the directory %s failed, does it already exist?" % path0)
23else:
24    print ("Successfully created the directory %s " % path0)
25
26try:
27    os.mkdir(path1)
28except OSError:
29    print ("Creation of the directory %s failed, does it already exist?" % path1)
30else:
31    print ("Successfully created the directory %s " % path1)
32
33try:
34    os.mkdir(path2)
35except OSError:
36    print ("Creation of the directory %s failed, does it already exist?" % path2)
37else:
38    print ("Successfully created the directory %s " % path2)
39
40try:
41    os.mkdir(path3)
42except OSError:
43    print ("Creation of the directory %s failed, does it already exist?" % path3)
44else:
45    print ("Successfully created the directory %s " % path3)
46
47
48# WARNING!
49# When editing this file, be sure to use a find and replace on the headers coming from
50# the _INPUT FILE_ ONLY!
51
52
53# Edit these if needed, input file(s) DO NOT have a -> / <- in front as they are in this directory!
54
55# List of file(s) to import:
56file0 = 'unit_holders.csv'
57file8 = 'cashflow.csv'
58
59# List of file(s) to export:
60
61# Exported to /originals:
62file1 = '/persons.csv'
63file5 = '/personal_identification.csv'
64file2 = '/accounts.csv'
65
66# Exported to /dirty
67file6 = '/persons.csv'
68file12 = '/cashflow.csv'
69file14 = '/personal_identification.csv'
70file16 = '/accounts.csv'
71
72# Exported to /clean
73file3 = '/sponsors.csv'
74file7 = '/persons.csv'
75file10 = '/accounts.csv'
76file9 = '/cashflow.csv'
77file15 = '/personal_identification.csv'
78
79
80# Do not edit these (adds full path to output file):
81# Note that file4, file11 and file13 were removed as contact info was added to person, and ssnit was added to account
82file1 = path3 + file1
83file2 = path3 + file2
84file3 = path2 + file3
85file5 = path3 + file5
86file6 = path1 + file6
87file7 = path2 + file7
88file9 = path2 + file9
89file10 = path2 + file10
90file12 = path1 + file12
91file14 = path1 + file14
92file15 = path2 + file15
93file16 = path1 + file16
94
95# Edit the name of the logfile below (if needed):
96logFile = '/log.txt'
97
98# Do not edit these:
99logFile = path0 + logFile
100log = open(logFile,"w+")
101
102# An array that stores duplicate entries:
103dupes = []
104dupeCodes = []
105dupeUniques = []
106
107# An array which stores accounts linked to the same personcode..
108accountLinks = []
109
110# An array which gets the person_code of a 'unit_holder' that should be linked to another personCode
111newAccounts = []
112accountUniques = []
113
114# Array that stores completed entries
115completedLines = []
116
117# Dictionary Array which stores completed persons (helps to identify duplicates)
118persons = []
119personUniques = []
120
121# Arrays to store sponsor info to avoid duplicates:
122sponsorNames = []
123sponsorCodes = []
124sponsorWrongCodes = []
125sponsorLinks = []
126sponsorEntries = []
127currentSponsorCode = 'missing'
128currentSponsorName = 'missing'
129
130
131# This is used for accounts to reference sponsorLinks:
132sponsorDupes = []
133
134def AssignSponsorCode(code):
135    global currentSponsorCode
136    #if currentSponsorCode == '':
137    #    currentSponsorCode = 'missing'
138    #else:
139    currentSponsorCode = code
140    print(currentSponsorCode, '/', code, 'updated!')
141
142def AssignSponsorName(name):
143    global currentSponsorName
144    currentSponsorName = name
145    print(currentSponsorName, 'updated!')
146
147def ClearSponsorCode():
148    global currentSponsorCode
149    currentSponsorCode = 'twasreset'
150    print('Sponsor Code Reset!')
151
152def ClearSponsorName():
153    global currentSponsorName
154    currentSponsorName = 'twasresettoo'
155    print('Sponsor Name Reset!')
156
157
158def CheckDuplicatePerson(unique):
159    if unique in dupeUniques:
160        return True
161
162    else:
163        return False
164
165def CheckDuplicateAccount(unique):
166    if unique in accountUniques:
167        return True
168
169    else:
170        return False
171
172def CheckNewAccount(code):
173    if code in newAccounts:
174        for link in accountLinks:
175            old = link[0]
176            new = link[1]
177            if new == code:
178                return old
179                break
180            else:
181                continue
182
183def SponsorCodeDupeCheck(code):
184    if code in sponsorCodes:
185        return True
186    else:
187        return False
188
189def SponsorNameDupeCheck(name):
190    if name in sponsorNames:
191        return True
192    else:
193        return False
194
195
196def GetSponsorCode(code):
197    for link in sponsorLinks:
198        old = link[0]
199        new = link[1]
200        if new == code:
201            return old
202            break
203        else:
204            continue
205
206def GetSponsorName(code):
207    for sponsorEntry in sponsorEntries:
208      entryCode = sponsorEntry[0]
209      entryName = sponsorEntry[1]
210      if entryCode == code:
211          return entryName
212          break
213      else:
214          continue
215
216def GetDuplicateByFullPerson(fullPerson):
217    if fullPerson in personUniques:
218        return True
219    else:
220        return False
221
222def GetDuplicateByAccount(fullAccount):
223    if fullAccount in accountUniques:
224        return True
225    else:
226        return False
227
228def GetFirstPersonCode(fullPerson):
229    for person in persons:
230      lastCode = person[0]
231      lastUnique = person[1]
232      if lastUnique == fullPerson:
233          return lastCode
234          break
235      else:
236          continue
237
238# Genenerator:
239def id_generator(size=10):
240    return ''.join(random.sample(string.ascii_uppercase, size))
241
242
243def split_csv(input_path, output_path1, output_path2, output_path3, output_path5, output_path6):
244    start = time.time()
245
246    # open the input file in read-mode:
247    csv_input_file = open(input_path, 'r')
248
249    # load the headers in the file with DictReader:
250    csv_input = csv.DictReader(csv_input_file, delimiter=',')
251
252    # 0pen the output files in write-mode
253    csv_output_file1 = open(output_path1, 'w', newline='')
254    csv_output_file2 = open(output_path2, 'w', newline='')
255    csv_output_file3 = open(output_path3, 'w', newline='')
256    csv_output_file5 = open(output_path5, 'w', newline='')
257    csv_output_file6 = open(output_path6, 'w', newline='')
258    csv_output_file10 = open(file10, 'w', newline='')
259
260    # Create CSV writers using the opened files
261    csv_1 = csv.writer(csv_output_file1) # people.csv
262    csv_2 = csv.writer(csv_output_file2) # accounts.csv
263    csv_3 = csv.writer(csv_output_file3) # sponsors.csv
264    csv_5 = csv.writer(csv_output_file5) # ids.csv
265    csv_6 = csv.writer(csv_output_file6) # duplicates.csv
266    csv_10 = csv.writer(csv_output_file10) # clean accounts.csv
267
268    ##################################################################################
269    # DO NOT edit the headers being inserted to the new files, they are PAS specific!#
270    ##################################################################################
271
272    # Headers for people.csv
273    person_headers = [
274        'person_code',
275        'ssnit',
276        'employer_code',
277        'first_name',
278        'mid_name',
279        'other_names',
280        'maiden_name',
281        'last_name',
282        'last_modified_by',
283        'last_modified_date',
284        'deleted',
285        'employment_status',
286        'dob',
287        'phone1',
288        'phone2',
289        'email',
290        'address1',
291        'address2',
292        'city',
293        'country',
294        'marital_status',
295        'gender',
296        'father_first_name',
297        'father_last_name',
298        'father_address',
299        'father_city',
300        'father_country',
301        'father_phone',
302        'mother_first_name',
303        'mother_last_name',
304        'mother_maiden_name',
305        'mother_address',
306        'mother_city',
307        'mother_country',
308        'mother_phone',
309        'nationality'
310    ]
311    # Write the headers to file:
312    csv_1.writerow(person_headers)
313    #
314    # Headers for accounts.csv
315    account_headers = [
316        'person_code',
317        'sponsor_code',
318        'scheme_code',
319        'description',
320        'last_modified_by',
321        'last_modified_date',
322        'industry_code',
323        'income_structure',
324        'average_pay',
325        'ssno'
326    ]
327    # Write the headers to file:
328    csv_2.writerow(account_headers)
329    #
330    # Headers for sponsors.csv
331    sponsor_headers = [
332        'sponsor_code',
333        'sponsor_name',
334        'region_code',
335        'category',
336        'last_modified_by',
337        'last_modified_date',
338        'deleted', # (yes/no)
339        'email',
340        'telephone_1',
341        'telephone_2',
342        'fax',
343        'address',
344        'contact_person1',
345        'contact_person2'
346    ]
347    # Write the headers to file:
348    csv_3.writerow(sponsor_headers)
349
350    # Headers for ids.csv
351    id_headers = [
352        'person_code',
353        'id_type',
354        'id_number'
355    ]
356    # Write the headers to file:
357    csv_5.writerow(id_headers)
358
359    # Just a placemark variable to show the script user what row it's on (increased and shown to console):
360    thisLine = 1;
361
362    # Same as above, but counts number of duplicates:
363    duplicates = 0;
364
365    # For each row in our input file, do this:\
366    for input_row in csv_input:
367        # If there wasnt a surname or first name provided, replace it with 'missing'...
368        # the field is nullable, but it looks better to show that the system and/or PiSys
369        # is aware of the missing data
370
371        # Process data for Sankofa to PAS structure..
372        if input_row['surname'] == '':
373           input_row['surname'] = 'missing'
374
375        if input_row['first_name'] == '':
376           input_row['first_name'] = 'missing'
377
378        if input_row['maritalStatus'] == 'Married':
379           input_row['maritalStatus'] = '1'
380
381        if input_row['maritalStatus'] == 'Single':
382           input_row['maritalStatus'] = '2'
383
384        if input_row['maritalStatus'] == 'Divorced':
385           input_row['maritalStatus'] = '3'
386
387        if input_row['employmentStatus'] == 'Unemployed':
388           input_row['employmentStatus'] = '2'
389
390        if input_row['employmentStatus'] == 'Employed':
391           input_row['employmentStatus'] = '1'
392
393        if input_row['employmentStatus'] == 'Retired':
394           input_row['employmentStatus'] = '3'
395
396        if input_row['gender'] == 'Male':
397           input_row['gender'] = '1'
398
399        if input_row['gender'] == 'Female':
400           input_row['gender'] = '2'
401
402        if input_row['Nationality'] != '':
403           input_row['Nationality'] = '82'
404
405        if input_row['fatherCountry'] == 'GH':
406           input_row['fatherCountry'] = '82'
407
408        if input_row['motherCountry'] == 'GH':
409           input_row['motherCountry'] = '82'
410
411        if input_row['fatherCity'] == 'Navrongo':
412           input_row['fatherCity'] = '54'
413
414        if input_row['motherCity'] == 'Navrongo':
415           input_row['motherCity'] = '54'
416
417        ##############################
418        # Beginning of Sponsor Parse #
419        ##############################
420
421        personCode = input_row['unit_holder_id']
422        #SponsorCode = ''
423        #SponsorName = ''
424        if input_row['employer_code'] != '' and input_row['employer_name'] != '':
425            rawcode = input_row['employer_code']
426            codeType = type(rawcode)
427            if codeType == int:
428                print('################# integer! #################')
429                print('################# integer! #################')
430                print('################# integer! #################')
431                print('################# integer! #################')
432                print('################# integer! #################')
433                print('################# integer! #################')
434            if codeType == str:
435                print('string!')
436            #code = ''
437            #if codeType == 'int':
438            #    code = str(code)
439            #else:
440            #    code = code
441            name = input_row['employer_name']
442            codeCheck = SponsorCodeDupeCheck(rawcode)
443            nameCheck = SponsorNameDupeCheck(name)
444            if codeCheck == True and nameCheck == True:
445                print(personCode, 'Duplicate Sponsor (based on name and code) Found and Omitted!')
446                AssignSponsorCode(code)
447                AssignSponsorName(name)
448                print(currentSponsorCode, 'vs', code)
449            if codeCheck == True and nameCheck == False:
450                print(personCode, 'Duplicate Sponsor (based on code) Found and Omitted!')
451                AssignSponsorCode(code)
452                AssignSponsorName(name)
453                print(currentSponsorCode, 'vs', code)
454            if codeCheck == False and nameCheck == True:
455                firstCode = GetSponsorCode(name)
456                if firstCode != code:
457                    appendValue = [firstCode, code]
458                    sponsorLinks.append(appendValue)
459                    sponsorWrongCodes.append(code)
460                    AssignSponsorCode(firstCode)
461                    AssignSponsorName(name)
462                    print(personCode, 'Duplicate Sponsor Name with Different Code Found! Code recorded in Links.')
463                    print(currentSponsorCode, 'vs', code)
464                if firstCode == code:
465                    AssignSponsorCode(code)
466                    AssignSponsorName(name)
467                    print(personCode, 'Duplicate Sponsor Found with same Code! Omitted entry. ')
468                    print(currentSponsorCode, 'vs', code)
469
470            if codeCheck == False and nameCheck == False:
471                csv_3_row = [
472                    currentSponsorCode,
473                    currentSponsorName,
474                    '0', #not null, integer(11)
475                    '-1', #not null, integer(11)
476                    '',
477                    '',
478                    'no', #deleted
479                    '', #email
480                    '', #telephone_1
481                    '', #telephone_2
482                    '', #fax
483                    '', #address
484                    '', #contact_person1
485                    '' #contact_person2
486                ]
487                # Write row to file:
488                csv_3.writerow(csv_3_row)
489                sponsorCodes.append(currentSponsorCode)
490                sponsorNames.append(currentSponsorName)
491                sponsorEntry = [currentSponsorCode, currentSponsorName]
492                sponsorEntries.append(sponsorEntry)
493                AssignSponsorCode(code)
494                AssignSponsorName(name)
495                print(currentSponsorCode, 'vs', code)
496                print(personCode, 'Contained full unique sponsor, added sponsor to file.')
497
498        if input_row['employer_code'] == '' and input_row['employer_name'] != '':
499            name = input_row['employer_name']
500            firstCode = GetSponsorCode(name)
501            if firstCode != '':
502                AssignSponsorCode(firstCode)
503                AssignSponsorName(name)
504                print(personCode, 'Excluded Duplicate Sponsor Entry with Name:', name, 'it linked with:', firstCode)
505                print(currentSponsorCode, 'vs', code)
506            if firstCode == '' or firstCode == None:
507                ClearSponsorCode()
508                AssignSponsorName(name)
509                print(personCode, 'Excluded Duplicate Sponsor Entry with Name:', name, 'as it had no identifying code.')
510                print(currentSponsorCode, 'vs', code)
511
512        if input_row['employer_code'] != '' and input_row['employer_name'] == '':
513            rawcode = input_row['employer_code']
514            codeType = type(rawcode)
515            code = ''
516            if codeType == 'int':
517                code = str(code)
518            else:
519                code = code
520            codeCheck = SponsorCodeDupeCheck(code)
521            # Get generated code:
522            randomString = id_generator()
523            # Add a cap which makes reporting easy in MySQL:
524            cap = 'MISS'
525            # Concatenate cap and generated name:
526            name = cap + randomString
527            if codeCheck == False:
528                if code == '1006' or code == 1006:
529                    logEntry = 'KER-SEE-1006', '\n'
530                    logString =  ''.join(logEntry)
531                    log.write(logString)
532
533                csv_3_row = [
534                    code,
535                    name,
536                    '0', #not null, integer(11)
537                    '-1', #not null, integer(11)
538                    '',
539                    '',
540                    'no', #deleted
541                    '', #email
542                    '', #telephone_1
543                    '', #telephone_2
544                    '', #fax
545                    '', #address
546                    '', #contact_person1
547                    '' #contact_person2
548                ]
549                # Write row to file:
550                csv_3.writerow(csv_3_row)
551                sponsorCodes.append(code)
552                AssignSponsorCode(code)
553                AssignSponsorName(name)
554                print(currentSponsorCode, 'vs', code)
555            else:
556                AssignSponsorCode(code)
557                AssignSponsorName(name)
558                print(personCode, 'Excluded Sponsor Entry without Name, as its code is a duplicate')
559                print(currentSponsorCode, 'vs', code)
560
561        if input_row['employer_code'] == '' and input_row['employer_name'] == '':
562            ClearSponsorCode()
563            ClearSponsorName()
564            print(personCode, 'No sponsor here! Ignoring...')
565
566        #########################
567        # End of Sponsor Parse! #
568        #########################
569
570        ##############################
571        # Beginning of Persons Parse #
572        ##############################
573
574        # Row Identity for People
575        # A lot of information will be missing.. but insert anyway.
576        # The person_code will tie things together, so DO NOT fail to insert
577        # EVEN IF a person_code is ALL you have! - they STILL EXIST!
578        #global CurrentSponsorCode
579        csv_1_row = [
580            input_row['unit_holder_id'],
581            input_row['ssnit_no'],
582            currentSponsorCode,
583            input_row['first_name'],
584            input_row['midName'],
585            input_row['other_names'],
586            '', #maiden name
587            input_row['surname'],
588            '', #last_modified_by
589            '', #last_modified_date
590            'no', #deleted
591            input_row['employmentStatus'],
592            input_row['DOB'],
593            input_row['phone'],
594            input_row['phone2'],
595            input_row['email'],
596            input_row['userAddress'],
597            input_row['userAddress1'],
598            input_row['userCity'],
599            input_row['userCountry'],
600            input_row['maritalStatus'],
601            input_row['gender'],
602            input_row['fatherFName'],
603            input_row['fatherLName'],
604            input_row['fatherAdd'],
605            input_row['fatherCity'],
606            input_row['fatherCountry'],
607            input_row['fatherPhone'],
608            input_row['motherFName'],
609            input_row['motherLName'],
610            '', #maiden name (mother)
611            input_row['motherAdd'],
612            input_row['motherCity'],
613            input_row['motherCountry'],
614            input_row['motherPhone'],
615            input_row['Nationality']
616        ]
617
618        # Remove spaces for insertion in to persons array, not output to CSV:
619        first_name = input_row['first_name'].replace(" ", "")
620        surname = input_row['surname'].replace(" ", "")
621
622        # Create a unique identifier based on these values:
623        fullPerson = input_row['ssnit_no'] + surname + first_name
624        # Extract the person_code:
625        personCode = input_row['unit_holder_id']
626        unverifiedCode = currentSponsorCode
627        sponsorName = currentSponsorName
628
629        #if unverifiedCode == '' or unverifiedCode == None:
630        #    code = 'missing'
631        #    AssignSponsorCode(code)
632        #else:
633        #    AssignSponsorCode(unverifiedCode)
634        #    #sponsorCode = unverifiedCode
635
636        print(currentSponsorCode)
637
638        # Check array of persons for unique:
639        # for pcode, ucode in persons.items():
640
641        # If this is a duplicate, mark it as so and write to duplicates file.
642        fullAccount = currentSponsorCode + surname + first_name
643        personDupeCheck = GetDuplicateByFullPerson(fullPerson) # this checks to see if another entry with the same ssnit, first name, and last name exists already
644        accountDupeCheck = GetDuplicateByAccount(fullAccount)  # this checks to see if another entry with the same sponsor, first name, and last name exists already
645
646        # If the person and the account are duplicates:
647        if personDupeCheck == True and accountDupeCheck == True:
648            logEntry = 'Duplicate Person and Account: personCode: ', personCode, 'schemeCode:', currentSponsorCode, 'unique:', fullPerson, '\n'
649            logString =  ''.join(logEntry)
650            log.write(logString)
651            #print("Duplicate: personCode: ", lastCode, "/", personCode, "- details logged.")
652            appendValue = [personCode, fullPerson]
653            dupes.append(appendValue)
654            dupeCodes.append(personCode)
655            dupeCodes.append(personDupeCheck)
656            dupeUniques.append(fullPerson)
657            # Write to People.csv:
658            csv_1.writerow(csv_1_row)
659
660        # If this is a duplicate person, but the account is new:
661        if personDupeCheck == True and accountDupeCheck == False:
662            OriginalCode = GetFirstPersonCode(fullPerson)
663            logEntry = 'New Account found for a duplicate person: personCode: ', personCode, 'vs', OriginalCode, 'SponsorsCode:', currentSponsorCode, 'unique:', fullPerson, '\n'
664            logString =  ''.join(logEntry)
665            log.write(logString)
666            appendValue = [OriginalCode, personCode]
667            accountLinks.append(appendValue)
668            # Add the personCode from this row to an array(list) of new accounts:
669            newAccounts.append(personCode)
670            accountUniques.append(fullAccount)
671            # Write to People.csv:
672            csv_1.writerow(csv_1_row)
673
674        # If this is an original person, with an original account:
675        if personDupeCheck == False and accountDupeCheck == False:
676            logEntry = 'Original Person and account found for: personCode: ', personCode, 'schemeCode:', currentSponsorCode, 'unique:', fullPerson, '\n'
677            logString =  ''.join(logEntry)
678            log.write(logString)
679            appendValue = [personCode, fullPerson, currentSponsorCode]
680            persons.append(appendValue)
681            personUniques.append(fullPerson)
682            accountUniques.append(fullAccount)
683            # Write to People.csv:
684            csv_1.writerow(csv_1_row)
685
686        # If this is an original person, with a duplicate account:
687        if personDupeCheck == False and accountDupeCheck == True:
688            logEntry = 'Original Person with duplicate account found for: personCode: ', personCode, 'schemeCode:', currentSponsorCode, 'unique:', fullPerson, '\n'
689            logString =  ''.join(logEntry)
690            log.write(logString)
691            appendValue = [personCode, fullPerson]
692            dupes.append(appendValue)
693            dupeCodes.append(personCode)
694            dupeUniques.append(fullPerson)
695            # Write to People.csv:
696            csv_1.writerow(csv_1_row)
697
698
699
700        ########################
701        # End of Persons Parse #
702        ########################
703
704
705
706
707
708        ###############################
709        # Beginning of Account Parse! #
710        ###############################
711
712        unit_holder = input_row['unit_holder_id']
713        unverifiedCode = input_row['employer_code']
714        sponsorName = input_row['employer_name']
715
716        sponsorCode = ''
717
718        if unverifiedCode in sponsorDupes:
719            for entry in sponsorLinks:
720                entryCode = entry[0]
721                entryName = entry[1]
722                if entryName == sponsorName and entryCode != unverifiedCode:
723                    sponsorCode = entryCode
724                    logEntry = 'Account Sponsor Code Updated! (from): Sponsors Code: ', unverifiedCode, ' to: ', entryCode, ' as: ', sponsorName, ' = ', entryName, '\n'
725                    logString =  ''.join(logEntry)
726                    log.write(logString)
727                    # Row identity for Accounts:
728                    csv_2_row = [
729                        unit_holder,
730                        sponsorCode,
731                        ThisScheme, # this is the scheme_code!! replace with input_row if supplied.
732                        '',
733                        '',
734                        '',
735                        input_row['IndustryCode'],
736                        input_row['payRate'],
737                        input_row['averagePay'],
738                        input_row['ssnit_no']
739                    ]
740
741                    # Write (data) to Accounts.csv:
742                    csv_2.writerow(csv_2_row)
743
744                if entryName == sponsorName and entryCode == unverifiedCode:
745                    sponsorCode = unverifiedCode
746                    # Row identity for Accounts:
747                    csv_2_row = [
748                        unit_holder,
749                        sponsorCode,
750                        ThisScheme, # this is the scheme_code!! replace with input_row if supplied.
751                        '',
752                        '',
753                        '',
754                        input_row['IndustryCode'],
755                        input_row['payRate'],
756                        input_row['averagePay'],
757                        input_row['ssnit_no']
758                    ]
759
760                    # Write (data) to Accounts.csv:
761                    csv_2.writerow(csv_2_row)
762                else:
763                    continue
764
765        else:
766            verifiedCode = unverifiedCode # it is not in duplicates, so it is automatically 'verified'
767            logEntry = 'Original sponsor found and added to account: Sponsors Code: ', unverifiedCode, ' Name: ', sponsorName, '\n'
768            logString =  ''.join(logEntry)
769            log.write(logString)
770            # Row identity for Accounts:
771            csv_2_row = [
772                unit_holder,
773                verifiedCode,
774                ThisScheme, # this is the scheme_code!! replace with input_row if supplied.
775                '',
776                '',
777                '',
778                input_row['IndustryCode'],
779                input_row['payRate'],
780                input_row['averagePay'],
781                input_row['ssnit_no']
782            ]
783
784            # Write (data) to Accounts.csv:
785            csv_2.writerow(csv_2_row)
786        ####################################################
787        ####################################################
788        # KR _ NOTE: this needs to check for sponsorLinks!!
789        # find link (if exists) and insert it instead if found!
790        ####################################################
791        ####################################################
792
793
794        # Row identity for Sponsors
795        # Region code is replaced with a value for 'unknown' in the database if empty
796
797
798
799        # Start of ID data parse. As IDType is a different column than ssnit, both are checked separately, and inserted separately.
800        if input_row['IDType'] == 'Drivers License':
801           input_row['IDType'] = '2'
802
803        if input_row['IDType'] == 'Passport':
804           input_row['IDType'] = '1'
805
806        if input_row['IDType'] == 'Voters ID':
807           input_row['IDType'] = '4'
808
809        # Create row for ID:
810        id_row = [
811           input_row['unit_holder_id'],
812           input_row['IDType'],
813           input_row['IDNum']
814        ]
815
816        # Write row to ID file:
817        csv_5.writerow(id_row)
818        ClearSponsorCode()
819        ClearSponsorName()
820
821    # Close the people_with_dupes.csv file so it can be opened for reference to duplicates:
822    csv_output_file1.close()
823    csv_output_file2.close()
824    csv_output_file5.close()
825
826    # Open 'original' files created in read mode:
827    input_persons = open(file1, 'r')
828    input_accounts = open(file2, 'r')
829    input_cashflow = open(file8, 'r')
830    input_ids = open(file5, 'r')
831
832    # Open the new files in /clean and /dirty in write mode:
833    persons_output = open(file7, 'w', newline='')
834    cashflow_clean = open(file9, 'w', newline='')
835    cashflow_dirty = open(file12, 'w', newline='')
836    ids_clean = open(file15, 'w', newline='')
837    ids_dirty = open(file14, 'w', newline='')
838    accounts_clean = open(file10, 'w', newline='')
839    accounts_dirty = open(file16, 'w', newline='')
840
841    # Create a reader for each of the input files:
842    csv_input_persons = csv.DictReader(input_persons, delimiter=',')
843    csv_input_cashflow = csv.DictReader(input_cashflow, delimiter=',')
844    csv_input_ids = csv.DictReader(input_ids, delimiter=',')
845    csv_input_accounts = csv.DictReader(input_accounts, delimiter=',')
846
847    # Create a writer for each of the new files:
848    persons_csv = csv.writer(persons_output)
849    cash_csv_c = csv.writer(cashflow_clean)
850    cash_csv_d = csv.writer(cashflow_dirty)
851    ids_csv_c = csv.writer(ids_clean)
852    ids_csv_d = csv.writer(ids_dirty)
853    accounts_csv_c = csv.writer(accounts_clean)
854    accounts_csv_d = csv.writer(accounts_dirty)
855
856    # Headers for Accounts:
857    account_headers = [
858        'person_code',
859        'sponsor_code',
860        'scheme_code',
861        'description',
862        'last_modified_by',
863        'last_modified_date',
864        'income_structure',
865        'average_pay',
866        'ssno'
867    ]
868    # Write the headers to files:
869    accounts_csv_c.writerow(account_headers)
870    accounts_csv_d.writerow(account_headers)
871
872    # For each row in the accounts table:
873    for input_row in csv_input_accounts:
874        # Get the person_code:
875        thisPerson = input_row['person_code']
876        # Write the row:
877        if thisPerson in newAccounts:
878            #for original, new in accountLinks.items():
879            for link in accountLinks:
880                # Old refers to the original person_code
881                old = link[0]
882                # New refers to a person/account to be replaced by original:
883                new = link[1]
884                # If the new code is equal to the current one:
885                if new == input_row['person_code']:
886                    # Create the row:
887                    account_row = [
888                        old,
889                        input_row['sponsor_code'],
890                        input_row['scheme_code'],
891                        input_row['description'],
892                        input_row['last_modified_by'],
893                        input_row['last_modified_date'],
894                        input_row['income_structure'],
895                        input_row['average_pay'],
896                        input_row['ssno']
897                    ]
898                    # Identify the sponsor/employer (for logging purposes):
899                    sponsor = input_row['sponsor_code']
900
901                    # If they're a duplicate:
902                    if thisPerson in dupeCodes:
903                        # Write the row:
904                        accounts_csv_d.writerow(account_row)
905                        # Show the user what we've done:
906                        print('New Account for Duplicate: |', thisPerson, '| moved to dirty file.' )
907                        # Create and append a log entry:
908                        logEntry = 'Found new account for a Duplicate: PersonCode (old/new): ', old, '/', new, 'SponsorCode:', sponsor, ', moved to Dirty file.', '\n'
909                        logString =  ''.join(logEntry)
910                        log.write(logString)
911                    # If they aren't a duplicate:
912                    else:
913                        # Write the row:
914                        accounts_csv_c.writerow(account_row)
915                        # Show the user what we've done:
916                        print('New Account for Original: |', thisPerson, '| moved to clean file.' )
917                        # Create and append a log entry:
918                        logEntry = 'Found new account for a person: PersonCode (old/new): ', old, '/', new, 'SponsorCode:', sponsor, ', moved to Clean file.', '\n'
919                        logString =  ''.join(logEntry)
920                        log.write(logString)
921                # This continues the loop around the array
922                else:
923                    continue
924        # If this is not a new account:
925        else:
926            # Create the row:
927             account_row = [
928                 input_row['person_code'],
929                 input_row['sponsor_code'],
930                 input_row['scheme_code'],
931                 input_row['description'],
932                 input_row['last_modified_by'],
933                 input_row['last_modified_date'],
934                 input_row['income_structure'],
935                 input_row['average_pay'],
936                 input_row['ssno']
937             ]
938             # If they're a duplicate:
939             if thisPerson in dupeCodes:
940                 # Write the row to dirty file:
941                 accounts_csv_d.writerow(account_row)
942                 # Show the user what we've done:
943                 print('Single account for Duplicate: |', thisPerson, '| moved to dirty file.' )
944                 # Create and append a log entry:
945                 logEntry = 'Found single account for a duplicate: PersonCode: ', thisPerson, 'SchemeCode:', sponsorCode, ', moved to Dirty file.', '\n'
946                 logString =  ''.join(logEntry)
947                 log.write(logString)
948             # If they aren't a duplicate:
949             else:
950                 # Write the row to clean file:
951                 accounts_csv_c.writerow(account_row)
952                 # Show the user what we've done:
953                 print('Single account for Original: |', thisPerson, '| moved to clean file.' )
954                 # Create and append a log entry:
955                 logEntry = 'Found single account for an individual: ', thisPerson, 'SchemeCode:', sponsorCode, ', moved to Clean file.' ,'\n'
956                 logString =  ''.join(logEntry)
957                 log.write(logString)
958
959    # Headers for ID file
960    id_headers = [
961        'person_code',
962        'id_type',
963        'id_number'
964    ]
965    # Write the headers to files:
966    ids_csv_c.writerow(id_headers)
967    ids_csv_d.writerow(id_headers)
968
969    # For each row in the IDs.csv:
970    for input_row in csv_input_ids:
971        # Get the person code and ID number:
972        thisPerson = input_row['person_code']
973        thisNumber = input_row['id_number']
974
975        # If the ID number is not empty, continue:
976        if thisNumber != '':
977            if thisPerson in newAccounts:
978                #for original, new in accountLinks.items():
979                for link in accountLinks:
980                    old = link[0]
981                    new = link[1]
982                    if new == input_row['person_code']:
983                        # Create the row:
984                        id_row = [
985                            old,
986                            input_row['id_type'],
987                            input_row['id_number']
988                        ]
989                        # If this is a duplicate, write to dirty file:
990                        if thisPerson in dupeCodes:
991                            ids_csv_d.writerow(id_row)
992                            print('ID Info for a Duplicate with Person Code w/New Account: |', thisPerson, '| moved to dirty file.' )
993                        # If this is not a duplicate, write to clean file:
994                        else:
995                            ids_csv_c.writerow(id_row)
996                            print('ID Info for an Individual w/ Person Code w/New Account: |', thisPerson, '| moved to clean file.' )
997                    else:
998                        continue
999            else:
1000                # Create the row:
1001                id_row = [
1002                    input_row['person_code'],
1003                    input_row['id_type'],
1004                    input_row['id_number']
1005                ]
1006                if thisPerson in dupeCodes:
1007                    ids_csv_d.writerow(id_row)
1008                    print('ID Info for a Duplicate with Person Code: |', thisPerson, '| moved to dirty file.' )
1009                # If this is not a duplicate, write to clean file:
1010                else:
1011                    ids_csv_c.writerow(id_row)
1012                    print('ID Info for an Individual w/ Person Code: |', thisPerson, '| moved to clean file.' )
1013        # If the ID number is empty, continue process, don't insert:
1014        else:
1015            continue
1016
1017
1018    # Headers for the Cashflow files:
1019    cashflow_headers = [
1020        'person_code',
1021        'sponsor_code',
1022        'scheme_code',
1023        'cashflow_type',
1024        'ledger_date',
1025        'value_date',
1026        'amount',
1027        'deleted',
1028        'last_modified_by',
1029        'last_modified_date'
1030    ]
1031    # Write the headers:
1032    cash_csv_c.writerow(cashflow_headers)
1033    cash_csv_d.writerow(cashflow_headers)
1034    # For each row in the cashflow file:
1035    for input_row in csv_input_cashflow:
1036        # Get the person_code:
1037        thisPerson = input_row['person_code']
1038        # Create the row:
1039        if thisPerson in newAccounts:
1040            #for original, new in accountLinks.items():
1041            for link in accountLinks:
1042                old = link[0]
1043                new = link[1]
1044                if new == input_row['person_code']:
1045                    # Create the row:
1046                    cashrow = [
1047                        old,
1048                        input_row['sponsor_code'],
1049                        input_row['scheme_code'],
1050                        input_row['cashflow_type'],
1051                        input_row['ledger_date'],
1052                        input_row['value_date'],
1053                        input_row['amount'],
1054                        'no', # deleted
1055                        '', #last_modified_by
1056                        ''  #last_modified_date
1057                    ]
1058                    # If this is a duplicate, write to dirty file:
1059                    if thisPerson in dupeCodes:
1060                        cash_csv_d.writerow(cashrow)
1061                        print('Cashflow Info for Duplicate w/new account: |', thisPerson, '| moved to dirty file.' )
1062                        break
1063                    # If this is not a duplicate, write to clean file:
1064                    else:
1065                        cash_csv_c.writerow(cashrow)
1066                        print('Cashflow Info for Original w/new account:  |', thisPerson, '| moved to clean file.' )
1067                        break
1068                else:
1069                    continue
1070
1071        else:
1072            cashrow = [
1073                input_row['person_code'],
1074                input_row['sponsor_code'],
1075                input_row['scheme_code'],
1076                input_row['cashflow_type'],
1077                input_row['ledger_date'],
1078                input_row['value_date'],
1079                input_row['amount'],
1080                'no', # deleted
1081                '', #last_modified_by
1082                ''  #last_modified_date
1083            ]
1084            # If this is a duplicate, write to dirty file:
1085            if thisPerson in dupeCodes:
1086                cash_csv_d.writerow(cashrow)
1087                print('Cashflow Info for Duplicate w/one account: |', thisPerson, '| moved to dirty file.' )
1088            # If this is not a duplicate, write to clean file:
1089            else:
1090                cash_csv_c.writerow(cashrow)
1091                print('Cashflow Info for Original  w/one account: |', thisPerson, '| moved to clean file.' )
1092
1093    # Headers for Persons.csv:
1094    person_headers = [
1095        'person_code',
1096        'first_name',
1097        'mid_name',
1098        'other_names',
1099        'maiden_name',
1100        'last_name',
1101        'last_modified_by',
1102        'last_modified_date',
1103        'deleted',
1104        'employment_status',
1105        'dob',
1106        'phone1',
1107        'phone2',
1108        'email',
1109        'address1',
1110        'address2',
1111        'city',
1112        'country',
1113        'marital_status',
1114        'gender',
1115        'father_first_name',
1116        'father_last_name',
1117        'father_address',
1118        'father_city',
1119        'father_country',
1120        'father_phone',
1121        'mother_first_name',
1122        'mother_last_name',
1123        'mother_maiden_name',
1124        'mother_address', # he has address and maiden name place reversed
1125        'mother_city',
1126        'mother_country',
1127        'mother_phone',
1128        'nationality'
1129    ]
1130    # Write the headers:
1131    persons_csv.writerow(person_headers)
1132    csv_6.writerow(person_headers)
1133    # For each row in the persons_with_dupes file:
1134    for input_row in csv_input_persons:
1135        # Remove spaces for insertion in to persons array, not output to CSV:
1136        if input_row['last_name'] == '':
1137           input_row['last_name'] = 'missing'
1138
1139        if input_row['first_name'] == '':
1140           input_row['first_name'] = 'missing'
1141
1142        first_name = input_row['first_name'].replace(" ", "")
1143        surname = input_row['last_name'].replace(" ", "")
1144
1145        # Create a unique identifier based on these values:
1146        fullPerson = input_row['ssnit'] + surname + first_name
1147        thisPerson = input_row['person_code']
1148        # If this is a duplicate, create row and write to dirty file:
1149        if thisPerson in newAccounts:
1150            #for original, new in accountLinks.items():
1151            for link in accountLinks:
1152                old = link[0]
1153                new = link[1]
1154                if new == input_row['person_code']:
1155                    if fullPerson in dupeUniques:
1156                        # Write the duplicate to the duplicates.csv file:
1157                        print('Duplicate Person Omitted, recorded new account with link to:', old )
1158                        logEntry = 'Omitted entry of person with duplicates for:', new, ' as it exists in:', old, '\n'
1159                        logString =  ''.join(logEntry)
1160                        log.write(logString)
1161                        break
1162                    # If this is not a duplicate, write to clean file:
1163                    else:
1164                        print('Person record omitted, new account recorded with link to:', old )
1165                        logEntry = 'Omitted entry of person with multiple accounts for:', new, ' as it exists in:', old, '\n'
1166                        logString =  ''.join(logEntry)
1167                        log.write(logString)
1168                        break
1169                else:
1170                    continue
1171        else:
1172            person_row = [
1173                input_row['person_code'],
1174                input_row['first_name'],
1175                input_row['mid_name'],
1176                input_row['other_names'],
1177                input_row['maiden_name'],
1178                input_row['last_name'],
1179                input_row['last_modified_by'],
1180                input_row['last_modified_date'],
1181                input_row['deleted'],
1182                input_row['employment_status'],
1183                input_row['dob'],
1184                input_row['phone1'],
1185                input_row['phone2'],
1186                input_row['email'],
1187                input_row['address1'],
1188                input_row['address2'],
1189                input_row['city'],
1190                input_row['country'],
1191                input_row['marital_status'],
1192                input_row['gender'],
1193                input_row['father_first_name'],
1194                input_row['father_last_name'],
1195                input_row['father_address'],
1196                input_row['father_city'],
1197                input_row['father_country'],
1198                input_row['father_phone'],
1199                input_row['mother_first_name'],
1200                input_row['mother_last_name'],
1201                input_row['mother_maiden_name'],
1202                input_row['mother_address'],
1203                input_row['mother_city'],
1204                input_row['mother_country'],
1205                input_row['mother_phone'],
1206                input_row['nationality']
1207            ]
1208            if fullPerson in dupeUniques:
1209                # Write the duplicate to the duplicates.csv file:
1210                csv_6.writerow(person_row)
1211                print('Duplicate Person w/single account using ref# |', input_row['person_code'], '| moved to dirty file.' )
1212            # If this is not a duplicate, write to clean file:
1213            else:
1214                persons_csv.writerow(person_row)
1215                print('Single Person w/single account using ref#    |', input_row['person_code'], '| moved to clean file.' )
1216
1217    # When done, close all of our files:
1218    csv_output_file1.close()
1219    csv_output_file2.close()
1220    csv_output_file3.close()
1221    csv_output_file5.close()
1222    csv_output_file6.close()
1223    persons_output.close()
1224    csv_input_file.close()
1225    cashflow_dirty.close()
1226    cashflow_clean.close()
1227    ids_dirty.close()
1228    ids_clean.close()
1229    accounts_dirty.close()
1230    accounts_clean.close()
1231    log.close()
1232    done = time.time()
1233    elapsed = done - start
1234    print("Completed in:", elapsed)
1235
1236    return "File splice completed. Proceed to import stage."
1237
1238
1239if __name__ == '__main__':
1240    success = split_csv(input_path = file0, output_path1 = file1, output_path2=file2, output_path3= file3, output_path5= file5, output_path6= file6)
1241    print(success)