X8vrun71

· 6 years ago · May 31, 2019, 06:16 AM
1## Import modules
2import os
3from optparse import OptionParser
4import sys
5import subprocess	
6import pyodbc
7import textract
8import time
9
10reload(sys)
11sys.setdefaultencoding('utf8')
12
13## Command line options
14parser = OptionParser(usage="usage: %prog [options]", version="%prog 1.1")
15parser.add_option(
16	"-v"
17	, "--verbose"
18	, action="store_true"
19	, dest="verbose"
20	, default=False
21	, help="Sets verbose output (slows down execution by 50%)")
22parser.add_option(
23	"-u"
24	, "--upload"
25	, action="store_true"
26	, dest="upload"
27	, default=False
28	, help="Auto uploads cached text to Azure SQL Database")
29parser.add_option(
30    "-c"
31    , "--container-name"
32    , dest="container_name"
33    , help="Sets the container name to download documents from")
34parser.add_option(
35    "-d"
36    , "--database-name"
37    , dest="database_name"
38    , help="Sets the database name to upload to")
39parser.add_option(
40    "-s"
41    , "--skip"
42    , action="store_true"
43    , dest="skip"
44    , default=False
45    , help="skips creation")
46
47
48(options, args) = parser.parse_args()
49
50## Azure SQL Server Connection
51driver 		= '{ODBC Driver 17 for SQL Server}'
52cnxn 	= pyodbc.connect('DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
53cursor 	= cnxn.cursor()
54
55## Global variables
56path_ScriptDir 		= os.path.dirname(os.path.realpath(__file__))	    		    ## Script path
57folder_documents 	= "/blob_files/{0}".format(options.container_name)   ## Folder containing document files
58folder_scripts 		= "{0}/script_files/".format(path_ScriptDir)			    ## Folder containing update scripts
59file_unparsed 		= "{0}/unparsed_documents.txt".format(path_ScriptDir)	            ## Unparsed document file list
60file_empty 			= "{0}/empty_documents.txt".format(path_ScriptDir)	            ## Empty document file list
61file_notexist 		= "{0}/notexist_documents.txt".format(path_ScriptDir)	            ## Empty document file list"
62
63## Colour codes for printing 
64class bcolors:
65    HEADER 		= '\033[95m'
66    OKBLUE 		= '\033[94m'
67    OKGREEN 	= '\033[92m'
68    WARNING 	= '\033[93m'
69    FAIL 		= '\033[91m'
70    ENDC 		= '\033[0m'
71    BOLD 		= '\033[1m'
72    UNDERLINE 	= '\033[4m'
73
74## Create verboseprint function
75if options.verbose:
76	def verboseprint(verbose_print):
77		print bcolors.WARNING + verbose_print + bcolors.ENDC
78else:
79	verboseprint = lambda *a: None
80
81## Remove unparsed document file if exists
82if os.path.exists(file_unparsed):
83    os.remove(file_unparsed)
84    print bcolors.OKGREEN + "[+] Unparsed document file removed" + bcolors.ENDC
85
86## Remove empty document file if exists
87if os.path.exists(file_empty):
88    os.remove(file_empty)
89    print bcolors.OKGREEN + "[+] Empty document file removed" + bcolors.ENDC
90
91## Remove notexist document file if exists
92if os.path.exists(file_notexist):
93    os.remove(file_notexist)
94    print bcolors.OKGREEN + "[+] Notexist document file removed" + bcolors.ENDC
95
96query_counter = 0 	## used to limit the amount of queries in one script
97script_counter = 1	## counter of scripts created
98
99## Get document list
100cursor.execute("SELECT d.id ,REPLACE(dT.FolderPath, '\\', '/') + REPLACE(d.path, '\\', '/') FROM Document d INNER JOIN DocumentType dT ON d.DocumentTypeId = dT.Id WHERE BranchID IN ( 7,9,10,14,16,18,20,27,2) AND ( d.path like '%.doc%' OR d.path like '%.pdf') and documentTypeID = 30")
101row = cursor.fetchone()
102
103
104## Begin text extraction loop
105if not options.skip:
106	while row:
107		
108		path = str(folder_documents + row[1].encode('utf-8').strip())
109		path_print = row[1].encode('utf-8').strip()
110
111		if os.path.exists(path.strip()): 
112
113			try:
114				## Call bash command
115				if os.path.splitext(path)[1] == '.docx':
116					text = subprocess.check_output("unzip -p '" + path + "' word/document.xml | sed -e 's/<\\/w:p>/ /g; s/<[^>]\\{1,\\}>/ /g; s/[^[:print:]]\\{1,\\}/ /g'", shell=True, stderr=subprocess.PIPE)
117					text = text.replace("'", "''")
118					text = text.replace(";", "")  
119					text = ' '.join(text.split())
120					## text = text.decode('utf-8').strip()
121				elif os.path.splitext(path)[1] == '.pdf':
122					text = subprocess.check_output("pdftotext '{0}' -".format(path), shell=True, stderr=subprocess.PIPE)
123					text = text.replace("'", "''") 
124					text = text.replace(";", "")  
125					text = ' '.join(text.split())
126					## text = text.decode('utf-8').strip()
127				elif os.path.splitext(path)[1] == '.doc':
128					text = textract.process(path)
129					text = text.replace("'", "''") 
130					text = text.replace(";", "")  
131					text = ' '.join(text.split())
132					## text = text.decode('utf-8').strip()
133				else:
134				 	text = ''  
135
136				if text:
137					if query_counter >= 500:
138						script_counter += 1
139						query_counter = 0
140
141					## Generate SQL Statements
142					## This needs to be able to run against the correct TRIS 5 ID
143					if text != '':
144						with open(folder_scripts + "update_script_{0}_{1}.sql".format(str(script_counter), options.database_name), "a") as update_script:
145							update_script.write("UPDATE Document SET CachedText = '{0}' WHERE ID = {1};".format(text, row[0]) + "\n")
146						query_counter += 1
147					else:
148						verboseprint("[!] Error on {0} - empty document".format(path_print))
149						with open(file_empty, "a") as empty:
150							empty.write(path_print + "\n")
151
152			except:
153				verboseprint("[!] Error on {0} - unparsed".format(path_print))
154
155				## Append to unparsed documents file
156				with open(file_unparsed, "a") as unparsed:
157					unparsed.write(path_print + "\n")
158
159		else:
160			verboseprint("[!] Error on {0} - Does not exist".format(path))
161			with open(file_notexist, "a") as notexist:
162				notexist.write(path_print + "\n")
163		
164		row = cursor.fetchone() ## Get next row
165
166## Run each file
167## Could run this for each file generated instead of after the exraction.
168if options.upload:
169	for script in os.listdir(folder_scripts):
170		if str(options.database_name) in str(script):
171			print bcolors.OKBLUE + "[*] Executing: {0}".format(script) + bcolors.ENDC
172			file = os.path.join(folder_scripts, script)
173			fd = open(file, 'r')
174			sqlFile = fd.read()
175			fd.close()
176			sqlCommands = sqlFile.split(';')
177
178			# Execute every command from the input file
179			for command in sqlCommands:
180			    # This will skip and report errors
181			    # For example, if the tables do not yet exist, this will skip over
182			    # the DROP TABLE commands
183			    try:
184			        cursor.execute(command)
185			        cursor.commit()
186			    except Exception:
187			    	print Exception
188			        pass
189
190			print bcolors.OKBLUE + "[*] Removing: {0}{1}".format(folder_scripts, script) + bcolors.ENDC
191			os.remove(os.path.join(folder_scripts, script))