· 6 years ago · May 31, 2019, 06:16 AM
1## Import modules
2import os
3from optparse import OptionParser
4import sys
5import subprocess
6import pyodbc
7import textract
8import time
9
10reload(sys)
11sys.setdefaultencoding('utf8')
12
13## Command line options
14parser = OptionParser(usage="usage: %prog [options]", version="%prog 1.1")
15parser.add_option(
16 "-v"
17 , "--verbose"
18 , action="store_true"
19 , dest="verbose"
20 , default=False
21 , help="Sets verbose output (slows down execution by 50%)")
22parser.add_option(
23 "-u"
24 , "--upload"
25 , action="store_true"
26 , dest="upload"
27 , default=False
28 , help="Auto uploads cached text to Azure SQL Database")
29parser.add_option(
30 "-c"
31 , "--container-name"
32 , dest="container_name"
33 , help="Sets the container name to download documents from")
34parser.add_option(
35 "-d"
36 , "--database-name"
37 , dest="database_name"
38 , help="Sets the database name to upload to")
39parser.add_option(
40 "-s"
41 , "--skip"
42 , action="store_true"
43 , dest="skip"
44 , default=False
45 , help="skips creation")
46
47
48(options, args) = parser.parse_args()
49
50## Azure SQL Server Connection
51driver = '{ODBC Driver 17 for SQL Server}'
52cnxn = pyodbc.connect('DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
53cursor = cnxn.cursor()
54
55## Global variables
56path_ScriptDir = os.path.dirname(os.path.realpath(__file__)) ## Script path
57folder_documents = "/blob_files/{0}".format(options.container_name) ## Folder containing document files
58folder_scripts = "{0}/script_files/".format(path_ScriptDir) ## Folder containing update scripts
59file_unparsed = "{0}/unparsed_documents.txt".format(path_ScriptDir) ## Unparsed document file list
60file_empty = "{0}/empty_documents.txt".format(path_ScriptDir) ## Empty document file list
61file_notexist = "{0}/notexist_documents.txt".format(path_ScriptDir) ## Empty document file list"
62
63## Colour codes for printing
64class bcolors:
65 HEADER = '\033[95m'
66 OKBLUE = '\033[94m'
67 OKGREEN = '\033[92m'
68 WARNING = '\033[93m'
69 FAIL = '\033[91m'
70 ENDC = '\033[0m'
71 BOLD = '\033[1m'
72 UNDERLINE = '\033[4m'
73
74## Create verboseprint function
75if options.verbose:
76 def verboseprint(verbose_print):
77 print bcolors.WARNING + verbose_print + bcolors.ENDC
78else:
79 verboseprint = lambda *a: None
80
81## Remove unparsed document file if exists
82if os.path.exists(file_unparsed):
83 os.remove(file_unparsed)
84 print bcolors.OKGREEN + "[+] Unparsed document file removed" + bcolors.ENDC
85
86## Remove empty document file if exists
87if os.path.exists(file_empty):
88 os.remove(file_empty)
89 print bcolors.OKGREEN + "[+] Empty document file removed" + bcolors.ENDC
90
91## Remove notexist document file if exists
92if os.path.exists(file_notexist):
93 os.remove(file_notexist)
94 print bcolors.OKGREEN + "[+] Notexist document file removed" + bcolors.ENDC
95
96query_counter = 0 ## used to limit the amount of queries in one script
97script_counter = 1 ## counter of scripts created
98
99## Get document list
100cursor.execute("SELECT d.id ,REPLACE(dT.FolderPath, '\\', '/') + REPLACE(d.path, '\\', '/') FROM Document d INNER JOIN DocumentType dT ON d.DocumentTypeId = dT.Id WHERE BranchID IN ( 7,9,10,14,16,18,20,27,2) AND ( d.path like '%.doc%' OR d.path like '%.pdf') and documentTypeID = 30")
101row = cursor.fetchone()
102
103
104## Begin text extraction loop
105if not options.skip:
106 while row:
107
108 path = str(folder_documents + row[1].encode('utf-8').strip())
109 path_print = row[1].encode('utf-8').strip()
110
111 if os.path.exists(path.strip()):
112
113 try:
114 ## Call bash command
115 if os.path.splitext(path)[1] == '.docx':
116 text = subprocess.check_output("unzip -p '" + path + "' word/document.xml | sed -e 's/<\\/w:p>/ /g; s/<[^>]\\{1,\\}>/ /g; s/[^[:print:]]\\{1,\\}/ /g'", shell=True, stderr=subprocess.PIPE)
117 text = text.replace("'", "''")
118 text = text.replace(";", "")
119 text = ' '.join(text.split())
120 ## text = text.decode('utf-8').strip()
121 elif os.path.splitext(path)[1] == '.pdf':
122 text = subprocess.check_output("pdftotext '{0}' -".format(path), shell=True, stderr=subprocess.PIPE)
123 text = text.replace("'", "''")
124 text = text.replace(";", "")
125 text = ' '.join(text.split())
126 ## text = text.decode('utf-8').strip()
127 elif os.path.splitext(path)[1] == '.doc':
128 text = textract.process(path)
129 text = text.replace("'", "''")
130 text = text.replace(";", "")
131 text = ' '.join(text.split())
132 ## text = text.decode('utf-8').strip()
133 else:
134 text = ''
135
136 if text:
137 if query_counter >= 500:
138 script_counter += 1
139 query_counter = 0
140
141 ## Generate SQL Statements
142 ## This needs to be able to run against the correct TRIS 5 ID
143 if text != '':
144 with open(folder_scripts + "update_script_{0}_{1}.sql".format(str(script_counter), options.database_name), "a") as update_script:
145 update_script.write("UPDATE Document SET CachedText = '{0}' WHERE ID = {1};".format(text, row[0]) + "\n")
146 query_counter += 1
147 else:
148 verboseprint("[!] Error on {0} - empty document".format(path_print))
149 with open(file_empty, "a") as empty:
150 empty.write(path_print + "\n")
151
152 except:
153 verboseprint("[!] Error on {0} - unparsed".format(path_print))
154
155 ## Append to unparsed documents file
156 with open(file_unparsed, "a") as unparsed:
157 unparsed.write(path_print + "\n")
158
159 else:
160 verboseprint("[!] Error on {0} - Does not exist".format(path))
161 with open(file_notexist, "a") as notexist:
162 notexist.write(path_print + "\n")
163
164 row = cursor.fetchone() ## Get next row
165
166## Run each file
167## Could run this for each file generated instead of after the exraction.
168if options.upload:
169 for script in os.listdir(folder_scripts):
170 if str(options.database_name) in str(script):
171 print bcolors.OKBLUE + "[*] Executing: {0}".format(script) + bcolors.ENDC
172 file = os.path.join(folder_scripts, script)
173 fd = open(file, 'r')
174 sqlFile = fd.read()
175 fd.close()
176 sqlCommands = sqlFile.split(';')
177
178 # Execute every command from the input file
179 for command in sqlCommands:
180 # This will skip and report errors
181 # For example, if the tables do not yet exist, this will skip over
182 # the DROP TABLE commands
183 try:
184 cursor.execute(command)
185 cursor.commit()
186 except Exception:
187 print Exception
188 pass
189
190 print bcolors.OKBLUE + "[*] Removing: {0}{1}".format(folder_scripts, script) + bcolors.ENDC
191 os.remove(os.path.join(folder_scripts, script))