5ca8TCd5

· 7 years ago · Nov 06, 2018, 04:34 AM
1"""
2Python script for turning semi-structured text files into a csv file. Requires there to be somewhat consistent headings and subheadings through out out the text. The script uses these subheadings to create columns of data and populates the columns with the text in between subheadings 
3Ulysses Pascal
4September 28 2018
5"""
6
7import re
8import pandas as pd
9import email
10import numpy as np
11
12
13###############
14# Configuration
15###############
16
17# Set your input text file here
18input_filename = "data/001_Input_Data/1996ExchangeBinder.txt"
19
20# Set your output file path here. (note: make sure a the folder exists)
21output_path = 'data/002_Text_Parser_Data/parsed-exchanges.csv'
22
23
24###############
25# Set variables
26###############
27
28exchanges = []
29text_raw = open(input_filename, "r").read()
30#text_raw = open(r"1996ExchangeBinder.txt", "r").read()
31headers_dict = {}
32entry = None #Set dummy variable to be replaced by paragraph level text later
33
34### construct a dictionary of headers using this format for each row: 
35#	headers_dict = {
36#		'Header1': '(Header1 Spelling 1)|(Header1 Spelling 2)|(Header1 Spelling 3)',
37#		'Header2': '(Header2 Spelling 2)|(Header2 Spelling 2)|(Header2 Spelling 3)',
38#	}
39
40headers_dict = {
41	'Brief history': '(Brief History)|(Brief history)|(Briefhistory)',
42	'Principal officers': '(Principalofficers)|(Principal Officers)|(Principal officers)',
43	'Structure':'(Structure)',
44	'Market capitalisation':'(Market capitalisation)',
45	'Trading hours':'(Trading hours)|(Tradinghours)',
46	'Main indices':'(Main indices)|(Mainindices)',
47	'Securities traded':'(Securities traded)',
48	'Trading system':'((Trading system)|(Trading System))',
49	'Settling and clearing':'(Settling and clearing)',
50	'Commission rates and other client costs':'(Commission rates and otherclient costs)|(Commission rates and other client costs)',
51	'Taxation and regulations affecting foreign investors':'(Taxation and regulations affecting foreign investors)',
52	'Exchange holiday schedule':'(Exchange holiday schedule)',
53	'Trade volume statistics':'(Trade volume statistics)|(Trade volume)',
54	'Prospective developments':'(Prospectivedevelopments)|(Prospective developments)',
55	'Futures contracts':'(Futures contracts)',
56	'Copyright Mondo Visione': '(Copyright Mondo Visione)',
57
58	
59}
60
61
62
63###############
64# Format text
65###############  cd
66text_formated = re.sub("\s\s+"," ", text_raw) # Remove extra spaces
67
68text_formated = re.sub("<P>"," ", text_formated) # Remove paragraph tags
69text_formated = re.sub("<\/P>"," ", text_formated) # Remove paragraph tags
70	
71text_formated = re.sub("<TD>"," ", text_formated) # Remove table tags
72text_formated = re.sub("<\/TD>"," ", text_formated) # Remove table tags
73text_formated = re.sub("<TD\/>"," ", text_formated) # Remove table tags
74	
75text_formated = re.sub("<Table>"," ", text_formated) # Remove table tags
76text_formated = re.sub("<\/Table>"," ", text_formated) # Remove table tags
77	
78text_formated = re.sub("<TR>"," ", text_formated) # Remove table tags
79text_formated = re.sub("<\/TR>"," ", text_formated) # Remove table tags
80	
81text_formated = re.sub("<TH>"," ", text_formated) # Remove table tags
82text_formated = re.sub("<\/TH>"," ", text_formated) # Remove table tags
83
84text_formated = re.sub("<ImageData"," ", text_formated) # Remove image tags
85text_formated = re.sub("jpg\"\/>","jpg\"", text_formated) # Remove image tags
86
87text_formated = re.sub("<Figure>"," ", text_formated) # Remove figure tags
88text_formated = re.sub("</Figure>"," ", text_formated) # Remove figure tags
89
90text_formated = re.sub("<Sect>"," ", text_formated) # Remove figure tags
91text_formated = re.sub("</Sect>"," ", text_formated) # Remove figure tags
92
93text_formated = re.sub("\s\s+"," ", text_formated) # Remove spaces again Just in case 
94
95
96### You have to fix this: (instead of wrapping header with <h2> </h2> you need to wrap one of the miss spellings
97#for header in headers_dict:
98#	text_formated = re.sub("{}".format(header), "<H2>{}</H2>".format(header), text_formated) #Replace headers with tagged headers
99	
100
101for key, value in headers_dict.items():
102	text_formated = re.sub("{}".format(value), "<H2>{}</H2>".format(key), text_formated)
103
104
105	
106#print(text_formated) #test if format output is correct
107
108###############
109# Parse Text
110###############
111	
112contents = re.split(r"<Part>",text_formated)
113contents.pop(0)
114
115for item in contents:
116	#print(item)
117	# Parse Main Headers
118	for key, value in headers_dict.items():
119		regex = re.search(r"<H2>{}<\/H2>( *)(.*?)( *)((<\/Part>)|(<H2>))".format(key), item) #Searches for text between the end of an H2 
120		if regex is not None:
121			entry = regex.group(2)
122			headers_dict[key] = entry
123		else:
124			entry = 'NA'
125			headers_dict[key] = entry
126	
127	# Add Date Founded
128	brief_history = str(headers_dict['Brief history'])
129	year = re.findall(r'([1-3][0-9]{3})', brief_history)
130	if year:
131		earliest_year = min(int(s) for s in year)
132		headers_dict['Date Founded'] = earliest_year
133	else:
134		headers_dict['Date Founded'] = 'NA'
135		
136	# Contains automated, computerized etc
137	trading_system = str(headers_dict['Trading system'])
138	automated = re.findall(r'(automa|comput|electr)', trading_system)
139	if automated:
140		headers_dict['Automated'] = 1
141	else:
142		headers_dict['Automated'] = 0
143	
144	
145	trading_system = str(headers_dict['Trading system'])
146	automated = re.findall(r'(automa)', trading_system)
147	computerised = re.findall(r'(comput)', trading_system)
148	electronic = re.findall(r'(electr|OTC|counter)', trading_system)
149	floor = re.findall(r'(floor)', trading_system)
150	
151	if automated:
152		headers_dict['Automated'] = 1
153	else:
154		headers_dict['Automated'] = 0
155	
156	
157	
158	
159	
160	## Irregular section categories 
161	exchange_dict = {
162		'Exchange Name': re.search(r"<H5>(.*?)<\/H5>", item).group(1), 
163		'Address': re.search(r"Visione (.*?)Telephone", item).group(1)
164		}
165		
166
167	
168
169	#exchange_dict = dict(metadata_dict.items() + headers_dict.items())
170
171	exchange_dict.update(headers_dict) 
172
173	exchanges.append(exchange_dict) #sets up dictionary for dataframe
174
175
176#Dataframe
177exchanges_df = pd.DataFrame(exchanges)
178#pd.DataFrame.head(exchanges_df, n=5)
179#exchanges_df.to_csv('data/test.csv', encoding='utf-8', index = True) #Export DataFrame to CSV
180exchanges_df.to_csv(output_path, encoding='utf-8', index = True) #Export DataFrame to CSV