· 7 years ago · Nov 06, 2018, 04:34 AM
1"""
2Python script for turning semi-structured text files into a csv file. Requires there to be somewhat consistent headings and subheadings through out out the text. The script uses these subheadings to create columns of data and populates the columns with the text in between subheadings
3Ulysses Pascal
4September 28 2018
5"""
6
7import re
8import pandas as pd
9import email
10import numpy as np
11
12
13###############
14# Configuration
15###############
16
17# Set your input text file here
18input_filename = "data/001_Input_Data/1996ExchangeBinder.txt"
19
20# Set your output file path here. (note: make sure a the folder exists)
21output_path = 'data/002_Text_Parser_Data/parsed-exchanges.csv'
22
23
24###############
25# Set variables
26###############
27
28exchanges = []
29text_raw = open(input_filename, "r").read()
30#text_raw = open(r"1996ExchangeBinder.txt", "r").read()
31headers_dict = {}
32entry = None #Set dummy variable to be replaced by paragraph level text later
33
34### construct a dictionary of headers using this format for each row:
35# headers_dict = {
36# 'Header1': '(Header1 Spelling 1)|(Header1 Spelling 2)|(Header1 Spelling 3)',
37# 'Header2': '(Header2 Spelling 2)|(Header2 Spelling 2)|(Header2 Spelling 3)',
38# }
39
40headers_dict = {
41 'Brief history': '(Brief History)|(Brief history)|(Briefhistory)',
42 'Principal officers': '(Principalofficers)|(Principal Officers)|(Principal officers)',
43 'Structure':'(Structure)',
44 'Market capitalisation':'(Market capitalisation)',
45 'Trading hours':'(Trading hours)|(Tradinghours)',
46 'Main indices':'(Main indices)|(Mainindices)',
47 'Securities traded':'(Securities traded)',
48 'Trading system':'((Trading system)|(Trading System))',
49 'Settling and clearing':'(Settling and clearing)',
50 'Commission rates and other client costs':'(Commission rates and otherclient costs)|(Commission rates and other client costs)',
51 'Taxation and regulations affecting foreign investors':'(Taxation and regulations affecting foreign investors)',
52 'Exchange holiday schedule':'(Exchange holiday schedule)',
53 'Trade volume statistics':'(Trade volume statistics)|(Trade volume)',
54 'Prospective developments':'(Prospectivedevelopments)|(Prospective developments)',
55 'Futures contracts':'(Futures contracts)',
56 'Copyright Mondo Visione': '(Copyright Mondo Visione)',
57
58
59}
60
61
62
63###############
64# Format text
65############### cd
66text_formated = re.sub("\s\s+"," ", text_raw) # Remove extra spaces
67
68text_formated = re.sub("<P>"," ", text_formated) # Remove paragraph tags
69text_formated = re.sub("<\/P>"," ", text_formated) # Remove paragraph tags
70
71text_formated = re.sub("<TD>"," ", text_formated) # Remove table tags
72text_formated = re.sub("<\/TD>"," ", text_formated) # Remove table tags
73text_formated = re.sub("<TD\/>"," ", text_formated) # Remove table tags
74
75text_formated = re.sub("<Table>"," ", text_formated) # Remove table tags
76text_formated = re.sub("<\/Table>"," ", text_formated) # Remove table tags
77
78text_formated = re.sub("<TR>"," ", text_formated) # Remove table tags
79text_formated = re.sub("<\/TR>"," ", text_formated) # Remove table tags
80
81text_formated = re.sub("<TH>"," ", text_formated) # Remove table tags
82text_formated = re.sub("<\/TH>"," ", text_formated) # Remove table tags
83
84text_formated = re.sub("<ImageData"," ", text_formated) # Remove image tags
85text_formated = re.sub("jpg\"\/>","jpg\"", text_formated) # Remove image tags
86
87text_formated = re.sub("<Figure>"," ", text_formated) # Remove figure tags
88text_formated = re.sub("</Figure>"," ", text_formated) # Remove figure tags
89
90text_formated = re.sub("<Sect>"," ", text_formated) # Remove figure tags
91text_formated = re.sub("</Sect>"," ", text_formated) # Remove figure tags
92
93text_formated = re.sub("\s\s+"," ", text_formated) # Remove spaces again Just in case
94
95
96### You have to fix this: (instead of wrapping header with <h2> </h2> you need to wrap one of the miss spellings
97#for header in headers_dict:
98# text_formated = re.sub("{}".format(header), "<H2>{}</H2>".format(header), text_formated) #Replace headers with tagged headers
99
100
101for key, value in headers_dict.items():
102 text_formated = re.sub("{}".format(value), "<H2>{}</H2>".format(key), text_formated)
103
104
105
106#print(text_formated) #test if format output is correct
107
108###############
109# Parse Text
110###############
111
112contents = re.split(r"<Part>",text_formated)
113contents.pop(0)
114
115for item in contents:
116 #print(item)
117 # Parse Main Headers
118 for key, value in headers_dict.items():
119 regex = re.search(r"<H2>{}<\/H2>( *)(.*?)( *)((<\/Part>)|(<H2>))".format(key), item) #Searches for text between the end of an H2
120 if regex is not None:
121 entry = regex.group(2)
122 headers_dict[key] = entry
123 else:
124 entry = 'NA'
125 headers_dict[key] = entry
126
127 # Add Date Founded
128 brief_history = str(headers_dict['Brief history'])
129 year = re.findall(r'([1-3][0-9]{3})', brief_history)
130 if year:
131 earliest_year = min(int(s) for s in year)
132 headers_dict['Date Founded'] = earliest_year
133 else:
134 headers_dict['Date Founded'] = 'NA'
135
136 # Contains automated, computerized etc
137 trading_system = str(headers_dict['Trading system'])
138 automated = re.findall(r'(automa|comput|electr)', trading_system)
139 if automated:
140 headers_dict['Automated'] = 1
141 else:
142 headers_dict['Automated'] = 0
143
144
145 trading_system = str(headers_dict['Trading system'])
146 automated = re.findall(r'(automa)', trading_system)
147 computerised = re.findall(r'(comput)', trading_system)
148 electronic = re.findall(r'(electr|OTC|counter)', trading_system)
149 floor = re.findall(r'(floor)', trading_system)
150
151 if automated:
152 headers_dict['Automated'] = 1
153 else:
154 headers_dict['Automated'] = 0
155
156
157
158
159
160 ## Irregular section categories
161 exchange_dict = {
162 'Exchange Name': re.search(r"<H5>(.*?)<\/H5>", item).group(1),
163 'Address': re.search(r"Visione (.*?)Telephone", item).group(1)
164 }
165
166
167
168
169 #exchange_dict = dict(metadata_dict.items() + headers_dict.items())
170
171 exchange_dict.update(headers_dict)
172
173 exchanges.append(exchange_dict) #sets up dictionary for dataframe
174
175
176#Dataframe
177exchanges_df = pd.DataFrame(exchanges)
178#pd.DataFrame.head(exchanges_df, n=5)
179#exchanges_df.to_csv('data/test.csv', encoding='utf-8', index = True) #Export DataFrame to CSV
180exchanges_df.to_csv(output_path, encoding='utf-8', index = True) #Export DataFrame to CSV