· 7 years ago · Dec 29, 2018, 05:26 PM
1<Butterfly>
2 <Name>Swallowtail&</Name>
3 <HomePage>https://example.com/butterflies/swallowtail/<HomePage>
4 <TaxonomyID>54321&</TaxonomyID>
5 <Grouping>Papilionidae&</Grouping>
6</Butterfly>
7
8CREATE EXTERNAL TABLE IF NOT EXISTS butterflies (
9Name STRING,
10HomePage STRING,
11TaxonomyID BIGINT,
12Grouping STRING,
13ROW FORMAT DELIMITED
14FIELDS TERMINATED BY "t"
15STORED AS TEXTFILE
16LOCATION 's3://butterflies/test_parsed_xml/';
17
18field_values = ''
19 row_count = 0
20 for row in self.root_node:
21 field_count = 0
22 for field in row:
23 if field.tag in self.schema[field_count]:
24 if field.text:
25 text_value = re.sub('s+', ' ', field.text).rstrip()
26 else:
27 text_value = u''
28 field_values = (field_values + text_value + 't')
29 field_count += 1
30 else:
31 raise ValueError()
32
33 field_values = field_values + 'n'
34 row_count += 1
35 print("Processed row number {}".format(row_count))
36 # Finally, code which pushes the huge string to s3