· 7 years ago · Feb 24, 2019, 11:30 PM
1import pandas as pd
2import os
3import numpy as np
4import psycopg2
5
6def generate_headers(df):
7 '''function to create list of headers with their respective data type. Input a pandas dataframe'''
8 headers = df.columns.tolist()
9 type_list = []
10 for col in headers: # for loop where we are defining a string representation of the postgres syntax for that column's dataframe datatype and appending it to type_list
11 df_type = str(df[col].dtype)
12 if 'int' in df_type:
13 type_list.append('int')
14 elif 'float' in df_type:
15 type_list.append('decimal')
16 else:
17 type_list.append('varchar')
18 return headers, type_list
19
20def create_table_sql(tablename, headers, type_list):
21 '''function to generate string of "create table" statements. takes in tablename, columns, and datatypes of each column'''
22
23 statement = '''drop table if exists dl.''' + tablename + ''';
24 create table dl.''' + tablename + ''' (''' # may need to change schema here!
25
26 for i in range(len(headers)):
27 if type_list[i] == 'varchar':
28 statement = (statement + '\n{} varchar ,').format(headers[i].lower())
29 else:
30 statement = (statement + '\n' + '{} {}' + ' ,').format(headers[i].lower(), type_list[i])
31
32 statement = statement + ' primary key (frn_adjusted));'
33 return statement
34
35############ Main code to run
36
37df = pd.read_csv('features.csv')
38
39# assuming you have your dataframe df of features from the csv and tablename is 'ml_final_features' and schema is 'dl',
40# will output a sql script that you need to execute in the forked DB
41headers, type_list = generate_headers(df)
42statement = create_table_sql('ml_final_features',headers,type_list)
43outputile = open('create_ml_final_features.sql','w')
44outputile.write(statement) # now definitely inspect the file before running anything below!!
45
46##### Now for inserting the data from your df
47
48# get forked DB credentials whatever they're called
49HOST = os.environ.get("HOST_TEMP")
50USER = os.environ.get("USER_TEMP")
51PASSWORD = os.environ.get("PASSWORD_TEMP")
52DB = os.environ.get("DB_TEMP")
53
54myConnection = psycopg2.connect(host=HOST, user=USER, password=PASSWORD, database=DB, port=5432)
55cursor = myConnection.cursor()
56
57# create the table -- hope this works, not sure if I've ever done this way and not manually
58cursor.execute(open('create_ml_final_features.sql', "r").read())
59
60# function to insert into tables with lots of columns
61def insert_large_table(df, full_tablename, cursor):
62
63 columns_list = df.columns.values.tolist()
64 columns = ', '.join(map(str, columns_list))
65 query = '(' + ','.join(["%s"] * len(columns.split(','))) + ')'
66 df = df.fillna(None) # hope this works; might need to try 'NULL', or maybe even do nothing
67 final_query = ','.join(cursor.mogrify(query, row.values.tolist()) for index, row in df.iterrows())
68
69 cursor.execute("INSERT INTO " + full_tablename + " VALUES " + final_query)
70
71 print(tablename + " inserted")
72
73# finally, insert the data
74insert_large_table(df, 'dl.ml_final_features', cursor)
75
76cursor.close()
77myConnection.commit()
78myConnection.close()