· 7 years ago · Jan 30, 2019, 06:14 PM
1/*
2** Variation Inflation Factor
3*/
4
5-- Functions that calculate Variance Inflation Factor:
6DROP TYPE IF EXISTS demo.vif_return CASCADE;
7CREATE TYPE demo.vif_return AS(
8 columns_analyzed INT,
9 colinear_columns_found INT,
10 output_schema TEXT,
11 output_table TEXT
12);
13
14CREATE OR REPLACE FUNCTION demo.vif(mytable TEXT, input_name TEXT, indep_var TEXT) RETURNS demo.vif_return AS $$
15declare
16 num_columns INT;
17 cur_col INT;
18 cur_name text;
19 table_name text;
20 schema_name text;
21 results_name text;
22 results_schema text;
23
24
25 dot INT;
26 ind INT := 0;
27 arg TEXT[];
28 argid INT[];
29 --res vif_result;
30 ret demo.vif_return;
31 lr madlib.linregr_result;
32
33begin
34 --Parse input
35 --Check if schema name is provided
36 SELECT INTO dot POSITION('.' in input_name);
37 IF (dot != 0 ) THEN
38 --schema specified, parse schema name
39 schema_name = substring(input_name from 1 for dot-1);
40 table_name = substring(input_name from dot+1 for char_length(input_name));
41
42 --check that this schema.table does exist
43 EXECUTE 'SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '''||table_name||''' AND table_schema = '''||schema_name||''';' INTO dot;
44
45 IF (dot = 0) THEN
46 RAISE EXCEPTION 'Table and/or schema does not exist';
47 END IF;
48
49 ELSE
50 --schema not specified, search for schema name
51 --if there are multiple, just pick one
52 table_name = input_name;
53 EXECUTE 'SELECT table_schema from information_schema.columns where table_name = '''||table_name||''' GROUP BY table_schema LIMIT 1;' INTO schema_name;
54
55 IF (schema_name IS NULL) THEN
56 RAISE EXCEPTION 'Table does not exist';
57 END IF;
58
59 END IF;
60
61 SELECT INTO dot POSITION('.' in mytable);
62 IF (dot != 0 ) THEN
63 --schema specified, parse schema name
64 results_schema = substring(mytable from 1 for dot-1);
65 results_name = substring(mytable from dot+1 for char_length(mytable));
66
67 ELSE
68 --schema not specified, make table in current schema
69 results_schema = '';
70 results_name = mytable;
71
72 END IF;
73
74 --Get list of column names from table
75 EXECUTE 'CREATE TEMP TABLE colnames AS SELECT row_number() OVER() AS id, column_name FROM information_schema.columns WHERE table_name = '''||table_name||''' AND table_schema = '''||schema_name||''' AND column_name <> '''||indep_var||'''';
76 EXECUTE 'CREATE TEMP TABLE colrem( nameid INT, pval FLOAT, coef FLOAT)';
77 EXECUTE 'CREATE TEMP TABLE colrem_full( column_name TEXT, nameid INT, pval FLOAT, coef FLOAT)';
78
79 --Get number of columns
80 EXECUTE 'SELECT COUNT(*) FROM colnames' INTO num_columns;
81 --check if there are any columns in the table aside from the indep variable
82 IF (num_columns <= 1) THEN
83 RAISE EXCEPTION 'Fewer than two columns found';
84 END IF;
85
86 --use only because of Greenplum bug which does not allow to return setof records
87 EXECUTE 'CREATE TABLE '||mytable||'(colname TEXT, colinear_columns TEXT[], p_value FLOAT[], vif FLOAT)';
88
89 --loop through each column
90 LOOP
91 ind = ind + 1;
92 IF (ind > num_columns) THEN
93 EXIT;
94 END IF;
95
96 --pick a column
97 EXECUTE 'SELECT id,column_name FROM colnames WHERE id = '||ind INTO cur_col,cur_name;
98 --res.colname = cur_name;
99
100 --compose array of dependent variables
101 --EXECUTE 'SELECT array_agg( column_name::text ),array_agg(id) FROM (SELECT column_name,id FROM colnames WHERE id <> '||cur_col||') AS T' INTO arg,argid;
102 EXECUTE 'SELECT array( SELECT column_name::text FROM (SELECT column_name,id FROM colnames WHERE id <> '||cur_col||' ORDER BY id LIMIT 10000) AS T),
103 array( SELECT id FROM (SELECT column_name,id FROM colnames WHERE id <> '||cur_col||' ORDER BY id LIMIT 10000) AS G)' INTO arg,argid;
104
105 --run linear regression
106 EXECUTE 'SELECT (madlib.linregr('||cur_name||',array[1,'|| array_to_string(arg,',') ||'])).* FROM '|| schema_name||'.'||table_name INTO lr;
107
108 --determine which parameters are collinear
109 EXECUTE 'TRUNCATE TABLE colrem';
110 EXECUTE 'INSERT INTO colrem SELECT unnest(array['||array_to_string(argid,',')||']), unnest(array['||array_to_string(lr.p_values[2:num_columns],',')||']), unnest(array['||array_to_string(lr.coef[2:num_columns],',')||'])';
111
112 EXECUTE 'TRUNCATE TABLE colrem_full';
113 EXECUTE 'INSERT INTO colrem_full SELECT column_name, id, pval, coef FROM colrem JOIN colnames ON (colrem.nameid = colnames.id)';
114
115 --EXECUTE 'SELECT array_agg(column_name), array_agg(pval) FROM colrem_full WHERE pval < 1e-5 AND coef > 1e-14' INTO res.co_col,res.p;
116 --EXECUTE 'SELECT array(SELECT column_name FROM colrem_full WHERE pval < 1e-4 AND abs(coef) > 1e-14 ORDER BY nameid LIMIT 10000)' INTO res.co_col;
117 --EXECUTE 'SELECT array(SELECT pval FROM colrem_full WHERE pval < 1e-4 AND abs(coef) > 1e-14 ORDER BY nameid LIMIT 10000)' INTO res.p;
118
119 --calculate the VIF
120 --res.vif = 1/(1-lr.r2+1e-15);
121
122 --use only because of Greenplum bug which does not allow to return setof records
123 EXECUTE 'INSERT INTO '||mytable||' SELECT '''||cur_name||''' AS colname,
124 array(SELECT column_name FROM colrem_full WHERE pval < 1e-4 AND abs(coef) > 1e-14 ORDER BY nameid LIMIT 10000) AS collinear_columns,
125 array(SELECT pval FROM colrem_full WHERE pval < 1e-4 AND abs(coef) > 1e-14 ORDER BY nameid LIMIT 10000) AS p_value,
126 1/(1-'||lr.r2||'+1e-15) AS vif';
127 --RETURN NEXT res;
128
129 END LOOP;
130
131 --Summary statistics return
132 ret.columns_analyzed = num_columns;
133 EXECUTE 'SELECT COUNT(*) FROM '||mytable||' WHERE colinear_columns IS NOT NULL' INTO ret.colinear_columns_found;
134 ret.output_schema = results_schema;
135 ret.output_table = results_name;
136
137 --Cleanup
138 EXECUTE 'DROP TABLE IF EXISTS colnames';
139 EXECUTE 'DROP TABLE IF EXISTS colrem';
140 EXECUTE 'DROP TABLE IF EXISTS colrem_full';
141
142 RETURN ret;
143
144end
145$$ language plpgsql;
146
147CREATE OR REPLACE FUNCTION demo.vif(myschema TEXT,input_name TEXT) RETURNS demo.vif_return AS $$
148declare
149 ret demo.vif_return;
150begin
151 ret = demo.vif(myschema,input_name,' ');
152 RETURN ret;
153end
154$$ language plpgsql;
155
156-- Run VIF
157DROP TABLE IF EXISTS demo.vif1;
158SELECT * FROM demo.vif('demo.vif1','demo.featureset','f');
159SELECT colname, vif FROM demo.vif1 ORDER BY vif desc;
160-- Total query runtime: 612593 ms. (289 rows retrieved.)