Repository: madlib Updated Branches: refs/heads/master f28c5a07e -> 4aa073294
Feature: Allow NULL in rows for computing correlations and covariance JIRA: MADLIB-1166 Additional Authors: Orhan Kislal <[email protected]> - Impute NULL values with the mean of corresponding column. Closes #206 Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/4aa07329 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/4aa07329 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/4aa07329 Branch: refs/heads/master Commit: 4aa0732942523a38ed3e7da60e43e3307ff8a5a1 Parents: f28c5a0 Author: Swatisoni <[email protected]> Authored: Tue Dec 5 10:27:13 2017 -0800 Committer: Orhan Kislal <[email protected]> Committed: Tue Dec 5 10:30:41 2017 -0800 ---------------------------------------------------------------------- .../postgres/modules/stats/correlation.py_in | 37 +++++------ .../postgres/modules/stats/correlation.sql_in | 25 ++++++-- .../modules/stats/test/correlation.sql_in | 65 ++++++++++++++++++++ 3 files changed, 101 insertions(+), 26 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/4aa07329/src/ports/postgres/modules/stats/correlation.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in index 0b08208..f658b48 100644 --- a/src/ports/postgres/modules/stats/correlation.py_in +++ b/src/ports/postgres/modules/stats/correlation.py_in @@ -165,7 +165,6 @@ def _populate_output_table(schema_madlib, source_table, output_table, with MinWarning("info" if verbose else "error"): start = time() col_len = len(col_names) - col_names_as_float_array = py_list_to_sql_string(col_names, "float8") col_names_as_text_array = py_list_to_sql_string(col_names, "varchar") temp_table = unique_string() if get_cov: @@ -180,32 +179,31 @@ def _populate_output_table(schema_madlib, source_table, output_table, function_name = "Correlation" agg_str = "{0}.correlation_agg(x, mean)".format(schema_madlib) + cols = ','.join(["coalesce({0}, avg_{0})".format(col) for col in col_names]) + avgs = ','.join(["avg({0}) AS avg_{0}".format(col) for col in col_names]) + avg_array = ','.join(["avg_{0}".format(col) for col in col_names]) # actual computation - plpy.execute(""" + sql1 = """ + CREATE TEMP TABLE {temp_table} AS SELECT - tot_cnt, - count(*) AS non_null_cnt, + count(*) AS tot_cnt, mean, {agg_str} as cor_mat FROM ( - SELECT {col_names_as_float_array} AS x - FROM {source_table} - ) src1, - ( - SELECT - count(*) AS tot_cnt, - {schema_madlib}.avg(x) AS mean - FROM + SELECT ARRAY[ {cols} ] AS x, + ARRAY [ {avg_array} ] AS mean + FROM {source_table}, ( - SELECT {col_names_as_float_array} AS x + SELECT {avgs} FROM {source_table} - ) src2 - ) subq - WHERE NOT {schema_madlib}.array_contains_null(x) - GROUP BY tot_cnt, mean - """.format(**locals())) + )sub1 + ) sub2 + GROUP BY mean + """.format(**locals()) + + plpy.execute(sql1) # create summary table summary_table = add_postfix(output_table, "_summary") @@ -217,8 +215,7 @@ def _populate_output_table(schema_madlib, source_table, output_table, '{output_table}'::varchar AS output_table, {col_names_as_text_array} AS column_names, mean AS mean_vector, - non_null_cnt AS total_rows_processed, - tot_cnt - non_null_cnt AS total_rows_skipped + tot_cnt AS total_rows_processed FROM {temp_table} """.format(**locals()) http://git-wip-us.apache.org/repos/asf/madlib/blob/4aa07329/src/ports/postgres/modules/stats/correlation.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in index 25c2859..79ea82f 100644 --- a/src/ports/postgres/modules/stats/correlation.sql_in +++ b/src/ports/postgres/modules/stats/correlation.sql_in @@ -20,6 +20,7 @@ m4_include(`SQLCommon.m4') <ul> <li><a href="#usage">Correlation Function</a></li> <li><a href="#examples">Examples</a></li> +<li><a href="#literature">Literature</a></li> <li><a href="#related">Related Topics</a></li> </ul> </div> @@ -104,10 +105,6 @@ is also created at the same time, which has the following columns: <th>total_rows_processed</th> <td>BIGINT. Total numbers of rows processed.</td> </tr> -<tr> - <th>total_rows_skipped</th> - <td>BIGINT. Total numbers of rows skipped due to missing values.</td> -</tr> </table> </dd> @@ -207,8 +204,24 @@ Result: </pre> @par Notes -Current implementation ignores a row that contains NULL entirely. This means -any correlation in such a row (with NULLs) does not contribute to the final answer. + +Null values will be replaced by the mean of their respective columns (Mean +imputation/substitution). Mean imputation is a method in which the missing +value on a certain variable is replaced by the mean of the available cases. +This method maintains the sample size and is easy to use, but the variability +in the data is reduced, so the standard deviations and the variance estimates +tend to be underestimated. Please refer to [1] and [2] for details. + +If the mean imputation method is not suitable for the target use case, it is +advised to employ a view that handles the NULL values prior to calling the +correlation/covariance functions. + +@anchor literature +@literature + +[1] https://en.wikipedia.org/wiki/Imputation_(statistics) + +[2] https://www.iriseekhout.com/missing-data/missing-data-methods/imputation-methods/ @anchor related @par Related Topics http://git-wip-us.apache.org/repos/asf/madlib/blob/4aa07329/src/ports/postgres/modules/stats/test/correlation.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/stats/test/correlation.sql_in b/src/ports/postgres/modules/stats/test/correlation.sql_in index 163f641..d6cd014 100644 --- a/src/ports/postgres/modules/stats/test/correlation.sql_in +++ b/src/ports/postgres/modules/stats/test/correlation.sql_in @@ -22,3 +22,68 @@ DROP TABLE IF EXISTS corr_output, corr_output_summary; SELECT * FROM correlation('rand_numeric', 'corr_output', Null); DROP TABLE IF EXISTS corr_output, corr_output_summary; SELECT * FROM correlation('rand_numeric', 'corr_output', 'a, c, e'); + +DROP TABLE IF EXISTS example_data; +CREATE TABLE example_data( + id SERIAL, outlook TEXT, + temperature FLOAT8, humidity FLOAT8, + windy TEXT, class TEXT, new_col FLOAT8); +INSERT INTO example_data VALUES +(1, 'sunny', 85, 85, 'false', 'Dont Play', 1), +(2, 'sunny', 80, 90, 'true', 'Dont Play', 12), +(3, 'overcast', 83, 78, 'false', 'Play', 13), +(4, 'rain', 70, 96, 'false', 'Play', 16), +(5, 'rain', 68, 80, 'false', 'Play', 17), +(6, 'rain', 65, 70, 'true', 'Dont Play', 12), +(7, 'overcast', 64, 65, 'true', 'Play', 15), +(8, 'sunny', 72, 95, 'false', 'Dont Play', 19), +(9, 'sunny', 69, 70, 'false', 'Play', 20), +(10, 'rain', 75, 80, 'false', 'Play', 32), +(11, 'sunny', 75, 70, 'true', 'Play', 31), +(12, 'overcast', 72, 90, 'true', 'Play', 11), +(13, 'overcast', 81, 75, 'false', 'Play', 31), +(14, 'rain', 71, 80, 'true', 'Dont Play', 21), +(15, NULL, 100, 100, 'true', NULL, 11), +(16, NULL, 110, 100, 'true', NULL, 13); + +--- example_data_output will have correlations of only two columns which do +--- not include any null columns in another columns +DROP TABLE IF EXISTS example_data_output, example_data_output_summary; +SELECT correlation( 'example_data', + 'example_data_output', + 'temperature, humidity, new_col', + True); +SELECT * FROM example_data_output; + +INSERT INTO example_data VALUES (17, NULL, 110, 100, 'true', NULL, NULL); + +--- example_data_output will have correlations of only three columns which does +--- include any columns. Hence if this +DROP TABLE IF EXISTS example_data_output_with_null_2column, example_data_output_with_null_2column_summary; +SELECT correlation( 'example_data', + 'example_data_output_with_null_2column', + 'temperature, humidity', + True); + +SELECT * FROM example_data_output_with_null_2column; + +DROP TABLE IF EXISTS example_data_output_with_null_3column, example_data_output_with_null_3column_summary; +SELECT correlation( 'example_data', + 'example_data_output_with_null_3column', + 'temperature, humidity, new_col', + True); +SELECT * FROM example_data_output_with_null_3column; + +SELECT assert(ABS(a.temperature - b.temperature) < 0.000001, + 'Rows with NULL values are ignored.') +FROM + example_data_output_with_null_2column a, + example_data_output_with_null_3column b +WHERE a.column_position = 2 and b.column_position = 2; + +SELECT assert(ABS(a.temperature - b.temperature) > 0.000001, + 'New row with NULL values is ignored.') +FROM + example_data_output a, + example_data_output_with_null_2column b +WHERE a.column_position = 2 and b.column_position = 2;
