Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/241#discussion_r175548591 --- Diff: src/ports/postgres/modules/utilities/test/minibatch_preprocessing.sql_in --- @@ -0,0 +1,223 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ +DROP TABLE IF EXISTS minibatch_preprocessing_input; +CREATE TABLE minibatch_preprocessing_input( + sex TEXT, + id SERIAL NOT NULL, + length DOUBLE PRECISION, + diameter DOUBLE PRECISION, + height DOUBLE PRECISION, + whole DOUBLE PRECISION, + shucked DOUBLE PRECISION, + viscera DOUBLE PRECISION, + shell DOUBLE PRECISION, + rings INTEGER); + +-- Test empty input table +-- Error out if out table already exists + +INSERT INTO minibatch_preprocessing_input(id,sex,length,diameter,height,whole,shucked,viscera,shell,rings) VALUES +(1040,'F',0.66,0.475,0.18,1.3695,0.641,0.294,0.335,6), +(3160,'F',0.34,0.255,0.085,0.204,0.097,0.021,0.05,6), +(3984,'F',0.585,0.45,0.125,0.874,0.3545,0.2075,0.225,6), +(2551,'I',0.28,0.22,0.08,0.1315,0.066,0.024,0.03,5), +(1246,'I',0.385,0.28,0.09,0.228,0.1025,0.042,0.0655,5), +(519,'M',0.325,0.23,0.09,0.147,0.06,0.034,0.045,4), +(2382,'M',0.155,0.115,0.025,0.024,0.009,0.005,0.0075,5), +(698,'M',0.28,0.205,0.1,0.1165,0.0545,0.0285,0.03,5), +(2381,'M',0.175,0.135,0.04,0.0305,0.011,0.0075,0.01,5), +(516,'M',0.27,0.195,0.08,0.1,0.0385,0.0195,0.03,6); + +-- no of rows = 10, buffer_size = 4, so assert that count = 10/4 = 3 +\set expected_row_count 3 +DROP TABLE IF EXISTS minibatch_preprocessing_out, minibatch_preprocessing_out_standardization, minibatch_preprocessing_out_summary; +SELECT minibatch_preprocessor('minibatch_preprocessing_input', 'minibatch_preprocessing_out', 'length>0.2', 'ARRAY[diameter,height,whole,shucked,viscera,shell]', 4); +SELECT assert + ( + row_count = :expected_row_count, 'Row count validation failed for minibatch_preprocessing_out. + Expected:' || :expected_row_count || ' Actual: ' || row_count + ) from (select count(*) as row_count from minibatch_preprocessing_out) s; + +\set expected_dep_row_count '\'' 2,4,4 '\'' +\set expected_dep_col_count '\'' 2,2,2 '\'' +\set expected_indep_row_count '\'' 2,4,4 '\'' +\set expected_indep_col_count '\'' 6,6,6 '\'' + +-- assert dimensions for both dependent and independent variable +SELECT assert + ( + str_dep_row_count = :expected_dep_row_count, 'Dependent variable row count failed. Actual: ' || str_dep_row_count || ' Expected:' || :expected_dep_row_count + ) from + ( + select array_to_string(array_agg(row_count order by row_count asc), ',') as str_dep_row_count from (select array_upper(dependent_varname,1) as row_count from minibatch_preprocessing_out order by row_count asc) s + ) s; + +SELECT assert + ( + str_dep_col_count = :expected_dep_col_count, 'Dependent variable col count failed. Actual: ' || str_dep_col_count || ' Expected:' || :expected_dep_col_count + ) from + ( + select array_to_string(array_agg(col_count order by col_count asc), ',') as str_dep_col_count from (select array_upper(dependent_varname,2) as col_count from minibatch_preprocessing_out order by col_count asc) s + ) s; + +SELECT assert + ( + str_indep_row_count = :expected_indep_row_count, 'Independent variable row count failed. Actual: ' || str_indep_row_count || ' Expected:' || :expected_indep_row_count + ) from + ( + select array_to_string(array_agg(row_count order by row_count asc), ',') as str_indep_row_count from (select array_upper(independent_varname, 1) as row_count from minibatch_preprocessing_out order by row_count asc) s + ) s; + +SELECT assert + ( + str_indep_col_count = :expected_indep_col_count, 'Independent variable col count failed. Actual: ' || str_indep_col_count || ' Expected:' || :expected_indep_col_count + ) from + ( + select array_to_string(array_agg(col_count order by col_count asc), ',') as str_indep_col_count from (select array_upper(independent_varname,2) as col_count from minibatch_preprocessing_out order by col_count asc) s + ) s; + +SELECT assert + ( + source_table = 'minibatch_preprocessing_input' AND + output_table = 'minibatch_preprocessing_out' AND + dependent_varname = 'length>0.2' AND + independent_varname = 'ARRAY[diameter,height,whole,shucked,viscera,shell]' AND + buffer_size = 4 AND + class_values = '{f,t}' AND -- we sort the class values in python + num_rows_processed = 10 AND + num_missing_rows_skipped = 0 AND + grouping_cols is NULL, + 'Summary Validation failed. Expected:' || __to_char(summary) + ) from (select * from minibatch_preprocessing_out_summary) summary; + + +-- Test null values in x and y +\set expected_row_count 1 +DROP TABLE IF EXISTS minibatch_preprocessing_out, minibatch_preprocessing_out_standardization, minibatch_preprocessing_out_summary; + +TRUNCATE TABLE minibatch_preprocessing_input; +INSERT INTO minibatch_preprocessing_input(id,sex,length,diameter,height,whole,shucked,viscera,shell,rings) VALUES +(1040,'F',0.66,0.475,0.18,NULL,0.641,0.294,0.335,6), +(3160,'F',0.34,0.35,0.085,0.204,0.097,0.021,0.05,6), +(3984,NULL,0.585,0.45,0.25,0.874,0.3545,0.2075,0.225,5), +(861,'M',0.595,0.475,NULL,1.1405,0.547,0.231,0.271,6), +(932,NULL,0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195,6), +(698,'F',0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195,6), +(922,NULL,0.445,0.335,0.11,NULL,0.2025,0.1095,0.1195,6); +SELECT minibatch_preprocessor('minibatch_preprocessing_input', 'minibatch_preprocessing_out', 'sex', 'ARRAY[length,diameter,height,whole,shucked,viscera,shell]', 2); +SELECT assert + ( + row_count = :expected_row_count, 'Row count validation failed for minibatch_preprocessing_out. + Expected:' || :expected_row_count || ' Actual: ' || row_count + ) from (select count(*) as row_count from minibatch_preprocessing_out) s; +SELECT assert + (num_rows_processed = 2 AND num_missing_rows_skipped = 5, + 'Rows processed/skipped validation failed for minibatch_preprocessing_out_summary. + Actual num_rows_processed:' || num_rows_processed || ', Actual num_missing_rows_skipped: ' || num_missing_rows_skipped + ) from (select * from minibatch_preprocessing_out_summary) s; + +-- Test standardization +DROP TABLE IF EXISTS minibatch_preprocessing_input; +DROP TABLE IF EXISTS minibatch_preprocessing_out, minibatch_preprocessing_out_standardization, minibatch_preprocessing_out_summary; +CREATE TABLE minibatch_preprocessing_input(x1 INTEGER ,x2 INTEGER ,y TEXT); +INSERT INTO minibatch_preprocessing_input(x1,x2,y) VALUES +(2,10,'y1'), +(4,30,'y2'); +SELECT minibatch_preprocessor('minibatch_preprocessing_input', 'minibatch_preprocessing_out', 'y', 'ARRAY[x1,x2]', 2); + +-- since the order is not deterministic, we assert for all possible orders +\set expected_normalized_independent_var1 '\'' {{-1, -1},{1, 1}} '\'' +\set expected_normalized_independent_var2 '\'' {{1, 1},{-1, -1}} '\'' + +SELECT assert +( + independent_varname = :expected_normalized_independent_var1 OR + independent_varname = :expected_normalized_independent_var2, + 'Standardization check failed. Actual: ' || independent_varname +) from +( + select __to_char(independent_varname) as independent_varname from minibatch_preprocessing_out +) s; + + +-- Test that the standardization table gets created. +\set expected_row_count 1 +SELECT assert +( + row_count = :expected_row_count, 'Row count validation failed for minibatch_preprocessing_out_standardization. + Expected:' || :expected_row_count || ' Actual: ' || row_count +) from +( + select count(*) as row_count from minibatch_preprocessing_out_standardization +) s; + +-- Test that the summary table gets created. +\set expected_row_count 1 +SELECT assert +( + row_count = :expected_row_count, 'Row count validation failed for minibatch_preprocessing_out_summary. + Expected:' || :expected_row_count || ' Actual: ' || row_count +) from +( + select count(*) as row_count from minibatch_preprocessing_out_summary +) s; + +-- Test for array values in indep column +DROP TABLE IF EXISTS minibatch_preprocessing_input; +DROP TABLE IF EXISTS minibatch_preprocessing_out, minibatch_preprocessing_out_standardization, minibatch_preprocessing_out_summary; +CREATE TABLE minibatch_preprocessing_input( + id INTEGER, + sex TEXT, + attributes double precision[], + rings INTEGER); +TRUNCATE TABLE minibatch_preprocessing_input; +INSERT INTO minibatch_preprocessing_input(id,sex,attributes) VALUES +(1040,'F',ARRAY[0.66,0.475,0.18,NULL,0.641,0.294,0.335]), +(3160,'F',ARRAY[0.34,0.35,0.085,0.204,0.097,0.021,0.05]), +(3984,NULL,ARRAY[0.585,0.45,0.25,0.874,0.3545,0.2075,0.225]), +(861,'M',ARRAY[0.595,0.475,NULL,1.1405,0.547,0.231,0.271]), +(932,NULL,ARRAY[0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195]), +(NULL,'F',ARRAY[0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195]), +(922,NULL,ARRAY[0.445,0.335,0.11,NULL,0.2025,0.1095,0.1195]); +SELECT minibatch_preprocessor('minibatch_preprocessing_input', 'minibatch_preprocessing_out', 'sex', 'attributes', 1); +SELECT assert + ( + row_count = 2, 'Row count validation failed for minibatch_preprocessing_out. + Expected:' || 2 || ' Actual: ' || row_count + ) from (select count(*) as row_count from minibatch_preprocessing_out) s; + +-- Test for array values in dep column +DROP TABLE IF EXISTS minibatch_preprocessing_out, minibatch_preprocessing_out_standardization, minibatch_preprocessing_out_summary; +SELECT minibatch_preprocessor('minibatch_preprocessing_input', 'minibatch_preprocessing_out', 'attributes', 'ARRAY[id]', 1); +SELECT assert + ( + row_count = 3, 'Row count validation failed array values in dependent variable. + Expected:' || 3 || ' Actual: ' || row_count + ) from (select count(*) as row_count from minibatch_preprocessing_out) s; + +-- Test for null buffer size +DROP TABLE IF EXISTS minibatch_preprocessing_out, minibatch_preprocessing_out_standardization, minibatch_preprocessing_out_summary; +SELECT minibatch_preprocessor('minibatch_preprocessing_input', 'minibatch_preprocessing_out', 'attributes', 'ARRAY[id]'); +SELECT assert + ( + row_count = 3, 'Row count validation failed for null buffer size. + Expected:' || 3 || ' Actual: ' || row_count + ) from (select count(*) as row_count from minibatch_preprocessing_out) s; --- End diff -- Please end this and other files with a new line.
---