Github user njayaram2 commented on a diff in the pull request:
https://github.com/apache/madlib/pull/241#discussion_r175548591
--- Diff:
src/ports/postgres/modules/utilities/test/minibatch_preprocessing.sql_in ---
@@ -0,0 +1,223 @@
+/* -----------------------------------------------------------------------
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//*
----------------------------------------------------------------------- */
+DROP TABLE IF EXISTS minibatch_preprocessing_input;
+CREATE TABLE minibatch_preprocessing_input(
+ sex TEXT,
+ id SERIAL NOT NULL,
+ length DOUBLE PRECISION,
+ diameter DOUBLE PRECISION,
+ height DOUBLE PRECISION,
+ whole DOUBLE PRECISION,
+ shucked DOUBLE PRECISION,
+ viscera DOUBLE PRECISION,
+ shell DOUBLE PRECISION,
+ rings INTEGER);
+
+-- Test empty input table
+-- Error out if out table already exists
+
+INSERT INTO
minibatch_preprocessing_input(id,sex,length,diameter,height,whole,shucked,viscera,shell,rings)
VALUES
+(1040,'F',0.66,0.475,0.18,1.3695,0.641,0.294,0.335,6),
+(3160,'F',0.34,0.255,0.085,0.204,0.097,0.021,0.05,6),
+(3984,'F',0.585,0.45,0.125,0.874,0.3545,0.2075,0.225,6),
+(2551,'I',0.28,0.22,0.08,0.1315,0.066,0.024,0.03,5),
+(1246,'I',0.385,0.28,0.09,0.228,0.1025,0.042,0.0655,5),
+(519,'M',0.325,0.23,0.09,0.147,0.06,0.034,0.045,4),
+(2382,'M',0.155,0.115,0.025,0.024,0.009,0.005,0.0075,5),
+(698,'M',0.28,0.205,0.1,0.1165,0.0545,0.0285,0.03,5),
+(2381,'M',0.175,0.135,0.04,0.0305,0.011,0.0075,0.01,5),
+(516,'M',0.27,0.195,0.08,0.1,0.0385,0.0195,0.03,6);
+
+-- no of rows = 10, buffer_size = 4, so assert that count = 10/4 = 3
+\set expected_row_count 3
+DROP TABLE IF EXISTS minibatch_preprocessing_out,
minibatch_preprocessing_out_standardization,
minibatch_preprocessing_out_summary;
+SELECT minibatch_preprocessor('minibatch_preprocessing_input',
'minibatch_preprocessing_out', 'length>0.2',
'ARRAY[diameter,height,whole,shucked,viscera,shell]', 4);
+SELECT assert
+ (
+ row_count = :expected_row_count, 'Row count validation failed for
minibatch_preprocessing_out.
+ Expected:' || :expected_row_count || ' Actual: ' || row_count
+ ) from (select count(*) as row_count from
minibatch_preprocessing_out) s;
+
+\set expected_dep_row_count '\'' 2,4,4 '\''
+\set expected_dep_col_count '\'' 2,2,2 '\''
+\set expected_indep_row_count '\'' 2,4,4 '\''
+\set expected_indep_col_count '\'' 6,6,6 '\''
+
+-- assert dimensions for both dependent and independent variable
+SELECT assert
+ (
+ str_dep_row_count = :expected_dep_row_count, 'Dependent variable
row count failed. Actual: ' || str_dep_row_count || ' Expected:' ||
:expected_dep_row_count
+ ) from
+ (
+ select array_to_string(array_agg(row_count order by row_count
asc), ',') as str_dep_row_count from (select array_upper(dependent_varname,1)
as row_count from minibatch_preprocessing_out order by row_count asc) s
+ ) s;
+
+SELECT assert
+ (
+ str_dep_col_count = :expected_dep_col_count, 'Dependent variable
col count failed. Actual: ' || str_dep_col_count || ' Expected:' ||
:expected_dep_col_count
+ ) from
+ (
+ select array_to_string(array_agg(col_count order by col_count
asc), ',') as str_dep_col_count from (select array_upper(dependent_varname,2)
as col_count from minibatch_preprocessing_out order by col_count asc) s
+ ) s;
+
+SELECT assert
+ (
+ str_indep_row_count = :expected_indep_row_count, 'Independent
variable row count failed. Actual: ' || str_indep_row_count || ' Expected:' ||
:expected_indep_row_count
+ ) from
+ (
+ select array_to_string(array_agg(row_count order by row_count
asc), ',') as str_indep_row_count from (select array_upper(independent_varname,
1) as row_count from minibatch_preprocessing_out order by row_count asc) s
+ ) s;
+
+SELECT assert
+ (
+ str_indep_col_count = :expected_indep_col_count, 'Independent
variable col count failed. Actual: ' || str_indep_col_count || ' Expected:' ||
:expected_indep_col_count
+ ) from
+ (
+ select array_to_string(array_agg(col_count order by col_count
asc), ',') as str_indep_col_count from (select
array_upper(independent_varname,2) as col_count from
minibatch_preprocessing_out order by col_count asc) s
+ ) s;
+
+SELECT assert
+ (
+ source_table = 'minibatch_preprocessing_input' AND
+ output_table = 'minibatch_preprocessing_out' AND
+ dependent_varname = 'length>0.2' AND
+ independent_varname =
'ARRAY[diameter,height,whole,shucked,viscera,shell]' AND
+ buffer_size = 4 AND
+ class_values = '{f,t}' AND -- we sort the class values in
python
+ num_rows_processed = 10 AND
+ num_missing_rows_skipped = 0 AND
+ grouping_cols is NULL,
+ 'Summary Validation failed. Expected:' || __to_char(summary)
+ ) from (select * from minibatch_preprocessing_out_summary) summary;
+
+
+-- Test null values in x and y
+\set expected_row_count 1
+DROP TABLE IF EXISTS minibatch_preprocessing_out,
minibatch_preprocessing_out_standardization,
minibatch_preprocessing_out_summary;
+
+TRUNCATE TABLE minibatch_preprocessing_input;
+INSERT INTO
minibatch_preprocessing_input(id,sex,length,diameter,height,whole,shucked,viscera,shell,rings)
VALUES
+(1040,'F',0.66,0.475,0.18,NULL,0.641,0.294,0.335,6),
+(3160,'F',0.34,0.35,0.085,0.204,0.097,0.021,0.05,6),
+(3984,NULL,0.585,0.45,0.25,0.874,0.3545,0.2075,0.225,5),
+(861,'M',0.595,0.475,NULL,1.1405,0.547,0.231,0.271,6),
+(932,NULL,0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195,6),
+(698,'F',0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195,6),
+(922,NULL,0.445,0.335,0.11,NULL,0.2025,0.1095,0.1195,6);
+SELECT minibatch_preprocessor('minibatch_preprocessing_input',
'minibatch_preprocessing_out', 'sex',
'ARRAY[length,diameter,height,whole,shucked,viscera,shell]', 2);
+SELECT assert
+ (
+ row_count = :expected_row_count, 'Row count validation failed for
minibatch_preprocessing_out.
+ Expected:' || :expected_row_count || ' Actual: ' || row_count
+ ) from (select count(*) as row_count from
minibatch_preprocessing_out) s;
+SELECT assert
+ (num_rows_processed = 2 AND num_missing_rows_skipped = 5,
+ 'Rows processed/skipped validation failed for
minibatch_preprocessing_out_summary.
+ Actual num_rows_processed:' || num_rows_processed || ', Actual
num_missing_rows_skipped: ' || num_missing_rows_skipped
+ ) from (select * from minibatch_preprocessing_out_summary) s;
+
+-- Test standardization
+DROP TABLE IF EXISTS minibatch_preprocessing_input;
+DROP TABLE IF EXISTS minibatch_preprocessing_out,
minibatch_preprocessing_out_standardization,
minibatch_preprocessing_out_summary;
+CREATE TABLE minibatch_preprocessing_input(x1 INTEGER ,x2 INTEGER ,y TEXT);
+INSERT INTO minibatch_preprocessing_input(x1,x2,y) VALUES
+(2,10,'y1'),
+(4,30,'y2');
+SELECT minibatch_preprocessor('minibatch_preprocessing_input',
'minibatch_preprocessing_out', 'y', 'ARRAY[x1,x2]', 2);
+
+-- since the order is not deterministic, we assert for all possible orders
+\set expected_normalized_independent_var1 '\'' {{-1, -1},{1, 1}} '\''
+\set expected_normalized_independent_var2 '\'' {{1, 1},{-1, -1}} '\''
+
+SELECT assert
+(
+ independent_varname = :expected_normalized_independent_var1 OR
+ independent_varname = :expected_normalized_independent_var2,
+ 'Standardization check failed. Actual: ' || independent_varname
+) from
+(
+ select __to_char(independent_varname) as independent_varname from
minibatch_preprocessing_out
+) s;
+
+
+-- Test that the standardization table gets created.
+\set expected_row_count 1
+SELECT assert
+(
+ row_count = :expected_row_count, 'Row count validation failed for
minibatch_preprocessing_out_standardization.
+ Expected:' || :expected_row_count || ' Actual: ' || row_count
+) from
+(
+ select count(*) as row_count from
minibatch_preprocessing_out_standardization
+) s;
+
+-- Test that the summary table gets created.
+\set expected_row_count 1
+SELECT assert
+(
+ row_count = :expected_row_count, 'Row count validation failed for
minibatch_preprocessing_out_summary.
+ Expected:' || :expected_row_count || ' Actual: ' || row_count
+) from
+(
+ select count(*) as row_count from minibatch_preprocessing_out_summary
+) s;
+
+-- Test for array values in indep column
+DROP TABLE IF EXISTS minibatch_preprocessing_input;
+DROP TABLE IF EXISTS minibatch_preprocessing_out,
minibatch_preprocessing_out_standardization,
minibatch_preprocessing_out_summary;
+CREATE TABLE minibatch_preprocessing_input(
+ id INTEGER,
+ sex TEXT,
+ attributes double precision[],
+ rings INTEGER);
+TRUNCATE TABLE minibatch_preprocessing_input;
+INSERT INTO minibatch_preprocessing_input(id,sex,attributes) VALUES
+(1040,'F',ARRAY[0.66,0.475,0.18,NULL,0.641,0.294,0.335]),
+(3160,'F',ARRAY[0.34,0.35,0.085,0.204,0.097,0.021,0.05]),
+(3984,NULL,ARRAY[0.585,0.45,0.25,0.874,0.3545,0.2075,0.225]),
+(861,'M',ARRAY[0.595,0.475,NULL,1.1405,0.547,0.231,0.271]),
+(932,NULL,ARRAY[0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195]),
+(NULL,'F',ARRAY[0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195]),
+(922,NULL,ARRAY[0.445,0.335,0.11,NULL,0.2025,0.1095,0.1195]);
+SELECT minibatch_preprocessor('minibatch_preprocessing_input',
'minibatch_preprocessing_out', 'sex', 'attributes', 1);
+SELECT assert
+ (
+ row_count = 2, 'Row count validation failed for
minibatch_preprocessing_out.
+ Expected:' || 2 || ' Actual: ' || row_count
+ ) from (select count(*) as row_count from
minibatch_preprocessing_out) s;
+
+-- Test for array values in dep column
+DROP TABLE IF EXISTS minibatch_preprocessing_out,
minibatch_preprocessing_out_standardization,
minibatch_preprocessing_out_summary;
+SELECT minibatch_preprocessor('minibatch_preprocessing_input',
'minibatch_preprocessing_out', 'attributes', 'ARRAY[id]', 1);
+SELECT assert
+ (
+ row_count = 3, 'Row count validation failed array values in
dependent variable.
+ Expected:' || 3 || ' Actual: ' || row_count
+ ) from (select count(*) as row_count from
minibatch_preprocessing_out) s;
+
+-- Test for null buffer size
+DROP TABLE IF EXISTS minibatch_preprocessing_out,
minibatch_preprocessing_out_standardization,
minibatch_preprocessing_out_summary;
+SELECT minibatch_preprocessor('minibatch_preprocessing_input',
'minibatch_preprocessing_out', 'attributes', 'ARRAY[id]');
+SELECT assert
+ (
+ row_count = 3, 'Row count validation failed for null buffer size.
+ Expected:' || 3 || ' Actual: ' || row_count
+ ) from (select count(*) as row_count from
minibatch_preprocessing_out) s;
--- End diff --
Please end this and other files with a new line.
---