Github user reductionista commented on a diff in the pull request: https://github.com/apache/madlib/pull/342#discussion_r243694232 --- Diff: src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in --- @@ -51,6 +51,105 @@ m4_changequote(`<!', `!>') MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" +class MiniBatchPreProcessorDL: + def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, + normalizing_const, **kwargs): + self.schema_madlib = schema_madlib + self.source_table = source_table + self.output_table = output_table + self.dependent_varname = dependent_varname + self.independent_varname = independent_varname + self.buffer_size = buffer_size + self.normalizing_const = normalizing_const + self.module_name = "minibatch_preprocessor_DL" + self.output_summary_table = add_postfix(self.output_table, "_summary") + self._validate_args() + self.num_of_buffers = self._get_num_buffers() + + def minibatch_preprocessor_dl(self): + norm_tbl = unique_string(desp='normalized') + # Create a temp table that has independent var normalized. + scalar_mult_sql = """ + CREATE TEMP TABLE {norm_tbl} AS + SELECT {self.schema_madlib}.array_scalar_mult( + {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS x_norm, + {self.dependent_varname} AS y, + row_number() over() AS row_id + FROM {self.source_table} + """.format(**locals()) + plpy.execute(scalar_mult_sql) + # Create the mini-batched output table + if is_platform_pg(): + distributed_by_clause = '' + else: + distributed_by_clause= ' DISTRIBUTED BY (buffer_id) ' + sql = """ + CREATE TABLE {self.output_table} AS + SELECT * FROM + ( + SELECT {self.schema_madlib}.agg_array_concat( + ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x}, + array_agg({norm_tbl}.y) AS {y}, + ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS buffer_id + FROM {norm_tbl} + GROUP BY buffer_id + ) b + {distributed_by_clause} + """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME, + y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME, --- End diff -- I don't think we should change the names of these columns while batching tables. IMO, the column names in the output table should remain the same as whatever they were in the input table; with the extra column "buffer_id" added. eg, if they were x and y to begin with, they should remain x and y. If they were features and labels, they should remain features and labels. In other words, instead of the literal strings "independent_varname" and "dependent_varname", the names should be self.independent_varname and self.dependent_varname, as specified by the user. If there is some reason why we want to force the user to always use the same column names in a batched table, then I'd suggest instead calling them either x and y, or independent_var and dependent_var. Naming them "independent_varname" and "dependent_varname" is problematic for at least two reasons: 1. These columns contain numeric data, not variable names. So the column heading would not reflect what is in the column. and 2. These column names conflict with the column names in the summary table, which actually do refer to variable names. It also conflicts with the names of the parameters being passed in by the user. I think users will be very confused if we give two different things exactly the same name in the same function.
---