[madlib] branch master updated: DL: fix default_buffer_size calculation in input_data_preprocessor

njayaram Fri, 31 May 2019 14:06:23 -0700

This is an automated email from the ASF dual-hosted git repository.

njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git



The following commit(s) were added to refs/heads/master by this push:
     new 6c8643c  DL: fix default_buffer_size calculation in 
input_data_preprocessor
6c8643c is described below

commit 6c8643ceb318f5d2fcb281233d483da321386958
Author: Jingyi Mei <[email protected]>
AuthorDate: Tue May 28 17:47:51 2019 -0700

    DL: fix default_buffer_size calculation in input_data_preprocessor
    
    JIRA: MADLIB-1340
    In buffer size calculator, we assumed the array to be packed is one
    dimensional, and used the array_upper(x, 1) to get the length
    of the array. In DL input data preprocessor, we passed the first element
    in array_ndims as the length, which is not right because the
    array can be multi-dimensional. This resulted in
    default_buffer_size_calculator returning a bigger buffer size than it
    was supposed leading to the 1GB limit error.
    
    Instead, we should use product of all the elements from array_ndims to
    represent the actual length of the array. For example, if array_ndims
    returns [32,32,3], we should pass 32*32*3 instead of 32. This commit
    fixes this issue by passing the right length to defualt buffer calculator.
    
    Closes #401
    Co-authored-by: Nikhil Kak <[email protected]>
---
 .../deep_learning/input_data_preprocessor.py_in    | 10 ++++----
 src/ports/postgres/modules/internal/db_utils.py_in | 28 ++++++++++++++++++++--
 .../postgres/modules/utilities/validate_args.py_in |  1 +
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in 
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index bc42f63..2b78a3b 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -27,6 +27,7 @@ import plpy
 
 from internal.db_utils import get_distinct_col_levels
 from internal.db_utils import quote_literal
+from internal.db_utils import get_product_of_dimensions
 from utilities.minibatch_preprocessing import MiniBatchBufferSizeCalculator
 from utilities.utilities import _assert
 from utilities.utilities import add_postfix
@@ -40,7 +41,6 @@ from utilities.utilities import strip_end_quotes
 from utilities.utilities import unique_string
 from utilities.utilities import validate_module_input_params
 
-from utilities.validate_args import _tbl_dimension_rownum
 from utilities.validate_args import input_tbl_valid
 from utilities.validate_args import get_expr_type
 
@@ -288,13 +288,13 @@ class InputDataPreprocessorDL(object):
                 SELECT count(*) AS cnt FROM {0}
             """.format(self.source_table))[0]['cnt']
         buffer_size_calculator = MiniBatchBufferSizeCalculator()
-        indepdent_var_dim = _tbl_dimension_rownum(
-            self.schema_madlib, self.source_table,
-            self.independent_varname, skip_row_count=True)
+        indepdent_var_dim = get_product_of_dimensions(self.source_table,
+            self.independent_varname)
         self.buffer_size = 
buffer_size_calculator.calculate_default_buffer_size(
-            self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0])
+            self.buffer_size, num_rows_in_tbl, indepdent_var_dim)
         return ceil((1.0 * num_rows_in_tbl) / self.buffer_size)
 
+
 class ValidationDataPreprocessorDL(InputDataPreprocessorDL):
     def __init__(self, schema_madlib, source_table, output_table,
                  dependent_varname, independent_varname,
diff --git a/src/ports/postgres/modules/internal/db_utils.py_in 
b/src/ports/postgres/modules/internal/db_utils.py_in
index 934107c..90c09ca 100644
--- a/src/ports/postgres/modules/internal/db_utils.py_in
+++ b/src/ports/postgres/modules/internal/db_utils.py_in
@@ -19,6 +19,7 @@
 
 import plpy
 from utilities.utilities import is_psql_char_type
+from utilities.validate_args import get_col_dimension
 from utilities.validate_args import get_expr_type
 
 m4_changequote(`<!', `!>')
@@ -26,8 +27,7 @@ m4_changequote(`<!', `!>')
 QUOTE_DELIMITER="$__madlib__$"
 
 
-def get_distinct_col_levels(source_table, col_name, col_type=None,
-                            include_nulls=False):
+def get_distinct_col_levels(source_table, col_name, col_type=None, 
include_nulls=False):
     """
     Add description here
     :return:
@@ -93,3 +93,27 @@ def is_col_1d_array(source_table, col_name):
     """.format(col_name, source_table)
     result = plpy.execute(query)
     return result[0]["n_y"]
+
+# 
------------------------------------------------------------------------------
+# This function runs postgres array_ndims function to get
+# the dimension of an array. For example if it is a 3
+# dimension array it will be an array with 3 elements
+# like [32,32,3].
+def get_ndims(source_table, col_name):
+    array_ndims = plpy.execute("""
+        SELECT array_ndims({0}) AS ndims
+        FROM {1}
+    """.format(col_name, source_table), 1)[0]['ndims']
+    return array_ndims
+
+# This function is to calculate the total `length` of a
+# multi dimensional array. For example, if an array is
+# with 3 dimensions and ndims=[32,32,3], this function
+# will return the product of them, which is 32*32*3
+def get_product_of_dimensions(source_table, col_name):
+    ndims = get_ndims(source_table, col_name)
+    dimension_product = 1
+    for i in range(1, ndims + 1):
+        dimension = get_col_dimension(source_table, col_name, i)
+        dimension_product *= dimension
+    return dimension_product
diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in 
b/src/ports/postgres/modules/utilities/validate_args.py_in
index b35e7ad..073563c 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -635,6 +635,7 @@ def _tbl_dimension_rownum(schema_madlib, tbl, col_name, 
skip_row_count=False):
                           col_name=col_name))[0]["count"]
 
     return (dimension, row_num)
+
 # ------------------------------------------------------------------------

[madlib] branch master updated: DL: fix default_buffer_size calculation in input_data_preprocessor

Reply via email to