This is an automated email from the ASF dual-hosted git repository.
njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new 6c8643c DL: fix default_buffer_size calculation in
input_data_preprocessor
6c8643c is described below
commit 6c8643ceb318f5d2fcb281233d483da321386958
Author: Jingyi Mei <[email protected]>
AuthorDate: Tue May 28 17:47:51 2019 -0700
DL: fix default_buffer_size calculation in input_data_preprocessor
JIRA: MADLIB-1340
In buffer size calculator, we assumed the array to be packed is one
dimensional, and used the array_upper(x, 1) to get the length
of the array. In DL input data preprocessor, we passed the first element
in array_ndims as the length, which is not right because the
array can be multi-dimensional. This resulted in
default_buffer_size_calculator returning a bigger buffer size than it
was supposed leading to the 1GB limit error.
Instead, we should use product of all the elements from array_ndims to
represent the actual length of the array. For example, if array_ndims
returns [32,32,3], we should pass 32*32*3 instead of 32. This commit
fixes this issue by passing the right length to defualt buffer calculator.
Closes #401
Co-authored-by: Nikhil Kak <[email protected]>
---
.../deep_learning/input_data_preprocessor.py_in | 10 ++++----
src/ports/postgres/modules/internal/db_utils.py_in | 28 ++++++++++++++++++++--
.../postgres/modules/utilities/validate_args.py_in | 1 +
3 files changed, 32 insertions(+), 7 deletions(-)
diff --git
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index bc42f63..2b78a3b 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -27,6 +27,7 @@ import plpy
from internal.db_utils import get_distinct_col_levels
from internal.db_utils import quote_literal
+from internal.db_utils import get_product_of_dimensions
from utilities.minibatch_preprocessing import MiniBatchBufferSizeCalculator
from utilities.utilities import _assert
from utilities.utilities import add_postfix
@@ -40,7 +41,6 @@ from utilities.utilities import strip_end_quotes
from utilities.utilities import unique_string
from utilities.utilities import validate_module_input_params
-from utilities.validate_args import _tbl_dimension_rownum
from utilities.validate_args import input_tbl_valid
from utilities.validate_args import get_expr_type
@@ -288,13 +288,13 @@ class InputDataPreprocessorDL(object):
SELECT count(*) AS cnt FROM {0}
""".format(self.source_table))[0]['cnt']
buffer_size_calculator = MiniBatchBufferSizeCalculator()
- indepdent_var_dim = _tbl_dimension_rownum(
- self.schema_madlib, self.source_table,
- self.independent_varname, skip_row_count=True)
+ indepdent_var_dim = get_product_of_dimensions(self.source_table,
+ self.independent_varname)
self.buffer_size =
buffer_size_calculator.calculate_default_buffer_size(
- self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0])
+ self.buffer_size, num_rows_in_tbl, indepdent_var_dim)
return ceil((1.0 * num_rows_in_tbl) / self.buffer_size)
+
class ValidationDataPreprocessorDL(InputDataPreprocessorDL):
def __init__(self, schema_madlib, source_table, output_table,
dependent_varname, independent_varname,
diff --git a/src/ports/postgres/modules/internal/db_utils.py_in
b/src/ports/postgres/modules/internal/db_utils.py_in
index 934107c..90c09ca 100644
--- a/src/ports/postgres/modules/internal/db_utils.py_in
+++ b/src/ports/postgres/modules/internal/db_utils.py_in
@@ -19,6 +19,7 @@
import plpy
from utilities.utilities import is_psql_char_type
+from utilities.validate_args import get_col_dimension
from utilities.validate_args import get_expr_type
m4_changequote(`<!', `!>')
@@ -26,8 +27,7 @@ m4_changequote(`<!', `!>')
QUOTE_DELIMITER="$__madlib__$"
-def get_distinct_col_levels(source_table, col_name, col_type=None,
- include_nulls=False):
+def get_distinct_col_levels(source_table, col_name, col_type=None,
include_nulls=False):
"""
Add description here
:return:
@@ -93,3 +93,27 @@ def is_col_1d_array(source_table, col_name):
""".format(col_name, source_table)
result = plpy.execute(query)
return result[0]["n_y"]
+
+#
------------------------------------------------------------------------------
+# This function runs postgres array_ndims function to get
+# the dimension of an array. For example if it is a 3
+# dimension array it will be an array with 3 elements
+# like [32,32,3].
+def get_ndims(source_table, col_name):
+ array_ndims = plpy.execute("""
+ SELECT array_ndims({0}) AS ndims
+ FROM {1}
+ """.format(col_name, source_table), 1)[0]['ndims']
+ return array_ndims
+
+# This function is to calculate the total `length` of a
+# multi dimensional array. For example, if an array is
+# with 3 dimensions and ndims=[32,32,3], this function
+# will return the product of them, which is 32*32*3
+def get_product_of_dimensions(source_table, col_name):
+ ndims = get_ndims(source_table, col_name)
+ dimension_product = 1
+ for i in range(1, ndims + 1):
+ dimension = get_col_dimension(source_table, col_name, i)
+ dimension_product *= dimension
+ return dimension_product
diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in
b/src/ports/postgres/modules/utilities/validate_args.py_in
index b35e7ad..073563c 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -635,6 +635,7 @@ def _tbl_dimension_rownum(schema_madlib, tbl, col_name,
skip_row_count=False):
col_name=col_name))[0]["count"]
return (dimension, row_num)
+
# ------------------------------------------------------------------------