Github user kaknikhil commented on a diff in the pull request:

    https://github.com/apache/madlib/pull/218#discussion_r157892828
  
    --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
    @@ -0,0 +1,322 @@
    +# coding=utf-8
    +#
    +# Licensed to the Apache Software Foundation (ASF) under one
    +# or more contributor license agreements.  See the NOTICE file
    +# distributed with this work for additional information
    +# regarding copyright ownership.  The ASF licenses this file
    +# to you under the Apache License, Version 2.0 (the
    +# "License"); you may not use this file except in compliance
    +# with the License.  You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing,
    +# software distributed under the License is distributed on an
    +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    +# KIND, either express or implied.  See the License for the
    +# specific language governing permissions and limitations
    +# under the License.
    +import plpy
    +from utilities.control import MinWarning
    +from utilities.utilities import _assert
    +from utilities.utilities import unique_string
    +from utilities.validate_args import table_exists
    +from utilities.validate_args import columns_exist_in_table
    +from utilities.validate_args import table_is_empty
    +from utilities.validate_args import get_cols
    +from utilities.utilities import py_list_to_sql_string
    +
    +m4_changequote(`<!', `!>')
    +
    +def balance_sample(schema_madlib, source_table, output_table, class_col,
    +    class_sizes, output_table_size, grouping_cols, with_replacement, 
**kwargs):
    +
    +    """
    +    Balance sampling function
    +    Args:
    +        @param source_table     Input table name.
    +        @param output_table     Output table name.
    +        @param class_col        Name of the column containing the class to 
be 
    +                                balanced.
    +        @param with_replacement (Default: FALSE) The sampling method.
    +
    +    """
    +    with MinWarning("warning"):
    +
    +        class_counts = unique_string(desp='class_counts')
    +        
    +        _validate_strs(source_table, output_table, class_col, class_sizes, 
    +            output_table_size, grouping_cols)
    +        source_table_columns = ','.join(get_cols(source_table))
    +        grp_by = "GROUP BY {0}".format(class_col)
    +        """
    +            Frequency table for classes
    +        """
    +        plpy.execute(""" CREATE VIEW {class_counts} AS (
    +                         SELECT 
    +                            {class_col} AS classes, 
    +                            count(*) AS class_count
    +                         FROM {source_table} 
    +                            {grp_by})
    +                     """.format(**locals())) 
    +
    +        if class_sizes.lower() == 'undersample':
    +
    +            if not with_replacement:               
    +                """
    +                    Random undersample without replacement
    +                """
    +                output_sql = """ 
    +                    CREATE TABLE {output_table} AS (
    +                        SELECT 
    +                            {source_table_columns} 
    +                        FROM
    +                            (SELECT 
    +                                *,
    +                                row_number() OVER(PARTITION BY 
    +                                    {class_col} ORDER BY random()) 
    +                                    AS __row_no
    +                            FROM {source_table}) AS foo 
    +                        WHERE __row_no <= 
    +                            (SELECT 
    +                                MIN(class_count) 
    +                             FROM {class_counts}))
    +                    """.format(**locals())
    +                plpy.execute(output_sql)
    +            else:
    +                """
    +                    Random undersample with replacement
    --- End diff --
    
    I think the comments can be more verbose and it might be easier to 
understand the implementation if the comments are consolidated and explained in 
a bit more detail. It's a bit hard to understand by just looking at these 
comments to understand what  the algorithm is doing. 


---

Reply via email to