Github user fmcquillan99 commented on the issue:

    https://github.com/apache/madlib/pull/256
  
    LGTM
    
    Default selection looks reasonable:
    
    (0) data
    DROP TABLE IF EXISTS iris_data;
    CREATE TABLE iris_data(
        id serial,
        attributes numeric[],
        class_text text,
        class integer,
        state text
    );
    INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
    (1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
    (2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
    (3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
    (4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
    (5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
    (6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
    (7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
    (8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
    (9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
    (10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
    (11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
    (12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
    (13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
    (14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
    (15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
    (16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
    (17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
    (18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
    (19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
    (20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
    (21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
    (22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
    (23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
    (24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
    (25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
    (26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
    (27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
    (28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
    (29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
    (30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
    (31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
    (32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
    (33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
    (34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
    (35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
    (36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
    (37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
    (38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
    (39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
    (40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
    (41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
    (42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
    (43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
    (44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
    (45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
    (46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
    (47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
    (48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
    (49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
    (50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
    (51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
    (52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
    ```
    
    
    (1) no groups, 2 segments, default buffer size
    ```
    select * from iris_data_packed_summary;
    
    -[ RECORD 1 ]------------+------------------------------
    source_table             | iris_data
    output_table             | iris_data_packed
    dependent_varname        | class_text
    independent_varname      | attributes
    buffer_size              | 26
    class_values             | {Iris_setosa,Iris_versicolor}
    num_rows_processed       | 52
    num_missing_rows_skipped | 0
    grouping_cols            | 
    ```
    
    
    (2) no groups, 2 segments, buffer size=10
    ```
    madlib=# select * from iris_data_packed_summary;
    
    -[ RECORD 1 ]------------+------------------------------
    source_table             | iris_data
    output_table             | iris_data_packed
    dependent_varname        | class_text
    independent_varname      | attributes
    buffer_size              | 10
    class_values             | {Iris_setosa,Iris_versicolor}
    num_rows_processed       | 52
    num_missing_rows_skipped | 0
    grouping_cols            | 
    ```
    
    
    (3) groups, 2 segments, default buffer size
    ```
    select * from iris_data_packed_summary;
    
    -[ RECORD 1 ]------------+------------------------------
    source_table             | iris_data
    output_table             | iris_data_packed
    dependent_varname        | class_text
    independent_varname      | attributes
    buffer_size              | 13
    class_values             | {Iris_setosa,Iris_versicolor}
    num_rows_processed       | 52
    num_missing_rows_skipped | 0
    grouping_cols            | state
    ```
    
    ```
    select __id__, state , dependent_varname from iris_data_packed order by 
state, __id__;
     __id__ |   state   |                                dependent_varname      
                          
    
--------+-----------+---------------------------------------------------------------------------------
          0 | Alaska    | 
{{1,0},{0,1},{0,1},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{1,0}}
          1 | Alaska    | 
{{1,0},{1,0},{0,1},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{0,1}}
          2 | Alaska    | {{0,1}}
          0 | Tennessee | 
{{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1}}
          1 | Tennessee | 
{{0,1},{1,0},{1,0},{0,1},{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0}}
    (5 rows)
    ```
    
    ^^^ Above buffer size is based on average group size: 
    i.e., alaska=27, tennessee=25 so avg=26
    and number of segments (2)
    i.e, 26/2=13
    
    
    (4) groups, 2 segments, buffer size=10
    ```
    -[ RECORD 1 ]------------+------------------------------
    source_table             | iris_data
    output_table             | iris_data_packed
    dependent_varname        | class_text
    independent_varname      | attributes
    buffer_size              | 10
    class_values             | {Iris_setosa,Iris_versicolor}
    num_rows_processed       | 52
    num_missing_rows_skipped | 0
    grouping_cols            | state
    ```
    
    ```
    select __id__, state , dependent_varname from iris_data_packed order by 
state, __id__;
    
     __id__ |   state   |                       dependent_varname               
        
    
--------+-----------+---------------------------------------------------------------
          0 | Alaska    | 
{{0,1},{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1}}
          1 | Alaska    | 
{{0,1},{0,1},{1,0},{1,0},{0,1},{1,0},{0,1},{1,0},{0,1},{0,1}}
          2 | Alaska    | {{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0}}
          0 | Tennessee | 
{{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{0,1}}
          1 | Tennessee | 
{{1,0},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0}}
          2 | Tennessee | {{1,0},{1,0},{0,1},{1,0},{0,1}}
    (6 rows)
    ```
    
    (5) mnist
    tested mnist training set of 60,000 rows and got buffer size of 30,000 
which is correct for 2 segments


---

Reply via email to