Github user fmcquillan99 commented on the issue:
https://github.com/apache/madlib/pull/256
LGTM
Default selection looks reasonable:
(0) data
DROP TABLE IF EXISTS iris_data;
CREATE TABLE iris_data(
id serial,
attributes numeric[],
class_text text,
class integer,
state text
);
INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
```
(1) no groups, 2 segments, default buffer size
```
select * from iris_data_packed_summary;
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 26
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols |
```
(2) no groups, 2 segments, buffer size=10
```
madlib=# select * from iris_data_packed_summary;
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 10
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols |
```
(3) groups, 2 segments, default buffer size
```
select * from iris_data_packed_summary;
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 13
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols | state
```
```
select __id__, state , dependent_varname from iris_data_packed order by
state, __id__;
__id__ | state | dependent_varname
--------+-----------+---------------------------------------------------------------------------------
0 | Alaska |
{{1,0},{0,1},{0,1},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{1,0}}
1 | Alaska |
{{1,0},{1,0},{0,1},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{0,1}}
2 | Alaska | {{0,1}}
0 | Tennessee |
{{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1}}
1 | Tennessee |
{{0,1},{1,0},{1,0},{0,1},{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0}}
(5 rows)
```
^^^ Above buffer size is based on average group size:
i.e., alaska=27, tennessee=25 so avg=26
and number of segments (2)
i.e, 26/2=13
(4) groups, 2 segments, buffer size=10
```
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 10
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols | state
```
```
select __id__, state , dependent_varname from iris_data_packed order by
state, __id__;
__id__ | state | dependent_varname
--------+-----------+---------------------------------------------------------------
0 | Alaska |
{{0,1},{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1}}
1 | Alaska |
{{0,1},{0,1},{1,0},{1,0},{0,1},{1,0},{0,1},{1,0},{0,1},{0,1}}
2 | Alaska | {{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0}}
0 | Tennessee |
{{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{0,1}}
1 | Tennessee |
{{1,0},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0}}
2 | Tennessee | {{1,0},{1,0},{0,1},{1,0},{0,1}}
(6 rows)
```
(5) mnist
tested mnist training set of 60,000 rows and got buffer size of 30,000
which is correct for 2 segments
---