Frank McQuillan created MADLIB-1295:
---------------------------------------

             Summary: Encoding module is not handling bigint properly
                 Key: MADLIB-1295
                 URL: https://issues.apache.org/jira/browse/MADLIB-1295
             Project: Apache MADlib
          Issue Type: Bug
          Components: Module: Utilities
            Reporter: Frank McQuillan
             Fix For: v1.16



From 
http://madlib.apache.org/docs/latest/group__grp__encode__categorical.html
"all Boolean, integer and text columns are considered categorical columns and 
will be encoded when ‘*’ is specified for this argument."

It works for int, however does not work for bigint:

1) int - OK

{code}
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
    id serial,
    sex character varying,
    length double precision,
    diameter double precision,
    height double precision,
    rings int
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M',    0.455,  0.365,  0.095,  15),
('M',    0.35,   0.265,  0.09,   7),
('F',    0.53,   0.42,   0.135,  9),
('M',    0.44,   0.365,  0.125,  10),
('I',    0.33,   0.255,  0.08,   7),
('I',    0.425,  0.3,    0.095,  8),
('F',    0.53,   0.415,  0.15,   20),
('F',    0.545,  0.425,  0.125,  16),
('M',    0.475,  0.37,   0.125,  9),
(null,   0.55,   0.44,   0.15,   19),
('F',    0.525,  0.38,   0.14,   14),
('M',    0.43,   0.35,   0.11,   10),
('M',    0.49,   0.38,   0.135,  11),
('F',    0.535,  0.405,  0.145,  10),
('F',    0.47,   0.355,  0.1,    10),
('M',    0.5,    0.4,    0.13,   12),
('I',    0.355,  0.28,   0.085,  7),
('F',    0.44,   0.34,   0.1,    10),
('M',    0.365,  0.295,  0.08,   7),
(null,   0.45,   0.32,   0.1,    9);
{code}

{code}
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        '*',                         -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        'id'                         -- Index columns
        );
SELECT * FROM abalone_out ORDER BY id;
{code}

{code}
 id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | 
rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9 
----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
  1 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |       
 1 |        0 |        0 |        0 |       0 |       0 |       0
  2 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       1 |       0 |       0
  3 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       1
  4 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       0
  5 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       1 |       0 |       0
  6 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       1 |       0
  7 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        1 |       0 |       0 |       0
  8 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |       
 0 |        1 |        0 |        0 |       0 |       0 |       0
  9 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       1
 10 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |       
 0 |        0 |        1 |        0 |       0 |       0 |       0
 11 |     1 |     0 |     0 |        0 |        0 |        0 |        1 |       
 0 |        0 |        0 |        0 |       0 |       0 |       0
 12 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       0
 13 |     0 |     0 |     1 |        0 |        1 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       0
 14 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       0
 15 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       0
 16 |     0 |     0 |     1 |        0 |        0 |        1 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       0
 17 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       1 |       0 |       0
 18 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       0
 19 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       1 |       0 |       0
 20 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |       
 0 |        0 |        0 |        0 |       0 |       0 |       1
(20 rows)
{code}

2) bigint - not OK

{code}
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
    id serial,
    sex character varying,
    length double precision,
    diameter double precision,
    height double precision,
    rings bigint
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M',    0.455,  0.365,  0.095,  15),
('M',    0.35,   0.265,  0.09,   7),
('F',    0.53,   0.42,   0.135,  9),
('M',    0.44,   0.365,  0.125,  10),
('I',    0.33,   0.255,  0.08,   7),
('I',    0.425,  0.3,    0.095,  8),
('F',    0.53,   0.415,  0.15,   20),
('F',    0.545,  0.425,  0.125,  16),
('M',    0.475,  0.37,   0.125,  9),
(null,   0.55,   0.44,   0.15,   19),
('F',    0.525,  0.38,   0.14,   14),
('M',    0.43,   0.35,   0.11,   10),
('M',    0.49,   0.38,   0.135,  11),
('F',    0.535,  0.405,  0.145,  10),
('F',    0.47,   0.355,  0.1,    10),
('M',    0.5,    0.4,    0.13,   12),
('I',    0.355,  0.28,   0.085,  7),
('F',    0.44,   0.34,   0.1,    10),
('M',    0.365,  0.295,  0.08,   7),
(null,   0.45,   0.32,   0.1,    9);
{code}

{code}
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        '*',                         -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        'id'                         -- Index columns
        );
SELECT * FROM abalone_out ORDER BY id;
{code}

{code}
 id | sex_F | sex_I | sex_M 
----+-------+-------+-------
  1 |     0 |     0 |     1
  2 |     0 |     0 |     1
  3 |     1 |     0 |     0
  4 |     0 |     0 |     1
  5 |     0 |     1 |     0
  6 |     0 |     1 |     0
  7 |     1 |     0 |     0
  8 |     1 |     0 |     0
  9 |     0 |     0 |     1
 10 |     0 |     0 |     0
 11 |     1 |     0 |     0
 12 |     0 |     0 |     1
 13 |     0 |     0 |     1
 14 |     1 |     0 |     0
 15 |     1 |     0 |     0
 16 |     0 |     0 |     1
 17 |     0 |     1 |     0
 18 |     1 |     0 |     0
 19 |     0 |     0 |     1
 20 |     0 |     0 |     0
(20 rows)
{code}

so it is not encoding rings.




--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to