[ 
https://issues.apache.org/jira/browse/MADLIB-1517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Nikhil Kak updated MADLIB-1517:
-------------------------------
    Description: 
Our regression models include the intercept as a predictor in the exported pmml 
file which isn't ideal when using this pmml file for predictions
h1. Reproduction Steps

Create glm model and generate pmml
{code:java}
CREATE TABLE warpbreaks(
    id      serial,
    breaks  integer,
    wool    char(1),
    tension char(1)
);
INSERT INTO warpbreaks(breaks, wool, tension) VALUES
(26, 'A', 'L'),
(30, 'A', 'L'),
(54, 'A', 'L'),
(25, 'A', 'L'),
(70, 'A', 'L'),
(52, 'A', 'L'),
(51, 'A', 'L'),
(26, 'A', 'L'),
(67, 'A', 'L'),
(18, 'A', 'M'),
(21, 'A', 'M'),
(29, 'A', 'M'),
(17, 'A', 'M'),
(12, 'A', 'M'),
(18, 'A', 'M'),
(35, 'A', 'M'),
(30, 'A', 'M'),
(36, 'A', 'M'),
(36, 'A', 'H'),
(21, 'A', 'H'),
(24, 'A', 'H'),
(18, 'A', 'H'),
(10, 'A', 'H'),
(43, 'A', 'H'),
(28, 'A', 'H'),
(15, 'A', 'H'),
(26, 'A', 'H'),
(27, 'B', 'L'),
(14, 'B', 'L'),
(29, 'B', 'L'),
(19, 'B', 'L'),
(29, 'B', 'L'),
(31, 'B', 'L'),
(41, 'B', 'L'),
(20, 'B', 'L'),
(44, 'B', 'L'),
(42, 'B', 'M'),
(26, 'B', 'M'),
(19, 'B', 'M'),
(16, 'B', 'M'),
(39, 'B', 'M'),
(28, 'B', 'M'),
(21, 'B', 'M'),
(39, 'B', 'M'),
(29, 'B', 'M'),
(20, 'B', 'H'),
(21, 'B', 'H'),
(24, 'B', 'H'),
(17, 'B', 'H'),
(13, 'B', 'H'),
(15, 'B', 'H'),
(15, 'B', 'H'),
(16, 'B', 'H'),
(28, 'B', 'H');
SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 
'wool,tension');

DROP TABLE IF EXISTS glm_model, glm_model_summary;
SELECT glm('warpbreaks_dummy',
           'glm_model',
           'breaks',
           'ARRAY[1.0,"wool_B","tension_M", "tension_H"]',
           'family=poisson, link=log');
COPY (SELECT madlib.pmml('glm_model')) TO '/tmp/glm.pmml';

SELECT madlib.glm_predict(coef,  ARRAY[1, 0, 1, 0]::float8[], 'log') FROM 
glm_model;
    glm_predict
--------------------
 29.097222222222218

SELECT madlib.glm_predict(coef, ARRAY[1, 0, 0, 0]::float8[], 'log') FROM 
glm_model;
    glm_predict
--------------------
 40.123538011695906
{code}
Use pypmml to predict the data using the generated pmml:
{code:java}
from pypmml import Model
model = Model.fromFile('/tmp/glm.pmml')
data = [{ "wool_B": 0, "tension_M": 0, "tension_H": 0},
        { "wool_B": 0, "tension_M": 1, "tension_H": 0},
        ]
for d in data:
    result = model.predict(d)
    print(d)
    print(result)

{'wool_B': 0, 'tension_M': 0, 'tension_H': 0}
{'predicted_breaks_pmml_prediction': nan}
{'wool_B': 0, 'tension_M': 1, 'tension_H': 0}
{'predicted_breaks_pmml_prediction': nan}
{code}
Obviously the nan results are wrong. To make it work with the existing pmml 
file, we need to modify the input a bit.
{code:java}
from pypmml import Model
model = Model.fromFile('/tmp/glm.pmml')
data = [{ "1.0": 1, "wool_B": 0, "tension_M": 0, "tension_H": 0},
        { "1.0": 1, "wool_B": 0, "tension_M": 1, "tension_H": 0}
        ]


for d in data:
    result = model.predict(d)
    print(d)
    print(result)

{'1.0': 1, 'wool_B': 0, 'tension_M': 0, 'tension_H': 0}
{'predicted_breaks_pmml_prediction': 40.123538011695906}
{'1.0': 1, 'wool_B': 0, 'tension_M': 1, 'tension_H': 0}
{'predicted_breaks_pmml_prediction': 29.097222222222218}
{code}
Now these values match the madlib glm_predict output
h1. Goal

The goal of this story is to fix the pmml code so that we don't need to 
awkwardly pass the intercept as "'1.0':1" before the beginning of each data 
row. Note the the name "1.0" is used because that's how it's stored in the pmml 
file

  was:
Our regression models include the intercept as a predictor in the exported pmml 
file which isn't ideal when using this pmml file for predictions

h1. Reproduction Steps

Create glm model and generate pmml

{code}
CREATE TABLE warpbreaks(
    id      serial,
    breaks  integer,
    wool    char(1),
    tension char(1)
);
INSERT INTO warpbreaks(breaks, wool, tension) VALUES
(26, 'A', 'L'),
(30, 'A', 'L'),
(54, 'A', 'L'),
(25, 'A', 'L'),
(70, 'A', 'L'),
(52, 'A', 'L'),
(51, 'A', 'L'),
(26, 'A', 'L'),
(67, 'A', 'L'),
(18, 'A', 'M'),
(21, 'A', 'M'),
(29, 'A', 'M'),
(17, 'A', 'M'),
(12, 'A', 'M'),
(18, 'A', 'M'),
(35, 'A', 'M'),
(30, 'A', 'M'),
(36, 'A', 'M'),
(36, 'A', 'H'),
(21, 'A', 'H'),
(24, 'A', 'H'),
(18, 'A', 'H'),
(10, 'A', 'H'),
(43, 'A', 'H'),
(28, 'A', 'H'),
(15, 'A', 'H'),
(26, 'A', 'H'),
(27, 'B', 'L'),
(14, 'B', 'L'),
(29, 'B', 'L'),
(19, 'B', 'L'),
(29, 'B', 'L'),
(31, 'B', 'L'),
(41, 'B', 'L'),
(20, 'B', 'L'),
(44, 'B', 'L'),
(42, 'B', 'M'),
(26, 'B', 'M'),
(19, 'B', 'M'),
(16, 'B', 'M'),
(39, 'B', 'M'),
(28, 'B', 'M'),
(21, 'B', 'M'),
(39, 'B', 'M'),
(29, 'B', 'M'),
(20, 'B', 'H'),
(21, 'B', 'H'),
(24, 'B', 'H'),
(17, 'B', 'H'),
(13, 'B', 'H'),
(15, 'B', 'H'),
(15, 'B', 'H'),
(16, 'B', 'H'),
(28, 'B', 'H');
SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 
'wool,tension');
DROP TABLE IF EXISTS glm_model, glm_model_summary;
SELECT glm('warpbreaks_dummy',
           'glm_model',
           'breaks',
           'ARRAY[1.0,"wool_B","tension_M", "tension_H"]',
           'family=poisson, link=log');
COPY (SELECT madlib.pmml('glm_model')) TO '/tmp/glm.pmml';

SELECT madlib.glm_predict(coef,  ARRAY[1, 0, 1, 0]::float8[], 'log') FROM 
glm_model;
    glm_predict
--------------------
 29.097222222222218

SELECT madlib.glm_predict(coef, ARRAY[1, 0, 0, 0]::float8[], 'log') FROM 
glm_model;
    glm_predict
--------------------
 40.123538011695906
{code}

Use pypmml to predict the data using the generated pmml:

{code}
from pypmml import Model
model = Model.fromFile('/tmp/glm.pmml')
data = [{ "wool_B": 0, "tension_M": 0, "tension_H": 0},
        { "wool_B": 0, "tension_M": 1, "tension_H": 0},
        ]
for d in data:
    result = model.predict(d)
    print(d)
    print(result)

{'wool_B': 0, 'tension_M': 0, 'tension_H': 0}
{'predicted_breaks_pmml_prediction': nan}
{'wool_B': 0, 'tension_M': 1, 'tension_H': 0}
{'predicted_breaks_pmml_prediction': nan}
{code}

Obviously the nan results are wrong. To make it work with the existing pmml 
file, we need to modify the input a bit.

{code}
from pypmml import Model
model = Model.fromFile('/tmp/glm.pmml')
data = [{ "1.0": 1, "wool_B": 0, "tension_M": 0, "tension_H": 0},
        { "1.0": 1, "wool_B": 0, "tension_M": 1, "tension_H": 0}
        ]


for d in data:
    result = model.predict(d)
    print(d)
    print(result)

{'1.0': 1, 'wool_B': 0, 'tension_M': 0, 'tension_H': 0}
{'predicted_breaks_pmml_prediction': 40.123538011695906}
{'1.0': 1, 'wool_B': 0, 'tension_M': 1, 'tension_H': 0}
{'predicted_breaks_pmml_prediction': 29.097222222222218}
{code}

Now these values match the madlib glm_predict output

h1. Goal

The goal of this story is to fix the pmml code so that we don't need to 
awkwardly pass the intercept as "'1.0':1" before the beginning of each data 
row. Note the the name "1.0" is used because that's how it's stored in the pmml 
file


> Do not include intercept as predictor for regression models
> -----------------------------------------------------------
>
>                 Key: MADLIB-1517
>                 URL: https://issues.apache.org/jira/browse/MADLIB-1517
>             Project: Apache MADlib
>          Issue Type: Bug
>          Components: Module: Utilities
>            Reporter: Ekta Khanna
>            Priority: Major
>             Fix For: v2.2.0
>
>
> Our regression models include the intercept as a predictor in the exported 
> pmml file which isn't ideal when using this pmml file for predictions
> h1. Reproduction Steps
> Create glm model and generate pmml
> {code:java}
> CREATE TABLE warpbreaks(
>     id      serial,
>     breaks  integer,
>     wool    char(1),
>     tension char(1)
> );
> INSERT INTO warpbreaks(breaks, wool, tension) VALUES
> (26, 'A', 'L'),
> (30, 'A', 'L'),
> (54, 'A', 'L'),
> (25, 'A', 'L'),
> (70, 'A', 'L'),
> (52, 'A', 'L'),
> (51, 'A', 'L'),
> (26, 'A', 'L'),
> (67, 'A', 'L'),
> (18, 'A', 'M'),
> (21, 'A', 'M'),
> (29, 'A', 'M'),
> (17, 'A', 'M'),
> (12, 'A', 'M'),
> (18, 'A', 'M'),
> (35, 'A', 'M'),
> (30, 'A', 'M'),
> (36, 'A', 'M'),
> (36, 'A', 'H'),
> (21, 'A', 'H'),
> (24, 'A', 'H'),
> (18, 'A', 'H'),
> (10, 'A', 'H'),
> (43, 'A', 'H'),
> (28, 'A', 'H'),
> (15, 'A', 'H'),
> (26, 'A', 'H'),
> (27, 'B', 'L'),
> (14, 'B', 'L'),
> (29, 'B', 'L'),
> (19, 'B', 'L'),
> (29, 'B', 'L'),
> (31, 'B', 'L'),
> (41, 'B', 'L'),
> (20, 'B', 'L'),
> (44, 'B', 'L'),
> (42, 'B', 'M'),
> (26, 'B', 'M'),
> (19, 'B', 'M'),
> (16, 'B', 'M'),
> (39, 'B', 'M'),
> (28, 'B', 'M'),
> (21, 'B', 'M'),
> (39, 'B', 'M'),
> (29, 'B', 'M'),
> (20, 'B', 'H'),
> (21, 'B', 'H'),
> (24, 'B', 'H'),
> (17, 'B', 'H'),
> (13, 'B', 'H'),
> (15, 'B', 'H'),
> (15, 'B', 'H'),
> (16, 'B', 'H'),
> (28, 'B', 'H');
> SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 
> 'wool,tension');
> DROP TABLE IF EXISTS glm_model, glm_model_summary;
> SELECT glm('warpbreaks_dummy',
>            'glm_model',
>            'breaks',
>            'ARRAY[1.0,"wool_B","tension_M", "tension_H"]',
>            'family=poisson, link=log');
> COPY (SELECT madlib.pmml('glm_model')) TO '/tmp/glm.pmml';
> SELECT madlib.glm_predict(coef,  ARRAY[1, 0, 1, 0]::float8[], 'log') FROM 
> glm_model;
>     glm_predict
> --------------------
>  29.097222222222218
> SELECT madlib.glm_predict(coef, ARRAY[1, 0, 0, 0]::float8[], 'log') FROM 
> glm_model;
>     glm_predict
> --------------------
>  40.123538011695906
> {code}
> Use pypmml to predict the data using the generated pmml:
> {code:java}
> from pypmml import Model
> model = Model.fromFile('/tmp/glm.pmml')
> data = [{ "wool_B": 0, "tension_M": 0, "tension_H": 0},
>         { "wool_B": 0, "tension_M": 1, "tension_H": 0},
>         ]
> for d in data:
>     result = model.predict(d)
>     print(d)
>     print(result)
> {'wool_B': 0, 'tension_M': 0, 'tension_H': 0}
> {'predicted_breaks_pmml_prediction': nan}
> {'wool_B': 0, 'tension_M': 1, 'tension_H': 0}
> {'predicted_breaks_pmml_prediction': nan}
> {code}
> Obviously the nan results are wrong. To make it work with the existing pmml 
> file, we need to modify the input a bit.
> {code:java}
> from pypmml import Model
> model = Model.fromFile('/tmp/glm.pmml')
> data = [{ "1.0": 1, "wool_B": 0, "tension_M": 0, "tension_H": 0},
>         { "1.0": 1, "wool_B": 0, "tension_M": 1, "tension_H": 0}
>         ]
> for d in data:
>     result = model.predict(d)
>     print(d)
>     print(result)
> {'1.0': 1, 'wool_B': 0, 'tension_M': 0, 'tension_H': 0}
> {'predicted_breaks_pmml_prediction': 40.123538011695906}
> {'1.0': 1, 'wool_B': 0, 'tension_M': 1, 'tension_H': 0}
> {'predicted_breaks_pmml_prediction': 29.097222222222218}
> {code}
> Now these values match the madlib glm_predict output
> h1. Goal
> The goal of this story is to fix the pmml code so that we don't need to 
> awkwardly pass the intercept as "'1.0':1" before the beginning of each data 
> row. Note the the name "1.0" is used because that's how it's stored in the 
> pmml file



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to