[ https://issues.apache.org/jira/browse/MADLIB-1517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Nikhil Kak updated MADLIB-1517: ------------------------------- Description: Our regression models include the intercept as a predictor in the exported pmml file which isn't ideal when using this pmml file for predictions h1. Reproduction Steps Create glm model and generate pmml {code:java} CREATE TABLE warpbreaks( id serial, breaks integer, wool char(1), tension char(1) ); INSERT INTO warpbreaks(breaks, wool, tension) VALUES (26, 'A', 'L'), (30, 'A', 'L'), (54, 'A', 'L'), (25, 'A', 'L'), (70, 'A', 'L'), (52, 'A', 'L'), (51, 'A', 'L'), (26, 'A', 'L'), (67, 'A', 'L'), (18, 'A', 'M'), (21, 'A', 'M'), (29, 'A', 'M'), (17, 'A', 'M'), (12, 'A', 'M'), (18, 'A', 'M'), (35, 'A', 'M'), (30, 'A', 'M'), (36, 'A', 'M'), (36, 'A', 'H'), (21, 'A', 'H'), (24, 'A', 'H'), (18, 'A', 'H'), (10, 'A', 'H'), (43, 'A', 'H'), (28, 'A', 'H'), (15, 'A', 'H'), (26, 'A', 'H'), (27, 'B', 'L'), (14, 'B', 'L'), (29, 'B', 'L'), (19, 'B', 'L'), (29, 'B', 'L'), (31, 'B', 'L'), (41, 'B', 'L'), (20, 'B', 'L'), (44, 'B', 'L'), (42, 'B', 'M'), (26, 'B', 'M'), (19, 'B', 'M'), (16, 'B', 'M'), (39, 'B', 'M'), (28, 'B', 'M'), (21, 'B', 'M'), (39, 'B', 'M'), (29, 'B', 'M'), (20, 'B', 'H'), (21, 'B', 'H'), (24, 'B', 'H'), (17, 'B', 'H'), (13, 'B', 'H'), (15, 'B', 'H'), (15, 'B', 'H'), (16, 'B', 'H'), (28, 'B', 'H'); SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 'wool,tension'); DROP TABLE IF EXISTS glm_model, glm_model_summary; SELECT glm('warpbreaks_dummy', 'glm_model', 'breaks', 'ARRAY[1.0,"wool_B","tension_M", "tension_H"]', 'family=poisson, link=log'); COPY (SELECT madlib.pmml('glm_model')) TO '/tmp/glm.pmml'; SELECT madlib.glm_predict(coef, ARRAY[1, 0, 1, 0]::float8[], 'log') FROM glm_model; glm_predict -------------------- 29.097222222222218 SELECT madlib.glm_predict(coef, ARRAY[1, 0, 0, 0]::float8[], 'log') FROM glm_model; glm_predict -------------------- 40.123538011695906 {code} Use pypmml to predict the data using the generated pmml: {code:java} from pypmml import Model model = Model.fromFile('/tmp/glm.pmml') data = [{ "wool_B": 0, "tension_M": 0, "tension_H": 0}, { "wool_B": 0, "tension_M": 1, "tension_H": 0}, ] for d in data: result = model.predict(d) print(d) print(result) {'wool_B': 0, 'tension_M': 0, 'tension_H': 0} {'predicted_breaks_pmml_prediction': nan} {'wool_B': 0, 'tension_M': 1, 'tension_H': 0} {'predicted_breaks_pmml_prediction': nan} {code} Obviously the nan results are wrong. To make it work with the existing pmml file, we need to modify the input a bit. {code:java} from pypmml import Model model = Model.fromFile('/tmp/glm.pmml') data = [{ "1.0": 1, "wool_B": 0, "tension_M": 0, "tension_H": 0}, { "1.0": 1, "wool_B": 0, "tension_M": 1, "tension_H": 0} ] for d in data: result = model.predict(d) print(d) print(result) {'1.0': 1, 'wool_B': 0, 'tension_M': 0, 'tension_H': 0} {'predicted_breaks_pmml_prediction': 40.123538011695906} {'1.0': 1, 'wool_B': 0, 'tension_M': 1, 'tension_H': 0} {'predicted_breaks_pmml_prediction': 29.097222222222218} {code} Now these values match the madlib glm_predict output h1. Goal The goal of this story is to fix the pmml code so that we don't need to awkwardly pass the intercept as "'1.0':1" before the beginning of each data row. Note the the name "1.0" is used because that's how it's stored in the pmml file was: Our regression models include the intercept as a predictor in the exported pmml file which isn't ideal when using this pmml file for predictions h1. Reproduction Steps Create glm model and generate pmml {code} CREATE TABLE warpbreaks( id serial, breaks integer, wool char(1), tension char(1) ); INSERT INTO warpbreaks(breaks, wool, tension) VALUES (26, 'A', 'L'), (30, 'A', 'L'), (54, 'A', 'L'), (25, 'A', 'L'), (70, 'A', 'L'), (52, 'A', 'L'), (51, 'A', 'L'), (26, 'A', 'L'), (67, 'A', 'L'), (18, 'A', 'M'), (21, 'A', 'M'), (29, 'A', 'M'), (17, 'A', 'M'), (12, 'A', 'M'), (18, 'A', 'M'), (35, 'A', 'M'), (30, 'A', 'M'), (36, 'A', 'M'), (36, 'A', 'H'), (21, 'A', 'H'), (24, 'A', 'H'), (18, 'A', 'H'), (10, 'A', 'H'), (43, 'A', 'H'), (28, 'A', 'H'), (15, 'A', 'H'), (26, 'A', 'H'), (27, 'B', 'L'), (14, 'B', 'L'), (29, 'B', 'L'), (19, 'B', 'L'), (29, 'B', 'L'), (31, 'B', 'L'), (41, 'B', 'L'), (20, 'B', 'L'), (44, 'B', 'L'), (42, 'B', 'M'), (26, 'B', 'M'), (19, 'B', 'M'), (16, 'B', 'M'), (39, 'B', 'M'), (28, 'B', 'M'), (21, 'B', 'M'), (39, 'B', 'M'), (29, 'B', 'M'), (20, 'B', 'H'), (21, 'B', 'H'), (24, 'B', 'H'), (17, 'B', 'H'), (13, 'B', 'H'), (15, 'B', 'H'), (15, 'B', 'H'), (16, 'B', 'H'), (28, 'B', 'H'); SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 'wool,tension'); DROP TABLE IF EXISTS glm_model, glm_model_summary; SELECT glm('warpbreaks_dummy', 'glm_model', 'breaks', 'ARRAY[1.0,"wool_B","tension_M", "tension_H"]', 'family=poisson, link=log'); COPY (SELECT madlib.pmml('glm_model')) TO '/tmp/glm.pmml'; SELECT madlib.glm_predict(coef, ARRAY[1, 0, 1, 0]::float8[], 'log') FROM glm_model; glm_predict -------------------- 29.097222222222218 SELECT madlib.glm_predict(coef, ARRAY[1, 0, 0, 0]::float8[], 'log') FROM glm_model; glm_predict -------------------- 40.123538011695906 {code} Use pypmml to predict the data using the generated pmml: {code} from pypmml import Model model = Model.fromFile('/tmp/glm.pmml') data = [{ "wool_B": 0, "tension_M": 0, "tension_H": 0}, { "wool_B": 0, "tension_M": 1, "tension_H": 0}, ] for d in data: result = model.predict(d) print(d) print(result) {'wool_B': 0, 'tension_M': 0, 'tension_H': 0} {'predicted_breaks_pmml_prediction': nan} {'wool_B': 0, 'tension_M': 1, 'tension_H': 0} {'predicted_breaks_pmml_prediction': nan} {code} Obviously the nan results are wrong. To make it work with the existing pmml file, we need to modify the input a bit. {code} from pypmml import Model model = Model.fromFile('/tmp/glm.pmml') data = [{ "1.0": 1, "wool_B": 0, "tension_M": 0, "tension_H": 0}, { "1.0": 1, "wool_B": 0, "tension_M": 1, "tension_H": 0} ] for d in data: result = model.predict(d) print(d) print(result) {'1.0': 1, 'wool_B': 0, 'tension_M': 0, 'tension_H': 0} {'predicted_breaks_pmml_prediction': 40.123538011695906} {'1.0': 1, 'wool_B': 0, 'tension_M': 1, 'tension_H': 0} {'predicted_breaks_pmml_prediction': 29.097222222222218} {code} Now these values match the madlib glm_predict output h1. Goal The goal of this story is to fix the pmml code so that we don't need to awkwardly pass the intercept as "'1.0':1" before the beginning of each data row. Note the the name "1.0" is used because that's how it's stored in the pmml file > Do not include intercept as predictor for regression models > ----------------------------------------------------------- > > Key: MADLIB-1517 > URL: https://issues.apache.org/jira/browse/MADLIB-1517 > Project: Apache MADlib > Issue Type: Bug > Components: Module: Utilities > Reporter: Ekta Khanna > Priority: Major > Fix For: v2.2.0 > > > Our regression models include the intercept as a predictor in the exported > pmml file which isn't ideal when using this pmml file for predictions > h1. Reproduction Steps > Create glm model and generate pmml > {code:java} > CREATE TABLE warpbreaks( > id serial, > breaks integer, > wool char(1), > tension char(1) > ); > INSERT INTO warpbreaks(breaks, wool, tension) VALUES > (26, 'A', 'L'), > (30, 'A', 'L'), > (54, 'A', 'L'), > (25, 'A', 'L'), > (70, 'A', 'L'), > (52, 'A', 'L'), > (51, 'A', 'L'), > (26, 'A', 'L'), > (67, 'A', 'L'), > (18, 'A', 'M'), > (21, 'A', 'M'), > (29, 'A', 'M'), > (17, 'A', 'M'), > (12, 'A', 'M'), > (18, 'A', 'M'), > (35, 'A', 'M'), > (30, 'A', 'M'), > (36, 'A', 'M'), > (36, 'A', 'H'), > (21, 'A', 'H'), > (24, 'A', 'H'), > (18, 'A', 'H'), > (10, 'A', 'H'), > (43, 'A', 'H'), > (28, 'A', 'H'), > (15, 'A', 'H'), > (26, 'A', 'H'), > (27, 'B', 'L'), > (14, 'B', 'L'), > (29, 'B', 'L'), > (19, 'B', 'L'), > (29, 'B', 'L'), > (31, 'B', 'L'), > (41, 'B', 'L'), > (20, 'B', 'L'), > (44, 'B', 'L'), > (42, 'B', 'M'), > (26, 'B', 'M'), > (19, 'B', 'M'), > (16, 'B', 'M'), > (39, 'B', 'M'), > (28, 'B', 'M'), > (21, 'B', 'M'), > (39, 'B', 'M'), > (29, 'B', 'M'), > (20, 'B', 'H'), > (21, 'B', 'H'), > (24, 'B', 'H'), > (17, 'B', 'H'), > (13, 'B', 'H'), > (15, 'B', 'H'), > (15, 'B', 'H'), > (16, 'B', 'H'), > (28, 'B', 'H'); > SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', > 'wool,tension'); > DROP TABLE IF EXISTS glm_model, glm_model_summary; > SELECT glm('warpbreaks_dummy', > 'glm_model', > 'breaks', > 'ARRAY[1.0,"wool_B","tension_M", "tension_H"]', > 'family=poisson, link=log'); > COPY (SELECT madlib.pmml('glm_model')) TO '/tmp/glm.pmml'; > SELECT madlib.glm_predict(coef, ARRAY[1, 0, 1, 0]::float8[], 'log') FROM > glm_model; > glm_predict > -------------------- > 29.097222222222218 > SELECT madlib.glm_predict(coef, ARRAY[1, 0, 0, 0]::float8[], 'log') FROM > glm_model; > glm_predict > -------------------- > 40.123538011695906 > {code} > Use pypmml to predict the data using the generated pmml: > {code:java} > from pypmml import Model > model = Model.fromFile('/tmp/glm.pmml') > data = [{ "wool_B": 0, "tension_M": 0, "tension_H": 0}, > { "wool_B": 0, "tension_M": 1, "tension_H": 0}, > ] > for d in data: > result = model.predict(d) > print(d) > print(result) > {'wool_B': 0, 'tension_M': 0, 'tension_H': 0} > {'predicted_breaks_pmml_prediction': nan} > {'wool_B': 0, 'tension_M': 1, 'tension_H': 0} > {'predicted_breaks_pmml_prediction': nan} > {code} > Obviously the nan results are wrong. To make it work with the existing pmml > file, we need to modify the input a bit. > {code:java} > from pypmml import Model > model = Model.fromFile('/tmp/glm.pmml') > data = [{ "1.0": 1, "wool_B": 0, "tension_M": 0, "tension_H": 0}, > { "1.0": 1, "wool_B": 0, "tension_M": 1, "tension_H": 0} > ] > for d in data: > result = model.predict(d) > print(d) > print(result) > {'1.0': 1, 'wool_B': 0, 'tension_M': 0, 'tension_H': 0} > {'predicted_breaks_pmml_prediction': 40.123538011695906} > {'1.0': 1, 'wool_B': 0, 'tension_M': 1, 'tension_H': 0} > {'predicted_breaks_pmml_prediction': 29.097222222222218} > {code} > Now these values match the madlib glm_predict output > h1. Goal > The goal of this story is to fix the pmml code so that we don't need to > awkwardly pass the intercept as "'1.0':1" before the beginning of each data > row. Note the the name "1.0" is used because that's how it's stored in the > pmml file -- This message was sent by Atlassian Jira (v8.20.10#820010)