This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
     new a1bc419  [SYSTEMML-2509] Fix transform sequences of binning/dummy 
coding, tests
a1bc419 is described below

commit a1bc419b033f635273e7b91cf9a8dea329e03567
Author: Matthias Boehm <[email protected]>
AuthorDate: Mon Feb 4 20:56:29 2019 +0100

    [SYSTEMML-2509] Fix transform sequences of binning/dummy coding, tests
    
    This patch is a follow-up on fixing the binning support, specifically
    for columns that are both binned and dummy coded. For an example
    scenario of {recode: [1,2,7], bin: [3,8] dummycode: [3,8]}, we
    incorrectly constructed the following composite encoder (which assumed
    that all dummy coded columns need to be recoded):
    
    CompositeEncoder(4):
    -- EncoderRecode: [1, 2, 3, 7, 8]
    -- EncoderPassThrough: [4, 5, 6, 9]
    -- EncoderDummycode: [3, 8]
    -- EncoderBin: [3, 8]
    
    Now, we fixed that by only adding dummy coded columns that are not
    binned to the recode list and brining the basic encoders into the right
    sequence (i.e., binning before dummy coding):
    
    CompositeEncoder(4):
    -- EncoderRecode: [1, 2, 7]
    -- EncoderPassThrough: [4, 5, 6, 9]
    -- EncoderBin: [3, 8]
    -- EncoderDummycode: [3, 8]
    
    Finally, this patch also includes the necessary tests to ensure such
    issues don't occur in the future.
---
 .../runtime/transform/encode/EncoderFactory.java   | 15 ++--
 .../transform/TransformFrameEncodeApplyTest.java   | 88 +++++++++++++++++-----
 .../input/homes3/homes.tfspec_binDummy.json        |  6 ++
 .../input/homes3/homes.tfspec_binDummy2.json       |  6 ++
 4 files changed, 88 insertions(+), 27 deletions(-)

diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
index 3d2a100..1118ca6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
+++ 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
@@ -56,19 +56,20 @@ public class EncoderFactory
                        List<Encoder> lencoders = new ArrayList<>();
                
                        //prepare basic id lists (recode, dummycode, 
pass-through)
-                       //note: any dummycode column requires recode as 
preparation
                        List<Integer> rcIDs = Arrays.asList(ArrayUtils.toObject(
                                        TfMetaUtils.parseJsonIDList(jSpec, 
colnames, TfUtils.TXMETHOD_RECODE)));
                        List<Integer> dcIDs = Arrays.asList(ArrayUtils.toObject(
                                        TfMetaUtils.parseJsonIDList(jSpec, 
colnames, TfUtils.TXMETHOD_DUMMYCODE))); 
-                       rcIDs = new 
ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs));
                        List<Integer> binIDs = 
TfMetaUtils.parseBinningColIDs(jSpec, colnames);
+                       //note: any dummycode column requires recode as 
preparation, unless it follows binning
+                       rcIDs = new ArrayList<Integer>(
+                               CollectionUtils.union(rcIDs, 
CollectionUtils.subtract(dcIDs, binIDs)));
                        List<Integer> ptIDs = new 
ArrayList<Integer>(CollectionUtils.subtract(
-                                       
CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), 
binIDs)); 
+                                       
CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), binIDs));
                        List<Integer> oIDs = Arrays.asList(ArrayUtils.toObject(
-                                       TfMetaUtils.parseJsonIDList(jSpec, 
colnames, TfUtils.TXMETHOD_OMIT))); 
+                                       TfMetaUtils.parseJsonIDList(jSpec, 
colnames, TfUtils.TXMETHOD_OMIT)));
                        List<Integer> mvIDs = Arrays.asList(ArrayUtils.toObject(
-                                       
TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_IMPUTE))); 
+                                       
TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_IMPUTE)));
                        
                        //create individual encoders
                        if( !rcIDs.isEmpty() ) {
@@ -79,10 +80,10 @@ public class EncoderFactory
                        if( !ptIDs.isEmpty() )
                                lencoders.add(new EncoderPassThrough(
                                        
ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), clen));
-                       if( !dcIDs.isEmpty() )
-                               lencoders.add(new EncoderDummycode(jSpec, 
colnames, schema.length));
                        if( !binIDs.isEmpty() )
                                lencoders.add(new EncoderBin(jSpec, colnames, 
schema.length));
+                       if( !dcIDs.isEmpty() )
+                               lencoders.add(new EncoderDummycode(jSpec, 
colnames, schema.length));
                        if( !oIDs.isEmpty() )
                                lencoders.add(new EncoderOmit(jSpec, colnames, 
schema.length));
                        if( !mvIDs.isEmpty() ) {
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
index c27a4a2..cb1b66e 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
@@ -31,7 +31,7 @@ import org.apache.sysml.test.integration.TestConfiguration;
 import org.apache.sysml.test.utils.TestUtils;
 import org.apache.sysml.utils.Statistics;
 
-public class TransformFrameEncodeApplyTest extends AutomatedTestBase 
+public class TransformFrameEncodeApplyTest extends AutomatedTestBase
 {
        private final static String TEST_NAME1 = "TransformFrameEncodeApply";
        private final static String TEST_DIR = "functions/transform/";
@@ -39,14 +39,16 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase
        
        //dataset and transform tasks without missing values
        private final static String DATASET1 = "homes3/homes.csv";
-       private final static String SPEC1    = 
"homes3/homes.tfspec_recode.json"; 
-       private final static String SPEC1b   = 
"homes3/homes.tfspec_recode2.json"; 
+       private final static String SPEC1    = 
"homes3/homes.tfspec_recode.json";
+       private final static String SPEC1b   = 
"homes3/homes.tfspec_recode2.json";
        private final static String SPEC2    = "homes3/homes.tfspec_dummy.json";
        private final static String SPEC2b   = 
"homes3/homes.tfspec_dummy2.json";
-       private final static String SPEC3    = "homes3/homes.tfspec_bin.json"; 
//incl recode
-       private final static String SPEC3b   = "homes3/homes.tfspec_bin2.json"; 
//incl recode
-       private final static String SPEC6    = 
"homes3/homes.tfspec_recode_dummy.json"; 
-       private final static String SPEC6b   = 
"homes3/homes.tfspec_recode_dummy2.json"; 
+       private final static String SPEC3    = "homes3/homes.tfspec_bin.json"; 
//recode
+       private final static String SPEC3b   = "homes3/homes.tfspec_bin2.json"; 
//recode
+       private final static String SPEC6    = 
"homes3/homes.tfspec_recode_dummy.json";
+       private final static String SPEC6b   = 
"homes3/homes.tfspec_recode_dummy2.json";
+       private final static String SPEC7    = 
"homes3/homes.tfspec_binDummy.json"; //recode+dummy
+       private final static String SPEC7b   = 
"homes3/homes.tfspec_binDummy2.json"; //recode+dummy
        
        //dataset and transform tasks with missing values
        private final static String DATASET2 = "homes/homes.csv";
@@ -55,11 +57,15 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase
        private final static String SPEC5    = "homes3/homes.tfspec_omit.json";
        private final static String SPEC5b   = "homes3/homes.tfspec_omit2.json";
        
+       private static final int[] BIN_col3 = new int[]{1,4,2,3,3,2,4};
+       private static final int[] BIN_col8 = new int[]{1,2,2,2,2,2,3};
+       
        public enum TransformType {
                RECODE,
                DUMMY,
                RECODE_DUMMY,
                BIN,
+               BIN_DUMMY,
                IMPUTE,
                OMIT,
        }
@@ -120,10 +126,10 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase
                runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", 
TransformType.BIN, false);
        }
        
-       @Test
-       public void testHomesBinningIDsSparkCSV() {
-               runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", 
TransformType.BIN, false);
-       }
+//     @Test
+//     public void testHomesBinningIDsSparkCSV() {
+//             runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", 
TransformType.BIN, false);
+//     }
        
        @Test
        public void testHomesBinningIDsHybridCSV() {
@@ -131,6 +137,21 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase
        }
        
        @Test
+       public void testHomesBinningDummyIDsSingleNodeCSV() {
+               runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", 
TransformType.BIN_DUMMY, false);
+       }
+
+//     @Test
+//     public void testHomesBinningDummyIDsSparkCSV() {
+//             runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", 
TransformType.BIN_DUMMY, false);
+//     }
+       
+       @Test
+       public void testHomesBinningDummyIDsHybridCSV() {
+               runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", 
TransformType.BIN_DUMMY, false);
+       }
+       
+       @Test
        public void testHomesOmitIDsSingleNodeCSV() {
                runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", 
TransformType.OMIT, false);
        }
@@ -210,10 +231,10 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase
                runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", 
TransformType.BIN, true);
        }
        
-       @Test
-       public void testHomesBinningColnamesSparkCSV() {
-               runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", 
TransformType.BIN, true);
-       }
+//     @Test
+//     public void testHomesBinningColnamesSparkCSV() {
+//             runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", 
TransformType.BIN, true);
+//     }
        
        @Test
        public void testHomesBinningColnamesHybridCSV() {
@@ -221,6 +242,21 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase
        }
        
        @Test
+       public void testHomesBinningDummyColnamesSingleNodeCSV() {
+               runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", 
TransformType.BIN_DUMMY, true);
+       }
+       
+//     @Test
+//     public void testHomesBinningDummyColnamesSparkCSV() {
+//             runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", 
TransformType.BIN_DUMMY, true);
+//     }
+       
+       @Test
+       public void testHomesBinningDummyColnamesHybridCSV() {
+               runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", 
TransformType.BIN_DUMMY, true);
+       }
+       
+       @Test
        public void testHomesOmitColnamesSingleNodeCSV() {
                runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", 
TransformType.OMIT, true);
        }
@@ -266,6 +302,7 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase
                        case IMPUTE: SPEC = colnames?SPEC4b:SPEC4; DATASET = 
DATASET2; break;
                        case OMIT:   SPEC = colnames?SPEC5b:SPEC5; DATASET = 
DATASET2; break;
                        case RECODE_DUMMY: SPEC = colnames?SPEC6b:SPEC6; 
DATASET = DATASET1; break;
+                       case BIN_DUMMY: SPEC = colnames?SPEC7b:SPEC7; DATASET = 
DATASET1; break;
                }
 
                if( !ofmt.equals("csv") )
@@ -302,12 +339,23 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase
                        
                        //additional checks for binning as encode-decode 
impossible
                        //TODO fix distributed binning as well
-                       if( type == TransformType.BIN && rt != 
RUNTIME_PLATFORM.SPARK ) {
-                               int[] col3 = new int[]{1,4,2,3,3,2,4};
-                               int[] col8 = new int[]{1,2,2,2,2,2,3};
+                       if( type == TransformType.BIN ) {
+                               for(int i=0; i<7; i++) {
+                                       Assert.assertEquals(BIN_col3[i], 
R1[i][2], 1e-8);
+                                       Assert.assertEquals(BIN_col8[i], 
R1[i][7], 1e-8);
+                               }
+                       }
+                       else if( type == TransformType.BIN_DUMMY ) {
+                               Assert.assertEquals(14, R1[0].length);
                                for(int i=0; i<7; i++) {
-                                       Assert.assertEquals(col3[i], R1[i][2], 
1e-8);
-                                       Assert.assertEquals(col8[i], R1[i][7], 
1e-8);
+                                       for(int j=0; j<4; j++) { //check dummy 
coded
+                                               
Assert.assertEquals((j==BIN_col3[i]-1)?
+                                                       1:0, R1[i][2+j], 1e-8);
+                                       }
+                                       for(int j=0; j<3; j++) { //check dummy 
coded
+                                               
Assert.assertEquals((j==BIN_col8[i]-1)?
+                                                       1:0, R1[i][10+j], 1e-8);
+                                       }
                                }
                        }
                }
diff --git 
a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_binDummy.json 
b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_binDummy.json
new file mode 100644
index 0000000..82a2d51
--- /dev/null
+++ 
b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_binDummy.json
@@ -0,0 +1,6 @@
+{
+ "ids": true, "recode": [ 1, 2, 7 ], "bin": [
+ { "id": 8  , "method": "equi-width", "numbins": 3 },
+ { "id": 3, "method": "equi-width", "numbins": 4 }],
+ "dummycode": [ 3, 8 ]
+  }
\ No newline at end of file
diff --git 
a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_binDummy2.json 
b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_binDummy2.json
new file mode 100644
index 0000000..34fd517
--- /dev/null
+++ 
b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_binDummy2.json
@@ -0,0 +1,6 @@
+{
+ "recode": [ zipcode, "district", "view" ], "bin": [
+ { "name": "saleprice"  , "method": "equi-width", "numbins": 3 },
+ { "name": "sqft", "method": "equi-width", "numbins": 4 }],
+ "dummycode": [ sqft, "saleprice" ]
+  }
\ No newline at end of file

Reply via email to