This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 8dc11d9  [SYSTEMDS-3227] Equi-height binning in transformencode/apply
8dc11d9 is described below

commit 8dc11d953a18d80b73ae5c8412f1daf3c1962318
Author: hieale <[email protected]>
AuthorDate: Sun Jan 23 20:40:38 2022 +0100

    [SYSTEMDS-3227] Equi-height binning in transformencode/apply
    
    This patch extends the built-in functions transformencode and 
transformapply by equi-height binning on the local runtime. For this purpose 
the selected column gets sorted, then the bin boundaries are calulated as 
quantiles which then are converted to inout indicies and applied as bin 
boundaries analogous to already existing equi-width binning. The bining problem 
for non divisible bin numbers has been adressed with spillover. This has the 
advantage that all but the last bin a gurantued [...]
    
    DIA project WS2021/22.
    Closes #1517.
    
    Co-authored-by: Clemens Hofmann <[email protected]>
    Co-authored-by: StaĊĦa Mandic <[email protected]>
---
 .../runtime/transform/encode/ColumnEncoderBin.java | 54 ++++++++++++++++++++--
 .../runtime/transform/encode/EncoderFactory.java   | 11 ++++-
 .../transform/TransformFrameEncodeApplyTest.java   | 50 ++++++++++++++++++--
 .../homes3/homes.tfspec_binHeightDummy.json        |  6 +++
 .../homes3/homes.tfspec_binHeightDummy2.json       |  6 +++
 .../datasets/homes3/homes.tfspec_bin_height.json   |  5 ++
 .../datasets/homes3/homes.tfspec_bin_height2.json  |  5 ++
 7 files changed, 126 insertions(+), 11 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java 
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
index 005c2c2..d6e79c4 100644
--- 
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
+++ 
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
@@ -42,6 +42,8 @@ public class ColumnEncoderBin extends ColumnEncoder {
        public static final String NBINS_PREFIX = "nbins";
        private static final long serialVersionUID = 1917445005206076078L;
        protected int _numBin = -1;
+       private BinMethod _binMethod = BinMethod.EQUI_WIDTH;
+       private double[] _sortedInput = null;
 
        // frame transform-apply attributes
        // a) column bin boundaries
@@ -55,9 +57,10 @@ public class ColumnEncoderBin extends ColumnEncoder {
                super(-1);
        }
 
-       public ColumnEncoderBin(int colID, int numBin) {
+       public ColumnEncoderBin(int colID, int numBin, BinMethod binMethod)  {
                super(colID);
                _numBin = numBin;
+               _binMethod = binMethod;
        }
 
        public ColumnEncoderBin(int colID, int numBin, double[] binMins, 
double[] binMaxs) {
@@ -88,8 +91,15 @@ public class ColumnEncoderBin extends ColumnEncoder {
                long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                if(!isApplicable())
                        return;
-               double[] pairMinMax = getMinMaxOfCol(in, _colID, 0, -1);
-               computeBins(pairMinMax[0], pairMinMax[1]);
+               if(_binMethod == BinMethod.EQUI_WIDTH) {
+                       double[] pairMinMax = getMinMaxOfCol(in, _colID, 0, -1);
+                       computeBins(pairMinMax[0], pairMinMax[1]);
+               }
+               else if(_binMethod == BinMethod.EQUI_HEIGHT) {
+                       prepareDataForEqualHeightBins(in, _colID, 0, -1);
+                       computeEqualHeightBins();
+               }
+
                if(DMLScript.STATISTICS)
                        
TransformStatistics.incBinningBuildTime(System.nanoTime()-t0);
        }
@@ -102,10 +112,9 @@ public class ColumnEncoderBin extends ColumnEncoder {
                }
                // Returns NaN if value is missing, so can't be assigned a Bin
                double inVal = in.getDoubleNaN(row, _colID - 1);
-               if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > 
_binMaxs[_binMaxs.length-1] )
+               if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > 
_binMaxs[_binMaxs.length-1])
                        return Double.NaN;
                int ix = Arrays.binarySearch(_binMaxs, inVal);
-
                return ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
        }
        
@@ -150,6 +159,20 @@ public class ColumnEncoderBin extends ColumnEncoder {
                return new double[] {min, max};
        }
 
+       private void prepareDataForEqualHeightBins(CacheBlock in, int colID, 
int startRow, int blockSize) {
+               int numRows = getEndIndex(in.getNumRows(), startRow, blockSize) 
- startRow;
+               _sortedInput = new double[numRows];
+               for(int i = startRow; i < numRows; i++) {
+                       double inVal = in.getDouble(i, colID - 1);
+                       //FIXME current NaN handling introduces 0s and thus
+                       // impacts the computation of bin boundaries
+                       if(Double.isNaN(inVal))
+                               continue;
+                       _sortedInput[i] = inVal;
+               }
+               Arrays.sort(_sortedInput);
+       }
+
        @Override
        public Callable<Object> getBuildTask(CacheBlock in) {
                return new ColumnBinBuildTask(this, in);
@@ -178,6 +201,23 @@ public class ColumnEncoderBin extends ColumnEncoder {
                }
        }
 
+       private void computeEqualHeightBins() {
+               if(_binMins == null || _binMaxs == null) {
+                       _binMins = new double[_numBin];
+                       _binMaxs = new double[_numBin];
+               }
+               int n = _sortedInput.length;
+               for(int i = 0; i < _numBin; i++) {
+                       double pos = n * (i + 1d) / _numBin;
+                       _binMaxs[i] = (pos % 1 == 0) ? // pos is integer
+                               _sortedInput[(int) pos-1] :
+                               _sortedInput[(int) Math.floor(pos)];
+               }
+               _binMaxs[_numBin-1] = _sortedInput[n-1];
+               _binMins[0] = _sortedInput[0];
+               System.arraycopy(_binMaxs, 0, _binMins, 1, _numBin - 1);
+       }
+
        public void prepareBuildPartial() {
                // ensure allocated min/max arrays
                _colMins = -1f;
@@ -290,6 +330,10 @@ public class ColumnEncoderBin extends ColumnEncoder {
                }
        }
 
+       public enum BinMethod {
+               INVALID, EQUI_WIDTH, EQUI_HEIGHT
+       }
+
        private static class BinSparseApplyTask extends 
ColumnApplyTask<ColumnEncoderBin> {
 
                public BinSparseApplyTask(ColumnEncoderBin encoder, CacheBlock 
input,
diff --git 
a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java 
b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
index 67c0ea1..33b7682 100644
--- 
a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
+++ 
b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
@@ -117,7 +117,15 @@ public class EncoderFactory {
                                        int id = 
TfMetaUtils.parseJsonObjectID(colspec, colnames, minCol, maxCol, ids);
                                        if(id <= 0)
                                                continue;
-                                       ColumnEncoderBin bin = new 
ColumnEncoderBin(id, numBins);
+                                       String method = 
colspec.get("method").toString().toUpperCase();
+                                       ColumnEncoderBin.BinMethod binMethod;
+                                       if ("EQUI-WIDTH".equals(method))
+                                               binMethod = 
ColumnEncoderBin.BinMethod.EQUI_WIDTH;
+                                       else if ("EQUI-HEIGHT".equals(method))
+                                               binMethod = 
ColumnEncoderBin.BinMethod.EQUI_HEIGHT;
+                                       else
+                                               throw new 
DMLRuntimeException("Unsupported binning method: " + method);
+                                       ColumnEncoderBin bin = new 
ColumnEncoderBin(id, numBins, binMethod);
                                        addEncoderToMap(bin, colEncoders);
                                }
                        if(!dcIDs.isEmpty())
@@ -219,5 +227,4 @@ public class EncoderFactory {
                        ret.put(colnames[i], i);
                return ret;
        }
-
 }
diff --git 
a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java
 
b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java
index f852a4d..5aa586d 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java
@@ -44,10 +44,14 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
        private final static String SPEC2b   = 
"homes3/homes.tfspec_dummy2.json";
        private final static String SPEC3    = "homes3/homes.tfspec_bin.json"; 
//recode
        private final static String SPEC3b   = "homes3/homes.tfspec_bin2.json"; 
//recode
+       private final static String SPEC3c   = 
"homes3/homes.tfspec_bin_height.json"; //recode
+       private final static String SPEC3d   = 
"homes3/homes.tfspec_bin_height2.json"; //recode
        private final static String SPEC6    = 
"homes3/homes.tfspec_recode_dummy.json";
        private final static String SPEC6b   = 
"homes3/homes.tfspec_recode_dummy2.json";
        private final static String SPEC7    = 
"homes3/homes.tfspec_binDummy.json"; //recode+dummy
        private final static String SPEC7b   = 
"homes3/homes.tfspec_binDummy2.json"; //recode+dummy
+       private final static String SPEC7c   = 
"homes3/homes.tfspec_binHeightDummy.json"; //recode+dummy
+       private final static String SPEC7d   = 
"homes3/homes.tfspec_binHeightDummy2.json"; //recode+dummy
        private final static String SPEC8    = "homes3/homes.tfspec_hash.json";
        private final static String SPEC8b   = "homes3/homes.tfspec_hash2.json";
        private final static String SPEC9    = 
"homes3/homes.tfspec_hash_recode.json";
@@ -63,6 +67,8 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
        
        private static final int[] BIN_col3 = new int[]{1,4,2,3,3,2,4};
        private static final int[] BIN_col8 = new int[]{1,2,2,2,2,2,3};
+       private static final int[] BIN_HEIGHT_col3 = new int[]{1,3,1,3,3,2,3};
+       private static final int[] BIN_HEIGHT_col8 = new int[]{1,2,2,3,2,2,3};
        
        public enum TransformType {
                RECODE,
@@ -70,6 +76,8 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
                RECODE_DUMMY,
                BIN,
                BIN_DUMMY,
+               BIN_HEIGHT,
+               BIN_HEIGHT_DUMMY,
                IMPUTE,
                OMIT,
                HASH,
@@ -131,6 +139,11 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
        public void testHomesBinningIDsSingleNodeCSV() {
                runTransformTest(ExecMode.SINGLE_NODE, "csv", 
TransformType.BIN, false);
        }
+
+       @Test
+       public void testHomesEqualHeightBinningIDsSingleNodeCSV() {
+               runTransformTest(ExecMode.SINGLE_NODE, "csv", 
TransformType.BIN_HEIGHT, true);
+       }
        
        @Test
        public void testHomesBinningIDsSparkCSV() {
@@ -148,6 +161,12 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
        }
 
        @Test
+       public void testHomesHeightBinningDummyIDsSingleNodeCSV() {
+               runTransformTest(ExecMode.SINGLE_NODE, "csv", 
TransformType.BIN_HEIGHT_DUMMY, false);
+       }
+
+
+       @Test
        public void testHomesBinningDummyIDsSparkCSV() {
                runTransformTest(ExecMode.SPARK, "csv", 
TransformType.BIN_DUMMY, false);
        }
@@ -251,6 +270,11 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
        public void testHomesBinningDummyColnamesSingleNodeCSV() {
                runTransformTest(ExecMode.SINGLE_NODE, "csv", 
TransformType.BIN_DUMMY, true);
        }
+
+       @Test
+       public void testHomesHeightBinningDummyColnamesSingleNodeCSV() {
+               runTransformTest(ExecMode.SINGLE_NODE, "csv", 
TransformType.BIN_HEIGHT_DUMMY, true);
+       }
        
        @Test
        public void testHomesBinningDummyColnamesSparkCSV() {
@@ -369,10 +393,12 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
                        case RECODE: SPEC = colnames?SPEC1b:SPEC1; DATASET = 
DATASET1; break;
                        case DUMMY:  SPEC = colnames?SPEC2b:SPEC2; DATASET = 
DATASET1; break;
                        case BIN:    SPEC = colnames?SPEC3b:SPEC3; DATASET = 
DATASET1; break;
+                       case BIN_HEIGHT:    SPEC = colnames?SPEC3d:SPEC3c; 
DATASET = DATASET1; break;
                        case IMPUTE: SPEC = colnames?SPEC4b:SPEC4; DATASET = 
DATASET2; break;
                        case OMIT:   SPEC = colnames?SPEC5b:SPEC5; DATASET = 
DATASET2; break;
                        case RECODE_DUMMY: SPEC = colnames?SPEC6b:SPEC6; 
DATASET = DATASET1; break;
                        case BIN_DUMMY: SPEC = colnames?SPEC7b:SPEC7; DATASET = 
DATASET1; break;
+                       case BIN_HEIGHT_DUMMY:    SPEC = 
colnames?SPEC7d:SPEC7c; DATASET = DATASET1; break;
                        case HASH:       SPEC = colnames?SPEC8b:SPEC8; DATASET 
= DATASET1; break;
                        case HASH_RECODE: SPEC = colnames?SPEC9b:SPEC9; DATASET 
= DATASET1; break;
                }
@@ -386,7 +412,7 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
                        
                        String HOME = SCRIPT_DIR + TEST_DIR;
                        fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
-                       programArgs = new String[]{"-nvargs", 
+                       programArgs = new String[]{"-nvargs",
                                "DATA=" + DATASET_DIR + DATASET,
                                "TFSPEC=" + DATASET_DIR + SPEC,
                                "TFDATA1=" + output("tfout1"),
@@ -412,13 +438,17 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
                        
                        //additional checks for binning as encode-decode 
impossible
                        //TODO fix distributed binning as well
-                       if( type == TransformType.BIN ) {
+                       if (type == TransformType.BIN ) {
                                for(int i=0; i<7; i++) {
                                        Assert.assertEquals(BIN_col3[i], 
R1[i][2], 1e-8);
                                        Assert.assertEquals(BIN_col8[i], 
R1[i][7], 1e-8);
                                }
-                       }
-                       else if( type == TransformType.BIN_DUMMY ) {
+                       } else if (type == TransformType.BIN_HEIGHT) {
+                               for(int i=0; i<7; i++) {
+                                       Assert.assertEquals(BIN_HEIGHT_col3[i], 
R1[i][2], 1e-8);
+                                       Assert.assertEquals(BIN_HEIGHT_col8[i], 
R1[i][7], 1e-8);
+                               }
+                       } else if (type == TransformType.BIN_DUMMY) {
                                Assert.assertEquals(14, R1[0].length);
                                for(int i=0; i<7; i++) {
                                        for(int j=0; j<4; j++) { //check dummy 
coded
@@ -430,6 +460,18 @@ public class TransformFrameEncodeApplyTest extends 
AutomatedTestBase {
                                                        1:0, R1[i][10+j], 1e-8);
                                        }
                                }
+                       } else if (type == TransformType.BIN_HEIGHT_DUMMY) {
+                               Assert.assertEquals(14, R1[0].length);
+                               for(int i=0; i<7; i++) {
+                                       for(int j=0; j<4; j++) { //check dummy 
coded
+                                               
Assert.assertEquals((j==BIN_HEIGHT_col3[i]-1)?
+                                                               1:0, 
R1[i][2+j], 1e-8);
+                                       }
+                                       for(int j=0; j<3; j++) { //check dummy 
coded
+                                               
Assert.assertEquals((j==BIN_HEIGHT_col8[i]-1)?
+                                                               1:0, 
R1[i][10+j], 1e-8);
+                                       }
+                               }
                        } else if (type == TransformType.IMPUTE){
                                // Column 8 had GLOBAL_MEAN applied
                                Assert.assertFalse(TestUtils.containsNan(R1, 
8));
diff --git 
a/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy.json 
b/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy.json
new file mode 100644
index 0000000..15ddea8
--- /dev/null
+++ b/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy.json
@@ -0,0 +1,6 @@
+{
+ "ids": true, "recode": [ 1, 2, 7 ], "bin": [
+ { "id": 8  , "method": "equi-height", "numbins": 3 },
+ { "id": 3, "method": "equi-height", "numbins": 4 }],
+ "dummycode": [ 3, 8 ]
+  }
\ No newline at end of file
diff --git 
a/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy2.json 
b/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy2.json
new file mode 100644
index 0000000..04be764
--- /dev/null
+++ b/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy2.json
@@ -0,0 +1,6 @@
+{
+ "recode": [ zipcode, "district", "view" ], "bin": [
+ { "name": "saleprice"  , "method": "equi-height", "numbins": 3 },
+ { "name": "sqft", "method": "equi-height", "numbins": 4 }],
+ "dummycode": [ sqft, "saleprice" ]
+  }
\ No newline at end of file
diff --git a/src/test/resources/datasets/homes3/homes.tfspec_bin_height.json 
b/src/test/resources/datasets/homes3/homes.tfspec_bin_height.json
new file mode 100644
index 0000000..4b1e1a4
--- /dev/null
+++ b/src/test/resources/datasets/homes3/homes.tfspec_bin_height.json
@@ -0,0 +1,5 @@
+{
+  "ids": true, "recode": [ 1, 2, 7 ], "bin": [
+  { "id": 8  , "method": "equi-height", "numbins": 3 }
+,{ "id": 3, "method": "equi-height", "numbins": 4 }]
+}
\ No newline at end of file
diff --git a/src/test/resources/datasets/homes3/homes.tfspec_bin_height2.json 
b/src/test/resources/datasets/homes3/homes.tfspec_bin_height2.json
new file mode 100644
index 0000000..6c2be01
--- /dev/null
+++ b/src/test/resources/datasets/homes3/homes.tfspec_bin_height2.json
@@ -0,0 +1,5 @@
+{
+  "recode": [ zipcode, "district", "view" ], "bin": [
+  { "name": "saleprice"  , "method": "equi-height", "numbins": 3 }
+,{ "name": "sqft", "method": "equi-height", "numbins": 4 }]
+}
\ No newline at end of file

Reply via email to