This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 8dc11d9 [SYSTEMDS-3227] Equi-height binning in transformencode/apply
8dc11d9 is described below
commit 8dc11d953a18d80b73ae5c8412f1daf3c1962318
Author: hieale <[email protected]>
AuthorDate: Sun Jan 23 20:40:38 2022 +0100
[SYSTEMDS-3227] Equi-height binning in transformencode/apply
This patch extends the built-in functions transformencode and
transformapply by equi-height binning on the local runtime. For this purpose
the selected column gets sorted, then the bin boundaries are calulated as
quantiles which then are converted to inout indicies and applied as bin
boundaries analogous to already existing equi-width binning. The bining problem
for non divisible bin numbers has been adressed with spillover. This has the
advantage that all but the last bin a gurantued [...]
DIA project WS2021/22.
Closes #1517.
Co-authored-by: Clemens Hofmann <[email protected]>
Co-authored-by: StaĊĦa Mandic <[email protected]>
---
.../runtime/transform/encode/ColumnEncoderBin.java | 54 ++++++++++++++++++++--
.../runtime/transform/encode/EncoderFactory.java | 11 ++++-
.../transform/TransformFrameEncodeApplyTest.java | 50 ++++++++++++++++++--
.../homes3/homes.tfspec_binHeightDummy.json | 6 +++
.../homes3/homes.tfspec_binHeightDummy2.json | 6 +++
.../datasets/homes3/homes.tfspec_bin_height.json | 5 ++
.../datasets/homes3/homes.tfspec_bin_height2.json | 5 ++
7 files changed, 126 insertions(+), 11 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
index 005c2c2..d6e79c4 100644
---
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
+++
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
@@ -42,6 +42,8 @@ public class ColumnEncoderBin extends ColumnEncoder {
public static final String NBINS_PREFIX = "nbins";
private static final long serialVersionUID = 1917445005206076078L;
protected int _numBin = -1;
+ private BinMethod _binMethod = BinMethod.EQUI_WIDTH;
+ private double[] _sortedInput = null;
// frame transform-apply attributes
// a) column bin boundaries
@@ -55,9 +57,10 @@ public class ColumnEncoderBin extends ColumnEncoder {
super(-1);
}
- public ColumnEncoderBin(int colID, int numBin) {
+ public ColumnEncoderBin(int colID, int numBin, BinMethod binMethod) {
super(colID);
_numBin = numBin;
+ _binMethod = binMethod;
}
public ColumnEncoderBin(int colID, int numBin, double[] binMins,
double[] binMaxs) {
@@ -88,8 +91,15 @@ public class ColumnEncoderBin extends ColumnEncoder {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
if(!isApplicable())
return;
- double[] pairMinMax = getMinMaxOfCol(in, _colID, 0, -1);
- computeBins(pairMinMax[0], pairMinMax[1]);
+ if(_binMethod == BinMethod.EQUI_WIDTH) {
+ double[] pairMinMax = getMinMaxOfCol(in, _colID, 0, -1);
+ computeBins(pairMinMax[0], pairMinMax[1]);
+ }
+ else if(_binMethod == BinMethod.EQUI_HEIGHT) {
+ prepareDataForEqualHeightBins(in, _colID, 0, -1);
+ computeEqualHeightBins();
+ }
+
if(DMLScript.STATISTICS)
TransformStatistics.incBinningBuildTime(System.nanoTime()-t0);
}
@@ -102,10 +112,9 @@ public class ColumnEncoderBin extends ColumnEncoder {
}
// Returns NaN if value is missing, so can't be assigned a Bin
double inVal = in.getDoubleNaN(row, _colID - 1);
- if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal >
_binMaxs[_binMaxs.length-1] )
+ if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal >
_binMaxs[_binMaxs.length-1])
return Double.NaN;
int ix = Arrays.binarySearch(_binMaxs, inVal);
-
return ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
}
@@ -150,6 +159,20 @@ public class ColumnEncoderBin extends ColumnEncoder {
return new double[] {min, max};
}
+ private void prepareDataForEqualHeightBins(CacheBlock in, int colID,
int startRow, int blockSize) {
+ int numRows = getEndIndex(in.getNumRows(), startRow, blockSize)
- startRow;
+ _sortedInput = new double[numRows];
+ for(int i = startRow; i < numRows; i++) {
+ double inVal = in.getDouble(i, colID - 1);
+ //FIXME current NaN handling introduces 0s and thus
+ // impacts the computation of bin boundaries
+ if(Double.isNaN(inVal))
+ continue;
+ _sortedInput[i] = inVal;
+ }
+ Arrays.sort(_sortedInput);
+ }
+
@Override
public Callable<Object> getBuildTask(CacheBlock in) {
return new ColumnBinBuildTask(this, in);
@@ -178,6 +201,23 @@ public class ColumnEncoderBin extends ColumnEncoder {
}
}
+ private void computeEqualHeightBins() {
+ if(_binMins == null || _binMaxs == null) {
+ _binMins = new double[_numBin];
+ _binMaxs = new double[_numBin];
+ }
+ int n = _sortedInput.length;
+ for(int i = 0; i < _numBin; i++) {
+ double pos = n * (i + 1d) / _numBin;
+ _binMaxs[i] = (pos % 1 == 0) ? // pos is integer
+ _sortedInput[(int) pos-1] :
+ _sortedInput[(int) Math.floor(pos)];
+ }
+ _binMaxs[_numBin-1] = _sortedInput[n-1];
+ _binMins[0] = _sortedInput[0];
+ System.arraycopy(_binMaxs, 0, _binMins, 1, _numBin - 1);
+ }
+
public void prepareBuildPartial() {
// ensure allocated min/max arrays
_colMins = -1f;
@@ -290,6 +330,10 @@ public class ColumnEncoderBin extends ColumnEncoder {
}
}
+ public enum BinMethod {
+ INVALID, EQUI_WIDTH, EQUI_HEIGHT
+ }
+
private static class BinSparseApplyTask extends
ColumnApplyTask<ColumnEncoderBin> {
public BinSparseApplyTask(ColumnEncoderBin encoder, CacheBlock
input,
diff --git
a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
index 67c0ea1..33b7682 100644
---
a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
+++
b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
@@ -117,7 +117,15 @@ public class EncoderFactory {
int id =
TfMetaUtils.parseJsonObjectID(colspec, colnames, minCol, maxCol, ids);
if(id <= 0)
continue;
- ColumnEncoderBin bin = new
ColumnEncoderBin(id, numBins);
+ String method =
colspec.get("method").toString().toUpperCase();
+ ColumnEncoderBin.BinMethod binMethod;
+ if ("EQUI-WIDTH".equals(method))
+ binMethod =
ColumnEncoderBin.BinMethod.EQUI_WIDTH;
+ else if ("EQUI-HEIGHT".equals(method))
+ binMethod =
ColumnEncoderBin.BinMethod.EQUI_HEIGHT;
+ else
+ throw new
DMLRuntimeException("Unsupported binning method: " + method);
+ ColumnEncoderBin bin = new
ColumnEncoderBin(id, numBins, binMethod);
addEncoderToMap(bin, colEncoders);
}
if(!dcIDs.isEmpty())
@@ -219,5 +227,4 @@ public class EncoderFactory {
ret.put(colnames[i], i);
return ret;
}
-
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java
b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java
index f852a4d..5aa586d 100644
---
a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java
@@ -44,10 +44,14 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
private final static String SPEC2b =
"homes3/homes.tfspec_dummy2.json";
private final static String SPEC3 = "homes3/homes.tfspec_bin.json";
//recode
private final static String SPEC3b = "homes3/homes.tfspec_bin2.json";
//recode
+ private final static String SPEC3c =
"homes3/homes.tfspec_bin_height.json"; //recode
+ private final static String SPEC3d =
"homes3/homes.tfspec_bin_height2.json"; //recode
private final static String SPEC6 =
"homes3/homes.tfspec_recode_dummy.json";
private final static String SPEC6b =
"homes3/homes.tfspec_recode_dummy2.json";
private final static String SPEC7 =
"homes3/homes.tfspec_binDummy.json"; //recode+dummy
private final static String SPEC7b =
"homes3/homes.tfspec_binDummy2.json"; //recode+dummy
+ private final static String SPEC7c =
"homes3/homes.tfspec_binHeightDummy.json"; //recode+dummy
+ private final static String SPEC7d =
"homes3/homes.tfspec_binHeightDummy2.json"; //recode+dummy
private final static String SPEC8 = "homes3/homes.tfspec_hash.json";
private final static String SPEC8b = "homes3/homes.tfspec_hash2.json";
private final static String SPEC9 =
"homes3/homes.tfspec_hash_recode.json";
@@ -63,6 +67,8 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
private static final int[] BIN_col3 = new int[]{1,4,2,3,3,2,4};
private static final int[] BIN_col8 = new int[]{1,2,2,2,2,2,3};
+ private static final int[] BIN_HEIGHT_col3 = new int[]{1,3,1,3,3,2,3};
+ private static final int[] BIN_HEIGHT_col8 = new int[]{1,2,2,3,2,2,3};
public enum TransformType {
RECODE,
@@ -70,6 +76,8 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
RECODE_DUMMY,
BIN,
BIN_DUMMY,
+ BIN_HEIGHT,
+ BIN_HEIGHT_DUMMY,
IMPUTE,
OMIT,
HASH,
@@ -131,6 +139,11 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
public void testHomesBinningIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv",
TransformType.BIN, false);
}
+
+ @Test
+ public void testHomesEqualHeightBinningIDsSingleNodeCSV() {
+ runTransformTest(ExecMode.SINGLE_NODE, "csv",
TransformType.BIN_HEIGHT, true);
+ }
@Test
public void testHomesBinningIDsSparkCSV() {
@@ -148,6 +161,12 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
}
@Test
+ public void testHomesHeightBinningDummyIDsSingleNodeCSV() {
+ runTransformTest(ExecMode.SINGLE_NODE, "csv",
TransformType.BIN_HEIGHT_DUMMY, false);
+ }
+
+
+ @Test
public void testHomesBinningDummyIDsSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv",
TransformType.BIN_DUMMY, false);
}
@@ -251,6 +270,11 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
public void testHomesBinningDummyColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv",
TransformType.BIN_DUMMY, true);
}
+
+ @Test
+ public void testHomesHeightBinningDummyColnamesSingleNodeCSV() {
+ runTransformTest(ExecMode.SINGLE_NODE, "csv",
TransformType.BIN_HEIGHT_DUMMY, true);
+ }
@Test
public void testHomesBinningDummyColnamesSparkCSV() {
@@ -369,10 +393,12 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
case RECODE: SPEC = colnames?SPEC1b:SPEC1; DATASET =
DATASET1; break;
case DUMMY: SPEC = colnames?SPEC2b:SPEC2; DATASET =
DATASET1; break;
case BIN: SPEC = colnames?SPEC3b:SPEC3; DATASET =
DATASET1; break;
+ case BIN_HEIGHT: SPEC = colnames?SPEC3d:SPEC3c;
DATASET = DATASET1; break;
case IMPUTE: SPEC = colnames?SPEC4b:SPEC4; DATASET =
DATASET2; break;
case OMIT: SPEC = colnames?SPEC5b:SPEC5; DATASET =
DATASET2; break;
case RECODE_DUMMY: SPEC = colnames?SPEC6b:SPEC6;
DATASET = DATASET1; break;
case BIN_DUMMY: SPEC = colnames?SPEC7b:SPEC7; DATASET =
DATASET1; break;
+ case BIN_HEIGHT_DUMMY: SPEC =
colnames?SPEC7d:SPEC7c; DATASET = DATASET1; break;
case HASH: SPEC = colnames?SPEC8b:SPEC8; DATASET
= DATASET1; break;
case HASH_RECODE: SPEC = colnames?SPEC9b:SPEC9; DATASET
= DATASET1; break;
}
@@ -386,7 +412,7 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
- programArgs = new String[]{"-nvargs",
+ programArgs = new String[]{"-nvargs",
"DATA=" + DATASET_DIR + DATASET,
"TFSPEC=" + DATASET_DIR + SPEC,
"TFDATA1=" + output("tfout1"),
@@ -412,13 +438,17 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
//additional checks for binning as encode-decode
impossible
//TODO fix distributed binning as well
- if( type == TransformType.BIN ) {
+ if (type == TransformType.BIN ) {
for(int i=0; i<7; i++) {
Assert.assertEquals(BIN_col3[i],
R1[i][2], 1e-8);
Assert.assertEquals(BIN_col8[i],
R1[i][7], 1e-8);
}
- }
- else if( type == TransformType.BIN_DUMMY ) {
+ } else if (type == TransformType.BIN_HEIGHT) {
+ for(int i=0; i<7; i++) {
+ Assert.assertEquals(BIN_HEIGHT_col3[i],
R1[i][2], 1e-8);
+ Assert.assertEquals(BIN_HEIGHT_col8[i],
R1[i][7], 1e-8);
+ }
+ } else if (type == TransformType.BIN_DUMMY) {
Assert.assertEquals(14, R1[0].length);
for(int i=0; i<7; i++) {
for(int j=0; j<4; j++) { //check dummy
coded
@@ -430,6 +460,18 @@ public class TransformFrameEncodeApplyTest extends
AutomatedTestBase {
1:0, R1[i][10+j], 1e-8);
}
}
+ } else if (type == TransformType.BIN_HEIGHT_DUMMY) {
+ Assert.assertEquals(14, R1[0].length);
+ for(int i=0; i<7; i++) {
+ for(int j=0; j<4; j++) { //check dummy
coded
+
Assert.assertEquals((j==BIN_HEIGHT_col3[i]-1)?
+ 1:0,
R1[i][2+j], 1e-8);
+ }
+ for(int j=0; j<3; j++) { //check dummy
coded
+
Assert.assertEquals((j==BIN_HEIGHT_col8[i]-1)?
+ 1:0,
R1[i][10+j], 1e-8);
+ }
+ }
} else if (type == TransformType.IMPUTE){
// Column 8 had GLOBAL_MEAN applied
Assert.assertFalse(TestUtils.containsNan(R1,
8));
diff --git
a/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy.json
b/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy.json
new file mode 100644
index 0000000..15ddea8
--- /dev/null
+++ b/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy.json
@@ -0,0 +1,6 @@
+{
+ "ids": true, "recode": [ 1, 2, 7 ], "bin": [
+ { "id": 8 , "method": "equi-height", "numbins": 3 },
+ { "id": 3, "method": "equi-height", "numbins": 4 }],
+ "dummycode": [ 3, 8 ]
+ }
\ No newline at end of file
diff --git
a/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy2.json
b/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy2.json
new file mode 100644
index 0000000..04be764
--- /dev/null
+++ b/src/test/resources/datasets/homes3/homes.tfspec_binHeightDummy2.json
@@ -0,0 +1,6 @@
+{
+ "recode": [ zipcode, "district", "view" ], "bin": [
+ { "name": "saleprice" , "method": "equi-height", "numbins": 3 },
+ { "name": "sqft", "method": "equi-height", "numbins": 4 }],
+ "dummycode": [ sqft, "saleprice" ]
+ }
\ No newline at end of file
diff --git a/src/test/resources/datasets/homes3/homes.tfspec_bin_height.json
b/src/test/resources/datasets/homes3/homes.tfspec_bin_height.json
new file mode 100644
index 0000000..4b1e1a4
--- /dev/null
+++ b/src/test/resources/datasets/homes3/homes.tfspec_bin_height.json
@@ -0,0 +1,5 @@
+{
+ "ids": true, "recode": [ 1, 2, 7 ], "bin": [
+ { "id": 8 , "method": "equi-height", "numbins": 3 }
+,{ "id": 3, "method": "equi-height", "numbins": 4 }]
+}
\ No newline at end of file
diff --git a/src/test/resources/datasets/homes3/homes.tfspec_bin_height2.json
b/src/test/resources/datasets/homes3/homes.tfspec_bin_height2.json
new file mode 100644
index 0000000..6c2be01
--- /dev/null
+++ b/src/test/resources/datasets/homes3/homes.tfspec_bin_height2.json
@@ -0,0 +1,5 @@
+{
+ "recode": [ zipcode, "district", "view" ], "bin": [
+ { "name": "saleprice" , "method": "equi-height", "numbins": 3 }
+,{ "name": "sqft", "method": "equi-height", "numbins": 4 }]
+}
\ No newline at end of file