This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 2aae692  [MINOR] Bug fixes in TransformEncode
2aae692 is described below

commit 2aae6922b38b6b50ee30adaf5cf7cb34231456eb
Author: Lukas Erlbacher <[email protected]>
AuthorDate: Fri Jun 11 08:32:31 2021 +0200

    [MINOR] Bug fixes in TransformEncode
    
    Closes #1309
---
 .../transform/encode/ColumnEncoderComposite.java   |  8 ++++++--
 .../transform/encode/MultiColumnEncoder.java       | 24 ++++++++++++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java
 
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java
index f3640f3..1973741 100644
--- 
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java
+++ 
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java
@@ -213,9 +213,13 @@ public class ColumnEncoderComposite extends ColumnEncoder {
                                addEncoder(otherEnc);
                        }
                }
-               else {
+               else
                        addEncoder(other);
-               }
+
+               updateAllDCEncoders();
+       }
+
+       public void updateAllDCEncoders(){
                // update dummycode encoder domain sizes based on distinctness 
information from other encoders
                ColumnEncoderDummycode dc = 
getEncoder(ColumnEncoderDummycode.class);
                if(dc != null)
diff --git 
a/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
 
b/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
index 4a8570a..aff078b 100644
--- 
a/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
+++ 
b/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
@@ -83,8 +83,13 @@ public class MultiColumnEncoder implements Encoder {
                MatrixBlock out;
                try {
                        build(in, k);
-                       _meta = getMetaData(new FrameBlock(in.getNumColumns(), 
Types.ValueType.STRING));
-                       initMetaData(_meta);
+                       if(_legacyMVImpute != null){
+                               // These operations are redundant for every 
encoder excluding the legacyMVImpute, the workaround to fix
+                               // it for this encoder would be very dirty. 
This will only have a performance impact if there is a lot of
+                               // recoding in combination with the 
legacyMVImpute. But since it is legacy this should be fine
+                               _meta = getMetaData(new 
FrameBlock(in.getNumColumns(), Types.ValueType.STRING));
+                               initMetaData(_meta);
+                       }
                        // apply meta data
                        out = apply(in, k);
                }
@@ -104,8 +109,10 @@ public class MultiColumnEncoder implements Encoder {
                        buildMT(in, k);
                }
                else {
-                       for(ColumnEncoder columnEncoder : _columnEncoders)
+                       for(ColumnEncoderComposite columnEncoder : 
_columnEncoders){
                                columnEncoder.build(in);
+                               columnEncoder.updateAllDCEncoders();
+                       }
                }
                legacyBuild(in);
        }
@@ -117,6 +124,8 @@ public class MultiColumnEncoder implements Encoder {
                try {
                        if(blockSize != in.getNumRows()) {
                                // Partial builds and merges
+                               // Most of the time not worth it for RC with 
the current implementation, GC overhead is to large.
+                               // Depending on unique values and rows more 
testing need to be done
                                List<List<Future<Object>>> partials = new 
ArrayList<>();
                                for(ColumnEncoderComposite encoder : 
_columnEncoders) {
                                        List<Callable<Object>> 
partialBuildTasks = encoder.getPartialBuildTasks(in, blockSize);
@@ -124,7 +133,7 @@ public class MultiColumnEncoder implements Encoder {
                                                partials.add(null);
                                                continue;
                                        }
-                                       
partials.add(pool.invokeAll(partialBuildTasks));
+                                       
partials.add(partialBuildTasks.stream().map(pool::submit).collect(Collectors.toList()));
                                }
                                for(int e = 0; e < _columnEncoders.size(); e++) 
{
                                        List<Future<Object>> partial = 
partials.get(e);
@@ -179,7 +188,7 @@ public class MultiColumnEncoder implements Encoder {
                if(in.getNumColumns() != numEncoders)
                        throw new DMLRuntimeException("Not every column in has 
a CompositeEncoder. Please make sure every column "
                                + "has a encoder or slice the input 
accordingly");
-               // Denseblock allocation since access is only on the DenseBlock
+               // Block allocation for MT access
                out.allocateBlock();
                if(out.isInSparseFormat()) {
                        SparseBlock block = out.getSparseBlock();
@@ -204,7 +213,7 @@ public class MultiColumnEncoder implements Encoder {
                                        offset += 
columnEncoder.getEncoder(ColumnEncoderDummycode.class)._domainSize - 1;
                        }
                }
-               // Recomputing NNZ since we access the Dense block directly
+               // Recomputing NNZ since we access the block directly
                // TODO set NNZ explicit count them in the encoders
                out.recomputeNonZeros();
                if(_legacyOmit != null)
@@ -634,6 +643,8 @@ public class MultiColumnEncoder implements Encoder {
                @Override
                public Integer call() throws Exception {
                        _encoder.build(_input);
+                       if(_encoder instanceof ColumnEncoderComposite)
+                               ((ColumnEncoderComposite) 
_encoder).updateAllDCEncoders();
                        return 1;
                }
        }
@@ -652,6 +663,7 @@ public class MultiColumnEncoder implements Encoder {
                @Override
                public Integer call() throws Exception {
                        _encoder.mergeBuildPartial(_partials, 0, 
_partials.size());
+                       _encoder.updateAllDCEncoders();
                        return 1;
                }
        }

Reply via email to