phaniarnab commented on a change in pull request #1203:
URL: https://github.com/apache/systemds/pull/1203#discussion_r597894910



##########
File path: 
src/main/java/org/apache/sysds/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java
##########
@@ -74,166 +74,164 @@
        protected final LinkedHashMap<String, String> params;
 
        protected ParameterizedBuiltinCPInstruction(Operator op, 
LinkedHashMap<String, String> paramsMap, CPOperand out,
-                       String opcode, String istr) {
+               String opcode, String istr) {
                super(CPType.ParameterizedBuiltin, op, null, null, out, opcode, 
istr);
                params = paramsMap;
        }
-       
-       public HashMap<String,String> getParameterMap() { 
-               return params; 
+
+       public HashMap<String, String> getParameterMap() {
+               return params;

Review comment:
       Probably your IDE is automatically applying the coding template. Please 
avoid these in future commits, or have a separate PR for all the syntax related 
changes. 

##########
File path: 
src/main/java/org/apache/sysds/runtime/transform/meta/TfMetaUtils.java
##########
@@ -182,6 +188,17 @@ else if(minCol == -1 && maxCol == -1)
                return arr;
        }
 
+       /**
+        * Get K value used for calculation during feature hashing from parsed 
specifications.
+        * @param parsedSpec parsed specifications
+        * @return K value
+        * @throws JSONException

Review comment:
       Javadoc generation fails with `warning: no description for @throws`, 
which in turn fails all the python tests.

##########
File path: 
src/main/java/org/apache/sysds/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
##########
@@ -829,9 +806,7 @@ public RDDTokenizeFunction(Tokenizer tokenizer, int blen) {
                }

Review comment:
       Looks like `_blen` is unused. Can be removed.

##########
File path: 
src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
##########
@@ -0,0 +1,446 @@
+package org.apache.sysds.runtime.transform.encode;
+
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectOutput;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.matrix.data.FrameBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.IndexRange;
+
+public class MultiColumnEncoder implements Encoder {
+
+       protected static final Log LOG = 
LogFactory.getLog(MultiColumnEncoder.class.getName());
+       private List<ColumnEncoderComposite> _columnEncoders;
+       // These encoders are deprecated and will be fazed out soon.
+       private EncoderMVImpute _legacyMVImpute = null;
+       private EncoderOmit _legacyOmit = null;
+       private int _colOffset = 0; // offset for federated Workers who are 
using subrange encoders
+       private FrameBlock _meta = null;
+
+       public MultiColumnEncoder(List<ColumnEncoderComposite> columnEncoders) {
+               _columnEncoders = columnEncoders;
+       }
+
+       public MultiColumnEncoder() {
+               _columnEncoders = new ArrayList<>();
+       }
+
+       public MatrixBlock encode(FrameBlock in) {
+               MatrixBlock out;
+               try {
+                       build(in);
+                       _meta = getMetaData(new FrameBlock(in.getNumColumns(), 
Types.ValueType.STRING));
+                       initMetaData(_meta);
+                       // apply meta data
+                       out = apply(in);
+               }
+               catch(Exception ex) {
+                       LOG.error("Failed transform-encode frame with \n" + 
this);
+                       throw ex;
+               }
+               return out;
+       }
+
+       public void build(FrameBlock in) {
+               for(ColumnEncoder columnEncoder : _columnEncoders)
+                       columnEncoder.build(in);
+               legacyBuild(in);
+       }
+
+       public void legacyBuild(FrameBlock in) {
+               if(_legacyOmit != null)
+                       _legacyOmit.build(in);
+               if(_legacyMVImpute != null)
+                       _legacyMVImpute.build(in);
+       }
+
+       public MatrixBlock apply(FrameBlock in) {
+               int numCols = in.getNumColumns() + getNumExtraCols();
+               MatrixBlock out = new MatrixBlock(in.getNumRows(), numCols, 
false);
+               return apply(in, out, 0);
+       }
+
+       public MatrixBlock apply(FrameBlock in, MatrixBlock out, int outputCol) 
{
+               // There should be a encoder for every column
+               int numEncoders = getFromAll(ColumnEncoderComposite.class, 
ColumnEncoder::getColID).size();
+               if(in.getNumColumns() != numEncoders)
+                       throw new DMLRuntimeException("Not every column in has 
a CompositeEncoder. Please make sure every column "
+                               + "has a encoder or slice the input 
accordingly");
+
+               try {
+                       int offset = outputCol;
+                       for(ColumnEncoderComposite columnEncoder : 
_columnEncoders) {
+                               columnEncoder.apply(in, out, 
columnEncoder._colID - 1 + offset);
+                               
if(columnEncoder.hasEncoder(ColumnEncoderDummycode.class))
+                                       offset += 
columnEncoder.getEncoder(ColumnEncoderDummycode.class)._domainSize - 1;
+                       }
+                       if(_legacyOmit != null)
+                               out = _legacyOmit.apply(in, out);
+                       if(_legacyMVImpute != null)
+                               out = _legacyMVImpute.apply(in, out);
+               }
+               catch(Exception ex) {
+                       LOG.error("Failed to transform-apply frame with \n" + 
this);
+                       throw ex;
+               }
+               return out;
+       }
+
+       @Override
+       public FrameBlock getMetaData(FrameBlock meta) {
+               if(_meta != null)
+                       return _meta;
+               for(ColumnEncoder columnEncoder : _columnEncoders)
+                       columnEncoder.getMetaData(meta);
+               if(_legacyOmit != null)
+                       _legacyOmit.getMetaData(meta);
+               if(_legacyMVImpute != null)
+                       _legacyMVImpute.getMetaData(meta);
+               return meta;
+       }
+
+       @Override
+       public void initMetaData(FrameBlock meta) {
+               for(ColumnEncoder columnEncoder : _columnEncoders)
+                       columnEncoder.initMetaData(meta);
+               if(_legacyOmit != null)
+                       _legacyOmit.initMetaData(meta);
+               if(_legacyMVImpute != null)
+                       _legacyMVImpute.initMetaData(meta);
+       }
+
+       @Override
+       public void prepareBuildPartial() {
+               for(Encoder encoder : _columnEncoders)
+                       encoder.prepareBuildPartial();
+       }
+
+       @Override
+       public void buildPartial(FrameBlock in) {
+               for(Encoder encoder : _columnEncoders)
+                       encoder.buildPartial(in);
+       }
+
+       /**
+        * Obtain the column mapping of encoded frames based on the passed meta 
data frame.
+        *
+        * @param meta meta data frame block
+        * @return matrix with column mapping (one row per attribute)
+        */
+       public MatrixBlock getColMapping(FrameBlock meta) {
+               MatrixBlock out = new MatrixBlock(meta.getNumColumns(), 3, 
false);
+               List<ColumnEncoderDummycode> dc = 
getColumnEncoders(ColumnEncoderDummycode.class);
+
+               for(int i = 0, ni = 0; i < out.getNumRows(); i++) {
+                       final int colID = i + 1; // 1-based
+                       int nColID = ni + 1;
+                       List<ColumnEncoderDummycode> encoder = 
dc.stream().filter(e -> e.getColID() == colID)
+                               .collect(Collectors.toList());
+                       assert encoder.size() <= 1;
+                       if(encoder.size() == 1) {
+                               ni += 
meta.getColumnMetadata(i).getNumDistinct();
+                       }
+                       else {
+                               ni++;
+                       }
+                       out.quickSetValue(i, 0, colID);
+                       out.quickSetValue(i, 1, nColID);
+                       out.quickSetValue(i, 2, ni);
+               }
+               return out;
+       }
+
+       @Override
+       public void updateIndexRanges(long[] beginDims, long[] endDims, int 
offset) {
+               _columnEncoders.forEach(encoder -> 
encoder.updateIndexRanges(beginDims, endDims, offset));
+               if(_legacyOmit != null)
+                       _legacyOmit.updateIndexRanges(beginDims, endDims);
+               if(_legacyMVImpute != null)
+                       _legacyMVImpute.updateIndexRanges(beginDims, endDims);
+       }
+
+       @Override
+       public void writeExternal(ObjectOutput out) throws IOException {
+               out.writeBoolean(_legacyMVImpute != null);
+               if(_legacyMVImpute != null)
+                       _legacyMVImpute.writeExternal(out);
+               out.writeBoolean(_legacyOmit != null);
+               if(_legacyOmit != null)
+                       _legacyOmit.writeExternal(out);
+
+               out.writeInt(_colOffset);
+               out.writeInt(_columnEncoders.size());
+               for(ColumnEncoder columnEncoder : _columnEncoders) {
+                       out.writeInt(columnEncoder._colID);
+                       columnEncoder.writeExternal(out);
+               }
+               out.writeBoolean(_meta != null);
+               if(_meta != null)
+                       _meta.write(out);
+       }
+
+       @Override
+       public void readExternal(ObjectInput in) throws IOException, 
ClassNotFoundException {
+               if(in.readBoolean()) {
+                       _legacyMVImpute = new EncoderMVImpute();
+                       _legacyMVImpute.readExternal(in);
+               }
+               if(in.readBoolean()) {
+                       _legacyOmit = new EncoderOmit();
+                       _legacyOmit.readExternal(in);
+               }
+
+               _colOffset = in.readInt();
+               int encodersSize = in.readInt();
+               _columnEncoders = new ArrayList<>();
+               for(int i = 0; i < encodersSize; i++) {
+                       int colID = in.readInt();
+                       ColumnEncoderComposite columnEncoder = new 
ColumnEncoderComposite();
+                       columnEncoder.readExternal(in);
+                       columnEncoder.setColID(colID);
+                       _columnEncoders.add(columnEncoder);
+               }
+               if(in.readBoolean()) {
+                       FrameBlock meta = new FrameBlock();
+                       meta.readFields(in);
+                       _meta = meta;
+               }
+       }
+
+       public <T extends ColumnEncoder> List<T> getColumnEncoders(Class<T> 
type) {
+               // TODO cache results for faster access
+               List<T> ret = new ArrayList<>();
+               for(ColumnEncoder encoder : _columnEncoders) {
+                       
if(encoder.getClass().equals(ColumnEncoderComposite.class) && type != 
ColumnEncoderComposite.class) {
+                               encoder = ((ColumnEncoderComposite) 
encoder).getEncoder(type);
+                       }
+                       if(encoder != null && encoder.getClass().equals(type)) {
+                               ret.add((T) encoder);

Review comment:
       Type safety warning. There are a few more in this file.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to