Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-03-14 Thread via GitHub


florian-jobs commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2935289213


##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import 
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with 
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the 
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential 
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+   private static final long serialVersionUID = -5769772089913918987L;
+
+   /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+   private final int[] _dataLZW;
+   private final int _nRows;
+   private final int _nUnique;
+
+   private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict, 
AMapToData data, int[] cachedCounts) {
+   super(colIndexes, dict, cachedCounts);
+   _nRows = data.size();
+   _nUnique = dict.getNumberOfValues(colIndexes.size());
+   _dataLZW = compress(data);
+
+   if(CompressedMatrixBlock.debug) {
+   if(getNumValues() == 0)
+   throw new DMLCompressionException("Invalid 
construction with empty dictionary");
+   if(_nRows == 0)
+   throw new DMLCompressionException("Invalid 
length of the data. is

Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-03-08 Thread via GitHub


Baunsgaard commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2902285913


##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import 
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with 
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the 
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential 
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+   private static final long serialVersionUID = -5769772089913918987L;
+
+   /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+   private final int[] _dataLZW;
+   private final int _nRows;
+   private final int _nUnique;
+
+   private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict, 
AMapToData data, int[] cachedCounts) {
+   super(colIndexes, dict, cachedCounts);
+   _nRows = data.size();
+   _nUnique = dict.getNumberOfValues(colIndexes.size());
+   _dataLZW = compress(data);
+
+   if(CompressedMatrixBlock.debug) {
+   if(getNumValues() == 0)
+   throw new DMLCompressionException("Invalid 
construction with empty dictionary");
+   if(_nRows == 0)
+   throw new DMLCompressionException("Invalid 
length of the data. is z

Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-03-06 Thread via GitHub


florian-jobs commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2896821729


##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import 
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with 
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the 
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential 
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+   private static final long serialVersionUID = -5769772089913918987L;
+
+   /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+   private final int[] _dataLZW;
+   private final int _nRows;
+   private final int _nUnique;
+
+   private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict, 
AMapToData data, int[] cachedCounts) {
+   super(colIndexes, dict, cachedCounts);
+   _nRows = data.size();
+   _nUnique = dict.getNumberOfValues(colIndexes.size());
+   _dataLZW = compress(data);
+
+   if(CompressedMatrixBlock.debug) {
+   if(getNumValues() == 0)
+   throw new DMLCompressionException("Invalid 
construction with empty dictionary");
+   if(_nRows == 0)
+   throw new DMLCompressionException("Invalid 
length of the data. is

Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-03-02 Thread via GitHub


janniklinde commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2871100874


##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import 
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with 
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the 
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential 
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+   private static final long serialVersionUID = -5769772089913918987L;
+
+   /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+   private final int[] _dataLZW;
+   private final int _nRows;
+   private final int _nUnique;
+
+   private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict, 
AMapToData data, int[] cachedCounts) {
+   super(colIndexes, dict, cachedCounts);
+   _nRows = data.size();
+   _nUnique = dict.getNumberOfValues(colIndexes.size());
+   _dataLZW = compress(data);
+
+   if(CompressedMatrixBlock.debug) {
+   if(getNumValues() == 0)
+   throw new DMLCompressionException("Invalid 
construction with empty dictionary");
+   if(_nRows == 0)
+   throw new DMLCompressionException("Invalid 
length of the data. is 

Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-02-23 Thread via GitHub


janniklinde commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2839754001


##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import 
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with 
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the 
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential 
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+   private static final long serialVersionUID = -5769772089913918987L;
+
+   /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+   private final int[] _dataLZW;
+   private final int _nRows;
+   private final int _nUnique;
+
+   private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict, 
AMapToData data, int[] cachedCounts) {
+   super(colIndexes, dict, cachedCounts);
+   _nRows = data.size();
+   _nUnique = dict.getNumberOfValues(colIndexes.size());
+   _dataLZW = compress(data);
+
+   if(CompressedMatrixBlock.debug) {
+   if(getNumValues() == 0)
+   throw new DMLCompressionException("Invalid 
construction with empty dictionary");
+   if(_nRows == 0)
+   throw new DMLCompressionException("Invalid 
length of the data. is 

Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-02-15 Thread via GitHub


florian-jobs commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3904758596

   I added tests to ensure the reconstructed matrices in the benchmarking class 
are equivalent.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-02-10 Thread via GitHub


Baunsgaard commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3877592143

   okay, now the memory numbers sound more plausible !
   
   Can you verify on all the tested instances that when we decompress either 
the DDC or the DDCLZW they reconstruct equivalent matrices?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-02-04 Thread via GitHub


florian-jobs commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3846326231

   We changed the `ColGroupDDCLZWBenchmark` class to use 
`estimateInMemorySize()` instead of `getExactSizeOnDisk()` for memory 
estimation. While `getExactSizeOnDisk()` returns the exact serialized size 
produced by `write()`, `estimateInMemorySize()` is the intended method in 
SystemDS for estimating the in-memory footprint of column groups. We also 
updated `estimateInMemorySize()` to account for the LZW metadata and the LZW 
mapping.
   
   Observation from the “distributed” benchmark:
   - The absolute byte numbers differ between the two modes (expected: 
in-memory estimate includes JVM overhead, whereas on-disk size is a compact 
serialization format).
   - However, the qualitative behavior and relative trends are very similar 
between `estimateInMemorySize()` and `getExactSizeOnDisk()` across the tested 
(size, nUnique) points (i.e., where DDCLZW is beneficial/harmful stays 
consistent).
   - As expected, DDCLZW tends to be unfavorable for very small inputs (fixed 
overhead dominates), while for larger sizes and low-to-moderate nUnique it 
achieves strong reductions. Around typical DDC representation boundaries (e.g., 
256→257, 65536→65537) the baseline DDC memory changes noticeably, which is 
reflected in the reported reductions as well.
   
   Below are the results from `benchmarkDistributed` using both types modes for 
comparison.
   
   ```java
   

   Benchmark: benchmarkDistributed using estimateInMemorySize
   

   
   ... Size: 100 
...
   Size: 100 | nUnique: 2 | Entropy: 100,00% | DDC: 172 bytes | 
DDCLZW: 248 bytes | Memory reduction:  -44,19% | De-/Compression speedup: 
0,01/0,00 times
   Size: 100 | nUnique: 3 | Entropy:  99,99% | DDC: 280 bytes | 
DDCLZW: 272 bytes | Memory reduction:2,86% | De-/Compression speedup: 
0,01/0,00 times
   Size: 100 | nUnique: 5 | Entropy: 100,00% | DDC: 296 bytes | 
DDCLZW: 312 bytes | Memory reduction:   -5,41% | De-/Compression speedup: 
0,02/0,00 times
   Size: 100 | nUnique:10 | Entropy: 100,00% | DDC: 336 bytes | 
DDCLZW: 392 bytes | Memory reduction:  -16,67% | De-/Compression speedup: 
0,00/0,00 times
   Size: 100 | nUnique:20 | Entropy: 100,00% | DDC: 416 bytes | 
DDCLZW: 552 bytes | Memory reduction:  -32,69% | De-/Compression speedup: 
0,00/0,00 times
   Size: 100 | nUnique:50 | Entropy: 100,00% | DDC: 656 bytes | 
DDCLZW: 952 bytes | Memory reduction:  -45,12% | De-/Compression speedup: 
0,01/0,00 times
   Size: 100 | nUnique:   100 | Entropy: 100,00% | DDC:1056 bytes | 
DDCLZW:1352 bytes | Memory reduction:  -28,03% | De-/Compression speedup: 
0,00/0,00 times
   ... Size: 10 
...
   Size:  10 | nUnique: 2 | Entropy: 100,00% | DDC:6420 bytes | 
DDCLZW:2696 bytes | Memory reduction:   58,01% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique: 3 | Entropy: 100,00% | DDC:  100184 bytes | 
DDCLZW:3272 bytes | Memory reduction:   96,73% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique: 5 | Entropy: 100,00% | DDC:  100200 bytes | 
DDCLZW:4192 bytes | Memory reduction:   95,82% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique:10 | Entropy: 100,00% | DDC:  100240 bytes | 
DDCLZW:5872 bytes | Memory reduction:   94,14% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique:20 | Entropy: 100,00% | DDC:  100320 bytes | 
DDCLZW:8312 bytes | Memory reduction:   91,71% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique:50 | Entropy: 100,00% | DDC:  100560 bytes | 
DDCLZW:   13152 bytes | Memory reduction:   86,92% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique:   100 | Entropy: 100,00% | DDC:  100960 bytes | 
DDCLZW:   18952 bytes | Memory reduction:   81,23% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique:   200 | Entropy: 100,00% | DDC:  101760 bytes | 
DDCLZW:   27352 bytes | Memory reduction:   73,12% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique:   256 | Entropy:  99,99% | DDC:  102208 bytes | 
DDCLZW:   30896 bytes | Memory reduction:   69,77% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique:   257 | Entropy: 100,00% | DDC:  202216 bytes | 
DDCLZW:   30992 bytes | Memory reduction:   84,67% | De-/Compression speedup: 
0,00/0,00 times
   Size:  10 | nUnique:   500 | Entropy: 100,00% | DDC:  204160 bytes | 
DDCLZW:   44152 bytes | Memory reduction:   78,37% | De-/Compression speedup: 
0,00/0,00 times
 

Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-02-03 Thread via GitHub


florian-jobs commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3842375217

   Yes, I think our memory estimates are indeed wrong @Baunsgaard. We where 
using getExactSizeOnDisc(). 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-02-02 Thread via GitHub


Baunsgaard commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3834193814

   I am still very sure the memory estimates are wrong (but I would be happy to 
be wrong).
   
   Please overwrite this method in your DDCLZW file: 
   
https://github.com/apache/systemds/blob/3f841b7383a6cc626acbb2193a111d28f5a19404/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java#L745C2-L750C3
 
   
   Please use the utilities associated with estimating memory size as the other 
column group does.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-31 Thread via GitHub


LukaDeka commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3828098919

   > A recipe for X unique values at length L could be:
   
   I also changed the `genPatternLZWOptimal` to do exactly this. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-31 Thread via GitHub


LukaDeka commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3828074855

   I have just tested it with the highest "optimal" values for DDC in the 
"distributed" benchmark, so with datasets like:
   
   `[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]`
   for `nUnique = 4, size = 16`.
   
   ```r
   Size:  10 | nUnique: 2 | Entropy: 100.00% | DDC:   12540 bytes | 
DDCLZW:2567 bytes | Memory reduction:   79.53% | De-/Compression speedup: 
0.00/0.00 times
   Size:  10 | nUnique: 3 | Entropy: 100.00% | DDC:  100044 bytes | 
DDCLZW:3147 bytes | Memory reduction:   96.85% | De-/Compression speedup: 
0.00/0.00 times
   ...
   Size:  10 | nUnique:   256 | Entropy:  99.99% | DDC:  102068 bytes | 
DDCLZW:   30767 bytes | Memory reduction:   69.86% | De-/Compression speedup: 
0.00/0.00 times
   Size:  10 | nUnique:   257 | Entropy: 100.00% | DDC:  202076 bytes | 
DDCLZW:   30867 bytes | Memory reduction:   84.73% | De-/Compression speedup: 
0.00/0.00 times
   ...
   Size:  10 | nUnique: 65536 | Entropy:  71.34% | DDC:  724308 bytes | 
DDCLZW:  787507 bytes | Memory reduction:   -8.73% | De-/Compression speedup: 
0.00/0.00 times
   Size:  10 | nUnique: 65537 | Entropy:  71.34% | DDC:  824316 bytes | 
DDCLZW:  787519 bytes | Memory reduction:4.46% | De-/Compression speedup: 
0.00/0.00 times
   ```
   
   There is a big jump at the `2-3` margin, as well as `256-257`. But the 
reduction from `65536-65537` isn't that substantial.
   
   Nevertheless, whenever `nUnique/size` approaches `7/10`, DDC and DDCLZW get 
similar memory usage results (for `size > 1` approximately). For datasets 
with this many unique values, simple compression is expected to make things 
worse though.
   
   I have also noticed that the entropy doesn't really influence the 
compression rate that much since entropy measures "how distributed" the values 
are and not "how they're arranged". So
   `[ 0, 1, 2, 0, 1, 2 ]`
   is going to have the same entropy as
   `[ 0, 1, 2, 3, 4, 5]`
   with both being 100%. The percentage is calculated by 
`entropy/log_2{nUnique}` so divided by the possible max.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-30 Thread via GitHub


florian-jobs commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3824021872

   > Okay, cool progress on the results!
   > 
   > However, I'm a bit skeptical about your byte estimates for the sizes. Do 
you do extra packing based on the number of bits in your implementation?
   > 
   > The ideal values for the current DDC implementation are 2, 256, and 65,536 
unique values to avoid bit manipulations on lookup (see `AMapToData` 
specializations). Please explicitly compare against these cases and 
double-check your memory calculations.
   > 
   > I'd love to see some results with your idealized input to get a range of 
what to expect vs. what you get.
   > 
   > A recipe for X unique values at length L could be:
   > 
   > 1. Use all X unique values once in sequence
   >(e.g., for X=4: `1,2,3,4`)
   > 2. Double repeatedly until you reach length L
   >
   >* Round 1: `1,2,3,4` → `1,2,3,4,1,2,3,4` (length 8)
   >* Round 2: → `1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4` (length 16)
   >* Round 3: → length 32
   >* ...and so on
   > 
   > I don't know if it's exactly optimal, but it should be pretty good.
   
   Good question! At the moment the codes are still stored as int values by the 
LZW logic, but I’m in the process of changing the storage representation.
   
   Instead of storing one code per array element, I’m implementing a bit-packed 
long wordstream, where codes are packed based on a fixed bit width (derived 
from the maximum emitted code), with the option to extend this to a growing 
bit-width policy later if needed.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-30 Thread via GitHub


Baunsgaard commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3823936897

   Okay, cool progress on the results!
   
   However, I'm a bit skeptical about your byte estimates for the sizes. Do you 
do extra packing based on the number of bits in your implementation?
   
   The ideal values for the current DDC implementation are 2, 256, and 65,536 
unique values to avoid bit manipulations on lookup (see `AMapToData` 
specializations). Please explicitly compare against these cases and 
double-check your memory calculations.
   
   I'd love to see some results with your idealized input to get a range of 
what to expect vs. what you get.
   
   A recipe for X unique values at length L could be:
   
   1. Use all X unique values once in sequence  
  (e.g., for X=4: `1,2,3,4`)
   
   2. Double repeatedly until you reach length L
  - Round 1: `1,2,3,4` → `1,2,3,4,1,2,3,4` (length 8)
  - Round 2: → `1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4` (length 16)
  - Round 3: → length 32
  - ...and so on
   
   I don't know if it's exactly optimal, but it should be pretty good.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-28 Thread via GitHub


LukaDeka commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3813675888

   # Update for benchmarks
   
   ## Addressing the feedback
   
   > 1) What you are looking for is to control the entropy of your data.
   
   I wasn't able to "generate" data that matched a given entropy (percentage), 
but I added a helper function to calculate "Shannon-entropy" for the given 
arrays. It's displayed now in the benchmarks.
   
   > 2) You can generate data that has exploitable patterns specific to LZW.
   
   I added `genPatternLZWOptimal` which features "repeating patterns". Right 
now, it just repeats the same pattern (length 10) twice, but based on my 
observations, any repeating pattern is compressed very well.
   
   > 3) Do not worry about input data that is smaller than 100 elements for 
these experiments.
   
   I adjusted the sizes to `100, 1000, 10.000, 40.000`.
   
   > 4) ...explicitly mention the number of distinct items you have...
   
   `nUnique` is not displayed with the benchmarks.
   
   I also added another `for` loop so that both `nUnique` and `size` are 
incremented:
   ```r
   

   Benchmark: benchmarkUniquesLZWOptimal
   

   
   ... Size: 100 
...
   Size: 100 | nUnique:2 | Entropy:  99.88% | DDC:  52 bytes | 
DDCLZW: 123 bytes | Memory reduction: -136.54% | De-/Compression speedup: 
0.02/0.00 times
   Size: 100 | nUnique:3 | Entropy:  99.66% | DDC: 144 bytes | 
DDCLZW: 151 bytes | Memory reduction:   -4.86% | De-/Compression speedup: 
0.01/0.00 times
   Size: 100 | nUnique:5 | Entropy:  99.41% | DDC: 160 bytes | 
DDCLZW: 187 bytes | Memory reduction:  -16.87% | De-/Compression speedup: 
0.01/0.00 times
   Size: 100 | nUnique:   10 | Entropy:  99.03% | DDC: 200 bytes | 
DDCLZW: 263 bytes | Memory reduction:  -31.50% | De-/Compression speedup: 
0.01/0.00 times
   Size: 100 | nUnique:   20 | Entropy:  83.91% | DDC: 280 bytes | 
DDCLZW: 367 bytes | Memory reduction:  -31.07% | De-/Compression speedup: 
0.01/0.00 times
   Size: 100 | nUnique:   50 | Entropy:  64.25% | DDC: 520 bytes | 
DDCLZW: 607 bytes | Memory reduction:  -16.73% | De-/Compression speedup: 
0.01/0.00 times
   Size: 100 | nUnique:  100 | Entropy:  54.58% | DDC: 920 bytes | 
DDCLZW:1007 bytes | Memory reduction:   -9.46% | De-/Compression speedup: 
0.01/0.00 times
   ... Size: 1000 
...
   Size:1000 | nUnique:2 | Entropy:  99.96% | DDC: 164 bytes | 
DDCLZW: 355 bytes | Memory reduction: -116.46% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique:3 | Entropy:  99.93% | DDC:1044 bytes | 
DDCLZW: 439 bytes | Memory reduction:   57.95% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique:5 | Entropy:  99.86% | DDC:1060 bytes | 
DDCLZW: 527 bytes | Memory reduction:   50.28% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique:   10 | Entropy:  99.64% | DDC:1100 bytes | 
DDCLZW: 659 bytes | Memory reduction:   40.09% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique:   20 | Entropy:  98.53% | DDC:1180 bytes | 
DDCLZW: 911 bytes | Memory reduction:   22.80% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique:   50 | Entropy:  85.20% | DDC:1420 bytes | 
DDCLZW:1291 bytes | Memory reduction:9.08% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique:  100 | Entropy:  72.37% | DDC:1820 bytes | 
DDCLZW:1691 bytes | Memory reduction:7.09% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique:  200 | Entropy:  62.91% | DDC:2620 bytes | 
DDCLZW:2491 bytes | Memory reduction:4.92% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique:  500 | Entropy:  53.63% | DDC:6020 bytes | 
DDCLZW:4891 bytes | Memory reduction:   18.75% | De-/Compression speedup: 
0.00/0.00 times
   Size:1000 | nUnique: 1000 | Entropy:  48.25% | DDC:   10020 bytes | 
DDCLZW:8891 bytes | Memory reduction:   11.27% | De-/Compression speedup: 
0.00/0.00 times
   ... Size: 1 
...
   Size:   1 | nUnique:2 | Entropy:  99.99% | DDC:1292 bytes | 
DDCLZW:1147 bytes | Memory reduction:   11.22% | De-/Compression speedup: 
0.00/0.00 times
   Size:   1 | nUnique:3 | Entropy:  99.99% | DDC:   10044 bytes | 
DDCLZW:1379 bytes | Memory reduction:   86.27% | De-/Compression speedup: 
0.00/0.00 times
   Size:   1 | nUnique:5 | Entropy:  99.98% | DDC:   10060 bytes | 
DDCLZW:1719 bytes | Memory reduction:   82.91% | De-/C

Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-24 Thread via GitHub


florian-jobs commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3794267132

   I have marked some comments as rersolved. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-22 Thread via GitHub


Baunsgaard commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3785480221

   > > When you process some of the comments feel free to mark them as resolved!
   > 
   > I wanted to before, but I think I don't have the permission in GitHub to 
do that. Not sure if Florian has it.
   
   Alternatively if you do not have permissions, make a comment saying 
resolved. Then when we go though the PR, it is cleaner.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-22 Thread via GitHub


LukaDeka commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3785280548

   > When you process some of the comments feel free to mark them as resolved!
   
   I wanted to before, but I think I don't have the permission in GitHub to do 
that. Not sure if Florian has it.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-22 Thread via GitHub


Baunsgaard commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3785258950

   When you process some of the comments feel free to mark them as resolved!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-22 Thread via GitHub


florian-jobs commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3785218879

   Status update:
   
   Many methods that operate sequentially on the original mapping have been 
implemented using partial on the fly decoding of the compressed LZW mapping via 
an iterator.
   
   Methods with more complex or non sequential access patterns are not yet 
handled in this way (for example leftMultByMatrixNoPreAgg) and currently fall 
back to DDC. These will be addressed in follow-up work.
   
   Most decompression paths now rely on partial decoding of the LZW mapping 
rather than full materialization. Scalar and unary operations have also been 
implemented.
   
   Several previously reported issues have been fixed. I have reverted the 
unintended formatting changes in the affected files and ensured alignment with 
the existing code style.
   
   I will continue working on the remaining improvements suggested by 
@Baunsgaard and @janniklinde.
   
   What is still missing at this point are more dedicated tests for the 
individual methods to ensure correctness.
   
   Thanks for the detailed feedback and reviews, they were very helpfull!
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-21 Thread via GitHub


Baunsgaard commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3781620971

   @LukaDeka  
   Good to see some numbers. However, the ones you have reported are a bit 
unfortunate. I have a few points you should consider:
   
   1. Random data is not very compressible, and in actuality, truly random data 
would tend to make DDC superior for your use case. What you are looking for is 
to control the entropy of your data. If the entropy is low, you should get more 
benefits from LZW; if it is high, then your compression ratio should tend 
towards DDC.
   
   2. As an additional experiment, you can generate data that has exploitable 
patterns specific to LZW. Try to generate some data that is in the "best" 
possible structure. This should ideally show scaling close to (O(sqrt{n})) of 
the input size with standard LZW, while DDC, being a dense format, always has 
(O(n)).
   
   3. Do not worry about input data that is smaller than 100 elements for these 
experiments. For instance, experiments with 1 input row trivially show that 
other encodings can perform better than DDC. It starts getting interesting at 
larger sizes.
   
   4. Control and explicitly mention the number of distinct items you have as a 
parameter for your experiment. Additionally, calculate the entropy and use that 
as an additional measure of compressibility of the data. These two changes will 
improve the experiments.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-21 Thread via GitHub


LukaDeka commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3781072027

   Added a few benchmarks that mostly compare memory as well as operation times 
for methods (so far, only for `getIdx`).
   
   Right now, the comparison is only done for `DDCLZW` with `DDC`.
   
   There are sizable memory savings for datasets with repeating patterns or 
large datasets:
   ```r
   

   Benchmark: benchmarkRandomData
   

   
   Size:   1 | DDC:   61 bytes | DDCLZW:   67 bytes | Memory 
reduction:  -9.84% | De-/Compression speedup: 0.09/0.00 times
   Size:  10 | DDC:   70 bytes | DDCLZW:   95 bytes | Memory 
reduction: -35.71% | De-/Compression speedup: 0.04/0.00 times
   Size: 100 | DDC:  160 bytes | DDCLZW:  299 bytes | Memory 
reduction: -86.87% | De-/Compression speedup: 0.01/0.00 times
   Size:1000 | DDC: 1060 bytes | DDCLZW: 1551 bytes | Memory 
reduction: -46.32% | De-/Compression speedup: 0.00/0.00 times
   Size:   1 | DDC:10060 bytes | DDCLZW:10487 bytes | Memory 
reduction:  -4.24% | De-/Compression speedup: 0.00/0.00 times
   Size:  10 | DDC:   100060 bytes | DDCLZW:78783 bytes | Memory 
reduction:  21.26% | De-/Compression speedup: 0.00/0.00 times
   ``` 
   
   I also added the `De-/Compression speedup` field to compare other 
compression types with each other as well.
   
   I also added a benchmark for the slides, but it doesn't look too useful at 
the moment:
   ```r
   

   Benchmark: benchmarkSlice
   

   
   Size:   1 | Slice[0:0] | DDC:  0 ms | DDCLZW:  1 ms | 
Slowdown: 37.09 times
   Size:  10 | Slice[2:7] | DDC:  0 ms | DDCLZW: 20 ms | 
Slowdown: 1141.72 times
   Size: 100 | Slice[   25:   75] | DDC:  0 ms | DDCLZW:  3 ms | 
Slowdown: 169.34 times
   Size:1000 | Slice[  250:  750] | DDC:  0 ms | DDCLZW:  3 ms | 
Slowdown: 348.98 times
   Size:   1 | Slice[ 2500: 7500] | DDC:  0 ms | DDCLZW:  6 ms | 
Slowdown: 483.40 times
   Size:  10 | Slice[25000:75000] | DDC:  0 ms | DDCLZW: 24 ms | 
Slowdown: 325.22 times
   ```
   
   The file might be in a wrong directory as well and wrongly labeled as a 
"test". We wouldn't want benchmarks running on every GitHub Actions trigger etc.
   
   Would it make more sense to refactor it into a `main` function?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-20 Thread via GitHub


Baunsgaard commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2710528127


##
src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/DDCLZWScheme.java:
##
@@ -0,0 +1,25 @@
+package org.apache.sysds.runtime.compress.colgroup.scheme;
+
+import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDDCLZW;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+public abstract class DDCLZWScheme extends DDCScheme {
+// TODO: private int nUnique; Zu Datenspezifisch, überhaupt sinnvoll

Review Comment:
   probably, not so meaningfull to implement specialization for the Scheme 
class. 
   
   The main goal of  this is serialization and applying similar schemes to 
other groups. For the project of LZW, it is out of scope. so in my opinion you 
can ignore all Scheme parts.



##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,943 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.*;
+import java.util.concurrent.ExecutorService;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+import org.apache.arrow.vector.complex.writer.BitWriter;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.*;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.indexes.RangeIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffsetIterator;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.data.SparseRow;
+import org.apache.sysds.runtime.functionobjects.*;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.jboss.netty.handler.codec.compression.CompressionException;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.util.HashMap;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with 
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw

Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-18 Thread via GitHub


LukaDeka commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3765608547

   Added new unit tests for ColGroupDDCLZW (they're subject to change and only 
an initial draft).
   
   They might include redundant/unnecessary checks.
   
   The rest of the methods are also untested. I'll do it later and possibly 
refactor the helper functions for the tests.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-16 Thread via GitHub


Baunsgaard commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2698722547


##
src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDDCTest.java:
##


Review Comment:
   Similarly here, do not change the indentation of existing methods, and keep 
the correct formatting of the new added elements.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-16 Thread via GitHub


Baunsgaard commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2698716665


##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java:
##


Review Comment:
   Please revert the indentation to tabs again, to avoid changing the DDC base 
class.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-16 Thread via GitHub


janniklinde commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3759484118

   Please add some more tests to really verify correctness. For example, you 
should do a full compression and then decompress it again. Then it should be 
compared to the original data


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]

2026-01-16 Thread via GitHub


janniklinde commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2697728126


##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##


Review Comment:
   In general, all not properly implemented methods should throw a 
NotImplementedException.
   Also, you should implement some of the operations that can be done on the 
compressed representation (e.g., scalar ops, unary, ...). Further, 
`getExactSizeOnDisk()` should be implemented



##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,642 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+import org.apache.arrow.vector.complex.writer.BitWriter;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import 
org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary;
+import 
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.indexes.RangeIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffsetIterator;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.data.SparseRow;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.jboss.netty.handler.codec.compression.CompressionException;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Stack;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with 
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed.
+ * Idea:
+ * - DDCLZW stores the mapping vector exclusively in compressed form.
+ * - No persistent MapToData cache is maintained.
+ * - Sequential operations decode on-the-f