Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
florian-jobs commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2935289213
##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+ private static final long serialVersionUID = -5769772089913918987L;
+
+ /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+ private final int[] _dataLZW;
+ private final int _nRows;
+ private final int _nUnique;
+
+ private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict,
AMapToData data, int[] cachedCounts) {
+ super(colIndexes, dict, cachedCounts);
+ _nRows = data.size();
+ _nUnique = dict.getNumberOfValues(colIndexes.size());
+ _dataLZW = compress(data);
+
+ if(CompressedMatrixBlock.debug) {
+ if(getNumValues() == 0)
+ throw new DMLCompressionException("Invalid
construction with empty dictionary");
+ if(_nRows == 0)
+ throw new DMLCompressionException("Invalid
length of the data. is
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2902285913
##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+ private static final long serialVersionUID = -5769772089913918987L;
+
+ /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+ private final int[] _dataLZW;
+ private final int _nRows;
+ private final int _nUnique;
+
+ private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict,
AMapToData data, int[] cachedCounts) {
+ super(colIndexes, dict, cachedCounts);
+ _nRows = data.size();
+ _nUnique = dict.getNumberOfValues(colIndexes.size());
+ _dataLZW = compress(data);
+
+ if(CompressedMatrixBlock.debug) {
+ if(getNumValues() == 0)
+ throw new DMLCompressionException("Invalid
construction with empty dictionary");
+ if(_nRows == 0)
+ throw new DMLCompressionException("Invalid
length of the data. is z
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
florian-jobs commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2896821729
##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+ private static final long serialVersionUID = -5769772089913918987L;
+
+ /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+ private final int[] _dataLZW;
+ private final int _nRows;
+ private final int _nUnique;
+
+ private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict,
AMapToData data, int[] cachedCounts) {
+ super(colIndexes, dict, cachedCounts);
+ _nRows = data.size();
+ _nUnique = dict.getNumberOfValues(colIndexes.size());
+ _dataLZW = compress(data);
+
+ if(CompressedMatrixBlock.debug) {
+ if(getNumValues() == 0)
+ throw new DMLCompressionException("Invalid
construction with empty dictionary");
+ if(_nRows == 0)
+ throw new DMLCompressionException("Invalid
length of the data. is
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
janniklinde commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2871100874
##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+ private static final long serialVersionUID = -5769772089913918987L;
+
+ /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+ private final int[] _dataLZW;
+ private final int _nRows;
+ private final int _nUnique;
+
+ private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict,
AMapToData data, int[] cachedCounts) {
+ super(colIndexes, dict, cachedCounts);
+ _nRows = data.size();
+ _nUnique = dict.getNumberOfValues(colIndexes.size());
+ _dataLZW = compress(data);
+
+ if(CompressedMatrixBlock.debug) {
+ if(getNumValues() == 0)
+ throw new DMLCompressionException("Invalid
construction with empty dictionary");
+ if(_nRows == 0)
+ throw new DMLCompressionException("Invalid
length of the data. is
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
janniklinde commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2839754001
##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,1011 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.utils.MemoryEstimates;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.NoSuchElementException;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw compressed. Idea: - DDCLZW stores the
mapping vector exclusively in compressed
+ * form. - No persistent MapToData cache is maintained. - Sequential
operations decode on-the-fly, while operations
+ * requiring random access explicitly materialize and fall back to DDC.
+ */
+public class ColGroupDDCLZW extends APreAgg implements IMapToDataGroup {
+ private static final long serialVersionUID = -5769772089913918987L;
+
+ /**
+* Stores the LZW-compressed representation of data mapping.
+*/
+ private final int[] _dataLZW;
+ private final int _nRows;
+ private final int _nUnique;
+
+ private ColGroupDDCLZW(IColIndex colIndexes, IDictionary dict,
AMapToData data, int[] cachedCounts) {
+ super(colIndexes, dict, cachedCounts);
+ _nRows = data.size();
+ _nUnique = dict.getNumberOfValues(colIndexes.size());
+ _dataLZW = compress(data);
+
+ if(CompressedMatrixBlock.debug) {
+ if(getNumValues() == 0)
+ throw new DMLCompressionException("Invalid
construction with empty dictionary");
+ if(_nRows == 0)
+ throw new DMLCompressionException("Invalid
length of the data. is
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
florian-jobs commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3904758596 I added tests to ensure the reconstructed matrices in the benchmarking class are equivalent. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3877592143 okay, now the memory numbers sound more plausible ! Can you verify on all the tested instances that when we decompress either the DDC or the DDCLZW they reconstruct equivalent matrices? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
florian-jobs commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3846326231 We changed the `ColGroupDDCLZWBenchmark` class to use `estimateInMemorySize()` instead of `getExactSizeOnDisk()` for memory estimation. While `getExactSizeOnDisk()` returns the exact serialized size produced by `write()`, `estimateInMemorySize()` is the intended method in SystemDS for estimating the in-memory footprint of column groups. We also updated `estimateInMemorySize()` to account for the LZW metadata and the LZW mapping. Observation from the “distributed” benchmark: - The absolute byte numbers differ between the two modes (expected: in-memory estimate includes JVM overhead, whereas on-disk size is a compact serialization format). - However, the qualitative behavior and relative trends are very similar between `estimateInMemorySize()` and `getExactSizeOnDisk()` across the tested (size, nUnique) points (i.e., where DDCLZW is beneficial/harmful stays consistent). - As expected, DDCLZW tends to be unfavorable for very small inputs (fixed overhead dominates), while for larger sizes and low-to-moderate nUnique it achieves strong reductions. Around typical DDC representation boundaries (e.g., 256→257, 65536→65537) the baseline DDC memory changes noticeably, which is reflected in the reported reductions as well. Below are the results from `benchmarkDistributed` using both types modes for comparison. ```java Benchmark: benchmarkDistributed using estimateInMemorySize ... Size: 100 ... Size: 100 | nUnique: 2 | Entropy: 100,00% | DDC: 172 bytes | DDCLZW: 248 bytes | Memory reduction: -44,19% | De-/Compression speedup: 0,01/0,00 times Size: 100 | nUnique: 3 | Entropy: 99,99% | DDC: 280 bytes | DDCLZW: 272 bytes | Memory reduction:2,86% | De-/Compression speedup: 0,01/0,00 times Size: 100 | nUnique: 5 | Entropy: 100,00% | DDC: 296 bytes | DDCLZW: 312 bytes | Memory reduction: -5,41% | De-/Compression speedup: 0,02/0,00 times Size: 100 | nUnique:10 | Entropy: 100,00% | DDC: 336 bytes | DDCLZW: 392 bytes | Memory reduction: -16,67% | De-/Compression speedup: 0,00/0,00 times Size: 100 | nUnique:20 | Entropy: 100,00% | DDC: 416 bytes | DDCLZW: 552 bytes | Memory reduction: -32,69% | De-/Compression speedup: 0,00/0,00 times Size: 100 | nUnique:50 | Entropy: 100,00% | DDC: 656 bytes | DDCLZW: 952 bytes | Memory reduction: -45,12% | De-/Compression speedup: 0,01/0,00 times Size: 100 | nUnique: 100 | Entropy: 100,00% | DDC:1056 bytes | DDCLZW:1352 bytes | Memory reduction: -28,03% | De-/Compression speedup: 0,00/0,00 times ... Size: 10 ... Size: 10 | nUnique: 2 | Entropy: 100,00% | DDC:6420 bytes | DDCLZW:2696 bytes | Memory reduction: 58,01% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique: 3 | Entropy: 100,00% | DDC: 100184 bytes | DDCLZW:3272 bytes | Memory reduction: 96,73% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique: 5 | Entropy: 100,00% | DDC: 100200 bytes | DDCLZW:4192 bytes | Memory reduction: 95,82% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique:10 | Entropy: 100,00% | DDC: 100240 bytes | DDCLZW:5872 bytes | Memory reduction: 94,14% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique:20 | Entropy: 100,00% | DDC: 100320 bytes | DDCLZW:8312 bytes | Memory reduction: 91,71% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique:50 | Entropy: 100,00% | DDC: 100560 bytes | DDCLZW: 13152 bytes | Memory reduction: 86,92% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique: 100 | Entropy: 100,00% | DDC: 100960 bytes | DDCLZW: 18952 bytes | Memory reduction: 81,23% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique: 200 | Entropy: 100,00% | DDC: 101760 bytes | DDCLZW: 27352 bytes | Memory reduction: 73,12% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique: 256 | Entropy: 99,99% | DDC: 102208 bytes | DDCLZW: 30896 bytes | Memory reduction: 69,77% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique: 257 | Entropy: 100,00% | DDC: 202216 bytes | DDCLZW: 30992 bytes | Memory reduction: 84,67% | De-/Compression speedup: 0,00/0,00 times Size: 10 | nUnique: 500 | Entropy: 100,00% | DDC: 204160 bytes | DDCLZW: 44152 bytes | Memory reduction: 78,37% | De-/Compression speedup: 0,00/0,00 times
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
florian-jobs commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3842375217 Yes, I think our memory estimates are indeed wrong @Baunsgaard. We where using getExactSizeOnDisc(). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3834193814 I am still very sure the memory estimates are wrong (but I would be happy to be wrong). Please overwrite this method in your DDCLZW file: https://github.com/apache/systemds/blob/3f841b7383a6cc626acbb2193a111d28f5a19404/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java#L745C2-L750C3 Please use the utilities associated with estimating memory size as the other column group does. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
LukaDeka commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3828098919 > A recipe for X unique values at length L could be: I also changed the `genPatternLZWOptimal` to do exactly this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
LukaDeka commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3828074855
I have just tested it with the highest "optimal" values for DDC in the
"distributed" benchmark, so with datasets like:
`[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]`
for `nUnique = 4, size = 16`.
```r
Size: 10 | nUnique: 2 | Entropy: 100.00% | DDC: 12540 bytes |
DDCLZW:2567 bytes | Memory reduction: 79.53% | De-/Compression speedup:
0.00/0.00 times
Size: 10 | nUnique: 3 | Entropy: 100.00% | DDC: 100044 bytes |
DDCLZW:3147 bytes | Memory reduction: 96.85% | De-/Compression speedup:
0.00/0.00 times
...
Size: 10 | nUnique: 256 | Entropy: 99.99% | DDC: 102068 bytes |
DDCLZW: 30767 bytes | Memory reduction: 69.86% | De-/Compression speedup:
0.00/0.00 times
Size: 10 | nUnique: 257 | Entropy: 100.00% | DDC: 202076 bytes |
DDCLZW: 30867 bytes | Memory reduction: 84.73% | De-/Compression speedup:
0.00/0.00 times
...
Size: 10 | nUnique: 65536 | Entropy: 71.34% | DDC: 724308 bytes |
DDCLZW: 787507 bytes | Memory reduction: -8.73% | De-/Compression speedup:
0.00/0.00 times
Size: 10 | nUnique: 65537 | Entropy: 71.34% | DDC: 824316 bytes |
DDCLZW: 787519 bytes | Memory reduction:4.46% | De-/Compression speedup:
0.00/0.00 times
```
There is a big jump at the `2-3` margin, as well as `256-257`. But the
reduction from `65536-65537` isn't that substantial.
Nevertheless, whenever `nUnique/size` approaches `7/10`, DDC and DDCLZW get
similar memory usage results (for `size > 1` approximately). For datasets
with this many unique values, simple compression is expected to make things
worse though.
I have also noticed that the entropy doesn't really influence the
compression rate that much since entropy measures "how distributed" the values
are and not "how they're arranged". So
`[ 0, 1, 2, 0, 1, 2 ]`
is going to have the same entropy as
`[ 0, 1, 2, 3, 4, 5]`
with both being 100%. The percentage is calculated by
`entropy/log_2{nUnique}` so divided by the possible max.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
florian-jobs commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3824021872 > Okay, cool progress on the results! > > However, I'm a bit skeptical about your byte estimates for the sizes. Do you do extra packing based on the number of bits in your implementation? > > The ideal values for the current DDC implementation are 2, 256, and 65,536 unique values to avoid bit manipulations on lookup (see `AMapToData` specializations). Please explicitly compare against these cases and double-check your memory calculations. > > I'd love to see some results with your idealized input to get a range of what to expect vs. what you get. > > A recipe for X unique values at length L could be: > > 1. Use all X unique values once in sequence >(e.g., for X=4: `1,2,3,4`) > 2. Double repeatedly until you reach length L > >* Round 1: `1,2,3,4` → `1,2,3,4,1,2,3,4` (length 8) >* Round 2: → `1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4` (length 16) >* Round 3: → length 32 >* ...and so on > > I don't know if it's exactly optimal, but it should be pretty good. Good question! At the moment the codes are still stored as int values by the LZW logic, but I’m in the process of changing the storage representation. Instead of storing one code per array element, I’m implementing a bit-packed long wordstream, where codes are packed based on a fixed bit width (derived from the maximum emitted code), with the option to extend this to a growing bit-width policy later if needed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3823936897 Okay, cool progress on the results! However, I'm a bit skeptical about your byte estimates for the sizes. Do you do extra packing based on the number of bits in your implementation? The ideal values for the current DDC implementation are 2, 256, and 65,536 unique values to avoid bit manipulations on lookup (see `AMapToData` specializations). Please explicitly compare against these cases and double-check your memory calculations. I'd love to see some results with your idealized input to get a range of what to expect vs. what you get. A recipe for X unique values at length L could be: 1. Use all X unique values once in sequence (e.g., for X=4: `1,2,3,4`) 2. Double repeatedly until you reach length L - Round 1: `1,2,3,4` → `1,2,3,4,1,2,3,4` (length 8) - Round 2: → `1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4` (length 16) - Round 3: → length 32 - ...and so on I don't know if it's exactly optimal, but it should be pretty good. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
LukaDeka commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3813675888 # Update for benchmarks ## Addressing the feedback > 1) What you are looking for is to control the entropy of your data. I wasn't able to "generate" data that matched a given entropy (percentage), but I added a helper function to calculate "Shannon-entropy" for the given arrays. It's displayed now in the benchmarks. > 2) You can generate data that has exploitable patterns specific to LZW. I added `genPatternLZWOptimal` which features "repeating patterns". Right now, it just repeats the same pattern (length 10) twice, but based on my observations, any repeating pattern is compressed very well. > 3) Do not worry about input data that is smaller than 100 elements for these experiments. I adjusted the sizes to `100, 1000, 10.000, 40.000`. > 4) ...explicitly mention the number of distinct items you have... `nUnique` is not displayed with the benchmarks. I also added another `for` loop so that both `nUnique` and `size` are incremented: ```r Benchmark: benchmarkUniquesLZWOptimal ... Size: 100 ... Size: 100 | nUnique:2 | Entropy: 99.88% | DDC: 52 bytes | DDCLZW: 123 bytes | Memory reduction: -136.54% | De-/Compression speedup: 0.02/0.00 times Size: 100 | nUnique:3 | Entropy: 99.66% | DDC: 144 bytes | DDCLZW: 151 bytes | Memory reduction: -4.86% | De-/Compression speedup: 0.01/0.00 times Size: 100 | nUnique:5 | Entropy: 99.41% | DDC: 160 bytes | DDCLZW: 187 bytes | Memory reduction: -16.87% | De-/Compression speedup: 0.01/0.00 times Size: 100 | nUnique: 10 | Entropy: 99.03% | DDC: 200 bytes | DDCLZW: 263 bytes | Memory reduction: -31.50% | De-/Compression speedup: 0.01/0.00 times Size: 100 | nUnique: 20 | Entropy: 83.91% | DDC: 280 bytes | DDCLZW: 367 bytes | Memory reduction: -31.07% | De-/Compression speedup: 0.01/0.00 times Size: 100 | nUnique: 50 | Entropy: 64.25% | DDC: 520 bytes | DDCLZW: 607 bytes | Memory reduction: -16.73% | De-/Compression speedup: 0.01/0.00 times Size: 100 | nUnique: 100 | Entropy: 54.58% | DDC: 920 bytes | DDCLZW:1007 bytes | Memory reduction: -9.46% | De-/Compression speedup: 0.01/0.00 times ... Size: 1000 ... Size:1000 | nUnique:2 | Entropy: 99.96% | DDC: 164 bytes | DDCLZW: 355 bytes | Memory reduction: -116.46% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique:3 | Entropy: 99.93% | DDC:1044 bytes | DDCLZW: 439 bytes | Memory reduction: 57.95% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique:5 | Entropy: 99.86% | DDC:1060 bytes | DDCLZW: 527 bytes | Memory reduction: 50.28% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique: 10 | Entropy: 99.64% | DDC:1100 bytes | DDCLZW: 659 bytes | Memory reduction: 40.09% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique: 20 | Entropy: 98.53% | DDC:1180 bytes | DDCLZW: 911 bytes | Memory reduction: 22.80% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique: 50 | Entropy: 85.20% | DDC:1420 bytes | DDCLZW:1291 bytes | Memory reduction:9.08% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique: 100 | Entropy: 72.37% | DDC:1820 bytes | DDCLZW:1691 bytes | Memory reduction:7.09% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique: 200 | Entropy: 62.91% | DDC:2620 bytes | DDCLZW:2491 bytes | Memory reduction:4.92% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique: 500 | Entropy: 53.63% | DDC:6020 bytes | DDCLZW:4891 bytes | Memory reduction: 18.75% | De-/Compression speedup: 0.00/0.00 times Size:1000 | nUnique: 1000 | Entropy: 48.25% | DDC: 10020 bytes | DDCLZW:8891 bytes | Memory reduction: 11.27% | De-/Compression speedup: 0.00/0.00 times ... Size: 1 ... Size: 1 | nUnique:2 | Entropy: 99.99% | DDC:1292 bytes | DDCLZW:1147 bytes | Memory reduction: 11.22% | De-/Compression speedup: 0.00/0.00 times Size: 1 | nUnique:3 | Entropy: 99.99% | DDC: 10044 bytes | DDCLZW:1379 bytes | Memory reduction: 86.27% | De-/Compression speedup: 0.00/0.00 times Size: 1 | nUnique:5 | Entropy: 99.98% | DDC: 10060 bytes | DDCLZW:1719 bytes | Memory reduction: 82.91% | De-/C
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
florian-jobs commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3794267132 I have marked some comments as rersolved. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3785480221 > > When you process some of the comments feel free to mark them as resolved! > > I wanted to before, but I think I don't have the permission in GitHub to do that. Not sure if Florian has it. Alternatively if you do not have permissions, make a comment saying resolved. Then when we go though the PR, it is cleaner. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
LukaDeka commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3785280548 > When you process some of the comments feel free to mark them as resolved! I wanted to before, but I think I don't have the permission in GitHub to do that. Not sure if Florian has it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3785258950 When you process some of the comments feel free to mark them as resolved! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
florian-jobs commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3785218879 Status update: Many methods that operate sequentially on the original mapping have been implemented using partial on the fly decoding of the compressed LZW mapping via an iterator. Methods with more complex or non sequential access patterns are not yet handled in this way (for example leftMultByMatrixNoPreAgg) and currently fall back to DDC. These will be addressed in follow-up work. Most decompression paths now rely on partial decoding of the LZW mapping rather than full materialization. Scalar and unary operations have also been implemented. Several previously reported issues have been fixed. I have reverted the unintended formatting changes in the affected files and ensured alignment with the existing code style. I will continue working on the remaining improvements suggested by @Baunsgaard and @janniklinde. What is still missing at this point are more dedicated tests for the individual methods to ensure correctness. Thanks for the detailed feedback and reviews, they were very helpfull! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on PR #2398:
URL: https://github.com/apache/systemds/pull/2398#issuecomment-3781620971
@LukaDeka
Good to see some numbers. However, the ones you have reported are a bit
unfortunate. I have a few points you should consider:
1. Random data is not very compressible, and in actuality, truly random data
would tend to make DDC superior for your use case. What you are looking for is
to control the entropy of your data. If the entropy is low, you should get more
benefits from LZW; if it is high, then your compression ratio should tend
towards DDC.
2. As an additional experiment, you can generate data that has exploitable
patterns specific to LZW. Try to generate some data that is in the "best"
possible structure. This should ideally show scaling close to (O(sqrt{n})) of
the input size with standard LZW, while DDC, being a dense format, always has
(O(n)).
3. Do not worry about input data that is smaller than 100 elements for these
experiments. For instance, experiments with 1 input row trivially show that
other encodings can perform better than DDC. It starts getting interesting at
larger sizes.
4. Control and explicitly mention the number of distinct items you have as a
parameter for your experiment. Additionally, calculate the entropy and use that
as an additional measure of compressibility of the data. These two changes will
improve the experiments.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
LukaDeka commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3781072027 Added a few benchmarks that mostly compare memory as well as operation times for methods (so far, only for `getIdx`). Right now, the comparison is only done for `DDCLZW` with `DDC`. There are sizable memory savings for datasets with repeating patterns or large datasets: ```r Benchmark: benchmarkRandomData Size: 1 | DDC: 61 bytes | DDCLZW: 67 bytes | Memory reduction: -9.84% | De-/Compression speedup: 0.09/0.00 times Size: 10 | DDC: 70 bytes | DDCLZW: 95 bytes | Memory reduction: -35.71% | De-/Compression speedup: 0.04/0.00 times Size: 100 | DDC: 160 bytes | DDCLZW: 299 bytes | Memory reduction: -86.87% | De-/Compression speedup: 0.01/0.00 times Size:1000 | DDC: 1060 bytes | DDCLZW: 1551 bytes | Memory reduction: -46.32% | De-/Compression speedup: 0.00/0.00 times Size: 1 | DDC:10060 bytes | DDCLZW:10487 bytes | Memory reduction: -4.24% | De-/Compression speedup: 0.00/0.00 times Size: 10 | DDC: 100060 bytes | DDCLZW:78783 bytes | Memory reduction: 21.26% | De-/Compression speedup: 0.00/0.00 times ``` I also added the `De-/Compression speedup` field to compare other compression types with each other as well. I also added a benchmark for the slides, but it doesn't look too useful at the moment: ```r Benchmark: benchmarkSlice Size: 1 | Slice[0:0] | DDC: 0 ms | DDCLZW: 1 ms | Slowdown: 37.09 times Size: 10 | Slice[2:7] | DDC: 0 ms | DDCLZW: 20 ms | Slowdown: 1141.72 times Size: 100 | Slice[ 25: 75] | DDC: 0 ms | DDCLZW: 3 ms | Slowdown: 169.34 times Size:1000 | Slice[ 250: 750] | DDC: 0 ms | DDCLZW: 3 ms | Slowdown: 348.98 times Size: 1 | Slice[ 2500: 7500] | DDC: 0 ms | DDCLZW: 6 ms | Slowdown: 483.40 times Size: 10 | Slice[25000:75000] | DDC: 0 ms | DDCLZW: 24 ms | Slowdown: 325.22 times ``` The file might be in a wrong directory as well and wrongly labeled as a "test". We wouldn't want benchmarks running on every GitHub Actions trigger etc. Would it make more sense to refactor it into a `main` function? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on code in PR #2398:
URL: https://github.com/apache/systemds/pull/2398#discussion_r2710528127
##
src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/DDCLZWScheme.java:
##
@@ -0,0 +1,25 @@
+package org.apache.sysds.runtime.compress.colgroup.scheme;
+
+import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDDCLZW;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+public abstract class DDCLZWScheme extends DDCScheme {
+// TODO: private int nUnique; Zu Datenspezifisch, überhaupt sinnvoll
Review Comment:
probably, not so meaningfull to implement specialization for the Scheme
class.
The main goal of this is serialization and applying similar schemes to
other groups. For the project of LZW, it is out of scope. so in my opinion you
can ignore all Scheme parts.
##
src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java:
##
@@ -0,0 +1,943 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.*;
+import java.util.concurrent.ExecutorService;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+import org.apache.arrow.vector.complex.writer.BitWriter;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.*;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.indexes.RangeIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffsetIterator;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCLZWScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.DDCScheme;
+import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
+import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.data.SparseRow;
+import org.apache.sysds.runtime.functionobjects.*;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.jboss.netty.handler.codec.compression.CompressionException;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
+import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+
+import java.util.HashMap;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with
dense dictionary encoding (DDC) whose
+ * mapping vector is additionally lzw
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
LukaDeka commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3765608547 Added new unit tests for ColGroupDDCLZW (they're subject to change and only an initial draft). They might include redundant/unnecessary checks. The rest of the methods are also untested. I'll do it later and possibly refactor the helper functions for the tests. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on code in PR #2398: URL: https://github.com/apache/systemds/pull/2398#discussion_r2698722547 ## src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDDCTest.java: ## Review Comment: Similarly here, do not change the indentation of existing methods, and keep the correct formatting of the new added elements. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
Baunsgaard commented on code in PR #2398: URL: https://github.com/apache/systemds/pull/2398#discussion_r2698716665 ## src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java: ## Review Comment: Please revert the indentation to tabs again, to avoid changing the DDC base class. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
janniklinde commented on PR #2398: URL: https://github.com/apache/systemds/pull/2398#issuecomment-3759484118 Please add some more tests to really verify correctness. For example, you should do a full compression and then decompress it again. Then it should be compared to the original data -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] [SYSTEMDS-3779] Add ColGroupDDCLZW with LZW-compressed MapToData [systemds]
janniklinde commented on code in PR #2398: URL: https://github.com/apache/systemds/pull/2398#discussion_r2697728126 ## src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java: ## Review Comment: In general, all not properly implemented methods should throw a NotImplementedException. Also, you should implement some of the operations that can be done on the compressed representation (e.g., scalar ops, unary, ...). Further, `getExactSizeOnDisk()` should be implemented ## src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCLZW.java: ## @@ -0,0 +1,642 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.compress.colgroup; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.concurrent.ExecutorService; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; +import org.apache.arrow.vector.complex.writer.BitWriter; +import org.apache.commons.lang3.NotImplementedException; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.compress.CompressedMatrixBlock; +import org.apache.sysds.runtime.compress.DMLCompressionException; +import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P; +import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary; +import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory; +import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary; +import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary; +import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary; +import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; +import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.compress.colgroup.indexes.RangeIndex; +import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData; +import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory; +import org.apache.sysds.runtime.compress.colgroup.offset.AOffsetIterator; +import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory; +import org.apache.sysds.runtime.compress.colgroup.scheme.DDCScheme; +import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme; +import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator; +import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; +import org.apache.sysds.runtime.compress.estim.EstimationFactors; +import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory; +import org.apache.sysds.runtime.compress.estim.encoding.IEncode; +import org.apache.sysds.runtime.data.DenseBlock; +import org.apache.sysds.runtime.data.SparseBlock; +import org.apache.sysds.runtime.data.SparseBlockMCSR; +import org.apache.sysds.runtime.data.SparseRow; +import org.apache.sysds.runtime.functionobjects.Builtin; +import org.apache.sysds.runtime.functionobjects.Minus; +import org.apache.sysds.runtime.functionobjects.Plus; +import org.apache.sysds.runtime.matrix.data.LibMatrixMult; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.operators.BinaryOperator; +import org.apache.sysds.runtime.matrix.operators.RightScalarOperator; +import org.apache.sysds.runtime.matrix.operators.ScalarOperator; +import org.apache.sysds.runtime.matrix.operators.UnaryOperator; +import org.jboss.netty.handler.codec.compression.CompressionException; +import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList; +import shaded.parquet.it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap; + + +import java.util.Map; +import java.util.HashMap; +import java.util.Stack; + +/** + * Class to encapsulate information about a column group that is encoded with dense dictionary encoding (DDC) whose + * mapping vector is additionally lzw compressed. + * Idea: + * - DDCLZW stores the mapping vector exclusively in compressed form. + * - No persistent MapToData cache is maintained. + * - Sequential operations decode on-the-f
