This is an automated email from the ASF dual-hosted git repository.
shangxinli pushed a commit to branch parquet-1.12.x
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/parquet-1.12.x by this push:
new 7597c74 PARQUET-2052: Integer overflow when writing huge binary using
dictionary encoding (#910)
7597c74 is described below
commit 7597c74c55c02980021eb9e8c5345ea0d86fa62a
Author: Chao Sun <[email protected]>
AuthorDate: Wed May 26 00:43:22 2021 -0700
PARQUET-2052: Integer overflow when writing huge binary using dictionary
encoding (#910)
---
.../column/values/dictionary/DictionaryValuesWriter.java | 6 +++---
.../parquet/column/values/dictionary/TestDictionary.java | 15 +++++++++++++++
pom.xml | 4 ++++
3 files changed, 22 insertions(+), 3 deletions(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
index 2999f3c..c4a9852 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
@@ -81,7 +81,7 @@ public abstract class DictionaryValuesWriter extends
ValuesWriter implements Req
protected boolean dictionaryTooBig;
/* current size in bytes the dictionary will take once serialized */
- protected int dictionaryByteSize;
+ protected long dictionaryByteSize;
/* size in bytes of the dictionary at the end of last dictionary encoded
page (in case the current page falls back to PLAIN) */
protected int lastUsedDictionaryByteSize;
@@ -173,7 +173,7 @@ public abstract class DictionaryValuesWriter extends
ValuesWriter implements Req
BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes);
// remember size of dictionary when we last wrote a page
lastUsedDictionarySize = getDictionarySize();
- lastUsedDictionaryByteSize = dictionaryByteSize;
+ lastUsedDictionaryByteSize = Math.toIntExact(dictionaryByteSize);
return bytes;
} catch (IOException e) {
throw new ParquetEncodingException("could not encode the values", e);
@@ -249,7 +249,7 @@ public abstract class DictionaryValuesWriter extends
ValuesWriter implements Req
id = binaryDictionaryContent.size();
binaryDictionaryContent.put(v.copy(), id);
// length as int (4 bytes) + actual bytes
- dictionaryByteSize += 4 + v.length();
+ dictionaryByteSize += 4L + v.length();
}
encodedValues.add(id);
}
diff --git
a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
index 2783b69..174fad8 100644
---
a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
+++
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
@@ -53,6 +53,7 @@ import
org.apache.parquet.column.values.plain.PlainValuesReader;
import org.apache.parquet.column.values.plain.PlainValuesWriter;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.mockito.Mockito;
public class TestDictionary {
@@ -172,6 +173,20 @@ public class TestDictionary {
}
@Test
+ public void testBinaryDictionaryIntegerOverflow() {
+ Binary mock = Mockito.mock(Binary.class);
+ Mockito.when(mock.length()).thenReturn(Integer.MAX_VALUE - 1);
+ // make the writer happy
+ Mockito.when(mock.copy()).thenReturn(Binary.fromString(" world"));
+
+ final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(100, 100);
+ cw.writeBytes(Binary.fromString("hello"));
+ cw.writeBytes(mock);
+
+ assertEquals(PLAIN, cw.getEncoding());
+ }
+
+ @Test
public void testBinaryDictionaryChangedValues() throws IOException {
int COUNT = 100;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
diff --git a/pom.xml b/pom.xml
index 30b65ab..8916ccc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -506,6 +506,10 @@
</excludeModules>
<excludes>
<exclude>${shade.prefix}</exclude>
+ <!-- In PARQUET-2052 this field is changed from int to long
which is a minor API
+ change to fix a integer overflow issue.
+ TODO: remove this after Parquet 1.13 release -->
+
<exclude>org.apache.parquet.column.values.dictionary.DictionaryValuesWriter#dictionaryByteSize</exclude>
</excludes>
</parameter>
</configuration>