[
https://issues.apache.org/jira/browse/ORC-192?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16020025#comment-16020025
]
ASF GitHub Bot commented on ORC-192:
------------------------------------
Github user xndai commented on a diff in the pull request:
https://github.com/apache/orc/pull/122#discussion_r117822601
--- Diff: c++/src/Compression.cc ---
@@ -636,6 +884,33 @@ DIAGNOSTIC_POP
return static_cast<uint64_t>(result);
}
+ std::unique_ptr<BufferedOutputStream>
+ createCompressor(
+ CompressionKind kind,
+ OutputStream * outStream,
+ CompressionStrategy strategy,
+ uint64_t bufferCapacity,
+ uint64_t blockSize,
+ MemoryPool& pool) {
+ switch (static_cast<int64_t>(kind)) {
+ case CompressionKind_NONE: {
+ return std::unique_ptr<BufferedOutputStream>
+ (new BufferedOutputStream(pool, outStream, bufferCapacity,
blockSize));
+ }
+ case CompressionKind_ZLIB: {
+ int level = (strategy == CompressionStrategy_SPEED) ? -1 : 9;
--- End diff --
According to this - https://orc.apache.org/docs/hive-config.html, there are
only two compression strategy defined: SPEED and COMPRESSION. I also checked
Java implementation, SPEED maps to zlib level Z_BEST_SPEED + 1, and COMPRESSION
maps to Z_DEFAULT_COMPRESSION. I will do the same for C++.
Java implementation for your reference -
WriterImpl.java
`
CompressionCodec result = physicalWriter.getCompressionCodec();
if (result != null) {
switch (kind) {
case BLOOM_FILTER:
case DATA:
case DICTIONARY_DATA:
case BLOOM_FILTER_UTF8:
if (compressionStrategy == OrcFile.CompressionStrategy.SPEED) {
result =
result.modify(EnumSet.of(CompressionCodec.Modifier.FAST,
CompressionCodec.Modifier.TEXT));
} else {
result =
result.modify(EnumSet.of(CompressionCodec.Modifier.DEFAULT,
CompressionCodec.Modifier.TEXT));
}
break;
case LENGTH:
case DICTIONARY_COUNT:
case PRESENT:
case ROW_INDEX:
case SECONDARY:
// easily compressed using the fastest modes
result =
result.modify(EnumSet.of(CompressionCodec.Modifier.FASTEST,
CompressionCodec.Modifier.BINARY));
break;
default:
LOG.info("Missing ORC compression modifiers for " + kind);
break;
}
}
`
ZlibCodec.java
`
public CompressionCodec modify(/* @Nullable */ EnumSet<Modifier> modifiers)
{
if (modifiers == null) {
return this;
}
int l = this.level;
int s = this.strategy;
for (Modifier m : modifiers) {
switch (m) {
case BINARY:
/* filtered == less LZ77, more huffman */
s = Deflater.FILTERED;
break;
case TEXT:
s = Deflater.DEFAULT_STRATEGY;
break;
case FASTEST:
// deflate_fast looking for 8 byte patterns
l = Deflater.BEST_SPEED;
break;
case FAST:
// deflate_fast looking for 16 byte patterns
l = Deflater.BEST_SPEED + 1;
break;
case DEFAULT:
// deflate_slow looking for 128 byte patterns
l = Deflater.DEFAULT_COMPRESSION;
break;
default:
break;
}
}
return new ZlibCodec(l, s);
}
`
> Zlib compression stream
> -----------------------
>
> Key: ORC-192
> URL: https://issues.apache.org/jira/browse/ORC-192
> Project: ORC
> Issue Type: Sub-task
> Components: C++
> Reporter: Xiening Dai
> Assignee: Xiening Dai
>
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)