Hello Brian, hello Kenton, hello list, as an alternative to GzipInputStream and GzipOutputStream I have written a compression and an uncompression stream class which are stackable into Protocol Buffers streams. They are named LzipInputStream and LzipOutputStream and use the Lempel-Ziv-Markov chain algorithm, as implemented by LZIP http://www.nongnu.org/lzip/lzip.html
An advantage for using Lzip instead of Gzip is, that Lzip supports multi member compression. So one can jump into the stream at any position, forward up to the next synchronization boundary and start reading from there. Using the default compression level, Lzip has a better compression ratio at the cost of being slower than Gzip, but when Lzip is used with a low compression level, speed and output size of Lzip are comparable to that of Gzip. I would like to donate these classes to the ProtoBuf software repository. They will be released under an OSS license, compatible to LZIP and Google's. Could someone please check them and tell me in what kind of repository I can publish them. In Google's license agreements there is a passage telling: "Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission." Since I have to use the name "google" in the C++ namespace of LzipIn/OutputStream, hereby I ask for permission to do so. Comments are appreciated, Jacob -- You received this message because you are subscribed to the Google Groups "Protocol Buffers" group. To post to this group, send email to [email protected]. To unsubscribe from this group, send email to [email protected]. For more options, visit this group at http://groups.google.com/group/protobuf?hl=en.
// This file contains the implementation of classes // LzipInputStream and LzipOutputStream used to compress // and decompress Google's Protocol Buffer Streams using // the Lempel-Ziv-Markow-Algorithm. // // Derived from http://protobuf.googlecode.com/svn/tags/2.2.0/src/google/protobuf/io/gzip_stream.cc // Copyright 2009 by Jacob Rief <[email protected]> // Evaluation copy - don't use in production code #include <lzip_stream.h> #include <google/protobuf/stubs/common.h> namespace google { namespace protobuf { namespace io { static const int kDefaultBufferSize = 8192; // === LzipInputStream === LzipInputStream::LzipInputStream(ZeroCopyInputStream* sub_stream) : sub_stream_(sub_stream), finished_(false), output_buffer_length_(kDefaultBufferSize), output_buffer_(operator new(output_buffer_length_)), output_position_(NULL), next_out_(NULL), avail_out_(0), errno_(LZ_ok) { GOOGLE_CHECK(output_buffer_ != NULL); decoder_ = LZ_decompress_open(); errno_ = LZ_decompress_errno(decoder_); GOOGLE_CHECK(errno_ == LZ_ok); } LzipInputStream::~LzipInputStream() { if (decoder_ != NULL) { Close(); } if (output_buffer_ != NULL) { operator delete(output_buffer_); } } bool LzipInputStream::Close() { errno_ = LZ_decompress_errno(decoder_); bool ok = LZ_decompress_close(decoder_) == LZ_ok; decoder_ = NULL; return ok; } // --- implements ZeroCopyInputStream --- bool LzipInputStream::Next(const void** data, int* size) { GOOGLE_CHECK_GE(next_out_, output_position_); if (next_out_ == output_position_) { if (finished_ && LZ_decompress_finished(decoder_)) return false; output_position_ = next_out_ = static_cast<uint8_t*>(output_buffer_); avail_out_ = output_buffer_length_; Decompress(); } *data = output_position_; *size = next_out_ - output_position_; output_position_ = next_out_; return true; } void LzipInputStream::BackUp(int count) { GOOGLE_CHECK_GE(output_position_-static_cast<uint8_t*>(output_buffer_), count); output_position_ -= count; } bool LzipInputStream::Skip(int count) { const void* data; int size; bool ok = Next(&data, &size); while (ok && (size < count)) { count -= size; ok = Next(&data, &size); } if (size > count) { BackUp(size - count); } return ok; } int64 LzipInputStream::ByteCount() const { return LZ_decompress_total_out_size(decoder_); } // --- private --- void LzipInputStream::Decompress() { GOOGLE_CHECK_GT(avail_out_, 0); if (!finished_) { int avail_in; const void* next_in; if (sub_stream_->Next(&next_in, &avail_in)) { int bytes_written = LZ_decompress_write(decoder_, static_cast<const uint8_t*>(next_in), avail_in); errno_ = LZ_decompress_errno(decoder_); GOOGLE_CHECK(errno_ == LZ_ok); GOOGLE_CHECK_GE(bytes_written, 0); sub_stream_->BackUp(avail_in - bytes_written); } else { GOOGLE_CHECK(LZ_decompress_finish(decoder_) == LZ_ok); finished_ = true; } } int bytes_read = LZ_decompress_read(decoder_, next_out_, avail_out_); errno_ = LZ_decompress_errno(decoder_); GOOGLE_CHECK(errno_ == LZ_ok); GOOGLE_CHECK_GE(bytes_read, 0); next_out_ += bytes_read; avail_out_ -= bytes_read; } // === LzipOutputStream === LzipOutputStream::LzipOutputStream(ZeroCopyOutputStream* sub_stream, size_t compression_level, int64_t member_size) : input_buffer_length_(kDefaultBufferSize), input_buffer_(operator new(input_buffer_length_)), input_position_(static_cast<uint8_t*>(input_buffer_)), input_buffer_end_(input_position_ + input_buffer_length_), sub_stream_(sub_stream), finished_(false), member_size_(member_size) { GOOGLE_CHECK(input_buffer_ != NULL); GOOGLE_CHECK_GT(compression_level, 0); compression_level--; GOOGLE_CHECK_LT(compression_level, sizeof(options)/sizeof(Options)); encoder_ = LZ_compress_open(options[compression_level].dictionary_size, options[compression_level].match_len_limit, member_size); errno_ = LZ_compress_errno(encoder_); GOOGLE_CHECK(errno_ == LZ_ok); } LzipOutputStream::~LzipOutputStream() { if (encoder_ != NULL) { Close(); } if (input_buffer_ != NULL) { operator delete(input_buffer_); } } bool LzipOutputStream::Flush() { Compress(true); input_position_ = static_cast<uint8_t*>(input_buffer_); return true; } bool LzipOutputStream::Close() { if (finished_) return false; Compress(); GOOGLE_CHECK(LZ_compress_finish(encoder_) == LZ_ok); do { int avail_out; void* next_out; if (sub_stream_->Next(&next_out, &avail_out)) { int bytes_read = LZ_compress_read(encoder_, static_cast<uint8_t*>(next_out), avail_out); errno_ = LZ_compress_errno(encoder_); GOOGLE_CHECK(errno_ == LZ_ok); GOOGLE_CHECK_GE(bytes_read, 0); sub_stream_->BackUp(avail_out - bytes_read); } else { // disk full? return false; } } while (!LZ_compress_finished(encoder_)); bool ok = LZ_compress_close(encoder_) == LZ_ok; encoder_ = NULL; return ok; } // --- implements ZeroCopyOutputStream --- bool LzipOutputStream::Next(void** data, int* size) { GOOGLE_CHECK_LE(input_position_, input_buffer_end_); if (input_position_ == input_buffer_end_) { if (finished_) return false; Compress(); *data = input_buffer_; *size = input_buffer_length_; } else { *data = input_position_; *size = input_buffer_end_ - input_position_; } input_position_ = input_buffer_end_; return true; } void LzipOutputStream::BackUp(int count) { GOOGLE_CHECK_LE(input_buffer_length_ - count, input_position_ - static_cast<uint8_t*>(input_buffer_)); input_position_ -= count; } int64 LzipOutputStream::ByteCount() const { return LZ_compress_total_in_size(encoder_); } // --- private --- void LzipOutputStream::Compress(bool flush) { uint8_t* next_in = static_cast<uint8_t*>(input_buffer_); int avail_in = input_position_ - next_in; int bytes_written, bytes_read; do { bytes_written = LZ_compress_write(encoder_, next_in, avail_in); errno_ = LZ_compress_errno(encoder_); GOOGLE_CHECK(errno_ == LZ_ok); GOOGLE_CHECK_GE(bytes_written, 0); next_in += bytes_written; avail_in -= bytes_written; if (flush) { GOOGLE_CHECK(LZ_compress_sync_flush(encoder_) == LZ_ok); flush = false; } int avail_out; void* next_out; if (sub_stream_->Next(&next_out, &avail_out)) { bytes_read = LZ_compress_read(encoder_, static_cast<uint8_t*>(next_out), avail_out); errno_ = LZ_compress_errno(encoder_); GOOGLE_CHECK(errno_ == LZ_ok); GOOGLE_CHECK_GE(bytes_read, 0); if (LZ_compress_member_finished(encoder_)==1) { LZ_compress_restart_member(encoder_, member_size_); } sub_stream_->BackUp(avail_out - bytes_read); } else { // disk full? finished_ = true; } } while (bytes_written>0 || bytes_read>0); } const LzipOutputStream::Options LzipOutputStream::options[9] = { { 1 << 20, 10 }, // -1 { 1 << 20, 12 }, // -2 { 1 << 20, 17 }, // -3 { 1 << 21, 26 }, // -4 { 1 << 22, 44 }, // -5 { 1 << 23, 80 }, // -6 { 1 << 24, 108 }, // -7 { 1 << 24, 163 }, // -8 { 1 << 25, 273 } // -9 }; } // namespace io } // namespace protobuf } // namespace google
// This file contains the declaration of classes // LzipInputStream and LzipOutputStream used to compress // and decompress Google's Protocol Buffer Streams using // the Lempel-Ziv-Markow-Algorithm. // // Derived from http://protobuf.googlecode.com/svn/tags/2.2.0/src/google/protobuf/io/gzip_stream.h // Copyright 2009 by Jacob Rief <[email protected]> // Evaluation copy - don't use in production code #ifndef GOOGLE_PROTOBUF_IO_LZIP_STREAM_H__ #define GOOGLE_PROTOBUF_IO_LZIP_STREAM_H__ #include <stdint.h> #include <lzlib.h> #include <limits> #include <google/protobuf/io/zero_copy_stream.h> namespace google { namespace protobuf { namespace io { // A ZeroCopyInputStream that reads compressed data through lzib class LIBPROTOBUF_EXPORT LzipInputStream : public ZeroCopyInputStream { public: explicit LzipInputStream(ZeroCopyInputStream* sub_stream); virtual ~LzipInputStream(); // Releases the decoder. bool Close(); // --- implements ZeroCopyInputStream --- bool Next(const void** data, int* size); void BackUp(int count); bool Skip(int count); int64 ByteCount() const; private: GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(LzipInputStream); void Decompress(); // compressed input stream ZeroCopyInputStream* sub_stream_; bool finished_; // plain text output stream const int output_buffer_length_; void* const output_buffer_; uint8_t* output_position_; uint8_t* next_out_; int avail_out_; // Lzip decoder void* decoder_; LZ_errno errno_; }; class LIBPROTOBUF_EXPORT LzipOutputStream : public ZeroCopyOutputStream { public: // Create a LzipOutputStream with default options. explicit LzipOutputStream(ZeroCopyOutputStream* sub_stream, size_t compression_level = 5, int64_t member_size = std::numeric_limits<long long>::max()); virtual ~LzipOutputStream(); // Flushes data written so far to zipped data in the underlying stream. // It is the caller's responsibility to flush the underlying stream if // necessary. // Compression may be less efficient stopping and starting around flushes. // Returns true if no error. bool Flush(); // Writes out all data and closes the lzip stream. // It is the caller's responsibility to close the underlying stream if // necessary. // Returns true if no error. bool Close(); // --- implements ZeroCopyOutputStream --- bool Next(void** data, int* size); void BackUp(int count); int64 ByteCount() const; private: GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(LzipOutputStream); void Compress(bool flush = false); // plain text input stream const int input_buffer_length_; void* const input_buffer_; uint8_t* input_position_; uint8_t* const input_buffer_end_; // compressed output stream ZeroCopyOutputStream* sub_stream_; bool finished_; // Lzip encoder struct Options { int dictionary_size; // 4KiB..512MiB int match_len_limit; // 5..273 }; static const Options options[9]; void* encoder_; int member_size_; LZ_errno errno_; }; } // namespace io } // namespace protobuf } // namespace google #endif // GOOGLE_PROTOBUF_IO_LZIP_STREAM_H__
