Repository: orc Updated Branches: refs/heads/master 6fa860f2e -> aad3581bc
ORC-8. Reimplement file-metadata to use the reader API. (omalley reviewed by asandryh) Signed-off-by: Owen O'Malley <[email protected]> Fixes apache/orc#15 Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/aad3581b Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/aad3581b Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/aad3581b Branch: refs/heads/master Commit: aad3581bc9f66d8f16eacaf87949f743e548f20d Parents: 6fa860f Author: Owen O'Malley <[email protected]> Authored: Fri Jan 15 12:48:41 2016 -0800 Committer: Owen O'Malley <[email protected]> Committed: Tue Jan 19 13:11:34 2016 -0800 ---------------------------------------------------------------------- c++/include/orc/Reader.hh | 114 +++++++++++++++- c++/src/Reader.cc | 301 ++++++++++++++++++++++++++++++++++++----- proto/orc_proto.proto | 3 +- tools/src/FileMetadata.cc | 287 +++++++++++++++++++-------------------- 4 files changed, 518 insertions(+), 187 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/c++/include/orc/Reader.hh ---------------------------------------------------------------------- diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index d924fbf..b6c5480 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -40,6 +40,22 @@ namespace orc { }; /** + * Get the name of the CompressionKind. + */ + std::string compressionKindToString(CompressionKind kind); + + enum WriterVersion { + WriterVersion_ORIGINAL = 0, + WriterVersion_HIVE_8732 = 1, + WriterVersion_HIVE_4243 = 2 + }; + + /** + * Get the name of the WriterVersion. + */ + std::string writerVersionToString(WriterVersion kind); + + /** * Statistics that are available for all types of columns. */ class ColumnStatistics { @@ -337,6 +353,41 @@ namespace orc { virtual int64_t getMaximum() const = 0; }; + enum StreamKind { + StreamKind_PRESENT = 0, + StreamKind_DATA = 1, + StreamKind_LENGTH = 2, + StreamKind_DICTIONARY_DATA = 3, + StreamKind_DICTIONARY_COUNT = 4, + StreamKind_SECONDARY = 5, + StreamKind_ROW_INDEX = 6, + StreamKind_BLOOM_FILTER = 7 + }; + + /** + * Get the string representation of the StreamKind. + */ + std::string streamKindToString(StreamKind kind); + + class StreamInformation { + public: + virtual ~StreamInformation(); + + virtual StreamKind getKind() const = 0; + virtual uint64_t getColumnId() const = 0; + virtual uint64_t getOffset() const = 0; + virtual uint64_t getLength() const = 0; + }; + + enum ColumnEncodingKind { + ColumnEncodingKind_DIRECT = 0, + ColumnEncodingKind_DICTIONARY = 1, + ColumnEncodingKind_DIRECT_V2 = 2, + ColumnEncodingKind_DICTIONARY_V2 = 3 + }; + + std::string columnEncodingKindToString(ColumnEncodingKind kind); + class StripeInformation { public: virtual ~StripeInformation(); @@ -376,6 +427,35 @@ namespace orc { * @return a count of the number of rows */ virtual uint64_t getNumberOfRows() const = 0; + + /** + * Get the number of streams in the stripe. + */ + virtual uint64_t getNumberOfStreams() const = 0; + + /** + * Get the StreamInformation for the given stream. + */ + virtual ORC_UNIQUE_PTR<StreamInformation> + getStreamInformation(uint64_t streamId) const = 0; + + /** + * Get the column encoding for the given column. + * @param colId the columnId + */ + virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0; + + /** + * Get the dictionary size. + * @param colId the columnId + * @return the size of the dictionary or 0 if there isn't one + */ + virtual uint64_t getDictionarySize(uint64_t colId) const = 0; + + /** + * Get the writer timezone. + */ + virtual const std::string& getWriterTimezone() const = 0; }; class Statistics { @@ -616,6 +696,12 @@ namespace orc { virtual uint64_t getCompressionSize() const = 0; /** + * Get the version of the writer. + * @return the version of the writer. + */ + virtual WriterVersion getWriterVersion() const = 0; + + /** * Get the number of rows per a entry in the row index. * @return the number of rows per an entry in the row index or 0 if there * is no row index. @@ -651,12 +737,36 @@ namespace orc { getStripeStatistics(uint64_t stripeIndex) const = 0; /** - * Get the length of the file. - * @return the number of bytes in the file + * Get the length of the data stripes in the file. + * @return the number of bytes in stripes */ virtual uint64_t getContentLength() const = 0; /** + * Get the length of the file stripe statistics + * @return the number of compressed bytes in the file stripe statistics + */ + virtual uint64_t getStripeStatisticsLength() const = 0; + + /** + * Get the length of the file footer + * @return the number of compressed bytes in the file footer + */ + virtual uint64_t getFileFooterLength() const = 0; + + /** + * Get the length of the file postscript + * @return the number of bytes in the file postscript + */ + virtual uint64_t getFilePostscriptLength() const = 0; + + /** + * Get the total length of the file. + * @return the number of bytes in the file + */ + virtual uint64_t getFileLength() const = 0; + + /** * Get the statistics about the columns in the file. * @return the information about the column */ http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/c++/src/Reader.cc ---------------------------------------------------------------------- diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 940ef16..29bd439 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -38,6 +38,36 @@ namespace orc { + std::string compressionKindToString(CompressionKind kind) { + switch (kind) { + case CompressionKind_NONE: + return "none"; + case CompressionKind_ZLIB: + return "zlib"; + case CompressionKind_SNAPPY: + return "snappy"; + case CompressionKind_LZO: + return "LZO"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string writerVersionToString(WriterVersion version) { + switch (version) { + case WriterVersion_ORIGINAL: + return "original"; + case WriterVersion_HIVE_8732: + return "HIVE-8732"; + case WriterVersion_HIVE_4243: + return "HIVE-4243"; + } + std::stringstream buffer; + buffer << "future - " << version; + return buffer.str(); + } + struct ReaderOptionsPrivate { bool setIndexes; bool setNames; @@ -199,8 +229,12 @@ namespace orc { return privateBits->serializedTail; } - StripeInformation::~StripeInformation() { + StreamInformation::~StreamInformation() { + // PASS + } + StripeInformation::~StripeInformation() { + // PASS } class ColumnStatisticsImpl: public ColumnStatistics { @@ -746,28 +780,123 @@ namespace orc { } }; + std::string streamKindToString(StreamKind kind) { + switch (kind) { + case StreamKind_PRESENT: + return "present"; + case StreamKind_DATA: + return "data"; + case StreamKind_LENGTH: + return "length"; + case StreamKind_DICTIONARY_DATA: + return "dictionary"; + case StreamKind_DICTIONARY_COUNT: + return "dictionary count"; + case StreamKind_SECONDARY: + return "secondary"; + case StreamKind_ROW_INDEX: + return "index"; + case StreamKind_BLOOM_FILTER: + return "bloom"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string columnEncodingKindToString(ColumnEncodingKind kind) { + switch (kind) { + case ColumnEncodingKind_DIRECT: + return "direct"; + case ColumnEncodingKind_DICTIONARY: + return "dictionary"; + case ColumnEncodingKind_DIRECT_V2: + return "direct rle2"; + case ColumnEncodingKind_DICTIONARY_V2: + return "dictionary rle2"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + class StreamInformationImpl: public StreamInformation { + private: + StreamKind kind; + uint64_t column; + uint64_t offset; + uint64_t length; + public: + StreamInformationImpl(uint64_t _offset, + const proto::Stream& stream + ): kind(static_cast<StreamKind>(stream.kind())), + column(stream.column()), + offset(_offset), + length(stream.length()) { + // PASS + } + + ~StreamInformationImpl(); + + StreamKind getKind() const override { + return kind; + } + + uint64_t getColumnId() const override { + return column; + } + + uint64_t getOffset() const override { + return offset; + } + + uint64_t getLength() const override { + return length; + } + }; + + StreamInformationImpl::~StreamInformationImpl() { + // PASS + } + class StripeInformationImpl : public StripeInformation { uint64_t offset; uint64_t indexLength; uint64_t dataLength; uint64_t footerLength; uint64_t numRows; - + InputStream* stream; + MemoryPool& memory; + CompressionKind compression; + uint64_t blockSize; + mutable std::unique_ptr<proto::StripeFooter> stripeFooter; + void ensureStripeFooterLoaded() const; public: StripeInformationImpl(uint64_t _offset, uint64_t _indexLength, uint64_t _dataLength, uint64_t _footerLength, - uint64_t _numRows) : - offset(_offset), - indexLength(_indexLength), - dataLength(_dataLength), - footerLength(_footerLength), - numRows(_numRows) - {} - - virtual ~StripeInformationImpl(); + uint64_t _numRows, + InputStream* _stream, + MemoryPool& _memory, + CompressionKind _compression, + uint64_t _blockSize + ) : offset(_offset), + indexLength(_indexLength), + dataLength(_dataLength), + footerLength(_footerLength), + numRows(_numRows), + stream(_stream), + memory(_memory), + compression(_compression), + blockSize(_blockSize) { + // PASS + } + + virtual ~StripeInformationImpl() { + // PASS + } uint64_t getOffset() const override { return offset; @@ -791,8 +920,68 @@ namespace orc { uint64_t getNumberOfRows() const override { return numRows; } + + uint64_t getNumberOfStreams() const override { + ensureStripeFooterLoaded(); + return static_cast<uint64_t>(stripeFooter->streams_size()); + } + + std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId + ) const override; + + ColumnEncodingKind getColumnEncoding(uint64_t colId) const override { + ensureStripeFooterLoaded(); + return static_cast<ColumnEncodingKind>(stripeFooter-> + columns(static_cast<int>(colId)) + .kind()); + } + + uint64_t getDictionarySize(uint64_t colId) const override { + ensureStripeFooterLoaded(); + return static_cast<ColumnEncodingKind>(stripeFooter-> + columns(static_cast<int>(colId)) + .dictionarysize()); + } + + const std::string& getWriterTimezone() const override { + ensureStripeFooterLoaded(); + return stripeFooter->writertimezone(); + } }; + void StripeInformationImpl::ensureStripeFooterLoaded() const { + if (stripeFooter.get() == nullptr) { + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(stream, + offset + + indexLength + + dataLength, + footerLength, + memory)), + blockSize, + memory); + stripeFooter.reset(new proto::StripeFooter()); + if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse the stripe footer"); + } + } + } + + std::unique_ptr<StreamInformation> + StripeInformationImpl::getStreamInformation(uint64_t streamId) const { + ensureStripeFooterLoaded(); + uint64_t streamOffset = offset; + for(uint64_t s=0; s < streamId; ++s) { + streamOffset += stripeFooter->streams(static_cast<int>(s)).length(); + } + return ORC_UNIQUE_PTR<StreamInformation> + (new StreamInformationImpl(streamOffset, + stripeFooter-> + streams(static_cast<int>(streamId)))); + } + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, bool correctStats) { if (s.has_intstatistics()) { @@ -869,10 +1058,6 @@ namespace orc { // PASS } - StripeInformationImpl::~StripeInformationImpl() { - // PASS - } - static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; class ReaderImpl : public Reader { @@ -882,7 +1067,8 @@ namespace orc { // inputs std::unique_ptr<InputStream> stream; ReaderOptions options; - const uint64_t footerStart; + const uint64_t fileLength; + const uint64_t postscriptLength; std::vector<bool> selectedColumns; // custom memory pool @@ -931,13 +1117,15 @@ namespace orc { * @param options options for reading * @param postscript the postscript for the file * @param footer the footer for the file - * @param footerStart the byte offset of the start of the footer + * @param fileLength the length of the file in bytes + * @param postscriptLength the length of the postscript in bytes */ ReaderImpl(std::unique_ptr<InputStream> stream, const ReaderOptions& options, std::unique_ptr<proto::PostScript> postscript, std::unique_ptr<proto::Footer> footer, - uint64_t footerStart); + uint64_t fileLength, + uint64_t postscriptLength); const ReaderOptions& getReaderOptions() const; @@ -945,6 +1133,8 @@ namespace orc { std::string getFormatVersion() const override; + WriterVersion getWriterVersion() const override; + uint64_t getNumberOfRows() const override; uint64_t getRowIndexStride() const override; @@ -971,6 +1161,10 @@ namespace orc { uint64_t getContentLength() const override; + uint64_t getStripeStatisticsLength() const override; + uint64_t getFileFooterLength() const override; + uint64_t getFilePostscriptLength() const override; + uint64_t getFileLength() const override; std::unique_ptr<Statistics> getStatistics() const override; @@ -1039,11 +1233,13 @@ namespace orc { const ReaderOptions& opts, std::unique_ptr<proto::PostScript> _postscript, std::unique_ptr<proto::Footer> _footer, - uint64_t _footerStart + uint64_t _fileLength, + uint64_t _postscriptLength ): epochOffset(getEpochOffset()), stream(std::move(input)), options(opts), - footerStart(_footerStart), + fileLength(_fileLength), + postscriptLength(_postscriptLength), memoryPool(*opts.getMemoryPool()), postscript(std::move(_postscript)), blockSize(getCompressionBlockSize(*postscript)), @@ -1114,7 +1310,8 @@ namespace orc { mutable_ps->CopyFrom(*postscript); proto::Footer *mutableFooter = tail.mutable_footer(); mutableFooter->CopyFrom(*footer); - tail.set_footerstart(footerStart); + tail.set_filelength(fileLength); + tail.set_postscriptlength(postscriptLength); std::string result; if (!tail.SerializeToString(&result)) { throw ParseError("Failed to serialize file tail"); @@ -1160,7 +1357,11 @@ namespace orc { stripeInfo.indexlength(), stripeInfo.datalength(), stripeInfo.footerlength(), - stripeInfo.numberofrows())); + stripeInfo.numberofrows(), + stream.get(), + memoryPool, + compression, + blockSize)); } std::string ReaderImpl::getFormatVersion() const { @@ -1178,10 +1379,33 @@ namespace orc { return footer->numberofrows(); } + WriterVersion ReaderImpl::getWriterVersion() const { + if (!postscript->has_writerversion()) { + return WriterVersion_ORIGINAL; + } + return static_cast<WriterVersion>(postscript->writerversion()); + } + uint64_t ReaderImpl::getContentLength() const { return footer->contentlength(); } + uint64_t ReaderImpl::getStripeStatisticsLength() const { + return postscript->metadatalength(); + } + + uint64_t ReaderImpl::getFileFooterLength() const { + return postscript->footerlength(); + } + + uint64_t ReaderImpl::getFilePostscriptLength() const { + return postscriptLength; + } + + uint64_t ReaderImpl::getFileLength() const { + return fileLength; + } + uint64_t ReaderImpl::getRowIndexStride() const { return footer->rowindexstride(); } @@ -1255,7 +1479,8 @@ namespace orc { void ReaderImpl::readMetadata() const { uint64_t metadataSize = postscript->metadatalength(); - uint64_t metadataStart = footerStart - metadataSize; + uint64_t metadataStart = fileLength - metadataSize + - postscript->footerlength() - postscriptLength - 1; if (metadataSize != 0) { std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(compression, @@ -1331,7 +1556,7 @@ namespace orc { } bool ReaderImpl::hasCorrectStatistics() const { - return postscript->has_writerversion() && postscript->writerversion(); + return getWriterVersion() != WriterVersion_ORIGINAL; } proto::StripeFooter ReaderImpl::getStripeFooter @@ -1711,8 +1936,9 @@ namespace orc { MemoryPool *memoryPool = options.getMemoryPool(); std::unique_ptr<proto::PostScript> ps; std::unique_ptr<proto::Footer> footer; - uint64_t footerStart; std::string serializedFooter = options.getSerializedFileTail(); + uint64_t fileLength; + uint64_t postscriptLength; if (serializedFooter.length() != 0) { // Parse the file tail from the serialized one. proto::FileTail tail; @@ -1721,30 +1947,30 @@ namespace orc { } ps.reset(new proto::PostScript(tail.postscript())); footer.reset(new proto::Footer(tail.footer())); - footerStart = tail.footerstart(); + fileLength = tail.filelength(); + postscriptLength = tail.postscriptlength(); } else { // figure out the size of the file using the option or filesystem - uint64_t size = std::min(options.getTailLocation(), - static_cast<uint64_t>(stream->getLength())); + fileLength = std::min(options.getTailLocation(), + static_cast<uint64_t>(stream->getLength())); //read last bytes into buffer to get PostScript - uint64_t readSize = std::min(size, DIRECTORY_SIZE_GUESS); + uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS); if (readSize < 4) { throw ParseError("File size too small"); } DataBuffer<char> *buffer = new DataBuffer<char>(*memoryPool, readSize); - stream->read(buffer->data(), readSize, size - readSize); + stream->read(buffer->data(), readSize, fileLength - readSize); - uint64_t postscriptSize = buffer->data()[readSize - 1] & 0xff; - ps = readPostscript(stream.get(), buffer, postscriptSize); + postscriptLength = buffer->data()[readSize - 1] & 0xff; + ps = readPostscript(stream.get(), buffer, postscriptLength); uint64_t footerSize = ps->footerlength(); - uint64_t tailSize = 1 + postscriptSize + footerSize; - footerStart = size - tailSize; + uint64_t tailSize = 1 + postscriptLength + footerSize; uint64_t footerOffset; if (tailSize > readSize) { buffer->resize(footerSize); - stream->read(buffer->data(), footerSize, size - tailSize); + stream->read(buffer->data(), footerSize, fileLength - tailSize); footerOffset = 0; } else { footerOffset = readSize - tailSize; @@ -1758,7 +1984,8 @@ namespace orc { options, std::move(ps), std::move(footer), - footerStart)); + fileLength, + postscriptLength)); } ColumnStatistics::~ColumnStatistics() { @@ -1952,7 +2179,7 @@ namespace orc { _hasMinimum = false; _hasMaximum = false; _hasTotalLength = false; - + totalLength = 0; }else{ const proto::StringStatistics& stats = pb.stringstatistics(); http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/proto/orc_proto.proto ---------------------------------------------------------------------- diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index 21b7f7c..502667f 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -222,5 +222,6 @@ message PostScript { message FileTail { optional PostScript postscript = 1; optional Footer footer = 2; - optional uint64 footerStart = 3; + optional uint64 fileLength = 3; + optional uint64 postscriptLength = 4; } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/tools/src/FileMetadata.cc ---------------------------------------------------------------------- diff --git a/tools/src/FileMetadata.cc b/tools/src/FileMetadata.cc index 13db666..950e955 100644 --- a/tools/src/FileMetadata.cc +++ b/tools/src/FileMetadata.cc @@ -16,168 +16,161 @@ * limitations under the License. */ +#include <getopt.h> #include <iostream> #include <fstream> #include <vector> #include <string> #include <sstream> -#include <iomanip> -#include "wrap/orc-proto-wrapper.hh" #include "orc/OrcFile.hh" -using namespace orc::proto; - -uint64_t getTotalPaddingSize(const Footer& footer) { - uint64_t paddedBytes = 0; - StripeInformation stripe; - for (int stripeIx=1; stripeIx<footer.stripes_size(); stripeIx++) { - stripe = footer.stripes(stripeIx-1); - uint64_t prevStripeOffset = stripe.offset(); - uint64_t prevStripeLen = stripe.datalength() + stripe.indexlength() + - stripe.footerlength(); - paddedBytes += footer.stripes(stripeIx).offset() - - (prevStripeOffset + prevStripeLen); - }; - return paddedBytes; +void printStripeInformation(std::ostream& out, + uint64_t index, + uint64_t columns, + std::unique_ptr<orc::StripeInformation> stripe, + bool verbose) { + out << " { \"stripe\": " << index + << ", \"rows\": " << stripe->getNumberOfRows() << ",\n"; + out << " \"offset\": " << stripe->getOffset() + << ", \"length\": " << stripe->getLength() << ",\n"; + out << " \"index\": " << stripe->getIndexLength() + << ", \"data\": " << stripe->getDataLength() + << ", \"footer\": " << stripe->getFooterLength(); + if (verbose) { + out << ",\n \"encodings\": [\n"; + for(uint64_t col=0; col < columns; ++col) { + if (col != 0) { + out << ",\n"; + } + orc::ColumnEncodingKind encoding = stripe->getColumnEncoding(col); + out << " { \"column\": " << col + << ", \"encoding\": \"" + << columnEncodingKindToString(encoding) << "\""; + if (encoding == orc::ColumnEncodingKind_DICTIONARY || + encoding == orc::ColumnEncodingKind_DICTIONARY_V2) { + out << ", \"count\": " << stripe->getDictionarySize(col); + } + out << " }"; + } + out << "\n ],\n"; + out << " \"streams\": [\n"; + for(uint64_t str = 0; str < stripe->getNumberOfStreams(); ++str) { + if (str != 0) { + out << ",\n"; + } + ORC_UNIQUE_PTR<orc::StreamInformation> stream = + stripe->getStreamInformation(str); + out << " { \"id\": " << str + << ", \"column\": " << stream->getColumnId() + << ", \"kind\": \"" << streamKindToString(stream->getKind()) + << "\", \"offset\": " << stream->getOffset() + << ", \"length\": " << stream->getLength() << " }"; + } + out << "\n ]"; + std::string tz = stripe->getWriterTimezone(); + if (tz.length() != 0) { + out << ",\n \"timezone\": \"" << tz << "\""; + } + } + out << "\n }"; } -void printMetadata(const char*filename) { - std::streamsize origPrecision(std::cout.precision()); - std::ios::fmtflags origFlags(std::cout.flags()); - std::cout << "Structure for " << filename << std::endl; - std::ifstream input; - - input.open(filename, std::ios::in | std::ios::binary); - input.seekg(0,input.end); - std::streamoff fileSize = input.tellg(); - - // Read the postscript size - input.seekg(fileSize-1); - int result = input.get(); - if (result == EOF) { - throw std::runtime_error("Failed to read postscript size"); +void printMetadata(std::ostream & out, const char*filename, bool verbose) { + std::unique_ptr<orc::Reader> reader = + orc::createReader(orc::readLocalFile(filename), orc::ReaderOptions()); + out << "{ \"name\": \"" << filename << "\",\n"; + uint64_t numberColumns = reader->getType().getMaximumColumnId() + 1; + out << " \"type\": \"" + << reader->getType().toString() << "\",\n"; + out << " \"rows\": " << reader->getNumberOfRows() << ",\n"; + uint64_t stripeCount = reader->getNumberOfStripes(); + out << " \"stripe count\": " << stripeCount << ",\n"; + out << " \"format\": \"" << reader->getFormatVersion() + << "\", \"writer version\": \"" + << orc::writerVersionToString(reader->getWriterVersion()) + << "\",\n"; + out << " \"compression\": \"" + << orc::compressionKindToString(reader->getCompression()) + << "\","; + if (reader->getCompression() != orc::CompressionKind_NONE) { + out << " \"compression block\": " + << reader->getCompressionSize() << ","; } - std::streamoff postscriptSize = result; - - // Read the postscript - input.seekg(fileSize - postscriptSize-1); - std::vector<char> buffer(static_cast<size_t>(postscriptSize)); - input.read(buffer.data(), postscriptSize); - PostScript postscript ; - postscript.ParseFromArray(buffer.data(), - static_cast<int>(postscriptSize)); - std::cout << std::endl << " === Postscript === " << std::endl ; - postscript.PrintDebugString(); - - // Everything but the postscript is compressed - switch (static_cast<int>(postscript.compression())) { - case NONE: - break; - case ZLIB: - case SNAPPY: - case LZO: - default: - input.close(); - throw std::logic_error("ORC files with compression are not supported"); + out << "\n \"file length\": " << reader->getFileLength() << ",\n"; + out << " \"content\": " << reader->getContentLength() + << ", \"stripe stats\": " << reader->getStripeStatisticsLength() + << ", \"footer\": " << reader->getFileFooterLength() + << ", \"postscript\": " << reader->getFilePostscriptLength() << ",\n"; + if (reader->getRowIndexStride()) { + out << " \"row index stride\": " + << reader->getRowIndexStride() << ",\n"; } - - std::streamoff footerSize = - static_cast<std::streamoff>(postscript.footerlength()); - std::streamoff metadataSize = - static_cast<std::streamoff>(postscript.metadatalength()); - - // Read the metadata - input.seekg(fileSize - 1 - postscriptSize - footerSize - metadataSize); - buffer.resize(static_cast<size_t>(metadataSize)); - input.read(buffer.data(), metadataSize); - Metadata metadata ; - metadata.ParseFromArray(buffer.data(), static_cast<int>(metadataSize)); - - // Read the footer - //input.seekg(fileSize -1 - postscriptSize-footerSize); - buffer.resize(static_cast<size_t>(footerSize)); - input.read(buffer.data(), footerSize); - Footer footer ; - footer.ParseFromArray(buffer.data(), static_cast<int>(footerSize)); - std::cout << std::endl << " === Footer === " << std::endl ; - footer.PrintDebugString(); - - std::cout << std::endl << "=== Stripe Statistics ===" << std::endl; - - StripeInformation stripe ; - Stream section; - ColumnEncoding encoding; - for (int stripeIx=0; stripeIx<footer.stripes_size(); stripeIx++) { - std::cout << "Stripe " << stripeIx+1 <<": " << std::endl ; - stripe = footer.stripes(stripeIx); - stripe.PrintDebugString(); - - std::streamoff offset = - static_cast<std::streamoff>(stripe.offset() + stripe.indexlength() + - stripe.datalength()); - std::streamoff tailLength = - static_cast<std::streamoff>(stripe.footerlength()); - - // read the stripe footer - input.seekg(offset); - buffer.resize(static_cast<size_t>(tailLength)); - input.read(buffer.data(), tailLength); - - StripeFooter stripeFooter; - stripeFooter.ParseFromArray(buffer.data(), static_cast<int>(tailLength)); - //stripeFooter.PrintDebugString(); - uint64_t stripeStart = stripe.offset(); - uint64_t sectionStart = stripeStart; - for (int streamIx=0; streamIx<stripeFooter.streams_size(); streamIx++) { - section = stripeFooter.streams(streamIx); - std::cout << " Stream: column " << section.column() - << " section " - << section.kind() << " start: " << sectionStart - << " length " << section.length() << std::endl; - sectionStart += section.length(); - }; - for (int columnIx=0; columnIx<stripeFooter.columns_size(); - columnIx++) { - encoding = stripeFooter.columns(columnIx); - std::cout << " Encoding column " << columnIx << ": " - << encoding.kind() ; - if (encoding.kind() == ColumnEncoding_Kind_DICTIONARY || - encoding.kind() == ColumnEncoding_Kind_DICTIONARY_V2) - std::cout << "[" << encoding.dictionarysize() << "]"; - std::cout << std::endl; - }; - }; - - uint64_t paddedBytes = getTotalPaddingSize(footer); - // empty ORC file is ~45 bytes. Assumption here is file length always >0 - double percentPadding = - static_cast<double>(paddedBytes) * 100 / static_cast<double>(fileSize); - std::cout << "File length: " << fileSize << " bytes" << std::endl; - std::cout <<"Padding length: " << paddedBytes << " bytes" << std::endl; - std::cout <<"Padding ratio: " << std::fixed << std::setprecision(2) - << percentPadding << " %" << std::endl; - std::cout.precision(origPrecision); - std::cout.flags(origFlags); - input.close(); + out << " \"user metadata\": {"; + std::list<std::string> keys = reader->getMetadataKeys(); + uint64_t remaining = keys.size(); + for(std::list<std::string>::const_iterator itr = keys.begin(); + itr != keys.end(); ++itr) { + out << "\n \"" << *itr << "\": \"" + << reader->getMetadataValue(*itr) << "\""; + if (--remaining != 0) { + out << ","; + } + } + out << "\n },\n"; + out << " \"stripes\": [\n"; + for(uint64_t i=0; i < stripeCount; ++i) { + printStripeInformation(out, i, numberColumns, reader->getStripe(i), + verbose); + if (i == stripeCount - 1) { + out << "\n"; + } else { + out << ",\n"; + } + } + out << " ]\n"; + out << "}\n"; } int main(int argc, char* argv[]) { - GOOGLE_PROTOBUF_VERIFY_VERSION; - - if (argc < 2) { - std::cout << "Usage: file-metadata <filename>\n"; - } - try { - printMetadata(argv[1]); - } catch (std::exception& ex) { - std::cerr << "Caught exception: " << ex.what() << "\n"; - return 1; + static struct option longOptions[] = { + {"help", no_argument, nullptr, 'h'}, + {"verbose", no_argument, nullptr, 'v'}, + {nullptr, 0, nullptr, 0} + }; + bool helpFlag = false; + bool verboseFlag = false; + int opt; + do { + opt = getopt_long(argc, argv, "hv", longOptions, nullptr); + switch (opt) { + case '?': + case 'h': + helpFlag = true; + opt = -1; + break; + case 'v': + verboseFlag = true; + break; + } + } while (opt != -1); + argc -= optind; + argv += optind; + + if (argc < 1 || helpFlag) { + std::cerr + << "Usage: file-metadata [-h] [--help] [-v] [--verbose] <filename>\n"; + exit(1); + } else { + for(int i=0; i < argc; ++i) { + try { + printMetadata(std::cout, argv[i], verboseFlag); + } catch (std::exception& ex) { + std::cerr << "Caught exception in " << argv[i] + << ": " << ex.what() << "\n"; + return 1; + } + } } - - google::protobuf::ShutdownProtobufLibrary(); - return 0; } - -
