Github user majetideepak commented on a diff in the pull request: https://github.com/apache/orc/pull/199#discussion_r158422184 --- Diff: tools/src/CSVFileImport.cc --- @@ -0,0 +1,476 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "orc/OrcFile.hh" + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <memory> +#include <getopt.h> +#include <string> +#include <sys/time.h> +#include <time.h> + +static char gDelimiter = ','; + +// extract one column raw text from one line +std::string extractColumn(std::string s, uint64_t colIndex) { + uint64_t col = 0; + size_t start = 0; + size_t end = s.find(gDelimiter); + while (col < colIndex && end != std::string::npos) { + start = end + 1; + end = s.find(gDelimiter, start); + ++col; + } + return col == colIndex ? s.substr(start, end - start) : ""; +} + +static const char* GetDate(void) { + static char buf[200]; + time_t t = time(NULL); + struct tm* p = localtime(&t); + strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p); + return buf; +} + +void fillLongValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* longBatch = + dynamic_cast<orc::LongVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + longBatch->data[i] = atoll(col.c_str()); + } + } + longBatch->hasNulls = hasNull; + longBatch->numElements = numValues; +} + +void fillStringValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex, + orc::DataBuffer<char>& buffer, + uint64_t& offset) { + orc::StringVectorBatch* stringBatch = + dynamic_cast<orc::StringVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + if (buffer.size() - offset < col.size()) { + buffer.reserve(buffer.size() * 2); + } + memcpy(buffer.data() + offset, + col.c_str(), + col.size()); + stringBatch->data[i] = buffer.data() + offset; + stringBatch->length[i] = static_cast<int64_t>(col.size()); + offset += col.size(); + } + } + stringBatch->hasNulls = hasNull; + stringBatch->numElements = numValues; +} + +void fillDoubleValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::DoubleVectorBatch* dblBatch = + dynamic_cast<orc::DoubleVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + dblBatch->data[i] = atof(col.c_str()); + } + } + dblBatch->hasNulls = hasNull; + dblBatch->numElements = numValues; +} + +// parse fixed point decimal numbers +void fillDecimalValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex, + size_t scale, + size_t precision) { + + + orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR; + orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR; + if (precision <= 18) { + d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch); + d64Batch->scale = static_cast<int32_t>(scale); + } else { + d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch); + d128Batch->scale = static_cast<int32_t>(scale); + } + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + size_t ptPos = col.find('.'); + size_t curScale = 0; + std::string num = col; + if (ptPos != std::string::npos) { + curScale = col.length() - ptPos - 1; + num = col.substr(0, ptPos) + col.substr(ptPos + 1); + } + orc::Int128 decimal(num); + while (curScale != scale) { + curScale++; + decimal *= 10; + } + if (precision <= 18) { + d64Batch->values[i] = decimal.toLong(); + } else { + d128Batch->values[i] = decimal; + } + } + } + batch->hasNulls = hasNull; + batch->numElements = numValues; +} + +void fillBoolValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* boolBatch = + dynamic_cast<orc::LongVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + std::transform(col.begin(), col.end(), col.begin(), ::tolower); + if (col == "true" || col == "t") { + boolBatch->data[i] = true; + } else { + boolBatch->data[i] = false; + } + } + } + boolBatch->hasNulls = hasNull; + boolBatch->numElements = numValues; +} + +// parse date string from format YYYY-mm-dd +void fillDateValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* longBatch = + dynamic_cast<orc::LongVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + struct tm tm; + memset(&tm, 0, sizeof(struct tm)); + strptime(col.c_str(), "%Y-%m-%d", &tm); + time_t t = mktime(&tm); + time_t t1970 = 0; + double seconds = difftime(t, t1970); + int64_t days = static_cast<int64_t>(seconds / (60*60*24)); + longBatch->data[i] = days; + } + } + longBatch->hasNulls = hasNull; + longBatch->numElements = numValues; +} + +// parse timestamp values in seconds +void fillTimestampValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::TimestampVectorBatch* tsBatch = + dynamic_cast<orc::TimestampVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + batch->notNull[i] = 1; + tsBatch->data[i] = atoll(col.c_str()); + tsBatch->nanoseconds[i] = 0; + } + } + tsBatch->hasNulls = hasNull; + tsBatch->numElements = numValues; +} + +void usage() { + std::cout << "Usage: csv-import --input <input file> --output <output> " --- End diff -- We need to include the short names too ``` std::cout << "Usage: csv-import [-h] [--help]\n" << " [-d<character>] [--delimiter=<character>]\n" << " [-s<size>] [--stripe=<size>]\n" << " [-b<size>] [--block=<size>]\n" << " [-b<size>] [--batch=<size>]\n" << " <schema> <input> <output>\n" << "Import CSV file into an Orc file using the specified schema.\n" << "Compound types are not yet supported.\n"; ```
---