Github user majetideepak commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155629237
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << " [--delimiter=<delimiter character>]\n"
+ << "Import CSV file into an Orc file using the specified
schema.\n"
+ << "Compound types are not supported at the moment.\n";
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 4) {
+ std::cout << "Invalid number of arguments." << std::endl;
+ usage();
+ return 1;
+ }
+
+ std::string input = argv[1];
+ std::string output = argv[2];
+ std::string schema = argv[3];
+
+ const std::string SCHEMA_PREFIX = "--schema=";
+ ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR;
+ if (schema.find(SCHEMA_PREFIX) != 0) {
+ std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." <<
std::endl;
+ usage();
+ return 1;
+ } else {
+ fileType =
orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size()));
+ }
+
+ if (argc > 4) {
--- End diff --
Can you handle options like other tools via the `static struct option
longOptions[]`?
---