Github user xndai commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155597298
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,411 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+#define DELIMITER ','
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(DELIMITER);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(DELIMITER, start);
+ ++col;
+ }
+ return s.substr(start, end - start);
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ longBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ stringBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ memcpy(buffer.data() + offset,
--- End diff --
we might want to resize again when we are running out of 4M buffer size. Or
at least we should raise an exception so we don't overflow the buffer.
---