Github user xndai commented on a diff in the pull request: https://github.com/apache/orc/pull/199#discussion_r155597298 --- Diff: tools/src/CSVFileImport.cc --- @@ -0,0 +1,411 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "orc/OrcFile.hh" + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <memory> +#include <string> +#include <sys/time.h> +#include <time.h> + +#define DELIMITER ',' + +std::string extractColumn(std::string s, uint64_t colIndex) { + uint64_t col = 0; + size_t start = 0; + size_t end = s.find(DELIMITER); + while (col < colIndex && end != std::string::npos) { + start = end + 1; + end = s.find(DELIMITER, start); + ++col; + } + return s.substr(start, end - start); +} + +static const char* GetDate(void) +{ + static char buf[200]; + time_t t = time(NULL); + struct tm* p = localtime(&t); + strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p); + return buf; +} + +void fillLongValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* longBatch = + dynamic_cast<orc::LongVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + longBatch->notNull[i] = 0; + hasNull = true; + } else { + longBatch->data[i] = atoll(col.c_str()); + } + } + longBatch->hasNulls = hasNull; + longBatch->numElements = numValues; +} + +void fillStringValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex, + orc::DataBuffer<char>& buffer, + uint64_t& offset) { + orc::StringVectorBatch* stringBatch = + dynamic_cast<orc::StringVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + stringBatch->notNull[i] = 0; + hasNull = true; + } else { + memcpy(buffer.data() + offset, --- End diff -- we might want to resize again when we are running out of 4M buffer size. Or at least we should raise an exception so we don't overflow the buffer.
---