[jira] [Commented] (ORC-276) [C++] Create a simple tool to import CSV files

2017-12-21 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ORC-276?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16300913#comment-16300913
 ] 

ASF GitHub Bot commented on ORC-276:


Github user majetideepak commented on the issue:

https://github.com/apache/orc/pull/199
  
I just realized we need to add tests for each tool as well.  `/tools/test` 
has some examples.
Some of the tools are missing tests as well. I will file a JIRA to cover 
those.
Sorry for not noticing this earlier.


> [C++] Create a simple tool to import CSV files
> --
>
> Key: ORC-276
> URL: https://issues.apache.org/jira/browse/ORC-276
> Project: ORC
>  Issue Type: Sub-task
>  Components: C++, tools
>Reporter: Gang Wu
>Assignee: Gang Wu
>




--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Assigned] (ORC-273) Bad memory access in TypeImpl::parseType

2017-12-21 Thread rip.nsk (JIRA)

 [ 
https://issues.apache.org/jira/browse/ORC-273?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

rip.nsk reassigned ORC-273:
---

Assignee: rip.nsk

> Bad memory access in TypeImpl::parseType
> 
>
> Key: ORC-273
> URL: https://issues.apache.org/jira/browse/ORC-273
> Project: ORC
>  Issue Type: Bug
>Reporter: rip.nsk
>Assignee: rip.nsk
> Attachments: 010-TypeImpl.patch
>
>
> TypeImpl::parseType can access bad memory (types[nextPos]) for some inputs 
> (for example "map"), due to missed range checks (nextPos < 
> types.size()).
> immediate fix for example is attached, but not all possible cases are 
> covered..



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ORC-276) [C++] Create a simple tool to import CSV files

2017-12-21 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ORC-276?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16300384#comment-16300384
 ] 

ASF GitHub Bot commented on ORC-276:


Github user wgtmac commented on a diff in the pull request:

https://github.com/apache/orc/pull/199#discussion_r158342083
  
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,476 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static char gDelimiter = ',';
+
+// extract one column raw text from one line
+std::string extractColumn(std::string s, uint64_t colIndex) {
+  uint64_t col = 0;
+  size_t start = 0;
+  size_t end = s.find(gDelimiter);
+  while (col < colIndex && end != std::string::npos) {
+start = end + 1;
+end = s.find(gDelimiter, start);
+++col;
+  }
+  return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void) {
+  static char buf[200];
+  time_t t = time(NULL);
+  struct tm* p = localtime();
+  strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+  return buf;
+}
+
+void fillLongValues(const std::vector& data,
+orc::ColumnVectorBatch* batch,
+uint64_t numValues,
+uint64_t colIndex) {
+  orc::LongVectorBatch* longBatch =
+dynamic_cast(batch);
+  bool hasNull = false;
+  for (uint64_t i = 0; i < numValues; ++i) {
+std::string col = extractColumn(data[i], colIndex);
+if (col.empty()) {
+  batch->notNull[i] = 0;
+  hasNull = true;
+} else {
+  batch->notNull[i] = 1;
+  longBatch->data[i] = atoll(col.c_str());
+}
+  }
+  longBatch->hasNulls = hasNull;
+  longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector& data,
+  orc::ColumnVectorBatch* batch,
+  uint64_t numValues,
+  uint64_t colIndex,
+  orc::DataBuffer& buffer,
+  uint64_t& offset) {
+  orc::StringVectorBatch* stringBatch =
+dynamic_cast(batch);
+  bool hasNull = false;
+  for (uint64_t i = 0; i < numValues; ++i) {
+std::string col = extractColumn(data[i], colIndex);
+if (col.empty()) {
+  batch->notNull[i] = 0;
+  hasNull = true;
+} else {
+  batch->notNull[i] = 1;
+  if (buffer.size() - offset < col.size()) {
+buffer.reserve(buffer.size() * 2);
+  }
+  memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+  stringBatch->data[i] = buffer.data() + offset;
+  stringBatch->length[i] = static_cast(col.size());
+  offset += col.size();
+}
+  }
+  stringBatch->hasNulls = hasNull;
+  stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector& data,
+  orc::ColumnVectorBatch* batch,
+  uint64_t numValues,
+  uint64_t colIndex) {
+  orc::DoubleVectorBatch* dblBatch =
+dynamic_cast(batch);
+  bool hasNull = false;
+  for (uint64_t i = 0; i < numValues; ++i) {
+std::string col = extractColumn(data[i], colIndex);
+if (col.empty()) {
+  batch->notNull[i] = 0;
+  hasNull = true;
+} else {
+  batch->notNull[i] = 1;
+  dblBatch->data[i] = atof(col.c_str());
+}
+  }
+  dblBatch->hasNulls = hasNull;
+  dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector& data,
+   

[jira] [Commented] (ORC-256) Add unmasked ranges option for redact mask

2017-12-21 Thread Sandeep More (JIRA)

[ 
https://issues.apache.org/jira/browse/ORC-256?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16300252#comment-16300252
 ] 

Sandeep More commented on ORC-256:
--

Thanks [~owen.omalley] ! 
Your changes (improvements) look good, thanks for taking out a lot of 
redundancy and dead code from the patch.

> Add unmasked ranges option for redact mask
> --
>
> Key: ORC-256
> URL: https://issues.apache.org/jira/browse/ORC-256
> Project: ORC
>  Issue Type: Sub-task
>Reporter: Owen O'Malley
>Assignee: Sandeep More
> Fix For: 1.5.0
>
>
> It would be good to extend the Redact DataMask so that you could leave 
> certain ranges of strings unmasked.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Closed] (ORC-282) [ORC] column name matching while schema evolution should be case unaware.

2017-12-21 Thread piyush mukati (JIRA)

 [ 
https://issues.apache.org/jira/browse/ORC-282?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

piyush mukati closed ORC-282.
-
Resolution: Invalid

created in wrong project.It should be in hive.

> [ORC] column name matching while schema evolution should be case unaware. 
> --
>
> Key: ORC-282
> URL: https://issues.apache.org/jira/browse/ORC-282
> Project: ORC
>  Issue Type: Bug
>Reporter: piyush mukati
>
> in case of orc data reader schema passed by hive are all small cases and if 
> the column name stored in the file has any uppercase, it will return null 
> values for those columns even if the data is present in the file. 
> Column name matching while schema evolution should be case unaware. 
> we need to pass config for same from hive. the config in orc will be exposed 
> by https://issues.apache.org/jira/browse/ORC-264 
>  



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ORC-283) Enable the cmake build to pick specified libraries over the default libraries

2017-12-21 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ORC-283?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16299975#comment-16299975
 ] 

ASF GitHub Bot commented on ORC-283:


Github user majetideepak commented on a diff in the pull request:

https://github.com/apache/orc/pull/204#discussion_r158269151
  
--- Diff: cmake_modules/FindGTest.cmake ---
@@ -28,7 +28,7 @@ find_path (GTEST_INCLUDE_DIR gmock/gmock.h HINTS
   NO_DEFAULT_PATH
   PATH_SUFFIXES "include")
 
-find_library (GTEST_LIBRARIES NAMES gmock PATHS
+find_library (GTEST_LIBRARIES NAMES gmock HINTS
--- End diff --

`HINTS` is apt here. `PATHS` must only be used for hardcoded guesses.
https://cmake.org/cmake/help/v3.0/command/find_library.html


> Enable the cmake build to pick specified libraries over the default libraries
> -
>
> Key: ORC-283
> URL: https://issues.apache.org/jira/browse/ORC-283
> Project: ORC
>  Issue Type: Bug
>Reporter: Deepak Majeti
>Assignee: Deepak Majeti
>
> The changes in https://github.com/apache/orc/pull/194 are causing libraries 
> in the default path to be picked over the specified path.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ORC-283) Enable the cmake build to pick specified libraries over the default libraries

2017-12-21 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ORC-283?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16299961#comment-16299961
 ] 

ASF GitHub Bot commented on ORC-283:


GitHub user majetideepak opened a pull request:

https://github.com/apache/orc/pull/204

ORC-283: Enable the cmake build to pick specified libraries over the …

…default libraries

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/majetideepak/orc ORC-283

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/orc/pull/204.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #204


commit 1a34380e7eb6323e3f0b92b377943f1ba5562d5e
Author: Deepak Majeti 
Date:   2017-12-21T12:17:12Z

ORC-283: Enable the cmake build to pick specified libraries over the 
default libraries




> Enable the cmake build to pick specified libraries over the default libraries
> -
>
> Key: ORC-283
> URL: https://issues.apache.org/jira/browse/ORC-283
> Project: ORC
>  Issue Type: Bug
>Reporter: Deepak Majeti
>Assignee: Deepak Majeti
>
> The changes in https://github.com/apache/orc/pull/194 are causing libraries 
> in the default path to be picked over the specified path.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ORC-276) [C++] Create a simple tool to import CSV files

2017-12-21 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ORC-276?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16299904#comment-16299904
 ] 

ASF GitHub Bot commented on ORC-276:


Github user majetideepak commented on a diff in the pull request:

https://github.com/apache/orc/pull/199#discussion_r158096524
  
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,476 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static char gDelimiter = ',';
+
+// extract one column raw text from one line
+std::string extractColumn(std::string s, uint64_t colIndex) {
+  uint64_t col = 0;
+  size_t start = 0;
+  size_t end = s.find(gDelimiter);
+  while (col < colIndex && end != std::string::npos) {
+start = end + 1;
+end = s.find(gDelimiter, start);
+++col;
+  }
+  return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void) {
+  static char buf[200];
+  time_t t = time(NULL);
+  struct tm* p = localtime();
+  strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+  return buf;
+}
+
+void fillLongValues(const std::vector& data,
+orc::ColumnVectorBatch* batch,
+uint64_t numValues,
+uint64_t colIndex) {
+  orc::LongVectorBatch* longBatch =
+dynamic_cast(batch);
+  bool hasNull = false;
+  for (uint64_t i = 0; i < numValues; ++i) {
+std::string col = extractColumn(data[i], colIndex);
+if (col.empty()) {
+  batch->notNull[i] = 0;
+  hasNull = true;
+} else {
+  batch->notNull[i] = 1;
+  longBatch->data[i] = atoll(col.c_str());
+}
+  }
+  longBatch->hasNulls = hasNull;
+  longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector& data,
+  orc::ColumnVectorBatch* batch,
+  uint64_t numValues,
+  uint64_t colIndex,
+  orc::DataBuffer& buffer,
+  uint64_t& offset) {
+  orc::StringVectorBatch* stringBatch =
+dynamic_cast(batch);
+  bool hasNull = false;
+  for (uint64_t i = 0; i < numValues; ++i) {
+std::string col = extractColumn(data[i], colIndex);
+if (col.empty()) {
+  batch->notNull[i] = 0;
+  hasNull = true;
+} else {
+  batch->notNull[i] = 1;
+  if (buffer.size() - offset < col.size()) {
+buffer.reserve(buffer.size() * 2);
+  }
+  memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+  stringBatch->data[i] = buffer.data() + offset;
+  stringBatch->length[i] = static_cast(col.size());
+  offset += col.size();
+}
+  }
+  stringBatch->hasNulls = hasNull;
+  stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector& data,
+  orc::ColumnVectorBatch* batch,
+  uint64_t numValues,
+  uint64_t colIndex) {
+  orc::DoubleVectorBatch* dblBatch =
+dynamic_cast(batch);
+  bool hasNull = false;
+  for (uint64_t i = 0; i < numValues; ++i) {
+std::string col = extractColumn(data[i], colIndex);
+if (col.empty()) {
+  batch->notNull[i] = 0;
+  hasNull = true;
+} else {
+  batch->notNull[i] = 1;
+  dblBatch->data[i] = atof(col.c_str());
+}
+  }
+  dblBatch->hasNulls = hasNull;
+  dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector& data,
+