carbondata git commit: [CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat

2018-03-07 Thread gvramana
Repository: carbondata
Updated Branches:
  refs/heads/branch-1.3 ce9695633 -> a781515c5


[CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat

This closes #2038


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/a781515c
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/a781515c
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/a781515c

Branch: refs/heads/branch-1.3
Commit: a781515c5c06a28187cfee1ef4ca8b38085649d9
Parents: ce96956
Author: KanakaKumar 
Authored: Mon Mar 5 16:58:18 2018 +0530
Committer: Venkata Ramana G 
Committed: Wed Mar 7 21:17:41 2018 +0530

--
 integration/presto/pom.xml  |   5 
 .../loading/csvinput/CSVInputFormat.java|   6 +++-
 .../loading/csvinput/CSVInputFormatTest.java|  30 ++-
 .../src/test/resources/csv/csv_with_bom.csv |   3 ++
 .../src/test/resources/csv/csv_with_bom.csv.bz2 | Bin 0 -> 129 bytes
 .../src/test/resources/csv/csv_with_bom.csv.gz  | Bin 0 -> 110 bytes
 6 files changed, 42 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/integration/presto/pom.xml
--
diff --git a/integration/presto/pom.xml b/integration/presto/pom.xml
index 00a397f..d0dcf4a 100644
--- a/integration/presto/pom.xml
+++ b/integration/presto/pom.xml
@@ -484,6 +484,11 @@
   hk2-utils
   2.5.0-b42
 
+
+  commons-io
+  commons-io
+  2.4
+
 
   
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
--
diff --git 
a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
 
b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
index 259b6da..aebaf3b 100644
--- 
a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
+++ 
b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
@@ -29,6 +29,7 @@ import org.apache.carbondata.core.util.CarbonProperties;
 
 import com.univocity.parsers.csv.CsvParser;
 import com.univocity.parsers.csv.CsvParserSettings;
+import org.apache.commons.io.input.BOMInputStream;
 import org.apache.commons.lang.BooleanUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
@@ -271,8 +272,11 @@ public class CSVInputFormat extends 
FileInputFormathttp://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
--
diff --git 
a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
 
b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
index 14c680e..d89f10d 100644
--- 
a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
+++ 
b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
@@ -128,6 +128,7 @@ public class CSVInputFormatTest extends TestCase {
   @Test public void testReadCSVFiles() throws Exception{
 Configuration conf = new Configuration();
 prepareConf(conf);
+conf.setBoolean(CSVInputFormat.HEADER_PRESENT, true);
 File output = new File("target/output_CSVInputFormatTest");
 conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath());
 Job job = Job.getInstance(conf, "CSVInputFormat_normal");
@@ -149,8 +150,35 @@ public class CSVInputFormatTest extends TestCase {
 Assert.assertTrue(job.waitForCompletion(true));
   }
 
+  /**
+   * test read csv files encoded as UTF-8 with BOM
+   * @throws Exception
+   */
+  @Test public void testReadCSVFilesWithBOM() throws Exception{
+
+Configuration conf = new Configuration();
+prepareConf(conf);
+conf.setBoolean(CSVInputFormat.HEADER_PRESENT, false);
+File output = new File("target/output_CSVInputFormatTest_bom");
+conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath());
+Job job = Job.getInstance(conf, "CSVInputFormat_normal_bom");
+job.setJarByClass(CSVInputFormatTest.class);
+job.setMapperClass(CSVCheckMapper.class);
+job.setNumReduceTasks(0);
+job.setInputFormatClass(CSVInputFormat.class);
+
+String inputFolder = new File("src/test/resources/csv").getCanonicalPath();
+FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + 
"csv_with_bom.csv"));
+FileInputFormat

carbondata git commit: [CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat

2018-03-07 Thread gvramana
Repository: carbondata
Updated Branches:
  refs/heads/master 9f2884a04 -> 910f26171


[CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat

This closes #2038


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/910f2617
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/910f2617
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/910f2617

Branch: refs/heads/master
Commit: 910f26171750276be5ccfe404be9d8ab0f2ead42
Parents: 9f2884a
Author: KanakaKumar 
Authored: Mon Mar 5 16:58:18 2018 +0530
Committer: Venkata Ramana G 
Committed: Wed Mar 7 21:16:16 2018 +0530

--
 integration/presto/pom.xml  |   5 
 .../loading/csvinput/CSVInputFormat.java|   6 +++-
 .../loading/csvinput/CSVInputFormatTest.java|  30 ++-
 .../src/test/resources/csv/csv_with_bom.csv |   3 ++
 .../src/test/resources/csv/csv_with_bom.csv.bz2 | Bin 0 -> 129 bytes
 .../src/test/resources/csv/csv_with_bom.csv.gz  | Bin 0 -> 110 bytes
 6 files changed, 42 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/integration/presto/pom.xml
--
diff --git a/integration/presto/pom.xml b/integration/presto/pom.xml
index aaaf175..17f5d41 100644
--- a/integration/presto/pom.xml
+++ b/integration/presto/pom.xml
@@ -484,6 +484,11 @@
   hk2-utils
   2.5.0-b42
 
+
+  commons-io
+  commons-io
+  2.4
+
 
   
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
--
diff --git 
a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
 
b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
index 259b6da..aebaf3b 100644
--- 
a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
+++ 
b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
@@ -29,6 +29,7 @@ import org.apache.carbondata.core.util.CarbonProperties;
 
 import com.univocity.parsers.csv.CsvParser;
 import com.univocity.parsers.csv.CsvParserSettings;
+import org.apache.commons.io.input.BOMInputStream;
 import org.apache.commons.lang.BooleanUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
@@ -271,8 +272,11 @@ public class CSVInputFormat extends 
FileInputFormathttp://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
--
diff --git 
a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
 
b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
index 14c680e..d89f10d 100644
--- 
a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
+++ 
b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
@@ -128,6 +128,7 @@ public class CSVInputFormatTest extends TestCase {
   @Test public void testReadCSVFiles() throws Exception{
 Configuration conf = new Configuration();
 prepareConf(conf);
+conf.setBoolean(CSVInputFormat.HEADER_PRESENT, true);
 File output = new File("target/output_CSVInputFormatTest");
 conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath());
 Job job = Job.getInstance(conf, "CSVInputFormat_normal");
@@ -149,8 +150,35 @@ public class CSVInputFormatTest extends TestCase {
 Assert.assertTrue(job.waitForCompletion(true));
   }
 
+  /**
+   * test read csv files encoded as UTF-8 with BOM
+   * @throws Exception
+   */
+  @Test public void testReadCSVFilesWithBOM() throws Exception{
+
+Configuration conf = new Configuration();
+prepareConf(conf);
+conf.setBoolean(CSVInputFormat.HEADER_PRESENT, false);
+File output = new File("target/output_CSVInputFormatTest_bom");
+conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath());
+Job job = Job.getInstance(conf, "CSVInputFormat_normal_bom");
+job.setJarByClass(CSVInputFormatTest.class);
+job.setMapperClass(CSVCheckMapper.class);
+job.setNumReduceTasks(0);
+job.setInputFormatClass(CSVInputFormat.class);
+
+String inputFolder = new File("src/test/resources/csv").getCanonicalPath();
+FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + 
"csv_with_bom.csv"));
+FileInputFormat.addInpu