Repository: carbondata
Updated Branches:
  refs/heads/branch-1.3 ce9695633 -> a781515c5


[CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat

This closes #2038


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/a781515c
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/a781515c
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/a781515c

Branch: refs/heads/branch-1.3
Commit: a781515c5c06a28187cfee1ef4ca8b38085649d9
Parents: ce96956
Author: KanakaKumar <kanaka.avv...@huawei.com>
Authored: Mon Mar 5 16:58:18 2018 +0530
Committer: Venkata Ramana G <ramana.gollam...@huawei.com>
Committed: Wed Mar 7 21:17:41 2018 +0530

----------------------------------------------------------------------
 integration/presto/pom.xml                      |   5 ++++
 .../loading/csvinput/CSVInputFormat.java        |   6 +++-
 .../loading/csvinput/CSVInputFormatTest.java    |  30 ++++++++++++++++++-
 .../src/test/resources/csv/csv_with_bom.csv     |   3 ++
 .../src/test/resources/csv/csv_with_bom.csv.bz2 | Bin 0 -> 129 bytes
 .../src/test/resources/csv/csv_with_bom.csv.gz  | Bin 0 -> 110 bytes
 6 files changed, 42 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/integration/presto/pom.xml
----------------------------------------------------------------------
diff --git a/integration/presto/pom.xml b/integration/presto/pom.xml
index 00a397f..d0dcf4a 100644
--- a/integration/presto/pom.xml
+++ b/integration/presto/pom.xml
@@ -484,6 +484,11 @@
       <artifactId>hk2-utils</artifactId>
       <version>2.5.0-b42</version>
     </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>2.4</version>
+    </dependency>
 
   </dependencies>
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
----------------------------------------------------------------------
diff --git 
a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
 
b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
index 259b6da..aebaf3b 100644
--- 
a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
+++ 
b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
@@ -29,6 +29,7 @@ import org.apache.carbondata.core.util.CarbonProperties;
 
 import com.univocity.parsers.csv.CsvParser;
 import com.univocity.parsers.csv.CsvParserSettings;
+import org.apache.commons.io.input.BOMInputStream;
 import org.apache.commons.lang.BooleanUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
@@ -271,8 +272,11 @@ public class CSVInputFormat extends 
FileInputFormat<NullWritable, StringArrayWri
         filePosition = fileIn;
         inputStream = boundedInputStream;
       }
-      reader = new InputStreamReader(inputStream,
+
+      //Wrap input stream with BOMInputStream to skip UTF-8 BOM characters
+      reader = new InputStreamReader(new BOMInputStream(inputStream),
           Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET));
+
       CsvParserSettings settings = extractCsvParserSettings(job);
       if (start == 0) {
         settings.setHeaderExtractionEnabled(job.getBoolean(HEADER_PRESENT,

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
----------------------------------------------------------------------
diff --git 
a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
 
b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
index 14c680e..d89f10d 100644
--- 
a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
+++ 
b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
@@ -128,6 +128,7 @@ public class CSVInputFormatTest extends TestCase {
   @Test public void testReadCSVFiles() throws Exception{
     Configuration conf = new Configuration();
     prepareConf(conf);
+    conf.setBoolean(CSVInputFormat.HEADER_PRESENT, true);
     File output = new File("target/output_CSVInputFormatTest");
     conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath());
     Job job = Job.getInstance(conf, "CSVInputFormat_normal");
@@ -149,8 +150,35 @@ public class CSVInputFormatTest extends TestCase {
     Assert.assertTrue(job.waitForCompletion(true));
   }
 
+  /**
+   * test read csv files encoded as UTF-8 with BOM
+   * @throws Exception
+   */
+  @Test public void testReadCSVFilesWithBOM() throws Exception{
+
+    Configuration conf = new Configuration();
+    prepareConf(conf);
+    conf.setBoolean(CSVInputFormat.HEADER_PRESENT, false);
+    File output = new File("target/output_CSVInputFormatTest_bom");
+    conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath());
+    Job job = Job.getInstance(conf, "CSVInputFormat_normal_bom");
+    job.setJarByClass(CSVInputFormatTest.class);
+    job.setMapperClass(CSVCheckMapper.class);
+    job.setNumReduceTasks(0);
+    job.setInputFormatClass(CSVInputFormat.class);
+
+    String inputFolder = new File("src/test/resources/csv").getCanonicalPath();
+    FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + 
"csv_with_bom.csv"));
+    FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + 
"csv_with_bom.csv.bz2"));
+    FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + 
"csv_with_bom.csv.gz"));
+
+    deleteOutput(output);
+    FileOutputFormat.setOutputPath(job, new Path(output.getCanonicalPath()));
+
+    Assert.assertTrue(job.waitForCompletion(true));
+  }
+
   private void prepareConf(Configuration conf) {
-    conf.setBoolean(CSVInputFormat.HEADER_PRESENT, true);
     conf.set(CSVInputFormat.MAX_COLUMNS, "10");
     conf.set(CSVInputFormat.NUMBER_OF_COLUMNS, "7");
   }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/test/resources/csv/csv_with_bom.csv
----------------------------------------------------------------------
diff --git a/processing/src/test/resources/csv/csv_with_bom.csv 
b/processing/src/test/resources/csv/csv_with_bom.csv
new file mode 100644
index 0000000..ea4cfcc
--- /dev/null
+++ b/processing/src/test/resources/csv/csv_with_bom.csv
@@ -0,0 +1,3 @@
+1,2015/7/23,china,aaa1,phone197,ASD69643,15000
+2,2015/7/24,china,aaa2,phone756,ASD42892,15001
+3,2015/7/25,china,aaa3,phone1904,ASD37014,15002
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/test/resources/csv/csv_with_bom.csv.bz2
----------------------------------------------------------------------
diff --git a/processing/src/test/resources/csv/csv_with_bom.csv.bz2 
b/processing/src/test/resources/csv/csv_with_bom.csv.bz2
new file mode 100644
index 0000000..21da5d5
Binary files /dev/null and 
b/processing/src/test/resources/csv/csv_with_bom.csv.bz2 differ

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/test/resources/csv/csv_with_bom.csv.gz
----------------------------------------------------------------------
diff --git a/processing/src/test/resources/csv/csv_with_bom.csv.gz 
b/processing/src/test/resources/csv/csv_with_bom.csv.gz
new file mode 100644
index 0000000..e3bd12e
Binary files /dev/null and 
b/processing/src/test/resources/csv/csv_with_bom.csv.gz differ

Reply via email to