carbondata git commit: [CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat
Repository: carbondata Updated Branches: refs/heads/branch-1.3 ce9695633 -> a781515c5 [CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat This closes #2038 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/a781515c Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/a781515c Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/a781515c Branch: refs/heads/branch-1.3 Commit: a781515c5c06a28187cfee1ef4ca8b38085649d9 Parents: ce96956 Author: KanakaKumar Authored: Mon Mar 5 16:58:18 2018 +0530 Committer: Venkata Ramana G Committed: Wed Mar 7 21:17:41 2018 +0530 -- integration/presto/pom.xml | 5 .../loading/csvinput/CSVInputFormat.java| 6 +++- .../loading/csvinput/CSVInputFormatTest.java| 30 ++- .../src/test/resources/csv/csv_with_bom.csv | 3 ++ .../src/test/resources/csv/csv_with_bom.csv.bz2 | Bin 0 -> 129 bytes .../src/test/resources/csv/csv_with_bom.csv.gz | Bin 0 -> 110 bytes 6 files changed, 42 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/integration/presto/pom.xml -- diff --git a/integration/presto/pom.xml b/integration/presto/pom.xml index 00a397f..d0dcf4a 100644 --- a/integration/presto/pom.xml +++ b/integration/presto/pom.xml @@ -484,6 +484,11 @@ hk2-utils 2.5.0-b42 + + commons-io + commons-io + 2.4 + http://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java -- diff --git a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java index 259b6da..aebaf3b 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java +++ b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java @@ -29,6 +29,7 @@ import org.apache.carbondata.core.util.CarbonProperties; import com.univocity.parsers.csv.CsvParser; import com.univocity.parsers.csv.CsvParserSettings; +import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.lang.BooleanUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; @@ -271,8 +272,11 @@ public class CSVInputFormat extends FileInputFormathttp://git-wip-us.apache.org/repos/asf/carbondata/blob/a781515c/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java -- diff --git a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java index 14c680e..d89f10d 100644 --- a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java +++ b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java @@ -128,6 +128,7 @@ public class CSVInputFormatTest extends TestCase { @Test public void testReadCSVFiles() throws Exception{ Configuration conf = new Configuration(); prepareConf(conf); +conf.setBoolean(CSVInputFormat.HEADER_PRESENT, true); File output = new File("target/output_CSVInputFormatTest"); conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath()); Job job = Job.getInstance(conf, "CSVInputFormat_normal"); @@ -149,8 +150,35 @@ public class CSVInputFormatTest extends TestCase { Assert.assertTrue(job.waitForCompletion(true)); } + /** + * test read csv files encoded as UTF-8 with BOM + * @throws Exception + */ + @Test public void testReadCSVFilesWithBOM() throws Exception{ + +Configuration conf = new Configuration(); +prepareConf(conf); +conf.setBoolean(CSVInputFormat.HEADER_PRESENT, false); +File output = new File("target/output_CSVInputFormatTest_bom"); +conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath()); +Job job = Job.getInstance(conf, "CSVInputFormat_normal_bom"); +job.setJarByClass(CSVInputFormatTest.class); +job.setMapperClass(CSVCheckMapper.class); +job.setNumReduceTasks(0); +job.setInputFormatClass(CSVInputFormat.class); + +String inputFolder = new File("src/test/resources/csv").getCanonicalPath(); +FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + "csv_with_bom.csv")); +FileInputFormat
carbondata git commit: [CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat
Repository: carbondata Updated Branches: refs/heads/master 9f2884a04 -> 910f26171 [CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat This closes #2038 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/910f2617 Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/910f2617 Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/910f2617 Branch: refs/heads/master Commit: 910f26171750276be5ccfe404be9d8ab0f2ead42 Parents: 9f2884a Author: KanakaKumar Authored: Mon Mar 5 16:58:18 2018 +0530 Committer: Venkata Ramana G Committed: Wed Mar 7 21:16:16 2018 +0530 -- integration/presto/pom.xml | 5 .../loading/csvinput/CSVInputFormat.java| 6 +++- .../loading/csvinput/CSVInputFormatTest.java| 30 ++- .../src/test/resources/csv/csv_with_bom.csv | 3 ++ .../src/test/resources/csv/csv_with_bom.csv.bz2 | Bin 0 -> 129 bytes .../src/test/resources/csv/csv_with_bom.csv.gz | Bin 0 -> 110 bytes 6 files changed, 42 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/integration/presto/pom.xml -- diff --git a/integration/presto/pom.xml b/integration/presto/pom.xml index aaaf175..17f5d41 100644 --- a/integration/presto/pom.xml +++ b/integration/presto/pom.xml @@ -484,6 +484,11 @@ hk2-utils 2.5.0-b42 + + commons-io + commons-io + 2.4 + http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java -- diff --git a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java index 259b6da..aebaf3b 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java +++ b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java @@ -29,6 +29,7 @@ import org.apache.carbondata.core.util.CarbonProperties; import com.univocity.parsers.csv.CsvParser; import com.univocity.parsers.csv.CsvParserSettings; +import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.lang.BooleanUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; @@ -271,8 +272,11 @@ public class CSVInputFormat extends FileInputFormathttp://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java -- diff --git a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java index 14c680e..d89f10d 100644 --- a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java +++ b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java @@ -128,6 +128,7 @@ public class CSVInputFormatTest extends TestCase { @Test public void testReadCSVFiles() throws Exception{ Configuration conf = new Configuration(); prepareConf(conf); +conf.setBoolean(CSVInputFormat.HEADER_PRESENT, true); File output = new File("target/output_CSVInputFormatTest"); conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath()); Job job = Job.getInstance(conf, "CSVInputFormat_normal"); @@ -149,8 +150,35 @@ public class CSVInputFormatTest extends TestCase { Assert.assertTrue(job.waitForCompletion(true)); } + /** + * test read csv files encoded as UTF-8 with BOM + * @throws Exception + */ + @Test public void testReadCSVFilesWithBOM() throws Exception{ + +Configuration conf = new Configuration(); +prepareConf(conf); +conf.setBoolean(CSVInputFormat.HEADER_PRESENT, false); +File output = new File("target/output_CSVInputFormatTest_bom"); +conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath()); +Job job = Job.getInstance(conf, "CSVInputFormat_normal_bom"); +job.setJarByClass(CSVInputFormatTest.class); +job.setMapperClass(CSVCheckMapper.class); +job.setNumReduceTasks(0); +job.setInputFormatClass(CSVInputFormat.class); + +String inputFolder = new File("src/test/resources/csv").getCanonicalPath(); +FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + "csv_with_bom.csv")); +FileInputFormat.addInpu