AMBARI-18583 : Hive view : handled BOM characters in upload table feature (nitirajrathore)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/5da18ae7 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/5da18ae7 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/5da18ae7 Branch: refs/heads/branch-dev-patch-upgrade Commit: 5da18ae7896edfbcdabd6c140199e1139d414048 Parents: b537833 Author: Nitiraj Rathore <[email protected]> Authored: Mon Nov 7 20:51:17 2016 +0530 Committer: Nitiraj Rathore <[email protected]> Committed: Mon Nov 7 20:51:17 2016 +0530 ---------------------------------------------------------------------- .../hive2/resources/uploads/UploadService.java | 21 +++++++++++++-- .../hive/resources/uploads/UploadService.java | 27 ++++++++++++++++++-- 2 files changed, 44 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/5da18ae7/contrib/views/hive-next/src/main/java/org/apache/ambari/view/hive2/resources/uploads/UploadService.java ---------------------------------------------------------------------- diff --git a/contrib/views/hive-next/src/main/java/org/apache/ambari/view/hive2/resources/uploads/UploadService.java b/contrib/views/hive-next/src/main/java/org/apache/ambari/view/hive2/resources/uploads/UploadService.java index 9800c22..0826945 100644 --- a/contrib/views/hive-next/src/main/java/org/apache/ambari/view/hive2/resources/uploads/UploadService.java +++ b/contrib/views/hive-next/src/main/java/org/apache/ambari/view/hive2/resources/uploads/UploadService.java @@ -42,6 +42,8 @@ import org.apache.ambari.view.hive2.resources.uploads.query.TableInfo; import org.apache.ambari.view.hive2.utils.ServiceFormattedException; import org.apache.ambari.view.hive2.utils.SharedObjectsFactory; import org.apache.ambari.view.utils.ambari.AmbariApi; +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.io.input.ReaderInputStream; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -496,7 +498,8 @@ public class UploadService extends BaseService { LOG.info("isFirstRowHeader : {}, inputFileType : {}", isFirstRowHeader, inputFileType); - DataParser dataParser = new DataParser(new InputStreamReader(uploadedInputStream), parseOptions); + Reader reader = getInputStreamReader(uploadedInputStream); + DataParser dataParser = new DataParser(reader, parseOptions); return dataParser.parsePreview(); } @@ -542,13 +545,27 @@ public class UploadService extends BaseService { parseOptions.setOption(ParseOptions.OPTIONS_CSV_QUOTE, csvParams.getCsvQuote()); } - DataParser dataParser = new DataParser(new InputStreamReader(uploadedInputStream), parseOptions); + Reader reader = getInputStreamReader(uploadedInputStream); + DataParser dataParser = new DataParser(reader, parseOptions); Reader csvReader = new TableDataReader(dataParser.iterator(), header, containsEndlines); // encode column values into HEX so that \n etc dont appear in the hive table data String path = uploadIntoTable(csvReader, databaseName, tableName); return path; } + private Reader getInputStreamReader(InputStream is) throws IOException { + BOMInputStream bomInputStream = new BOMInputStream(is, + ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, + ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE + ); + if(bomInputStream.hasBOM()){ + String charSetName = bomInputStream.getBOMCharsetName(); + return new InputStreamReader(bomInputStream, charSetName); // return with the encoded charset encoding. + }else{ + return new InputStreamReader(bomInputStream); //return with default charset + } + } + private String getBasenameFromPath(String path) { String fileName = new File(path).getName(); return getBasename(fileName); http://git-wip-us.apache.org/repos/asf/ambari/blob/5da18ae7/contrib/views/hive/src/main/java/org/apache/ambari/view/hive/resources/uploads/UploadService.java ---------------------------------------------------------------------- diff --git a/contrib/views/hive/src/main/java/org/apache/ambari/view/hive/resources/uploads/UploadService.java b/contrib/views/hive/src/main/java/org/apache/ambari/view/hive/resources/uploads/UploadService.java index 7dccbd4..2dceadf 100644 --- a/contrib/views/hive/src/main/java/org/apache/ambari/view/hive/resources/uploads/UploadService.java +++ b/contrib/views/hive/src/main/java/org/apache/ambari/view/hive/resources/uploads/UploadService.java @@ -36,6 +36,8 @@ import org.apache.ambari.view.hive.resources.uploads.query.TableInfo; import org.apache.ambari.view.hive.utils.ServiceFormattedException; import org.apache.ambari.view.hive.utils.SharedObjectsFactory; import org.apache.ambari.view.utils.ambari.AmbariApi; +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.io.input.ReaderInputStream; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -464,7 +466,8 @@ public class UploadService extends BaseService { LOG.info("isFirstRowHeader : {}, inputFileType : {}", isFirstRowHeader, inputFileType); - DataParser dataParser = new DataParser(new InputStreamReader(uploadedInputStream), parseOptions); + Reader reader = getInputStreamReader(uploadedInputStream); + DataParser dataParser = new DataParser(reader, parseOptions); return dataParser.parsePreview(); } @@ -510,13 +513,33 @@ public class UploadService extends BaseService { parseOptions.setOption(ParseOptions.OPTIONS_CSV_QUOTE, csvParams.getCsvQuote()); } - DataParser dataParser = new DataParser(new InputStreamReader(uploadedInputStream), parseOptions); + Reader reader = getInputStreamReader(uploadedInputStream); + DataParser dataParser = new DataParser(reader, parseOptions); Reader csvReader = new TableDataReader(dataParser.iterator(), header, containsEndlines); // encode column values into HEX so that \n etc dont appear in the hive table data String path = uploadIntoTable(csvReader, databaseName, tableName); return path; } + /** + * takes care of any BOM in the stream + * @param is : the input stream + * @return : the reader from the stream + * @throws IOException + */ + private Reader getInputStreamReader(InputStream is) throws IOException { + BOMInputStream bomInputStream = new BOMInputStream(is, + ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, + ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE + ); + if(bomInputStream.hasBOM()){ + String charSetName = bomInputStream.getBOMCharsetName(); + return new InputStreamReader(bomInputStream, charSetName); // return with the encoded charset encoding. + }else{ + return new InputStreamReader(bomInputStream); //return with default charset + } + } + private String getBasenameFromPath(String path) { String fileName = new File(path).getName(); return getBasename(fileName);
