METAMODEL-250: Added support for EBCDIC files Closes #103
Project: http://git-wip-us.apache.org/repos/asf/metamodel/repo Commit: http://git-wip-us.apache.org/repos/asf/metamodel/commit/a1b9ff7f Tree: http://git-wip-us.apache.org/repos/asf/metamodel/tree/a1b9ff7f Diff: http://git-wip-us.apache.org/repos/asf/metamodel/diff/a1b9ff7f Branch: refs/heads/5.x Commit: a1b9ff7fbc22cbebd8abda60dc8954fbf58981ce Parents: 2392557 Author: Kasper Sørensen <i.am.kasper.soren...@gmail.com> Authored: Mon Aug 1 21:19:22 2016 -0700 Committer: Kasper Sørensen <i.am.kasper.soren...@gmail.com> Committed: Mon Aug 1 21:20:11 2016 -0700 ---------------------------------------------------------------------- CHANGES.md | 1 + .../fixedwidth/EbcdicConfiguration.java | 60 ++++ .../metamodel/fixedwidth/EbcdicReader.java | 75 +++++ .../fixedwidth/FixedWidthColumnSpec.java | 2 +- .../fixedwidth/FixedWidthConfiguration.java | 199 +++++++------ .../FixedWidthConfigurationReader.java | 18 +- .../fixedwidth/FixedWidthDataContext.java | 25 +- .../metamodel/fixedwidth/FixedWidthDataSet.java | 3 +- .../metamodel/fixedwidth/FixedWidthReader.java | 281 +++++++++++++++---- .../apache/metamodel/fixedwidth/EBCDICTest.java | 77 +++++ .../fixedwidth/FixedWidthConfigurationTest.java | 11 +- .../fixedwidth/FixedWidthDataContextTest.java | 3 - .../fixedwidth/FixedWidthReaderTest.java | 27 +- .../test/resources/fixed-width-2-7-10-10.ebc | 1 + 14 files changed, 572 insertions(+), 211 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/CHANGES.md ---------------------------------------------------------------------- diff --git a/CHANGES.md b/CHANGES.md index f0264c6..c0b90cc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,7 @@ * [METAMODEL-1099] - Created a new DataContextFactory SPI and a extensible registry of implementations based on ServiceLoader. * [METAMODEL-1099] - Implemented DataContextFactory SPI for connectors: JDBC, CSV, ElasticSearch + * [METAMODEL-250] - Added support for EBCDIC files (part of 'fixedwidth' module). * [METAMODEL-1103] - Fixed a bug pertaining to anchoring of wildcards in LIKE operands. * [METAMODEL-1088] - Add support for aliases in MongoDB. * [METAMODEL-1086] - Fixed encoding issue when CsvDataContext is instantiated with InputStream. http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicConfiguration.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicConfiguration.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicConfiguration.java new file mode 100644 index 0000000..389a4f8 --- /dev/null +++ b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicConfiguration.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.metamodel.fixedwidth; + +/** + * Special fixed-width configuration for EBCDIC files. + */ +public final class EbcdicConfiguration extends FixedWidthConfiguration { + + private final boolean _skipEbcdicHeader; + private final boolean _eolPresent; + + public EbcdicConfiguration(int columnNameLineNumber, String encoding, int fixedValueWidth, + boolean failOnInconsistentLineWidth, boolean skipEbcdicHeader, boolean eolPresent) { + super(columnNameLineNumber, encoding, fixedValueWidth, failOnInconsistentLineWidth); + _skipEbcdicHeader = skipEbcdicHeader; + _eolPresent = eolPresent; + } + + public EbcdicConfiguration(int columnNameLineNumber, String encoding, int[] valueWidths, + boolean failOnInconsistentLineWidth, boolean skipEbcdicHeader, boolean eolPresent) { + super(columnNameLineNumber, null, encoding, valueWidths, failOnInconsistentLineWidth); + _skipEbcdicHeader = skipEbcdicHeader; + _eolPresent = eolPresent; + } + + /** + * Determines if the input file contains a header that should be skipped before reading records data. + * + * @return a boolean indicating whether or not to skip EBCDIC header. + */ + public boolean isSkipEbcdicHeader() { + return _skipEbcdicHeader; + } + + /** + * Determines if the input file contains new line characters. + * + * @return a boolean indicating whether or not the input contains new line characters. + */ + public boolean isEolPresent() { + return _eolPresent; + } +} http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicReader.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicReader.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicReader.java new file mode 100644 index 0000000..a7639fc --- /dev/null +++ b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicReader.java @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.metamodel.fixedwidth; + +import java.io.BufferedInputStream; +import java.io.IOException; + +/** + * Reader capable of separating values based on a fixed width setting. + */ +class EbcdicReader extends FixedWidthReader { + + private final boolean _skipEbcdicHeader; + private final boolean _eolPresent; + private boolean _headerSkipped; + + public EbcdicReader(BufferedInputStream stream, String charsetName, int[] valueWidths, + boolean failOnInconsistentLineWidth, boolean skipEbcdicHeader, boolean eolPresent) { + super(stream, charsetName, valueWidths, failOnInconsistentLineWidth); + _skipEbcdicHeader = skipEbcdicHeader; + _eolPresent = eolPresent; + } + + @Override + protected void beforeReadLine() { + if (shouldSkipHeader()) { + try { + skipHeader(); + } catch (IOException e) { + throw new IllegalStateException("A problem occurred while skipping the input stream. ", e); + } + } + } + + private boolean shouldSkipHeader() { + return (_skipEbcdicHeader && !_headerSkipped); + } + + private void skipHeader() throws IOException { + _headerSkipped = true; + _stream.skip(_expectedLineLength); + } + + @Override + protected String readSingleRecordData() throws IOException { + if (_eolPresent) { + return super.readSingleRecordData(); + } else { + byte[] buffer = new byte[_expectedLineLength]; + int bytesRead = _stream.read(buffer, 0, _expectedLineLength); + + if (bytesRead < 0) { + return null; + } + + return new String(buffer, _charsetName); + } + } +} http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthColumnSpec.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthColumnSpec.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthColumnSpec.java index 65ec219..dedfbcd 100644 --- a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthColumnSpec.java +++ b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthColumnSpec.java @@ -24,7 +24,7 @@ import org.apache.metamodel.util.HasName; * Represents the specification of a single column for a * {@link FixedWidthDataContext}. */ -public final class FixedWidthColumnSpec implements HasName { +final class FixedWidthColumnSpec implements HasName { private final String name; private final int width; http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfiguration.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfiguration.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfiguration.java index 2b2cae5..c53ff16 100644 --- a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfiguration.java +++ b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfiguration.java @@ -31,32 +31,29 @@ import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.HasNameMapper; /** - * Configuration of metadata about a fixed width values datacontext. + * Configuration of metadata about a fixed width values data context. */ -public final class FixedWidthConfiguration extends BaseObject implements - Serializable { +public class FixedWidthConfiguration extends BaseObject implements Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; - public static final int NO_COLUMN_NAME_LINE = 0; - public static final int DEFAULT_COLUMN_NAME_LINE = 1; + public static final int NO_COLUMN_NAME_LINE = 0; + public static final int DEFAULT_COLUMN_NAME_LINE = 1; - private final String encoding; - private final int fixedValueWidth; - private final int[] valueWidths; - private final int columnNameLineNumber; - private final boolean failOnInconsistentLineWidth; - private final ColumnNamingStrategy columnNamingStrategy; + private final String encoding; + private final int fixedValueWidth; + private final int[] valueWidths; + private final int columnNameLineNumber; + private final boolean failOnInconsistentLineWidth; + private final ColumnNamingStrategy columnNamingStrategy; - public FixedWidthConfiguration(int fixedValueWidth) { - this(DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING, - fixedValueWidth); - } + public FixedWidthConfiguration(int fixedValueWidth) { + this(DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING, fixedValueWidth); + } - public FixedWidthConfiguration(int[] valueWidth) { - this(DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING, valueWidth, - false); - } + public FixedWidthConfiguration(int[] valueWidth) { + this(DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING, valueWidth, false); + } public FixedWidthConfiguration(int columnNameLineNumber, String encoding, int fixedValueWidth) { this(columnNameLineNumber, encoding, fixedValueWidth, false); @@ -72,11 +69,11 @@ public final class FixedWidthConfiguration extends BaseObject implements this.valueWidths = new int[0]; } - public FixedWidthConfiguration(int columnNameLineNumber, String encoding, - int[] valueWidths, boolean failOnInconsistentLineWidth) { + public FixedWidthConfiguration(int columnNameLineNumber, String encoding, int[] valueWidths, + boolean failOnInconsistentLineWidth) { this(columnNameLineNumber, null, encoding, valueWidths, failOnInconsistentLineWidth); } - + public FixedWidthConfiguration(int columnNameLineNumber, ColumnNamingStrategy columnNamingStrategy, String encoding, int[] valueWidths, boolean failOnInconsistentLineWidth) { this.encoding = encoding; @@ -86,7 +83,7 @@ public final class FixedWidthConfiguration extends BaseObject implements this.columnNamingStrategy = columnNamingStrategy; this.valueWidths = valueWidths; } - + public FixedWidthConfiguration(String encoding, List<FixedWidthColumnSpec> columnSpecs) { this(encoding, columnSpecs, false); } @@ -106,84 +103,84 @@ public final class FixedWidthConfiguration extends BaseObject implements } /** - * The line number (1 based) from which to get the names of the columns. - * - * @return an int representing the line number of the column headers/names. - */ - public int getColumnNameLineNumber() { - return columnNameLineNumber; - } - - /** - * Gets a {@link ColumnNamingStrategy} to use if needed. - * @return - */ - public ColumnNamingStrategy getColumnNamingStrategy() { - if (columnNamingStrategy == null) { - return ColumnNamingStrategies.defaultStrategy(); - } + * The line number (1 based) from which to get the names of the columns. + * + * @return an int representing the line number of the column headers/names. + */ + public int getColumnNameLineNumber() { + return columnNameLineNumber; + } + + /** + * Gets a {@link ColumnNamingStrategy} to use if needed. + * @return column naming strategy + */ + public ColumnNamingStrategy getColumnNamingStrategy() { + if (columnNamingStrategy == null) { + return ColumnNamingStrategies.defaultStrategy(); + } return columnNamingStrategy; } - /** - * Gets the file encoding to use for reading the file. - * - * @return the text encoding to use for reading the file. - */ - public String getEncoding() { - return encoding; - } - - /** - * Gets the width of each value within the fixed width value file. - * - * @return the fixed width to use when parsing the file. - */ - public int getFixedValueWidth() { - return fixedValueWidth; - } - - public int[] getValueWidths() { - return valueWidths; - } - - /** - * Determines if the {@link DataSet#next()} should throw an exception in - * case of inconsistent line width in the fixed width value file. - * - * @return a boolean indicating whether or not to fail on inconsistent line - * widths. - */ - public boolean isFailOnInconsistentLineWidth() { - return failOnInconsistentLineWidth; - } - - @Override - protected void decorateIdentity(List<Object> identifiers) { - identifiers.add(columnNameLineNumber); - identifiers.add(encoding); - identifiers.add(fixedValueWidth); - identifiers.add(valueWidths); - identifiers.add(failOnInconsistentLineWidth); - } - - @Override - public String toString() { - return "FixedWidthConfiguration[encoding=" + encoding - + ", fixedValueWidth=" + fixedValueWidth + ", valueWidths=" - + Arrays.toString(valueWidths) + ", columnNameLineNumber=" - + columnNameLineNumber + ", failOnInconsistentLineWidth=" - + failOnInconsistentLineWidth + "]"; - } - - public boolean isConstantValueWidth() { - return fixedValueWidth != -1; - } - - public int getValueWidth(int columnIndex) { - if (isConstantValueWidth()) { - return fixedValueWidth; - } - return valueWidths[columnIndex]; - } + /** + * Gets the file encoding to use for reading the file. + * + * @return the text encoding to use for reading the file. + */ + public String getEncoding() { + return encoding; + } + + /** + * Gets the width of each value within the fixed width value file. + * + * @return the fixed width to use when parsing the file. + */ + public int getFixedValueWidth() { + return fixedValueWidth; + } + + public int[] getValueWidths() { + return valueWidths; + } + + /** + * Determines if the {@link DataSet#next()} should throw an exception in + * case of inconsistent line width in the fixed width value file. + * + * @return a boolean indicating whether or not to fail on inconsistent line + * widths. + */ + public boolean isFailOnInconsistentLineWidth() { + return failOnInconsistentLineWidth; + } + + @Override + protected void decorateIdentity(List<Object> identifiers) { + identifiers.add(columnNameLineNumber); + identifiers.add(encoding); + identifiers.add(fixedValueWidth); + identifiers.add(valueWidths); + identifiers.add(failOnInconsistentLineWidth); + } + + @Override + public String toString() { + return "FixedWidthConfiguration[encoding=" + encoding + + ", fixedValueWidth=" + fixedValueWidth + ", valueWidths=" + + Arrays.toString(valueWidths) + ", columnNameLineNumber=" + + columnNameLineNumber + ", failOnInconsistentLineWidth=" + + failOnInconsistentLineWidth + "]"; + } + + public boolean isConstantValueWidth() { + return fixedValueWidth != -1; + } + + public int getValueWidth(int columnIndex) { + if (isConstantValueWidth()) { + return fixedValueWidth; + } + return valueWidths[columnIndex]; + } } http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationReader.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationReader.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationReader.java index 9154e5e..264287f 100644 --- a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationReader.java +++ b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationReader.java @@ -60,10 +60,9 @@ public class FixedWidthConfigurationReader { * "http://support.sas.com/documentation/cdl/en/etlug/67323/HTML/default/viewer.htm#p0h03yig7fp1qan1arghp3lwjqi6.htm"> * described here</a>. * - * @param encoding - * @param resource - * the format file resource - * @param failOnInconsistentLineWidth + * @param encoding the format file encoding + * @param resource the format file resource + * @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not * @return a {@link FixedWidthConfiguration} object to use */ public FixedWidthConfiguration readFromSasFormatFile(String encoding, Resource resource, @@ -88,13 +87,11 @@ public class FixedWidthConfigurationReader { /** * Reads a {@link FixedWidthConfiguration} based on a SAS INPUT declaration. - * The reader method also optionally will look for a LABEL defintion for - * column naming. + * The reader method also optionally will look for a LABEL definition for column naming. * - * @param encoding - * @param resource - * the format file resource - * @param failOnInconsistentLineWidth + * @param encoding the format file encoding + * @param resource the format file resource + * @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not * @return a {@link FixedWidthConfiguration} object to use */ public FixedWidthConfiguration readFromSasInputDefinition(String encoding, Resource resource, @@ -176,5 +173,4 @@ public class FixedWidthConfigurationReader { return new FixedWidthConfiguration(encoding, columnSpecs, failOnInconsistentLineWidth); } - } http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataContext.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataContext.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataContext.java index d28a0b2..027cdab 100644 --- a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataContext.java +++ b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataContext.java @@ -18,9 +18,9 @@ */ package org.apache.metamodel.fixedwidth; +import java.io.BufferedInputStream; import java.io.File; import java.io.InputStream; -import java.io.Reader; import org.apache.metamodel.MetaModelException; import org.apache.metamodel.QueryPostprocessDataContext; @@ -106,7 +106,7 @@ public class FixedWidthDataContext extends QueryPostprocessDataContext { /** * Gets the resource being read * - * @return + * @return a {@link Resource} object */ public Resource getResource() { return _resource; @@ -184,16 +184,23 @@ public class FixedWidthDataContext extends QueryPostprocessDataContext { private FixedWidthReader createReader() { final InputStream inputStream = _resource.read(); - final Reader fileReader = FileHelper.getReader(inputStream, _configuration.getEncoding()); final FixedWidthReader reader; - if (_configuration.isConstantValueWidth()) { - reader = new FixedWidthReader(fileReader, _configuration.getFixedValueWidth(), _configuration - .isFailOnInconsistentLineWidth()); + + if (_configuration instanceof EbcdicConfiguration) { + reader = new EbcdicReader((BufferedInputStream) inputStream, _configuration.getEncoding(), + _configuration.getValueWidths(), _configuration.isFailOnInconsistentLineWidth(), + ((EbcdicConfiguration) _configuration).isSkipEbcdicHeader(), + ((EbcdicConfiguration) _configuration).isEolPresent()); } else { - reader = new FixedWidthReader(fileReader, _configuration.getValueWidths(), _configuration - .isFailOnInconsistentLineWidth()); + if (_configuration.isConstantValueWidth()) { + reader = new FixedWidthReader(inputStream, _configuration.getEncoding(), + _configuration.getFixedValueWidth(), _configuration.isFailOnInconsistentLineWidth()); + } else { + reader = new FixedWidthReader(inputStream, _configuration.getEncoding(), + _configuration.getValueWidths(), _configuration.isFailOnInconsistentLineWidth()); + } } + return reader; } - } http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataSet.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataSet.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataSet.java index 44ce808..4f78bab 100644 --- a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataSet.java +++ b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataSet.java @@ -98,8 +98,7 @@ class FixedWidthDataSet extends AbstractDataSet { if (columnNumber < stringValues.length) { rowValues[i] = stringValues[columnNumber]; } else { - // Ticket #125: Missing values should be enterpreted as - // null. + // Ticket #125: Missing values should be interpreted as null. rowValues[i] = null; } } http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthReader.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthReader.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthReader.java index d7a18cf..da17ff1 100644 --- a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthReader.java +++ b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthReader.java @@ -18,78 +18,235 @@ */ package org.apache.metamodel.fixedwidth; -import java.io.BufferedReader; +import java.io.BufferedInputStream; import java.io.Closeable; import java.io.IOException; -import java.io.Reader; +import java.io.InputStream; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; +import java.util.ArrayList; +import java.util.List; /** * Reader capable of separating values based on a fixed width setting. */ -final public class FixedWidthReader implements Closeable { - - private final BufferedReader _reader; - private final FixedWidthLineParser _parser; - - public FixedWidthReader(Reader reader, int fixedValueWidth, - boolean failOnInconsistentLineWidth) { - this(new BufferedReader(reader), fixedValueWidth, - failOnInconsistentLineWidth); - } - - public FixedWidthReader(BufferedReader reader, int fixedValueWidth, - boolean failOnInconsistentLineWidth) { - _reader = reader; - final FixedWidthConfiguration fixedWidthConfiguration = new FixedWidthConfiguration( - FixedWidthConfiguration.NO_COLUMN_NAME_LINE, null, fixedValueWidth, failOnInconsistentLineWidth); - _parser = new FixedWidthLineParser(fixedWidthConfiguration, -1, 0); - } - - public FixedWidthReader(Reader reader, int[] valueWidths, - boolean failOnInconsistentLineWidth) { - this(new BufferedReader(reader), valueWidths, - failOnInconsistentLineWidth); - } - - public FixedWidthReader(BufferedReader reader, int[] valueWidths, - boolean failOnInconsistentLineWidth) { - _reader = reader; - int fixedValueWidth = -1; - int expectedLineLength = 0; - if (fixedValueWidth == -1) { - for (int i = 0; i < valueWidths.length; i++) { - expectedLineLength += valueWidths[i]; - } - } - final FixedWidthConfiguration fixedWidthConfiguration = new FixedWidthConfiguration( - FixedWidthConfiguration.NO_COLUMN_NAME_LINE, null, valueWidths, failOnInconsistentLineWidth); - _parser = new FixedWidthLineParser(fixedWidthConfiguration, expectedLineLength, 0); - } - - - /*** - * Reads the next line in the file. - * - * @return an array of values in the next line, or null if the end of the - * file has been reached. - * - * @throws IllegalStateException - * if an exception occurs while reading the file. - */ - public String[] readLine() throws IllegalStateException { - String line; +class FixedWidthReader implements Closeable { + private static final int END_OF_STREAM = -1; + private static final int LINE_FEED = '\n'; + private static final int CARRIAGE_RETURN = '\r'; + + protected final String _charsetName; + private final int _fixedValueWidth; + private final int[] _valueWidths; + private int _valueIndex = 0; + private final boolean _failOnInconsistentLineWidth; + private final boolean _constantWidth; + private volatile int _rowNumber; + protected final BufferedInputStream _stream; + protected final int _expectedLineLength; + + public FixedWidthReader(InputStream stream, String charsetName, int fixedValueWidth, + boolean failOnInconsistentLineWidth) { + this(new BufferedInputStream(stream), charsetName, fixedValueWidth, failOnInconsistentLineWidth); + } + + private FixedWidthReader(BufferedInputStream stream, String charsetName, int fixedValueWidth, + boolean failOnInconsistentLineWidth) { + _stream = stream; + _charsetName = charsetName; + _fixedValueWidth = fixedValueWidth; + _failOnInconsistentLineWidth = failOnInconsistentLineWidth; + _rowNumber = 0; + _valueWidths = null; + _constantWidth = true; + _expectedLineLength = -1; + } + + public FixedWidthReader(InputStream stream, String charsetName, int[] valueWidths, + boolean failOnInconsistentLineWidth) { + this(new BufferedInputStream(stream), charsetName, valueWidths, failOnInconsistentLineWidth); + } + + FixedWidthReader(BufferedInputStream stream, String charsetName, int[] valueWidths, + boolean failOnInconsistentLineWidth) { + _stream = stream; + _charsetName = charsetName; + _fixedValueWidth = -1; + _valueWidths = valueWidths; + _failOnInconsistentLineWidth = failOnInconsistentLineWidth; + _rowNumber = 0; + _constantWidth = false; + int expectedLineLength = 0; + + for (final int _valueWidth : _valueWidths) { + expectedLineLength += _valueWidth; + } + + _expectedLineLength = expectedLineLength; + } + + /** + * This reads and returns the next record from the file. Usually, it is a line but in case the new line characters + * are not present, the length of the content depends on the column-widths setting. + * + * @return an array of values in the next line, or null if the end of the file has been reached. + * @throws IllegalStateException if an exception occurs while reading the file. + */ + public String[] readLine() throws IllegalStateException { try { - line = _reader.readLine(); - return _parser.parseLine(line); + beforeReadLine(); + _rowNumber++; + return getValues(); } catch (IOException e) { throw new IllegalStateException(e); } - } - + } + + /** + * Empty hook that enables special behavior in sub-classed readers (by overriding this method). + */ + protected void beforeReadLine() { + return; + } + + private String[] getValues() throws IOException { + final List<String> values = new ArrayList<>(); + final String singleRecordData = readSingleRecordData(); + + if (singleRecordData == null) { + return null; + } + + processSingleRecordData(singleRecordData, values); + String[] result = values.toArray(new String[values.size()]); + + if (!_failOnInconsistentLineWidth && !_constantWidth) { + result = correctResult(result); + } + + validateConsistentValue(singleRecordData, result, values.size()); + + return result; + } + + private void validateConsistentValue(String recordData, String[] result, int valuesSize) { + if (!_failOnInconsistentLineWidth) { + return; + } + + InconsistentValueWidthException inconsistentValueException = null; + + if (_constantWidth) { + if (recordData.length() % _fixedValueWidth != 0) { + inconsistentValueException = new InconsistentValueWidthException(result, recordData, _rowNumber); + } + } else if (result.length != valuesSize || recordData.length() != _expectedLineLength) { + inconsistentValueException = new InconsistentValueWidthException(result, recordData, _rowNumber); + } + + if (inconsistentValueException != null) { + throw inconsistentValueException; + } + } + + private void processSingleRecordData(final String singleRecordData, final List<String> values) { + StringBuilder nextValue = new StringBuilder(); + final CharacterIterator it = new StringCharacterIterator(singleRecordData); + _valueIndex = 0; + + for (char c = it.first(); c != CharacterIterator.DONE; c = it.next()) { + processCharacter(c, nextValue, values, singleRecordData); + } + + if (nextValue.length() > 0) { + addNewValueIfAppropriate(values, nextValue); + } + } + + String readSingleRecordData() throws IOException { + StringBuilder line = new StringBuilder(); + int ch; + + for (ch = _stream.read(); !isEndingCharacter(ch); ch = _stream.read()) { + line.append((char) ch); + } + + if (ch == CARRIAGE_RETURN) { + readLineFeedIfFollows(); + } + + return (line.length()) > 0 ? line.toString() : null; + } + + private void readLineFeedIfFollows() throws IOException { + _stream.mark(1); + + if (_stream.read() != LINE_FEED) { + _stream.reset(); + } + } + + private boolean isEndingCharacter(int ch) { + return (ch == CARRIAGE_RETURN || ch == LINE_FEED || ch == END_OF_STREAM); + } + + private void processCharacter(char c, StringBuilder nextValue, List<String> values, String recordData) { + nextValue.append(c); + final int valueWidth = getValueWidth(values, recordData); + + if (nextValue.length() == valueWidth) { + addNewValueIfAppropriate(values, nextValue); + nextValue.setLength(0); // clear the buffer + + if (_valueWidths != null) { + _valueIndex = (_valueIndex + 1) % _valueWidths.length; + } + } + } + + private int getValueWidth(List<String> values, String recordData) { + if (_constantWidth) { + return _fixedValueWidth; + } else { + if (_valueIndex >= _valueWidths.length) { + if (_failOnInconsistentLineWidth) { + String[] result = values.toArray(new String[values.size()]); + throw new InconsistentValueWidthException(result, recordData, _rowNumber + 1); + } else { + return -1; // silently ignore the inconsistency + } + } + + return _valueWidths[_valueIndex]; + } + } + + private void addNewValueIfAppropriate(List<String> values, StringBuilder nextValue) { + if (_valueWidths != null) { + if (values.size() < _valueWidths.length) { + values.add(nextValue.toString().trim()); + } + } else { + values.add(nextValue.toString().trim()); + } + } + + private String[] correctResult(String[] result) { + if (result.length != _valueWidths.length) { + String[] correctedResult = new String[_valueWidths.length]; + + for (int i = 0; i < result.length && i < _valueWidths.length; i++) { + correctedResult[i] = result[i]; + } + + result = correctedResult; + } - @Override - public void close() throws IOException { - _reader.close(); - } + return result; + } + @Override + public void close() throws IOException { + _stream.close(); + } } http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/EBCDICTest.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/EBCDICTest.java b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/EBCDICTest.java new file mode 100644 index 0000000..ea19960 --- /dev/null +++ b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/EBCDICTest.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.metamodel.fixedwidth; + +import java.io.File; + +import org.apache.metamodel.data.DataSet; +import org.apache.metamodel.schema.Schema; +import org.apache.metamodel.schema.Table; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class EBCDICTest { + private static final int[] COLUMN_WIDTHS = new int[] { 2, 7, 10, 10 }; + private static final long EXPECTED_ROWS_COUNT = 49; // 50 lines, 1. is a header + private static final String ENCODING = "IBM500"; + private static final String[] EXPECTED_ROWS = new String[] { + "Row[values=[01, name-01, surname-01, address-01]]", + "Row[values=[02, name-02, surname-02, address-02]]", + "Row[values=[03, name-03, surname-03, address-03]]", + }; + private final FixedWidthDataContext _context; + private final Table _table; + + public EBCDICTest() { + String fileName = "fixed-width-2-7-10-10.ebc"; + FixedWidthConfiguration configuration = new EbcdicConfiguration(FixedWidthConfiguration.NO_COLUMN_NAME_LINE, + ENCODING, COLUMN_WIDTHS, false, true, false); + _context = new FixedWidthDataContext(new File("src/test/resources/" + fileName), configuration); + Schema schema = _context.getDefaultSchema(); + _table = schema.getTableByName(fileName); + } + + @Test + public void testRowsCount() throws Exception { + long rows = 0; + + try (final DataSet dataSet = _context.query().from(_table).selectCount().execute()) { + if (dataSet.next()) { + Object[] values = dataSet.getRow().getValues(); + rows = (long) values[0]; + } + } + + assertEquals(EXPECTED_ROWS_COUNT, rows); + } + + @Test + public void testFirstRows() throws Exception { + int limit = EXPECTED_ROWS.length; + int i = 0; + + try (final DataSet dataSet = _context.query().from(_table).selectAll().limit(limit).execute()) { + while (dataSet.next()) { + assertEquals(EXPECTED_ROWS[i], dataSet.getRow().toString()); + i++; + } + } + } +} http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationTest.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationTest.java b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationTest.java index 8225be0..f03d633 100644 --- a/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationTest.java +++ b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationTest.java @@ -18,8 +18,6 @@ */ package org.apache.metamodel.fixedwidth; -import org.apache.metamodel.fixedwidth.FixedWidthConfiguration; - import junit.framework.TestCase; public class FixedWidthConfigurationTest extends TestCase { @@ -31,14 +29,11 @@ public class FixedWidthConfigurationTest extends TestCase { } public void testEquals() throws Exception { - FixedWidthConfiguration conf1 = new FixedWidthConfiguration(1, "UTF8", - 10, true); - FixedWidthConfiguration conf2 = new FixedWidthConfiguration(1, "UTF8", - 10, true); + FixedWidthConfiguration conf1 = new FixedWidthConfiguration(1, "UTF8", 10, true); + FixedWidthConfiguration conf2 = new FixedWidthConfiguration(1, "UTF8", 10, true); assertEquals(conf1, conf2); - FixedWidthConfiguration conf3 = new FixedWidthConfiguration(1, "UTF8", - 10, false); + FixedWidthConfiguration conf3 = new FixedWidthConfiguration(1, "UTF8", 10, false); assertFalse(conf1.equals(conf3)); } } http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthDataContextTest.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthDataContextTest.java b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthDataContextTest.java index 2ac3680..7962cf6 100644 --- a/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthDataContextTest.java +++ b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthDataContextTest.java @@ -25,9 +25,6 @@ import junit.framework.TestCase; import org.apache.metamodel.DataContext; import org.apache.metamodel.data.DataSet; -import org.apache.metamodel.fixedwidth.FixedWidthConfiguration; -import org.apache.metamodel.fixedwidth.FixedWidthDataContext; -import org.apache.metamodel.fixedwidth.InconsistentValueWidthException; import org.apache.metamodel.query.Query; import org.apache.metamodel.schema.Schema; import org.apache.metamodel.schema.Table; http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthReaderTest.java ---------------------------------------------------------------------- diff --git a/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthReaderTest.java b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthReaderTest.java index 4d11f0e..8f40c1d 100644 --- a/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthReaderTest.java +++ b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthReaderTest.java @@ -18,11 +18,9 @@ */ package org.apache.metamodel.fixedwidth; -import static org.junit.Assert.assertEquals; - -import java.io.BufferedReader; +import java.io.BufferedInputStream; import java.io.File; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; import java.util.Arrays; @@ -30,7 +28,10 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; +import static org.junit.Assert.assertEquals; + public class FixedWidthReaderTest { + private static final String CHARSET = "UTF-8"; @Rule public final ExpectedException exception = ExpectedException.none(); @@ -38,9 +39,9 @@ public class FixedWidthReaderTest { @Test public void testBufferedReader1() throws IOException { final File file = new File("src/test/resources/example_simple1.txt"); - final BufferedReader reader = new BufferedReader(new FileReader(file)); + final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file)); int[] widths = new int[] { 8, 9 }; - try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(reader, widths, false)) { + try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(stream, CHARSET, widths, false)) { final String[] line1 = fixedWidthReader.readLine(); assertEquals("[greeting, greeter]", Arrays.asList(line1).toString()); final String[] line2 = fixedWidthReader.readLine(); @@ -53,9 +54,9 @@ public class FixedWidthReaderTest { @Test public void testBufferedReader2() throws IOException { final File file = new File("src/test/resources/example_simple2.txt"); - final BufferedReader reader = new BufferedReader(new FileReader(file)); + final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file)); int[] widths = new int[] {1, 8, 9 }; - try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(reader, widths, false)) { + try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(stream, CHARSET, widths, false)) { final String[] line1 = fixedWidthReader.readLine(); assertEquals("[i, greeting, greeter]", Arrays.asList(line1).toString()); final String[] line2 = fixedWidthReader.readLine(); @@ -68,8 +69,8 @@ public class FixedWidthReaderTest { @Test public void testBufferedReader3() throws IOException { final File file = new File("src/test/resources/example_simple3.txt"); - final BufferedReader reader = new BufferedReader(new FileReader(file)); - try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(reader, 5, false)) { + final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file)); + try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(stream, CHARSET, 5, false)) { final String[] line1 = fixedWidthReader.readLine(); assertEquals("[hello]", Arrays.asList(line1).toString()); final String[] line2 = fixedWidthReader.readLine(); @@ -84,8 +85,8 @@ public class FixedWidthReaderTest { @Test public void testBufferedReaderFailOnInconsistentRows() throws IOException { final File file = new File("src/test/resources/example_simple3.txt"); - final BufferedReader reader = new BufferedReader(new FileReader(file)); - try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(reader, 5, true)) { + final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file)); + try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(stream, CHARSET, 5, true)) { final String[] line1 = fixedWidthReader.readLine(); assertEquals("[hello]", Arrays.asList(line1).toString()); final String[] line2 = fixedWidthReader.readLine(); @@ -98,6 +99,4 @@ public class FixedWidthReaderTest { final String[] line4 = fixedWidthReader.readLine(); } } - - } http://git-wip-us.apache.org/repos/asf/metamodel/blob/a1b9ff7f/fixedwidth/src/test/resources/fixed-width-2-7-10-10.ebc ---------------------------------------------------------------------- diff --git a/fixedwidth/src/test/resources/fixed-width-2-7-10-10.ebc b/fixedwidth/src/test/resources/fixed-width-2-7-10-10.ebc new file mode 100644 index 0000000..09fcc70 --- /dev/null +++ b/fixedwidth/src/test/resources/fixed-width-2-7-10-10.ebc @@ -0,0 +1 @@ +������`���������`���������`���ñ�`�ñ¢¤ï¿½ï¿½ï¿½ï¿½`�ñ����`���ò�`�ò¢¤ï¿½ï¿½ï¿½ï¿½`�ò����`���ó�`�ó¢¤ï¿½ï¿½ï¿½ï¿½`�ó����`������`�������`�ô����`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`���������`���������`���ð�`�ð¢¤ï¿½ï¿½ï¿½ï¿½`�������`���ñ�`�ñ¢¤ï¿½ï¿½ï¿½ï¿½`�ñ����`���ò�`�ò¢¤ï¿½ï¿½ï¿½ï¿½`�ò����`���ó�`�ó¢¤ï¿½ï¿½ï¿½ï¿½`�ó����`������`�������`�ôï¿½ï¿½ï ¿½ï¿½`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`���������`���������`���ð�`�ð¢¤ï¿½ï¿½ï¿½ï¿½`�������`���ñ�`�ñ¢¤ï¿½ï¿½ï¿½ï¿½`�ñ����`���ò�`�ò¢¤ï¿½ï¿½ï¿½ï¿½`�ò����`���ó�`�ó¢¤ï¿½ï¿½ï¿½ï¿½`�ó����`������`�������`�ô����`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`��������� `���������`��������`���������`���������`���ð�`�ð¢¤ï¿½ï¿½ï¿½ï¿½`�������`���ñ�`�ñ¢¤ï¿½ï¿½ï¿½ï¿½`�ñ����`���ò�`�ò¢¤ï¿½ï¿½ï¿½ï¿½`�ò����`���ó�`�ó¢¤ï¿½ï¿½ï¿½ï¿½`�ó����`������`�������`�ô����`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`���������`���������`���ð�`�ð¢¤ï¿½ï¿½ï¿½ï¿½`�������`���ñ�`�ñ¢¤ï¿½ï¿½ï¿½ï¿½`�ñ����`���ò�`�ò¢¤ï¿½ï¿½ï¿½ï¿½`�ò����`���ó�`ï ¿½ó¢¤ï¿½ï¿½ï¿½ï¿½`�ó����`������`�������`�ô����`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`���������`���������`��������`���������`���������`�� \ No newline at end of file