This is an automated email from the ASF dual-hosted git repository. desruisseaux pushed a commit to branch geoapi-4.0 in repository https://gitbox.apache.org/repos/asf/sis.git
commit 34169b0ea1727b100b3cd986679700942089a1d4 Author: Martin Desruisseaux <[email protected]> AuthorDate: Tue Nov 6 12:38:09 2018 +0100 Accept, under some condition, spaces as a multiplication symbols (e.g. "kg m**-2"). This is a departure from UCUM specification which does not allow spaces as multiplication symbols. We do this departure only when we think that it is non-ambiguous. --- .../java/org/apache/sis/measure/UnitFormat.java | 87 ++++++++++++++++++---- .../org/apache/sis/measure/UnitFormatTest.java | 17 +++++ 2 files changed, 89 insertions(+), 15 deletions(-) diff --git a/core/sis-utility/src/main/java/org/apache/sis/measure/UnitFormat.java b/core/sis-utility/src/main/java/org/apache/sis/measure/UnitFormat.java index fb18ac4..c2857df 100644 --- a/core/sis-utility/src/main/java/org/apache/sis/measure/UnitFormat.java +++ b/core/sis-utility/src/main/java/org/apache/sis/measure/UnitFormat.java @@ -268,6 +268,7 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma * Units associated to a given label (in addition to the system-wide {@link UnitRegistry}). * This map is the converse of {@link #unitToLabel}. The {@link Unit} instances may differ from the ones * specified by user since {@link AbstractUnit#symbol} may have been set to the label specified by the user. + * The labels may contain some characters normally not allowed in unit symbols, like white spaces. * * @see #label(Unit, String) */ @@ -467,6 +468,7 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma * for allowing the singleton {@link #INSTANCE} to parse symbols in a multi-threads environment.</p> * * @param uom the unit symbol, without leading or trailing spaces. + * @return the unit for the given name, or {@code null} if unknown. */ private Unit<?> fromName(String uom) { /* @@ -983,6 +985,33 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma } /** + * Parse position when text to be parsed is expected to contain nothing else than a unit symbol. + * This is used for recording whether another term (separated from the previous term by a space) + * is allowed or not. + */ + private static final class Position extends ParsePosition { + /** {@code true} if we do not expect any more content after the last term parsed. */ + boolean finished; + + /** Creates a new position initialized to the beginning of the text to parse. */ + Position() { + super(0); + } + } + + /** + * Reports that the parsing is finished and no more content should be parsed. + * This method is invoked when the last parsed term is possibly one or more words instead than unit symbols. + * The intent is to avoid trying to parse "degree minute" as "degree × minute". By contrast, this method is + * not invoked if the string to parse is "m kg**-2" because it can be interpreted as "m × kg**-2". + */ + private static void finish(final ParsePosition pos) { + if (pos instanceof Position) { + ((Position) pos).finished = true; + } + } + + /** * Parses the given text as an instance of {@code Unit}. * If the parse completes without reading the entire length of the text, an exception is thrown. * @@ -991,9 +1020,18 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma * The product operator can be either {@code '.'} (ASCII) or {@code '⋅'} (Unicode) character. * Exponent after symbol can be decimal digits as in “m2” or a superscript as in “m²”.</p> * + * <p>This method differs from {@link #parse(CharSequence, ParsePosition)} in the treatment of white spaces: + * that method with a {@link ParsePosition} argument stops parsing at the first white space, + * while this {@code parse(…)} method treats white spaces as multiplications. + * The reason for this difference is that white space is normally not a valid multiplication symbol; + * it could be followed by a text which is not part of the unit symbol. + * But in the case of this {@code parse(CharSequence)} method, the whole {@code CharSequence} shall be a unit symbol. + * In such case, white spaces are less ambiguous.</p> + * * <p>The default implementation delegates to * <code>{@linkplain #parse(CharSequence, ParsePosition) parse}(symbols, new ParsePosition(0))</code> - * and verifies that all non-white characters have been parsed.</p> + * and verifies that all non-white characters have been parsed. + * Units separated by spaces are multiplied; for example "kg m**-2" is parsed as kg/m².</p> * * @param symbols the unit symbols or URI to parse. * @return the unit parsed from the specified symbols. @@ -1003,15 +1041,19 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma */ @Override public Unit<?> parse(final CharSequence symbols) throws ParserException { - final ParsePosition position = new ParsePosition(0); - final Unit<?> unit = parse(symbols, position); + final Position position = new Position(); + Unit<?> unit = parse(symbols, position); final int length = symbols.length(); - final int unrecognized = CharSequences.skipLeadingWhitespaces(symbols, position.getIndex(), length); - if (unrecognized < length) { - throw new ParserException(Errors.format(Errors.Keys.UnexpectedCharactersAfter_2, - CharSequences.trimWhitespaces(symbols, 0, unrecognized), - CharSequences.trimWhitespaces(symbols, unrecognized, length)), - symbols, unrecognized); + int unrecognized; + while ((unrecognized = CharSequences.skipLeadingWhitespaces(symbols, position.getIndex(), length)) < length) { + if (position.finished || !Character.isLetter(Character.codePointAt(symbols, unrecognized))) { + throw new ParserException(Errors.format(Errors.Keys.UnexpectedCharactersAfter_2, + CharSequences.trimWhitespaces(symbols, 0, unrecognized), + CharSequences.trimWhitespaces(symbols, unrecognized, length)), + symbols, unrecognized); + } + position.setIndex(unrecognized); + unit = unit.multiply(parse(symbols, position)); } return unit; } @@ -1062,6 +1104,7 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma final Unit<?> unit = Units.valueOfEPSG(Integer.parseInt(code)); if (unit != null) { position.setIndex(endOfURI); + finish(position); return unit; } } catch (NumberFormatException e) { @@ -1260,8 +1303,11 @@ search: while ((i = CharSequences.skipTrailingWhitespaces(symbols, start, i) i = end; // Restore the full length (until the first illegal character). } } - if (component == null) { - component = parseTerm(symbols, start, i, operation); + if (!(operation.finished = (component != null))) { + component = parseTerm(symbols, start, i, operation); // May set 'operation.finished' flag. + } + if (operation.finished) { + finish(position); // For preventing interpretation of "degree minute" as "degree × minute". } unit = operation.apply(unit, component, start); position.setIndex(endOfURI >= 0 ? endOfURI : i); @@ -1292,6 +1338,14 @@ search: while ((i = CharSequences.skipTrailingWhitespaces(symbols, start, i) private final CharSequence symbols; /** + * {@code true} if the parsed terms may be one or more words, possibly containing white spaces. + * In such case, the parsing should not continue after those words. + * + * @see Position#finished + */ + boolean finished; + + /** * Creates an operation initialized to {@link #NOOP}. */ Operation(final CharSequence symbols) { @@ -1353,8 +1407,7 @@ search: while ((i = CharSequences.skipTrailingWhitespaces(symbols, start, i) * @param symbols the complete string specified by the user. * @param lower index where to begin parsing in the {@code symbols} string. * @param upper index after the last character to parse in the {@code symbols} string. - * @param operation if the term will be used as multiplier or divisor of another unit, the - * operation to be applied. Otherwise {@code null}. + * @param operation the operation to be applied (e.g. the term to be parsed is a multiplier or divisor of another unit). * @return the parsed unit symbol (never {@code null}). * @throws ParserException if a problem occurred while parsing the given symbols. */ @@ -1368,6 +1421,7 @@ search: while ((i = CharSequences.skipTrailingWhitespaces(symbols, start, i) * symbols. If no explicit label was found, check for symbols and names known to this UnitFormat instance. */ Unit<?> unit = labelToUnit.get(uom); + operation.finished = (unit != null); if (unit == null) { unit = Prefixes.getUnit(uom); if (unit == null) { @@ -1394,8 +1448,9 @@ search: while ((i = CharSequences.skipTrailingWhitespaces(symbols, start, i) if (s >= 0) { final int next = CharSequences.skipLeadingWhitespaces(uom, s, length); if (next < length && AbstractUnit.isSymbolChar(uom.codePointAt(next))) { + operation.finished = true; // For preventing attempt to continue parsing after "100 feet". multiplier = Double.parseDouble(uom.substring(0, s)); - return parseTerm(uom, s, length, null).multiply(multiplier); + return parseTerm(uom, s, length, new Operation(uom)).multiply(multiplier); } } multiplier = parseMultiplicationFactor(uom); @@ -1464,13 +1519,14 @@ search: while ((i = CharSequences.skipTrailingWhitespaces(symbols, start, i) } final String symbol = uom.substring(CharSequences.skipLeadingWhitespaces(uom, 0, i), i); unit = labelToUnit.get(symbol); + operation.finished = (unit != null); if (unit == null) { unit = Prefixes.getUnit(symbol); } if (unit != null) { int numerator = power.numerator; int denominator = power.denominator; - if (numerator < 0 && operation != null && operation.invert()) { + if (numerator < 0 && operation.invert()) { numerator = -numerator; } if (numerator != 1) unit = unit.pow (numerator); @@ -1483,6 +1539,7 @@ search: while ((i = CharSequences.skipTrailingWhitespaces(symbols, start, i) * At this point, we have determined that the label is not a known unit symbol. * It may be a unit name, in which case the label is not case-sensitive anymore. */ + operation.finished = true; unit = fromName(uom); if (unit == null) { if (CharSequences.regionMatches(symbols, lower, UNITY, true)) { diff --git a/core/sis-utility/src/test/java/org/apache/sis/measure/UnitFormatTest.java b/core/sis-utility/src/test/java/org/apache/sis/measure/UnitFormatTest.java index fc711b7..be982bf 100644 --- a/core/sis-utility/src/test/java/org/apache/sis/measure/UnitFormatTest.java +++ b/core/sis-utility/src/test/java/org/apache/sis/measure/UnitFormatTest.java @@ -482,6 +482,23 @@ public final strictfp class UnitFormatTest extends TestCase { } /** + * Tests parsing of symbols containing terms separated by spaces. + * This is valid only when using {@link UnitFormat#parse(CharSequence)}. + */ + public void testParseTermsSeparatedBySpace() { + final UnitFormat f = new UnitFormat(Locale.UK); + assertSame(Units.METRES_PER_SECOND, f.parse("m s**-1")); + try { + f.parse("degree minute"); + fail("Should not accept unknown sentence even if each individual word is known."); + } catch (ParserException e) { + final String message = e.getMessage(); + assertTrue(message, message.contains("degree")); + assertTrue(message, message.contains("minute")); + } + } + + /** * Tests parsing of symbols composed of terms combined by arithmetic operations (e.g. "m/s"). */ @Test
