Repository: orc Updated Branches: refs/heads/master 411e633a2 -> 51b6b6ce3
ORC-308. Add function to get subtypes by name. Fixes #221 Signed-off-by: Owen O'Malley <omal...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/51b6b6ce Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/51b6b6ce Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/51b6b6ce Branch: refs/heads/master Commit: 51b6b6ce3c63e3da326978fa016f72247b9f6c0d Parents: 411e633 Author: Owen O'Malley <omal...@apache.org> Authored: Tue Feb 27 16:02:16 2018 -0800 Committer: Owen O'Malley <omal...@apache.org> Committed: Fri Mar 2 10:37:11 2018 -0800 ---------------------------------------------------------------------- .../java/org/apache/orc/TypeDescription.java | 123 ++++++++++++++++++- .../org/apache/orc/TestTypeDescription.java | 83 +++++++++++++ 2 files changed, 205 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/51b6b6ce/java/core/src/java/org/apache/orc/TypeDescription.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 7f4d241..86d88ff 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -275,7 +275,7 @@ public class TypeDescription } else { while (source.position < source.length) { char ch = source.value.charAt(source.position); - if (!Character.isLetterOrDigit(ch) && ch != '.' && ch != '_') { + if (!Character.isLetterOrDigit(ch) && ch != '_') { break; } source.position += 1; @@ -925,4 +925,125 @@ public class TypeDescription return prev.findSubtype(goal); } } + + /** + * Split a compound name into parts separated by '.'. + * @param source the string to parse into simple names + * @return a list of simple names from the source + */ + private static List<String> splitName(StringPosition source) { + List<String> result = new ArrayList<>(); + do { + result.add(parseName(source)); + } while (consumeChar(source, '.')); + return result; + } + + private static final Pattern INTEGER_PATTERN = Pattern.compile("^[0-9]+$"); + + private TypeDescription findSubtype(StringPosition source) { + List<String> names = splitName(source); + if (names.size() == 1 && INTEGER_PATTERN.matcher(names.get(0)).matches()) { + return findSubtype(Integer.parseInt(names.get(0))); + } + TypeDescription current = this; + while (names.size() > 0) { + String first = names.remove(0); + switch (current.category) { + case STRUCT: { + int posn = current.fieldNames.indexOf(first); + if (posn == -1) { + throw new IllegalArgumentException("Field " + first + + " not found in " + current.toString()); + } + current = current.children.get(posn); + break; + } + case LIST: + if (first.equals("_elem")) { + current = current.getChildren().get(0); + } else { + throw new IllegalArgumentException("Field " + first + + "not found in " + current.toString()); + } + break; + case MAP: + if (first.equals("_key")) { + current = current.getChildren().get(0); + } else if (first.equals("_value")) { + current = current.getChildren().get(1); + } else { + throw new IllegalArgumentException("Field " + first + + "not found in " + current.toString()); + } + break; + case UNION: { + try { + int posn = Integer.parseInt(first); + if (posn < 0 || posn >= current.getChildren().size()) { + throw new NumberFormatException("off end of union"); + } + current = current.getChildren().get(posn); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Field " + first + + "not found in " + current.toString(), e); + } + break; + } + default: + throw new IllegalArgumentException("Field " + first + + "not found in " + current.toString()); + } + } + return current; + } + + /** + * Find a subtype of this schema by name. + * If the name is a simple integer, it will be used as a column number. + * Otherwise, this routine will recursively search for the name. + * <ul> + * <li>Struct fields are selected by name.</li> + * <li>List children are selected by "_elem".</li> + * <li>Map children are selected by "_key" or "_value".</li> + * <li>Union children are selected by number starting at 0.</li> + * </ul> + * Names are separated by '.'. + * @param columnName the name to search for + * @return the subtype + */ + public TypeDescription findSubtype(String columnName) { + StringPosition source = new StringPosition(columnName); + TypeDescription result = findSubtype(source); + if (source.position != source.length) { + throw new IllegalArgumentException("Remaining text in parsing field name " + + source); + } + return result; + } + + /** + * Find a list of subtypes from a string, including the empty list. + * + * Each column name is separated by ','. + * @param columnNameList the list of column names + * @return the list of subtypes that correspond to the column names + */ + public List<TypeDescription> findSubtypes(String columnNameList) { + StringPosition source = new StringPosition(columnNameList); + List<TypeDescription> result = new ArrayList<>(); + boolean needComma = false; + while (source.position != source.length) { + if (needComma) { + if (!consumeChar(source, ',')) { + throw new IllegalArgumentException("Comma expected in list of column" + + " names at " + source); + } + } else { + needComma = true; + } + result.add(findSubtype(source)); + } + return result; + } } http://git-wip-us.apache.org/repos/asf/orc/blob/51b6b6ce/java/core/src/test/org/apache/orc/TestTypeDescription.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java b/java/core/src/test/org/apache/orc/TestTypeDescription.java index 404c9d0..cb2e8d7 100644 --- a/java/core/src/test/org/apache/orc/TestTypeDescription.java +++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java @@ -18,6 +18,7 @@ package org.apache.orc; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import org.junit.Rule; import org.junit.Test; @@ -220,4 +221,86 @@ public class TestTypeDescription { assertEquals(0, type.getId()); assertEquals(2, leaf.getId()); } + + @Test + public void testFindSubtype() { + TypeDescription type = TypeDescription.fromString( + "struct<a:int," + + "b:struct<c:array<int>,d:map<string,struct<e:string>>>," + + "f:string," + + "g:uniontype<string,int>>"); + assertEquals(0, type.findSubtype("0").getId()); + assertEquals(1, type.findSubtype("a").getId()); + assertEquals(2, type.findSubtype("b").getId()); + assertEquals(3, type.findSubtype("b.c").getId()); + assertEquals(4, type.findSubtype("b.c._elem").getId()); + assertEquals(5, type.findSubtype("b.d").getId()); + assertEquals(6, type.findSubtype("b.d._key").getId()); + assertEquals(7, type.findSubtype("b.d._value").getId()); + assertEquals(8, type.findSubtype("b.d._value.e").getId()); + assertEquals(9, type.findSubtype("f").getId()); + assertEquals(10, type.findSubtype("g").getId()); + assertEquals(11, type.findSubtype("g.0").getId()); + assertEquals(12, type.findSubtype("g.1").getId()); + } + + @Test + public void testBadFindSubtype() { + TypeDescription type = TypeDescription.fromString( + "struct<a:int," + + "b:struct<c:array<int>,d:map<string,struct<e:string>>>," + + "f:string," + + "g:uniontype<string,int>>"); + try { + type.findSubtype("13"); + assertTrue(false); + } catch (IllegalArgumentException e) { + // PASS + } + try { + type.findSubtype("aa"); + assertTrue(false); + } catch (IllegalArgumentException e) { + // PASS + } + try { + type.findSubtype("b.a"); + assertTrue(false); + } catch (IllegalArgumentException e) { + // PASS + } + try { + type.findSubtype("g.2"); + assertTrue(false); + } catch (IllegalArgumentException e) { + // PASS + } + try { + type.findSubtype("b.c.d"); + assertTrue(false); + } catch (IllegalArgumentException e) { + // PASS + } + } + + @Test + public void testFindSubtypes() { + TypeDescription type = TypeDescription.fromString( + "struct<a:int," + + "b:struct<c:array<int>,d:map<string,struct<e:string>>>," + + "f:string," + + "g:uniontype<string,int>>"); + List<TypeDescription> results = type.findSubtypes("a"); + assertEquals(1, results.size()); + assertEquals(1, results.get(0).getId()); + + results = type.findSubtypes("b.d._value.e,3,g.0"); + assertEquals(3, results.size()); + assertEquals(8, results.get(0).getId()); + assertEquals(3, results.get(1).getId()); + assertEquals(11, results.get(2).getId()); + + results = type.findSubtypes(""); + assertEquals(0, results.size()); + } }