This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new 18eedab47 PARQUET-1942: Bump Arrow to 14.0.1 (#1193)
18eedab47 is described below
commit 18eedab47d5f1b23e4781abcf58ea0e4d5529adb
Author: Fokko Driesprong <[email protected]>
AuthorDate: Thu Nov 16 09:43:01 2023 +0100
PARQUET-1942: Bump Arrow to 14.0.1 (#1193)
* Bump Arrow to 14.0.0
* Remove deprecated OriginalType
* Add exclusion
* Update
parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/Map3Levels.java
Co-authored-by: Gang Wu <[email protected]>
* Update
parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/Map3Levels.java
Co-authored-by: Gang Wu <[email protected]>
* Update
parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/Map3Levels.java
Co-authored-by: Gang Wu <[email protected]>
* Update parquet-arrow/pom.xml
---------
Co-authored-by: Gang Wu <[email protected]>
---
parquet-arrow/pom.xml | 2 +-
.../apache/parquet/arrow/schema/List3Levels.java | 4 +-
.../schema/{List3Levels.java => Map3Levels.java} | 61 ++++---
.../parquet/arrow/schema/SchemaConverter.java | 105 +++++++++++-
.../apache/parquet/arrow/schema/SchemaMapping.java | 41 ++++-
.../parquet/arrow/schema/TestSchemaConverter.java | 190 +++++++++++----------
.../parquet/schema/LogicalTypeAnnotation.java | 8 +
pom.xml | 3 +
8 files changed, 290 insertions(+), 124 deletions(-)
diff --git a/parquet-arrow/pom.xml b/parquet-arrow/pom.xml
index ec2909b19..499889439 100644
--- a/parquet-arrow/pom.xml
+++ b/parquet-arrow/pom.xml
@@ -33,7 +33,7 @@
<url>https://parquet.apache.org</url>
<properties>
- <arrow.version>0.10.0</arrow.version>
+ <arrow.version>14.0.1</arrow.version>
</properties>
<dependencies>
diff --git
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/List3Levels.java
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/List3Levels.java
index cf21cb19b..f49b970cc 100644
---
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/List3Levels.java
+++
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/List3Levels.java
@@ -21,7 +21,7 @@ package org.apache.parquet.arrow.schema;
import static org.apache.parquet.schema.Type.Repetition.REPEATED;
import org.apache.parquet.schema.GroupType;
-import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.Type;
/**
@@ -41,7 +41,7 @@ class List3Levels {
* @param list the Parquet List
*/
public List3Levels(GroupType list) {
- if (list.getOriginalType() != OriginalType.LIST || list.getFields().size()
!= 1) {
+ if (list.getLogicalTypeAnnotation() != LogicalTypeAnnotation.listType() ||
list.getFields().size() != 1) {
throw new IllegalArgumentException("invalid list type: " + list);
}
this.list = list;
diff --git
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/List3Levels.java
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/Map3Levels.java
similarity index 57%
copy from
parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/List3Levels.java
copy to
parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/Map3Levels.java
index cf21cb19b..035faed6e 100644
---
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/List3Levels.java
+++
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/Map3Levels.java
@@ -18,60 +18,69 @@
*/
package org.apache.parquet.arrow.schema;
-import static org.apache.parquet.schema.Type.Repetition.REPEATED;
-
import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.Type;
+import static org.apache.parquet.schema.Type.Repetition.REPEATED;
+
/**
- * Represents a standard 3 levels Parquet list
- * (can be null, can contain nulls)
- * - optional list
- * - repeated content
- * - optional element
+ * Represents a standard 3 levels Parquet map
+ * - optional map
+ * - repeated key_value
+ * - required key, optional value
*/
-class List3Levels {
- private final GroupType list;
+class Map3Levels {
+ private final GroupType map;
private final GroupType repeated;
- private final Type element;
+ private final Type key;
+ private final Type value;
/**
- * Will validate the structure of the list
- * @param list the Parquet List
+ * Will validate the structure of the map
+ * @param map the Parquet map
*/
- public List3Levels(GroupType list) {
- if (list.getOriginalType() != OriginalType.LIST || list.getFields().size()
!= 1) {
- throw new IllegalArgumentException("invalid list type: " + list);
+ public Map3Levels(GroupType map) {
+ if (map.getLogicalTypeAnnotation() != LogicalTypeAnnotation.mapType() ||
map.getFields().size() != 1) {
+ throw new IllegalArgumentException("invalid map type: " + map);
}
- this.list = list;
- Type repeatedField = list.getFields().get(0);
- if (repeatedField.isPrimitive() || !repeatedField.isRepetition(REPEATED)
|| repeatedField.asGroupType().getFields().size() != 1) {
- throw new IllegalArgumentException("invalid list type: " + list);
+ this.map = map;
+ Type repeatedField = map.getFields().get(0);
+ if (repeatedField.isPrimitive() || !repeatedField.isRepetition(REPEATED)
|| repeatedField.asGroupType().getFields().size() != 2) {
+ throw new IllegalArgumentException("invalid map key: " + map);
}
this.repeated = repeatedField.asGroupType();
- this.element = repeated.getFields().get(0);
+ this.key = repeated.getFields().get(0);
+ this.value = repeated.getFields().get(1);
}
/**
- * @return the root list element (an optional group with one child)
+ * @return the root map element (an optional group with two children)
*/
- public GroupType getList() {
- return list;
+ public GroupType getMap() {
+ return map;
}
/**
- * @return repeated level, single child of list
+ * @return repeated level, single child of map
*/
public GroupType getRepeated() {
return repeated;
}
+ /**
+ * @return the key level
+ */
+ public Type getKey() {
+ return key;
+ }
+
/**
* @return the element level, single child of repeated.
*/
- public Type getElement() {
- return element;
+ public Type getValue() {
+ return value;
}
}
diff --git
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
index 6275ca37b..bde391000 100644
---
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
+++
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
@@ -42,6 +42,7 @@ import static
org.apache.parquet.schema.Type.Repetition.REPEATED;
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Optional;
@@ -64,6 +65,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp;
import org.apache.arrow.vector.types.pojo.ArrowType.Union;
import org.apache.arrow.vector.types.pojo.ArrowType.Utf8;
import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.parquet.arrow.schema.SchemaMapping.ListTypeMapping;
import org.apache.parquet.arrow.schema.SchemaMapping.PrimitiveTypeMapping;
@@ -158,6 +160,11 @@ public class SchemaConverter {
return createListTypeMapping();
}
+ @Override
+ public TypeMapping visit(ArrowType.LargeList largeList) {
+ return createListTypeMapping();
+ }
+
@Override
public TypeMapping
visit(org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList type) {
return createListTypeMapping();
@@ -179,6 +186,17 @@ public class SchemaConverter {
return new UnionTypeMapping(field, addToBuilder(parquetTypes,
Types.buildGroup(OPTIONAL)).named(fieldName), parquetTypes);
}
+ @Override
+ public TypeMapping visit(ArrowType.Map map) {
+ if (children.size() != 2) {
+ throw new IllegalArgumentException("Map fields must have exactly two
children: " + field);
+ }
+ TypeMapping keyChild = fromArrow(children.get(0), "key");
+ TypeMapping valueChild = fromArrow(children.get(1), "value");
+ GroupType groupType =
Types.optionalMap().key(keyChild.getParquetType()).value(valueChild.getParquetType()).named(fieldName);
+ return new SchemaMapping.MapTypeMapping(field, new
Map3Levels(groupType), keyChild, valueChild);
+ }
+
@Override
public TypeMapping visit(Int type) {
boolean signed = type.getIsSigned();
@@ -214,11 +232,21 @@ public class SchemaConverter {
return primitive(BINARY, stringType());
}
+ @Override
+ public TypeMapping visit(ArrowType.LargeUtf8 largeUtf8) {
+ return primitive(BINARY, stringType());
+ }
+
@Override
public TypeMapping visit(Binary type) {
return primitive(BINARY);
}
+ @Override
+ public TypeMapping visit(ArrowType.LargeBinary largeBinary) {
+ return primitive(BINARY);
+ }
+
@Override
public TypeMapping visit(Bool type) {
return primitive(BOOLEAN);
@@ -289,6 +317,16 @@ public class SchemaConverter {
return primitiveFLBA(12,
LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance());
}
+ @Override
+ public TypeMapping visit(ArrowType.Duration duration) {
+ return primitiveFLBA(12,
LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance());
+ }
+
+ @Override
+ public TypeMapping visit(ArrowType.ExtensionType type) {
+ return ArrowTypeVisitor.super.visit(type);
+ }
+
@Override
public TypeMapping visit(ArrowType.FixedSizeBinary fixedSizeBinary) {
return primitive(BINARY);
@@ -358,7 +396,7 @@ public class SchemaConverter {
if (repetition == REPEATED) {
// case where we have a repeated field that is not in a List/Map
TypeMapping child = fromParquet(type, null, REQUIRED);
- Field arrowField = new Field(name, false, new ArrowType.List(),
asList(child.getArrowField()));
+ Field arrowField = new Field(name, FieldType.notNullable(new
ArrowType.List()), Collections.singletonList(child.getArrowField()));
return new RepeatedTypeMapping(arrowField, type, child);
}
if (type.isPrimitive()) {
@@ -376,8 +414,14 @@ public class SchemaConverter {
private TypeMapping fromParquetGroup(GroupType type, String name) {
LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation();
if (logicalType == null) {
+ final FieldType field;
+ if (type.isRepetition(OPTIONAL)) {
+ field = FieldType.nullable(new Struct());
+ } else {
+ field = FieldType.notNullable(new Struct());
+ }
List<TypeMapping> typeMappings = fromParquet(type.getFields());
- Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new
Struct(), fields(typeMappings));
+ Field arrowField = new Field(name, field, fields(typeMappings));
return new StructTypeMapping(arrowField, type, typeMappings);
} else {
return logicalType.accept(new
LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<TypeMapping>() {
@@ -385,9 +429,17 @@ public class SchemaConverter {
public Optional<TypeMapping>
visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
List3Levels list3Levels = new List3Levels(type);
TypeMapping child = fromParquet(list3Levels.getElement(), null,
list3Levels.getElement().getRepetition());
- Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new
ArrowType.List(), asList(child.getArrowField()));
+ Field arrowField = new Field(name, FieldType.nullable(new
ArrowType.List()), Collections.singletonList(child.getArrowField()));
return of(new ListTypeMapping(arrowField, list3Levels, child));
}
+ @Override
+ public Optional<TypeMapping>
visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
+ Map3Levels map3levels = new Map3Levels(type);
+ TypeMapping keyType = fromParquet(map3levels.getKey(), null,
map3levels.getKey().getRepetition());
+ TypeMapping valueType = fromParquet(map3levels.getValue(), null,
map3levels.getValue().getRepetition());
+ Field arrowField = new Field(name, FieldType.nullable(new
ArrowType.Map(false)), asList(keyType.getArrowField(),
valueType.getArrowField()));
+ return of(new SchemaMapping.MapTypeMapping(arrowField, map3levels,
keyType, valueType));
+ }
}).orElseThrow(() -> new UnsupportedOperationException("Unsupported type
" + type));
}
}
@@ -401,7 +453,12 @@ public class SchemaConverter {
return type.getPrimitiveTypeName().convert(new
PrimitiveType.PrimitiveTypeNameConverter<TypeMapping, RuntimeException>() {
private TypeMapping field(ArrowType arrowType) {
- Field field = new Field(name, type.isRepetition(OPTIONAL), arrowType,
null);
+ final Field field;
+ if (type.isRepetition(OPTIONAL)) {
+ field = Field.nullable(name, arrowType);
+ } else {
+ field = Field.notNullable(name, arrowType);
+ }
return new PrimitiveTypeMapping(field, type);
}
@@ -607,6 +664,11 @@ public class SchemaConverter {
return createListTypeMapping(type);
}
+ @Override
+ public TypeMapping visit(ArrowType.LargeList largeList) {
+ return createListTypeMapping(largeList);
+ }
+
@Override
public TypeMapping
visit(org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList type) {
return createListTypeMapping(type);
@@ -639,6 +701,26 @@ public class SchemaConverter {
return new UnionTypeMapping(arrowField, groupType,
map(arrowField.getChildren(), groupType.getFields()));
}
+ @Override
+ public TypeMapping visit(ArrowType.Map map) {
+ if (arrowField.getChildren().size() != 2) {
+ throw new IllegalArgumentException("Invalid map type: " + map);
+ }
+ if (parquetField.isPrimitive()) {
+ throw new IllegalArgumentException("Parquet type not a group: " +
parquetField);
+ }
+ Map3Levels map3levels = new Map3Levels(parquetField.asGroupType());
+ if (arrowField.getChildren().size() != 2) {
+ throw new IllegalArgumentException("invalid arrow map: " +
arrowField);
+ }
+ Field keyChild = arrowField.getChildren().get(0);
+ Field valueChild = arrowField.getChildren().get(1);
+ return new SchemaMapping.MapTypeMapping(arrowField, map3levels,
+ map(keyChild, map3levels.getKey()),
+ map(valueChild, map3levels.getValue())
+ );
+ }
+
@Override
public TypeMapping visit(Int type) {
return primitive();
@@ -654,11 +736,21 @@ public class SchemaConverter {
return primitive();
}
+ @Override
+ public TypeMapping visit(ArrowType.LargeUtf8 largeUtf8) {
+ return primitive();
+ }
+
@Override
public TypeMapping visit(Binary type) {
return primitive();
}
+ @Override
+ public TypeMapping visit(ArrowType.LargeBinary largeBinary) {
+ return primitive();
+ }
+
@Override
public TypeMapping visit(Bool type) {
return primitive();
@@ -689,6 +781,11 @@ public class SchemaConverter {
return primitive();
}
+ @Override
+ public TypeMapping visit(ArrowType.Duration duration) {
+ return primitive();
+ }
+
@Override
public TypeMapping visit(ArrowType.FixedSizeBinary fixedSizeBinary) {
return primitive();
diff --git
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaMapping.java
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaMapping.java
index cbb04cec7..5bb11e504 100644
---
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaMapping.java
+++
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaMapping.java
@@ -71,6 +71,7 @@ public class SchemaMapping {
T visit(StructTypeMapping structTypeMapping);
T visit(UnionTypeMapping unionTypeMapping);
T visit(ListTypeMapping listTypeMapping);
+ T visit(MapTypeMapping mapTypeMapping);
T visit(RepeatedTypeMapping repeatedTypeMapping);
}
@@ -81,7 +82,7 @@ public class SchemaMapping {
private final Field arrowField;
private final Type parquetType;
- private List<TypeMapping> children;
+ private final List<TypeMapping> children;
TypeMapping(Field arrowField, Type parquetType, List<TypeMapping>
children) {
super();
@@ -111,7 +112,7 @@ public class SchemaMapping {
*/
public static class PrimitiveTypeMapping extends TypeMapping {
public PrimitiveTypeMapping(Field arrowField, PrimitiveType parquetType) {
- super(arrowField, parquetType, Collections.<TypeMapping>emptyList());
+ super(arrowField, parquetType, Collections.emptyList());
}
@Override
@@ -156,7 +157,7 @@ public class SchemaMapping {
private final TypeMapping child;
public ListTypeMapping(Field arrowField, List3Levels list3Levels,
TypeMapping child) {
- super(arrowField, list3Levels.getList(), asList(child));
+ super(arrowField, list3Levels.getList(),
Collections.singletonList(child));
this.list3Levels = list3Levels;
this.child = child;
if (list3Levels.getElement() != child.getParquetType()) {
@@ -178,6 +179,40 @@ public class SchemaMapping {
}
}
+ /**
+ * mapping of a Map type and standard 3-level Map annotated Parquet type
+ */
+ public static class MapTypeMapping extends TypeMapping {
+ private final Map3Levels map3levels;
+ private final TypeMapping key;
+ private final TypeMapping value;
+
+ public MapTypeMapping(Field arrowField, Map3Levels map3levels, TypeMapping
key, TypeMapping value) {
+ super(arrowField, map3levels.getMap(), asList(key, value));
+ this.map3levels = map3levels;
+ this.key = key;
+ this.value = value;
+ }
+
+ public Map3Levels getMap3Levels() {
+ return map3levels;
+ }
+
+ public TypeMapping getKey() {
+ return this.key;
+ }
+
+ public TypeMapping getValue() {
+ return this.value;
+ }
+
+ @Override
+ public <T> T accept(TypeMappingVisitor<T> visitor) {
+ return visitor.visit(this);
+ }
+ }
+
+
/**
* mapping of a List type and repeated Parquet field (non-list annotated)
*/
diff --git
a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
index 764621a13..f3296085d 100644
---
a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
+++
b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
@@ -22,24 +22,6 @@ import static java.util.Arrays.asList;
import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS;
import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS;
import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType;
-import static org.apache.parquet.schema.OriginalType.DATE;
-import static org.apache.parquet.schema.OriginalType.DECIMAL;
-import static org.apache.parquet.schema.OriginalType.INTERVAL;
-import static org.apache.parquet.schema.OriginalType.INT_16;
-import static org.apache.parquet.schema.OriginalType.INT_32;
-import static org.apache.parquet.schema.OriginalType.INT_64;
-import static org.apache.parquet.schema.OriginalType.INT_8;
-import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS;
-import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MICROS;
-import static org.apache.parquet.schema.OriginalType.TIME_MILLIS;
-import static org.apache.parquet.schema.OriginalType.TIME_MICROS;
-import static org.apache.parquet.schema.OriginalType.UINT_16;
-import static org.apache.parquet.schema.OriginalType.UINT_32;
-import static org.apache.parquet.schema.OriginalType.UINT_64;
-import static org.apache.parquet.schema.OriginalType.UINT_8;
-import static org.apache.parquet.schema.OriginalType.UTF8;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
@@ -59,6 +41,7 @@ import org.apache.arrow.vector.types.TimeUnit;
import org.apache.arrow.vector.types.UnionMode;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.parquet.arrow.schema.SchemaMapping.ListTypeMapping;
import org.apache.parquet.arrow.schema.SchemaMapping.PrimitiveTypeMapping;
@@ -68,6 +51,7 @@ import
org.apache.parquet.arrow.schema.SchemaMapping.TypeMapping;
import org.apache.parquet.arrow.schema.SchemaMapping.TypeMappingVisitor;
import org.apache.parquet.arrow.schema.SchemaMapping.UnionTypeMapping;
import org.apache.parquet.example.Paper;
+import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Types;
@@ -80,7 +64,11 @@ import org.junit.Test;
public class TestSchemaConverter {
private static Field field(String name, boolean nullable, ArrowType type,
Field... children) {
- return new Field(name, nullable, type, asList(children));
+ if (nullable) {
+ return new Field(name, FieldType.nullable(type), asList(children));
+ } else {
+ return new Field(name, FieldType.notNullable(type), asList(children));
+ }
}
private static Field field(String name, ArrowType type, Field... children) {
@@ -100,27 +88,30 @@ public class TestSchemaConverter {
field("j", new ArrowType.Timestamp(TimeUnit.MILLISECOND, null)),
field("k", new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC")),
field("l", new ArrowType.Timestamp(TimeUnit.MICROSECOND, null)),
- field("m", new ArrowType.Interval(IntervalUnit.DAY_TIME))
+ field("m", new ArrowType.Interval(IntervalUnit.DAY_TIME)),
+ field("e", new ArrowType.Map(false), field(null, false, new
ArrowType.Date(DateUnit.DAY)), field(null, true, new ArrowType.Utf8()))
));
+
private final MessageType complexParquetSchema = Types.buildMessage()
- .addField(Types.optional(INT32).as(INT_8).named("a"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(8)).named("a"))
.addField(Types.optionalGroup()
- .addField(Types.optional(INT32).as(INT_16).named("c"))
- .addField(Types.optional(BINARY).as(UTF8).named("d"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(16)).named("c"))
+
.addField(Types.optional(BINARY).as(LogicalTypeAnnotation.stringType()).named("d"))
.named("b"))
.addField(Types.optionalList().
- setElementType(Types.optional(INT32).as(DATE).named("element"))
+
setElementType(Types.optional(INT32).as(LogicalTypeAnnotation.dateType()).named("element"))
.named("e"))
.addField(Types.optionalList().
- setElementType(Types.optional(INT32).as(DATE).named("element"))
+
setElementType(Types.optional(INT32).as(LogicalTypeAnnotation.dateType()).named("element"))
.named("f"))
.addField(Types.optional(FLOAT).named("g"))
- .addField(Types.optional(INT64).as(timestampType(true, MILLIS)).named("h"))
- .addField(Types.optional(INT64).as(timestampType(true, NANOS)).named("i"))
- .addField(Types.optional(INT64).as(timestampType(false,
MILLIS)).named("j"))
- .addField(Types.optional(INT64).as(timestampType(true, MICROS)).named("k"))
- .addField(Types.optional(INT64).as(timestampType(false,
MICROS)).named("l"))
-
.addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("m"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
MILLIS)).named("h"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
NANOS)).named("i"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(false,
MILLIS)).named("j"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
MICROS)).named("k"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(false,
MICROS)).named("l"))
+
.addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(LogicalTypeAnnotation.intervalType()).named("m"))
+
.addField(Types.optionalMap().key(Types.optional(INT32).as(LogicalTypeAnnotation.dateType()).named("key")).value(Types.optional(BINARY).as(LogicalTypeAnnotation.stringType()).named("value")).named("e"))
.named("root");
private final Schema allTypesArrowSchema = new Schema(asList(
@@ -168,29 +159,29 @@ public class TestSchemaConverter {
.addField(Types.optionalGroup()
.addField(Types.optional(BINARY).named("ea"))
.named("e"))
- .addField(Types.optional(INT32).as(INT_8).named("f"))
- .addField(Types.optional(INT32).as(INT_16).named("f1"))
- .addField(Types.optional(INT32).as(INT_32).named("f2"))
- .addField(Types.optional(INT64).as(INT_64).named("f3"))
- .addField(Types.optional(INT32).as(UINT_8).named("f4"))
- .addField(Types.optional(INT32).as(UINT_16).named("f5"))
- .addField(Types.optional(INT32).as(UINT_32).named("f6"))
- .addField(Types.optional(INT64).as(UINT_64).named("f7"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(8)).named("f"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(16)).named("f1"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(32)).named("f2"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.intType(64)).named("f3"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(8,
false)).named("f4"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(16,
false)).named("f5"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(32,
false)).named("f6"))
+ .addField(Types.optional(INT64).as(LogicalTypeAnnotation.intType(64,
false)).named("f7"))
.addField(Types.optional(FLOAT).named("g"))
.addField(Types.optional(DOUBLE).named("g1"))
- .addField(Types.optional(BINARY).as(UTF8).named("h"))
+
.addField(Types.optional(BINARY).as(LogicalTypeAnnotation.stringType()).named("h"))
.addField(Types.optional(BINARY).named("i"))
.addField(Types.optional(BOOLEAN).named("j"))
-
.addField(Types.optional(INT32).as(DECIMAL).precision(5).scale(5).named("k"))
-
.addField(Types.optional(INT64).as(DECIMAL).precision(15).scale(5).named("k1"))
-
.addField(Types.optional(BINARY).as(DECIMAL).precision(25).scale(5).named("k2"))
- .addField(Types.optional(INT32).as(DATE).named("l"))
- .addField(Types.optional(INT32).as(timeType(false, MILLIS)).named("m"))
- .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("n"))
-
.addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("o"))
-
.addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("o1"))
- .addField(Types.optional(INT64).as(timeType(false, NANOS)).named("p"))
- .addField(Types.optional(INT64).as(timestampType(true, NANOS)).named("q"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.decimalType(5,
5)).named("k"))
+ .addField(Types.optional(INT64).as(LogicalTypeAnnotation.decimalType(5,
15)).named("k1"))
+ .addField(Types.optional(BINARY).as(LogicalTypeAnnotation.decimalType(5,
25)).named("k2"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.dateType()).named("l"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.timeType(false,
MILLIS)).named("m"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
MILLIS)).named("n"))
+
.addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(LogicalTypeAnnotation.intervalType()).named("o"))
+
.addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(LogicalTypeAnnotation.intervalType()).named("o1"))
+ .addField(Types.optional(INT64).as(LogicalTypeAnnotation.timeType(false,
NANOS)).named("p"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
NANOS)).named("q"))
.named("root");
private final Schema supportedTypesArrowSchema = new Schema(asList(
@@ -226,27 +217,27 @@ public class TestSchemaConverter {
.addField(Types.optionalList().
setElementType(Types.optional(BINARY).named("element"))
.named("c"))
- .addField(Types.optional(INT32).as(INT_8).named("e"))
- .addField(Types.optional(INT32).as(INT_16).named("e1"))
- .addField(Types.optional(INT32).as(INT_32).named("e2"))
- .addField(Types.optional(INT64).as(INT_64).named("e3"))
- .addField(Types.optional(INT32).as(UINT_8).named("e4"))
- .addField(Types.optional(INT32).as(UINT_16).named("e5"))
- .addField(Types.optional(INT32).as(UINT_32).named("e6"))
- .addField(Types.optional(INT64).as(UINT_64).named("e7"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(8)).named("e"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(16)).named("e1"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(32)).named("e2"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.intType(64)).named("e3"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(8,
false)).named("e4"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(16,
false)).named("e5"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.intType(32,
false)).named("e6"))
+ .addField(Types.optional(INT64).as(LogicalTypeAnnotation.intType(64,
false)).named("e7"))
.addField(Types.optional(FLOAT).named("f"))
.addField(Types.optional(DOUBLE).named("f1"))
- .addField(Types.optional(BINARY).as(UTF8).named("g"))
+
.addField(Types.optional(BINARY).as(LogicalTypeAnnotation.stringType()).named("g"))
.addField(Types.optional(BINARY).named("h"))
.addField(Types.optional(BOOLEAN).named("i"))
-
.addField(Types.optional(INT32).as(DECIMAL).precision(5).scale(5).named("j"))
-
.addField(Types.optional(INT64).as(DECIMAL).precision(15).scale(5).named("j1"))
-
.addField(Types.optional(BINARY).as(DECIMAL).precision(25).scale(5).named("j2"))
- .addField(Types.optional(INT32).as(DATE).named("k"))
- .addField(Types.optional(INT32).as(TIME_MILLIS).named("l"))
- .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("m"))
- .addField(Types.optional(INT64).as(timeType(true, NANOS)).named("n"))
- .addField(Types.optional(INT64).as(timestampType(true, NANOS)).named("o"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.decimalType(5,
5)).named("j"))
+ .addField(Types.optional(INT64).as(LogicalTypeAnnotation.decimalType(5,
15)).named("j1"))
+ .addField(Types.optional(BINARY).as(LogicalTypeAnnotation.decimalType(5,
25)).named("j2"))
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.dateType()).named("k"))
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.timeType(false,
MILLIS)).named("l"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
MILLIS)).named("m"))
+ .addField(Types.optional(INT64).as(LogicalTypeAnnotation.timeType(true,
NANOS)).named("n"))
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
NANOS)).named("o"))
.named("root");
private final Schema paperArrowSchema = new Schema(asList(
@@ -268,30 +259,30 @@ public class TestSchemaConverter {
)
));
- private SchemaConverter converter = new SchemaConverter();
+ private final SchemaConverter converter = new SchemaConverter();
@Test
- public void testComplexArrowToParquet() throws IOException {
+ public void testComplexArrowToParquet() {
MessageType parquet =
converter.fromArrow(complexArrowSchema).getParquetSchema();
Assert.assertEquals(complexParquetSchema.toString(), parquet.toString());
// easier to read
Assert.assertEquals(complexParquetSchema, parquet);
}
@Test
- public void testAllArrowToParquet() throws IOException {
+ public void testAllArrowToParquet() {
MessageType parquet =
converter.fromArrow(allTypesArrowSchema).getParquetSchema();
Assert.assertEquals(allTypesParquetSchema.toString(), parquet.toString());
// easier to read
Assert.assertEquals(allTypesParquetSchema, parquet);
}
@Test
- public void testSupportedParquetToArrow() throws IOException {
+ public void testSupportedParquetToArrow() {
Schema arrow =
converter.fromParquet(supportedTypesParquetSchema).getArrowSchema();
assertEquals(supportedTypesArrowSchema, arrow);
}
@Test
- public void testRepeatedParquetToArrow() throws IOException {
+ public void testRepeatedParquetToArrow() {
Schema arrow = converter.fromParquet(Paper.schema).getArrowSchema();
assertEquals(paperArrowSchema, arrow);
}
@@ -303,8 +294,6 @@ public class TestSchemaConverter {
/**
* for more pinpointed error on what is different
- * @param left
- * @param right
*/
private void compareFields(List<Field> left, List<Field> right) {
Assert.assertEquals(left + "\n" + right, left.size(), right.size());
@@ -318,7 +307,7 @@ public class TestSchemaConverter {
}
@Test
- public void testAllMap() throws IOException {
+ public void testAllMap() {
SchemaMapping map = converter.map(allTypesArrowSchema,
allTypesParquetSchema);
Assert.assertEquals("p, s<p>, l<p>, l<p>, u<p>, p, p, p, p, p, p, p, p, p,
p, p, p, p, p, p, p, p, p, p, p, p, p, p", toSummaryString(map));
}
@@ -356,6 +345,11 @@ public class TestSchemaConverter {
return "l";
}
+ @Override
+ public String visit(SchemaMapping.MapTypeMapping mapTypeMapping) {
+ return "m";
+ }
+
@Override
public String visit(RepeatedTypeMapping repeatedTypeMapping) {
return "r";
@@ -388,7 +382,7 @@ public class TestSchemaConverter {
field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32))
))).getParquetSchema();
Assert.assertEquals(expected,
- Types.buildMessage().addField(Types.optional(INT32).as(timeType(false,
MILLIS)).named("a")).named("root"));
+
Types.buildMessage().addField(Types.optional(INT32).as(LogicalTypeAnnotation.timeType(false,
MILLIS)).named("a")).named("root"));
}
@Test
@@ -397,13 +391,13 @@ public class TestSchemaConverter {
field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64))
))).getParquetSchema();
Assert.assertEquals(expected,
- Types.buildMessage().addField(Types.optional(INT64).as(timeType(false,
MICROS)).named("a")).named("root"));
+
Types.buildMessage().addField(Types.optional(INT64).as(LogicalTypeAnnotation.timeType(false,
MICROS)).named("a")).named("root"));
}
@Test
public void testParquetInt32TimeMillisToArrow() {
MessageType parquet = Types.buildMessage()
-
.addField(Types.optional(INT32).as(TIME_MILLIS).named("a")).named("root");
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.timeType(false,
MILLIS)).named("a")).named("root");
Schema expected = new Schema(asList(
field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32))
));
@@ -413,7 +407,7 @@ public class TestSchemaConverter {
@Test
public void testParquetInt64TimeMicrosToArrow() {
MessageType parquet = Types.buildMessage()
-
.addField(Types.optional(INT64).as(TIME_MICROS).named("a")).named("root");
+ .addField(Types.optional(INT64).as(LogicalTypeAnnotation.timeType(false,
MICROS)).named("a")).named("root");
Schema expected = new Schema(asList(
field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64))
));
@@ -430,10 +424,30 @@ public class TestSchemaConverter {
Assert.assertEquals(expected,
converter.fromParquet(parquet).getArrowSchema());
}
+ @Test
+ public void testParquetMapToArrow() {
+ GroupType mapType = Types.requiredMap()
+ .key(INT32)
+ .optionalValue(INT64)
+ .named("myMap");
+ MessageType parquet = Types.buildMessage()
+ .addField(mapType).named("root");
+ Schema expected = new Schema(asList(
+ field("myMap", new ArrowType.Map(false),
+ field(null, false, new ArrowType.Int(32, true)),
+ field(null, true, new ArrowType.Int(64, true))
+ )
+ ));
+ SchemaMapping mapping = converter.fromParquet(parquet);
+ Schema actual = mapping.getArrowSchema();
+
+ Assert.assertEquals(expected, actual);
+ }
+
@Test
public void testParquetFixedBinaryToArrowDecimal() {
MessageType parquet = Types.buildMessage()
-
.addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(5).as(DECIMAL).precision(8).scale(2).named("a")).named("root");
+
.addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(5).as(LogicalTypeAnnotation.decimalType(2,
8)).named("a")).named("root");
Schema expected = new Schema(asList(
field("a", new ArrowType.Decimal(8, 2))
));
@@ -464,13 +478,13 @@ public class TestSchemaConverter {
@Test(expected = IllegalStateException.class)
public void testParquetInt64TimeMillisToArrow() {
converter.fromParquet(Types.buildMessage()
-
.addField(Types.optional(INT64).as(TIME_MILLIS).named("a")).named("root"));
+ .addField(Types.optional(INT64).as(LogicalTypeAnnotation.timeType(false,
MILLIS)).named("a")).named("root"));
}
@Test(expected = IllegalStateException.class)
public void testParquetInt32TimeMicrosToArrow() {
converter.fromParquet(Types.buildMessage()
-
.addField(Types.optional(INT32).as(TIME_MICROS).named("a")).named("root"));
+ .addField(Types.optional(INT32).as(LogicalTypeAnnotation.timeType(false,
MICROS)).named("a")).named("root"));
}
@Test(expected = UnsupportedOperationException.class)
@@ -485,7 +499,7 @@ public class TestSchemaConverter {
MessageType expected = converter.fromArrow(new Schema(asList(
field("a", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"))
))).getParquetSchema();
- Assert.assertEquals(expected,
Types.buildMessage().addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("a")).named("root"));
+ Assert.assertEquals(expected,
Types.buildMessage().addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
MILLIS)).named("a")).named("root"));
}
@Test
@@ -493,13 +507,13 @@ public class TestSchemaConverter {
MessageType expected = converter.fromArrow(new Schema(asList(
field("a", new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"))
))).getParquetSchema();
- Assert.assertEquals(expected,
Types.buildMessage().addField(Types.optional(INT64).as(TIMESTAMP_MICROS).named("a")).named("root"));
+ Assert.assertEquals(expected,
Types.buildMessage().addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
MICROS)).named("a")).named("root"));
}
@Test
public void testParquetInt64TimestampMillisToArrow() {
MessageType parquet = Types.buildMessage()
-
.addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("a")).named("root");
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
MILLIS)).named("a")).named("root");
Schema expected = new Schema(asList(
field("a", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"))
));
@@ -509,7 +523,7 @@ public class TestSchemaConverter {
@Test
public void testParquetInt64TimestampMicrosToArrow() {
MessageType parquet = Types.buildMessage()
-
.addField(Types.optional(INT64).as(TIMESTAMP_MICROS).named("a")).named("root");
+
.addField(Types.optional(INT64).as(LogicalTypeAnnotation.timestampType(true,
MICROS)).named("a")).named("root");
Schema expected = new Schema(asList(
field("a", new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"))
));
@@ -519,12 +533,12 @@ public class TestSchemaConverter {
@Test(expected = IllegalStateException.class)
public void testParquetInt32TimestampMillisToArrow() {
converter.fromParquet(Types.buildMessage()
-
.addField(Types.optional(INT32).as(TIMESTAMP_MILLIS).named("a")).named("root"));
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.timestampType(false,
MILLIS)).named("a")).named("root"));
}
@Test(expected = IllegalStateException.class)
public void testParquetInt32TimestampMicrosToArrow() {
converter.fromParquet(Types.buildMessage()
-
.addField(Types.optional(INT32).as(TIMESTAMP_MICROS).named("a")).named("root"));
+
.addField(Types.optional(INT32).as(LogicalTypeAnnotation.timestampType(false,
MICROS)).named("a")).named("root"));
}
}
diff --git
a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java
b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java
index c4e50f292..bdd4f1a5e 100644
---
a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java
+++
b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java
@@ -277,6 +277,10 @@ public abstract class LogicalTypeAnnotation {
return new TimestampLogicalTypeAnnotation(isAdjustedToUTC, unit);
}
+ public static IntLogicalTypeAnnotation intType(final int bitWidth) {
+ return new IntLogicalTypeAnnotation(bitWidth, true);
+ }
+
public static IntLogicalTypeAnnotation intType(final int bitWidth, final
boolean isSigned) {
Preconditions.checkArgument(
bitWidth == 8 || bitWidth == 16 || bitWidth == 32 || bitWidth == 64,
@@ -285,6 +289,10 @@ public abstract class LogicalTypeAnnotation {
return new IntLogicalTypeAnnotation(bitWidth, isSigned);
}
+ public static IntervalLogicalTypeAnnotation intervalType() {
+ return new IntervalLogicalTypeAnnotation();
+ }
+
public static JsonLogicalTypeAnnotation jsonType() {
return JsonLogicalTypeAnnotation.INSTANCE;
}
diff --git a/pom.xml b/pom.xml
index 5c1f9f98b..e6d9e4e3c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -554,6 +554,9 @@
<exclude>org.apache.parquet.hadoop.thrift.TBaseWriteSupport#setThriftClass(org.apache.parquet.conf.ParquetConfiguration,java.lang.Class)</exclude>
<exclude>org.apache.parquet.proto.ProtoParquetReader#builder(org.apache.hadoop.fs.Path,boolean)</exclude>
<exclude>org.apache.parquet.proto.ProtoParquetReader#builder(org.apache.parquet.io.InputFile,boolean)</exclude>
+
+ <!-- Due to the removal of deprecated methods -->
+ <exclude>org.apache.parquet.arrow.schema.SchemaMapping</exclude>
</excludes>
</parameter>
</configuration>