[
https://issues.apache.org/jira/browse/PARQUET-1285?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16465612#comment-16465612
]
ASF GitHub Bot commented on PARQUET-1285:
-----------------------------------------
xhochy closed pull request #469: PARQUET-1285: [Java] SchemaConverter should
not convert from TimeUnit.SECOND and TimeUnit.NANOSECOND of Arrow
URL: https://github.com/apache/parquet-mr/pull/469
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
index 1d69c4523..f298558ac 100644
---
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
+++
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
@@ -28,6 +28,7 @@
import static org.apache.parquet.schema.OriginalType.INT_8;
import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS;
import static org.apache.parquet.schema.OriginalType.TIME_MILLIS;
+import static org.apache.parquet.schema.OriginalType.TIME_MICROS;
import static org.apache.parquet.schema.OriginalType.UINT_16;
import static org.apache.parquet.schema.OriginalType.UINT_32;
import static org.apache.parquet.schema.OriginalType.UINT_64;
@@ -49,6 +50,7 @@
import org.apache.arrow.vector.types.DateUnit;
import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.TimeUnit;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor;
import org.apache.arrow.vector.types.pojo.ArrowType.Binary;
@@ -245,7 +247,14 @@ public TypeMapping visit(Date type) {
@Override
public TypeMapping visit(Time type) {
- return primitive(INT32, TIME_MILLIS);
+ int bitWidth = type.getBitWidth();
+ TimeUnit timeUnit = type.getUnit();
+ if (bitWidth == 32 && timeUnit == TimeUnit.MILLISECOND) {
+ return primitive(INT32, TIME_MILLIS);
+ } else if (bitWidth == 64 && timeUnit == TimeUnit.MICROSECOND) {
+ return primitive(INT64, TIME_MICROS);
+ }
+ throw new UnsupportedOperationException("Unsupported type " + type);
}
@Override
@@ -407,11 +416,11 @@ public TypeMapping convertINT32(PrimitiveTypeName
primitiveTypeName) throws Runt
case DATE:
return field(new ArrowType.Date(DateUnit.DAY));
case TIMESTAMP_MICROS:
- return field(new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC"));
+ return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"));
case TIMESTAMP_MILLIS:
- return field(new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC"));
+ return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"));
case TIME_MILLIS:
- return field(new
ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, 32));
+ return field(new ArrowType.Time(TimeUnit.MILLISECOND, 32));
default:
case TIME_MICROS:
case INT_64:
@@ -456,11 +465,12 @@ public TypeMapping convertINT64(PrimitiveTypeName
primitiveTypeName) throws Runt
case DATE:
return field(new ArrowType.Date(DateUnit.DAY));
case TIMESTAMP_MICROS:
- return field(new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC"));
+ return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"));
case TIMESTAMP_MILLIS:
- return field(new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC"));
- default:
+ return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"));
case TIME_MICROS:
+ return field(new ArrowType.Time(TimeUnit.MICROSECOND, 64));
+ default:
case UTF8:
case ENUM:
case BSON:
diff --git
a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
index 654f773f9..4c3da35f1 100644
---
a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
+++
b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
@@ -28,6 +28,7 @@
import static org.apache.parquet.schema.OriginalType.INT_8;
import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS;
import static org.apache.parquet.schema.OriginalType.TIME_MILLIS;
+import static org.apache.parquet.schema.OriginalType.TIME_MICROS;
import static org.apache.parquet.schema.OriginalType.UINT_16;
import static org.apache.parquet.schema.OriginalType.UINT_32;
import static org.apache.parquet.schema.OriginalType.UINT_64;
@@ -43,11 +44,12 @@
import java.io.IOException;
import java.util.List;
-import org.apache.arrow.vector.types.IntervalUnit;
-import org.apache.arrow.vector.types.UnionMode;
import org.apache.arrow.vector.types.DateUnit;
import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.IntervalUnit;
+import org.apache.arrow.vector.types.TimeUnit;
+import org.apache.arrow.vector.types.UnionMode;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
@@ -86,7 +88,7 @@ private static Field field(String name, ArrowType type,
Field... children) {
field("e", new ArrowType.List(), field(null, new
ArrowType.Date(DateUnit.DAY))),
field("f", new ArrowType.FixedSizeList(1), field(null, new
ArrowType.Date(DateUnit.DAY))),
field("g", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)),
- field("h", new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")),
+ field("h", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")),
field("i", new ArrowType.Interval(IntervalUnit.DAY_TIME))
));
private final MessageType complexParquetSchema = Types.buildMessage()
@@ -129,11 +131,12 @@ private static Field field(String name, ArrowType type,
Field... children) {
field("k1", new ArrowType.Decimal(15, 5)),
field("k2", new ArrowType.Decimal(25, 5)),
field("l", new ArrowType.Date(DateUnit.DAY)),
- field("m", new
ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.SECOND, 32)),
- field("n", new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")),
+ field("m", new ArrowType.Time(TimeUnit.MILLISECOND, 32)),
+ field("n", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")),
field("o", new ArrowType.Interval(IntervalUnit.DAY_TIME)),
field("o1", new ArrowType.Interval(IntervalUnit.YEAR_MONTH))
));
+
private final MessageType allTypesParquetSchema = Types.buildMessage()
.addField(Types.optional(BINARY).named("a"))
.addField(Types.optionalGroup()
@@ -191,8 +194,8 @@ private static Field field(String name, ArrowType type,
Field... children) {
field("j1", new ArrowType.Decimal(15, 5)),
field("j2", new ArrowType.Decimal(25, 5)),
field("k", new ArrowType.Date(DateUnit.DAY)),
- field("l", new
ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, 32)),
- field("m", new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC"))
+ field("l", new ArrowType.Time(TimeUnit.MILLISECOND, 32)),
+ field("m", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"))
));
private final MessageType supportedTypesParquetSchema = Types.buildMessage()
@@ -348,4 +351,66 @@ public void testRepeatedMap() throws IOException {
SchemaMapping map = converter.map(paperArrowSchema, Paper.schema);
Assert.assertEquals("p, s<r<p>, r<p>>, r<s<r<s<p, p>>, p>>",
toSummaryString(map));
}
+
+ @Test(expected = UnsupportedOperationException.class)
+ public void testArrowTimeSecondToParquet() {
+ converter.fromArrow(new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.SECOND, 32))
+ ))).getParquetSchema();
+ }
+
+ @Test
+ public void testArrowTimeMillisecondToParquet() {
+ MessageType expected = converter.fromArrow(new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32))
+ ))).getParquetSchema();
+ Assert.assertEquals(expected,
Types.buildMessage().addField(Types.optional(INT32).as(TIME_MILLIS).named("a")).named("root"));
+ }
+
+ @Test
+ public void testArrowTimeMicrosecondToParquet() {
+ MessageType expected = converter.fromArrow(new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64))
+ ))).getParquetSchema();
+ Assert.assertEquals(expected,
Types.buildMessage().addField(Types.optional(INT64).as(TIME_MICROS).named("a")).named("root"));
+ }
+
+ @Test(expected = UnsupportedOperationException.class)
+ public void testArrowTimeNanosecondToParquet() {
+ converter.fromArrow(new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.NANOSECOND, 64))
+ ))).getParquetSchema();
+ }
+
+ @Test
+ public void testParquetInt32TimeMillisToArrow() {
+ MessageType parquet = Types.buildMessage()
+
.addField(Types.optional(INT32).as(TIME_MILLIS).named("a")).named("root");
+ Schema expected = new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32))
+ ));
+ Assert.assertEquals(expected,
converter.fromParquet(parquet).getArrowSchema());
+ }
+
+ @Test
+ public void testParquetInt64TimeMicrosToArrow() {
+ MessageType parquet = Types.buildMessage()
+
.addField(Types.optional(INT64).as(TIME_MICROS).named("a")).named("root");
+ Schema expected = new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64))
+ ));
+ Assert.assertEquals(expected,
converter.fromParquet(parquet).getArrowSchema());
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testParquetInt64TimeMillisToArrow() {
+ converter.fromParquet(Types.buildMessage()
+
.addField(Types.optional(INT64).as(TIME_MILLIS).named("a")).named("root"));
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testParquetInt32TimeMicrosToArrow() {
+ converter.fromParquet(Types.buildMessage()
+
.addField(Types.optional(INT32).as(TIME_MICROS).named("a")).named("root"));
+ }
}
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Java] SchemaConverter should not convert from TimeUnit.SECOND AND
> TimeUnit.NANOSECOND of Arrow
> -----------------------------------------------------------------------------------------------
>
> Key: PARQUET-1285
> URL: https://issues.apache.org/jira/browse/PARQUET-1285
> Project: Parquet
> Issue Type: Bug
> Components: parquet-mr
> Reporter: Masayuki Takahashi
> Priority: Minor
> Fix For: 1.10.0
>
>
> Arrow's 'Time' definition is below:
> {code:java}
> { "name" : "time", "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND",
> "bitWidth": /* integer: 32 or 64 */ }{code}
> [http://arrow.apache.org/docs/metadata.html]
>
> But Parquet only supports 'TIME_MILLIS' and 'TIME_MICROS'.
> [https://github.com/Apache/parquet-format/blob/master/LogicalTypes.md]
> Therefore SchemaConverter should not convert from TimeUnit.SECOND AND
> TimeUnit.NANOSECOND of Arrow to Parquet.
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)