git commit: [SQL] Add test case with workaround for reading partitioned Avro files

marmbrus Wed, 10 Sep 2014 20:58:28 -0700

Repository: spark
Updated Branches:
  refs/heads/master 79cdb9b64 -> 84e2c8bfe



[SQL] Add test case with workaround for reading partitioned Avro files

In order to read from partitioned Avro files we need to also set the 
`SERDEPROPERTIES` since `TBLPROPERTIES` are not passed to the initialization.  
This PR simply adds a test to make sure we don't break this workaround.

Author: Michael Armbrust <mich...@databricks.com>

Closes #2340 from marmbrus/avroPartitioned and squashes the following commits:

6b969d6 [Michael Armbrust] fix style
fea2124 [Michael Armbrust] Add test case with workaround for reading 
partitioned avro files.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84e2c8bf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84e2c8bf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84e2c8bf

Branch: refs/heads/master
Commit: 84e2c8bfe41837baf2aeffa9741e4dbd14351981
Parents: 79cdb9b
Author: Michael Armbrust <mich...@databricks.com>
Authored: Wed Sep 10 20:57:38 2014 -0700
Committer: Michael Armbrust <mich...@databricks.com>
Committed: Wed Sep 10 20:57:38 2014 -0700

----------------------------------------------------------------------
 .../org/apache/spark/sql/hive/TestHive.scala    | 69 +++++++++++++++++++-
 ...AvroSerDe-0-e4501461c855cc9071a872a64186c3de |  8 +++
 .../sql/hive/execution/HiveSerDeSuite.scala     |  2 +
 3 files changed, 78 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/84e2c8bf/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index a013f3f..6974f3e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -269,7 +269,74 @@ class TestHiveContext(sc: SparkContext) extends 
HiveContext(sc) {
          |)
        """.stripMargin.cmd,
       s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/episodes.avro")}' 
INTO TABLE episodes".cmd
-    )
+    ),
+    // THIS TABLE IS NOT THE SAME AS THE HIVE TEST TABLE episodes_partitioned 
AS DYNAMIC PARITIONING
+    // IS NOT YET SUPPORTED
+    TestTable("episodes_part",
+      s"""CREATE TABLE episodes_part (title STRING, air_date STRING, doctor 
INT)
+         |PARTITIONED BY (doctor_pt INT)
+         |ROW FORMAT SERDE '${classOf[AvroSerDe].getCanonicalName}'
+         |STORED AS
+         |INPUTFORMAT '${classOf[AvroContainerInputFormat].getCanonicalName}'
+         |OUTPUTFORMAT '${classOf[AvroContainerOutputFormat].getCanonicalName}'
+         |TBLPROPERTIES (
+         |  'avro.schema.literal'='{
+         |    "type": "record",
+         |    "name": "episodes",
+         |    "namespace": "testing.hive.avro.serde",
+         |    "fields": [
+         |      {
+         |          "name": "title",
+         |          "type": "string",
+         |          "doc": "episode title"
+         |      },
+         |      {
+         |          "name": "air_date",
+         |          "type": "string",
+         |          "doc": "initial date"
+         |      },
+         |      {
+         |          "name": "doctor",
+         |          "type": "int",
+         |          "doc": "main actor playing the Doctor in episode"
+         |      }
+         |    ]
+         |  }'
+         |)
+       """.stripMargin.cmd,
+      // WORKAROUND: Required to pass schema to SerDe for partitioned tables.
+      // TODO: Pass this automatically from the table to partitions.
+      s"""
+         |ALTER TABLE episodes_part SET SERDEPROPERTIES (
+         |  'avro.schema.literal'='{
+         |    "type": "record",
+         |    "name": "episodes",
+         |    "namespace": "testing.hive.avro.serde",
+         |    "fields": [
+         |      {
+         |          "name": "title",
+         |          "type": "string",
+         |          "doc": "episode title"
+         |      },
+         |      {
+         |          "name": "air_date",
+         |          "type": "string",
+         |          "doc": "initial date"
+         |      },
+         |      {
+         |          "name": "doctor",
+         |          "type": "int",
+         |          "doc": "main actor playing the Doctor in episode"
+         |      }
+         |    ]
+         |  }'
+         |)
+        """.stripMargin.cmd,
+      s"""
+        INSERT OVERWRITE TABLE episodes_part PARTITION (doctor_pt=1)
+        SELECT title, air_date, doctor FROM episodes
+      """.cmd
+      )
   )
 
   hiveQTestUtilTables.foreach(registerTestTable)

http://git-wip-us.apache.org/repos/asf/spark/blob/84e2c8bf/sql/hive/src/test/resources/golden/Read
 Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/Read Partitioned with 
AvroSerDe-0-e4501461c855cc9071a872a64186c3de 
b/sql/hive/src/test/resources/golden/Read Partitioned with 
AvroSerDe-0-e4501461c855cc9071a872a64186c3de
new file mode 100644
index 0000000..49c8434
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/Read Partitioned with 
AvroSerDe-0-e4501461c855cc9071a872a64186c3de     
@@ -0,0 +1,8 @@
+The Eleventh Hour      3 April 2010    11      1
+The Doctor's Wife      14 May 2011     11      1
+Horror of Fang Rock    3 September 1977        4       1
+An Unearthly Child     23 November 1963        1       1
+The Mysterious Planet  6 September 1986        6       1
+Rose   26 March 2005   9       1
+The Power of the Daleks        5 November 1966 2       1
+Castrolava     4 January 1982  5       1

http://git-wip-us.apache.org/repos/asf/spark/blob/84e2c8bf/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
index 8bc7238..7486bfa 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
@@ -37,4 +37,6 @@ class HiveSerDeSuite extends HiveComparisonTest with 
BeforeAndAfterAll {
   createQueryTest("Read with RegexSerDe", "SELECT * FROM sales")
 
   createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")
+
+  createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM 
episodes_part")
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SQL] Add test case with workaround for reading partitioned Avro files

Reply via email to