[GitHub] spark pull request: [SPARK-6575][SQL] Converted Parquet Metastore ...

liancheng Thu, 02 Apr 2015 18:31:37 -0700

Github user liancheng commented on a diff in the pull request:

    https://github.com/apache/spark/pull/5339#discussion_r27711275
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala ---
    @@ -390,6 +392,116 @@ class ParquetDataSourceOnMetastoreSuite extends 
ParquetMetastoreSuiteBase {
     
         sql("DROP TABLE ms_convert")
       }
    +
    +  test("Caching converted data source Parquet Relations") {
    +    def checkCached(tableIdentifer: catalog.QualifiedTableName): Unit = {
    +      // Converted test_parquet should be cached.
    +      catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) match {
    +        case null => fail("Converted test_parquet should be cached in the 
cache.")
    +        case logical @ LogicalRelation(parquetRelation: ParquetRelation2) 
=> // OK
    +        case other =>
    +          fail(
    +            "The cached test_parquet should be a Parquet Relation. " +
    +              s"However, $other is returned form the cache.")
    +      }
    +    }
    +
    +    sql("DROP TABLE IF EXISTS test_insert_parquet")
    +    sql("DROP TABLE IF EXISTS test_parquet_partitioned_cache_test")
    +
    +    sql(
    +      """
    +        |create table test_insert_parquet
    +        |(
    +        |  intField INT,
    +        |  stringField STRING
    +        |)
    +        |ROW FORMAT SERDE 
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    +        |STORED AS
    +        |  INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
    +        |  OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
    +      """.stripMargin)
    +
    +    var tableIdentifer = catalog.QualifiedTableName("default", 
"test_insert_parquet")
    +
    +    // First, make sure the converted test_parquet is not cached.
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === 
null)
    +    // Table lookup will make the table cached.
    +    table("test_insert_parquet")
    +    checkCached(tableIdentifer)
    +    // For insert into non-partitioned table, we will do the conversion,
    +    // so the converted test_insert_parquet should be cached.
    +    invalidateTable("test_insert_parquet")
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === 
null)
    +    sql(
    +      """
    +        |INSERT INTO TABLE test_insert_parquet
    +        |select a, b from jt
    +      """.stripMargin)
    +    checkCached(tableIdentifer)
    +    // Make sure we can read the data.
    +    checkAnswer(
    +      sql("select * from test_insert_parquet"),
    +      sql("select a, b from jt").collect())
    +    // Invalidate the cache.
    +    invalidateTable("test_insert_parquet")
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === 
null)
    +
    +    // Create a partitioned table.
    +    sql(
    +      """
    +        |create table test_parquet_partitioned_cache_test
    +        |(
    +        |  intField INT,
    +        |  stringField STRING
    +        |)
    +        |PARTITIONED BY (date string)
    +        |ROW FORMAT SERDE 
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    +        |STORED AS
    +        |  INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
    +        |  OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
    +      """.stripMargin)
    +
    +    tableIdentifer = catalog.QualifiedTableName("default", 
"test_parquet_partitioned_cache_test")
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === 
null)
    +    sql(
    +      """
    +        |INSERT INTO TABLE test_parquet_partitioned_cache_test
    +        |PARTITION (date='2015-04-01')
    +        |select a, b from jt
    +      """.stripMargin)
    +    // Right now, insert into a partitioned Parquet is not supported in 
data source Parquet.
    +    // So, we expect it is not cached.
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === 
null)
    +    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
    +    sql(
    +      """
    +        |INSERT INTO TABLE test_parquet_partitioned_cache_test
    +        |PARTITION (date='2015-04-02')
    +        |select a, b from jt
    +      """.stripMargin)
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === 
null)
    +    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
    --- End diff --
    
    This should be unnecessary since we are in the 
`ParquetDataSourceOnMetastoreSuite`.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-6575][SQL] Converted Parquet Metastore ...

Reply via email to