[GitHub] spark pull request: [SPARK-11544][SQL] sqlContext doesn't use Path...

liancheng Mon, 16 Nov 2015 08:29:34 -0800

Github user liancheng commented on a diff in the pull request:

    https://github.com/apache/spark/pull/9652#discussion_r44945464
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 ---
    @@ -1393,4 +1401,58 @@ class JsonSuite extends QueryTest with 
SharedSQLContext with TestJsonData {
           )
         }
       }
    +
    +  test("SPARK-11544 test pathfilter") {
    +    def filterFunc(file: File): Boolean = {
    +      val name = file.getCanonicalFile.getName()
    +      val ret = !(name.startsWith(".")  || name.startsWith("_"))
    +      ret.asInstanceOf[Boolean]
    +    }
    +
    +    val clonedConf = new Configuration(hadoopConfiguration)
    +    try {
    +      val dir = Utils.createTempDir()
    +      dir.delete()
    +      val path = dir.getCanonicalPath
    +      val emps = Seq(
    +        Row(1, "Emp-1"),
    +        Row(2, "Emp-2")
    +      )
    +
    +      val empRows = sqlContext.sparkContext.parallelize(emps, 1)
    +      val empSchema = StructType(
    +        Seq(
    +          StructField("id", IntegerType, true),
    +          StructField("name", StringType, true)
    +        )
    +      )
    +      val empDF = sqlContext.createDataFrame(empRows, empSchema)
    +      empDF.write.json(path)
    +      val files = dir.listFiles().filter(filterFunc(_))
    +      files.map(_.renameTo(new File(path + File.separator + 
"pathmap.tmp")))
    +      FileUtils.copyFile(new File(path + File.separator + "pathmap.tmp"),
    +        new File(path + File.separator + "data.json"))
    +
    +      // Both the files should be read and count should be 4
    +      val empDF2 = sqlContext.read.json(path)
    +      empDF2.registerTempTable("jsonTable1")
    +      checkAnswer(
    +        sql("select count(*) from jsonTable1"),
    +        Row(4)
    +      )
    +      // After applying the path filter, only one file should be read and 
the count should be 2
    +      
sqlContext.sparkContext.hadoopConfiguration.setClass("mapreduce.input.pathFilter.class",
    +        classOf[TmpFileFilter], classOf[PathFilter])
    +      val empDF3 = sqlContext.read.json(path)
    +      empDF3.registerTempTable("jsonTable2")
    +      checkAnswer(
    +        sql("select count(*) from jsonTable2"),
    +        Row(2)
    +      )
    +    } finally {
    +      // Hadoop 1 doesn't have `Configuration.unset`
    +      hadoopConfiguration.clear()
    +      clonedConf.asScala.foreach(entry => 
hadoopConfiguration.set(entry.getKey, entry.getValue))
    +    }
    +  }
    --- End diff --
    
    Some suggestions for this test case:
    
    1. Use a testing path filter that is more easier to verify
    2. The contents of the testing DataFrame is irrelevant, so make it as 
simple as possible
    3. You may make this test suite extend from `SQLTestUtils`, which contains 
a helper method named `withTempPath`, so that you don't need to create the 
temporary path manually.
    
    Basically, you can end up with a much simpler test case like this:
    
    ```scala
    class TestFileFilter extends PathFilter {
      override def accept(path: Path): Boolean = path.getParent.getName != "p=2"
    }
    
    // ...
    
    withTempPath { dir =>
      val path = dir.getCanonicalPath
    
      val df = sqlContext.range(2)
      df.write.json(path + "/p=1")
      df.write.json(path + "/p=2")
      assert(sqlContext.read.json(path).count() === 4)
    
      val clonedConf = new Configuration(hadoopConfiguration)
      try {
        hadoopContext.setClass(
          "mapreduce.input.pathFilter.class",
          classOf[TestFileFilter],
          classOf[PathFilter])
        assert(sqlContext.read.json(path).count() === 2)
      } finally {
        // Hadoop 1 doesn't have `Configuration.unset`
        hadoopConfiguration.clear()
        clonedConf.asScala.foreach(entry => 
hadoopConfiguration.set(entry.getKey, entry.getValue)) 
      }
    }
    ```
    
    (I didn't compile or run the above code, but you get the idea.)



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-11544][SQL] sqlContext doesn't use Path...

Reply via email to