Hi I am trying to filter large table with 3 columns. Spark SQL might be a
good choice but want to do it without SQL. The goal is to filter bigtable
with multi clauses. I filtered bigtable 3times but the first filtering takes
about 50seconds but the second and third filter transformation took about 5
seconds. I wonder if it is because of lazy evaluation. But I already
evaluated my rdd parsing it when I first read it using sc.textFile then
counted it.
Running times:
t1 => 50seconds
t2 => 5seconds
t3 => 4seconds

    val clause = List(
      ("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>",
"<www.ssu.ac.kr#GraduateStudent>"),
      ("<www.ssu.ac.kr#memberOf>", "?Z"),
      ("<www.ssu.ac.kr#undergraduateDegreeFrom>", "?Y")
    )

    val bcastedSubj: Broadcast[String] = sc.broadcast("?X")
    val bcastedCls: Broadcast[List[(String, String)]] = sc.broadcast(clause)
    var n = clause.length

    val t0 = System.currentTimeMillis()

    val subgraph1 = bigtable.mapPartitions (
      iterator => {
        val bcls = bcastedCls.value
        val bsubj = bcastedSubj.value
        n = bcls.length
        for ((s, grp) <- iterator;
             if {
               val flag = if (!bsubj.startsWith("?") && !bsubj.equals(s))
false
               else {
                 var k = 0

                 val m = grp.length
                 var flag1 = true

                 while(k < n) {
                   var flag2 = false
                   var l = 0
                   while(l < m) {
                     if (grp(l)._1.equals(bcls(k)._1) &&
grp(l)._2.equals(bcls(k)._2)) flag2 = true
                     else if (bcls(k)._1.startsWith("?") &&
grp(l)._2.equals(bcls(k)._2)) flag2 = true
                     else if  (bcls(k)._2.startsWith("?") &&
grp(l)._1.equals(bcls(k)._1)) flag2 = true
                     l += 1
                   }
                   if (!flag2) flag1 = false

                   k += 1
                 }

                 flag1
               }

               flag
             }
        ) yield (s, grp)
      }, preservesPartitioning = true).cache()
    val num1 = subgraph1.count()



--
View this message in context: 
http://apache-spark-user-list.1001560.n3.nabble.com/GC-problem-while-filtering-large-data-tp20702.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Reply via email to