HeartSaVioR commented on a change in pull request #26416: [SPARK-29779][CORE] Compact old event log files and cleanup URL: https://github.com/apache/spark/pull/26416#discussion_r347165730
########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/history/SQLEventFilterBuilder.scala ########## @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.history + +import scala.collection.mutable + +import org.apache.spark.deploy.history.{EventFilter, EventFilterBuilder, JobEventFilter} +import org.apache.spark.internal.Logging +import org.apache.spark.scheduler._ +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.ui._ +import org.apache.spark.sql.streaming.StreamingQueryListener + +/** + * This class tracks live SQL executions, and pass the list to the [[SQLLiveEntitiesEventFilter]] + * to help SQLLiveEntitiesEventFilter to filter out finished SQL executions as well as relevant + * jobs (+ stages/tasks/RDDs). Unlike BasicEventFilterBuilder, it doesn't concern about the status + * of individual job - it only concerns whether SQL execution is finished or not. + */ +private[spark] class SQLEventFilterBuilder extends SparkListener with EventFilterBuilder { + private val _liveExecutionToJobs = new mutable.HashMap[Long, mutable.Set[Int]] + private val _jobToStages = new mutable.HashMap[Int, Seq[Int]] + private val _stageToTasks = new mutable.HashMap[Int, mutable.Set[Long]] + private val _stageToRDDs = new mutable.HashMap[Int, Seq[Int]] + private val stages = new mutable.HashSet[Int] + + def liveExecutionToJobs: Map[Long, Set[Int]] = _liveExecutionToJobs.mapValues(_.toSet).toMap + def jobToStages: Map[Int, Seq[Int]] = _jobToStages.toMap + def stageToTasks: Map[Int, Set[Long]] = _stageToTasks.mapValues(_.toSet).toMap + def stageToRDDs: Map[Int, Seq[Int]] = _stageToRDDs.toMap + + override def onJobStart(jobStart: SparkListenerJobStart): Unit = { + val executionIdString = jobStart.properties.getProperty(SQLExecution.EXECUTION_ID_KEY) + if (executionIdString == null) { Review comment: https://github.com/databricks/scala-style-guide#return-statements > Use return as a guard to simplify control flow without adding a level of indentation If I understand correctly on the style guide, early return, especially placing return in mostly top of the method, is not a bad pattern which should be considered to be changed. This is a kind of preferences of individual, which should be checked with the style guide to not force one's preference. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
