zentol commented on a change in pull request #15311: URL: https://github.com/apache/flink/pull/15311#discussion_r603100189
########## File path: flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/exceptionhistory/ExceptionHistoryEntryExtractor.java ########## @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.exceptionhistory; + +import org.apache.flink.runtime.executiongraph.ExecutionJobVertex; +import org.apache.flink.runtime.executiongraph.ExecutionVertex; +import org.apache.flink.runtime.jobgraph.JobVertexID; +import org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID; +import org.apache.flink.runtime.taskmanager.TaskManagerLocation; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.SerializedThrowable; +import org.apache.flink.util.function.QuadFunction; + +import java.util.Map; + +/** + * {@code ExceptionHistoryEntryExtractor} extracts all the necessary information from given + * executions to create corresponding {@link RootExceptionHistoryEntry RootExceptionHistoryEntries}. + */ +public class ExceptionHistoryEntryExtractor { + + /** + * Extracts a {@link RootExceptionHistoryEntry} based on the passed local failure information. + * + * @param executionJobVertices The {@link ExecutionJobVertex} instances registry. + * @param failedExecutionVertexId The {@link ExecutionVertexID} referring to the {@link + * ExecutionVertex} that is the root of the failure. + * @param otherAffectedVertices The {@code ExecutionVertexID}s of other affected {@code + * ExecutionVertices} that, if failed as well, would be added as concurrent failures. + * @return The {@code RootExceptionHistoryEntry}. + * @throws IllegalArgumentException if one of the passed {@code ExecutionVertexID}s cannot be + * resolved into an {@code ExecutionVertex}. + * @throws IllegalArgumentException if the {@code failedExecutionVertexID} refers to an {@code + * ExecutionVertex} that didn't fail. + */ + public RootExceptionHistoryEntry extractLocalFailure( + Map<JobVertexID, ExecutionJobVertex> executionJobVertices, + ExecutionVertexID failedExecutionVertexId, + Iterable<ExecutionVertexID> otherAffectedVertices) { + final ExecutionVertex rootCauseExecutionVertex = + getExecutionVertex(executionJobVertices, failedExecutionVertexId); + + final RootExceptionHistoryEntry root = + createLocalExceptionHistoryEntry( + RootExceptionHistoryEntry::new, rootCauseExecutionVertex); + + for (ExecutionVertexID otherExecutionVertexId : otherAffectedVertices) { + final ExecutionVertex executionVertex = + getExecutionVertex(executionJobVertices, otherExecutionVertexId); + if (executionVertex.getFailureInfo().isPresent()) { + root.add( Review comment: why are we mutating the entry instead of first collecting all concurrent failures? ########## File path: flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/exceptionhistory/ExceptionHistoryEntry.java ########## @@ -40,45 +39,20 @@ @Nullable private final String failingTaskName; @Nullable private final ArchivedTaskManagerLocation taskManagerLocation; - /** - * Creates a {@code ExceptionHistoryEntry} representing a global failure from the passed {@code - * Throwable} and timestamp. - * - * @param cause The reason for the failure. - * @param timestamp The time the failure was caught. - * @return The {@code ExceptionHistoryEntry} instance. - */ - public static ExceptionHistoryEntry fromGlobalFailure(Throwable cause, long timestamp) { - return new ExceptionHistoryEntry(cause, timestamp, null, null); - } - - /** - * Creates a {@code ExceptionHistoryEntry} representing a local failure using the passed - * information. - * - * @param execution The {@link AccessExecution} that caused the failure. - * @param failingTaskName The name of the task the {@code execution} is connected to. - * @return The {@code ExceptionHistoryEntry} instance. - */ - public static ExceptionHistoryEntry fromFailedExecution( - AccessExecution execution, String failingTaskName) { - ErrorInfo failureInfo = - execution - .getFailureInfo() - .orElseThrow( - () -> - new IllegalArgumentException( - "The passed Execution does not provide a failureCause.")); - return new ExceptionHistoryEntry( - failureInfo.getException(), - failureInfo.getTimestamp(), + @VisibleForTesting Review comment: this is incorrect ########## File path: flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/exceptionhistory/RootExceptionHistoryEntry.java ########## @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.exceptionhistory; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.runtime.taskmanager.TaskManagerLocation; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.Collection; + +/** + * {@code RootExceptionHistoryEntry} extending {@link ExceptionHistoryEntry} by providing a list of + * {@code ExceptionHistoryEntry} instances to store concurrently caught failures. + */ +public class RootExceptionHistoryEntry extends ExceptionHistoryEntry { + + private static final long serialVersionUID = -7647332765867297434L; + + private final Collection<ExceptionHistoryEntry> concurrentExceptions = new ArrayList<>(); + + /** + * Creates a {@code ExceptionHistoryEntry} representing a global failure from the passed {@code + * Throwable} and timestamp. + * + * @param cause The reason for the failure. + * @param timestamp The time the failure was caught. + * @return The {@code ExceptionHistoryEntry} instance. + */ + @VisibleForTesting + public static RootExceptionHistoryEntry fromGlobalFailure(Throwable cause, long timestamp) { + return new RootExceptionHistoryEntry(cause, timestamp, null, null); + } + + @VisibleForTesting + public RootExceptionHistoryEntry( + Throwable cause, + long timestamp, + @Nullable String failingTaskName, + @Nullable TaskManagerLocation taskManagerLocation) { + super(cause, timestamp, failingTaskName, taskManagerLocation); + } + + public void add(ExceptionHistoryEntry concurrentException) { Review comment: ```suggestion void add(ExceptionHistoryEntry concurrentException) { ``` Ideally we get rid of it though. ########## File path: flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/exceptionhistory/RootExceptionHistoryEntry.java ########## @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.exceptionhistory; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.runtime.taskmanager.TaskManagerLocation; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.Collection; + +/** + * {@code RootExceptionHistoryEntry} extending {@link ExceptionHistoryEntry} by providing a list of + * {@code ExceptionHistoryEntry} instances to store concurrently caught failures. + */ +public class RootExceptionHistoryEntry extends ExceptionHistoryEntry { + + private static final long serialVersionUID = -7647332765867297434L; + + private final Collection<ExceptionHistoryEntry> concurrentExceptions = new ArrayList<>(); + + /** + * Creates a {@code ExceptionHistoryEntry} representing a global failure from the passed {@code + * Throwable} and timestamp. + * + * @param cause The reason for the failure. + * @param timestamp The time the failure was caught. + * @return The {@code ExceptionHistoryEntry} instance. + */ + @VisibleForTesting + public static RootExceptionHistoryEntry fromGlobalFailure(Throwable cause, long timestamp) { Review comment: it was surprising that there is no `fromLocalFailure` flavor, that would also enforce the task name / location to not be null. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
