[ https://issues.apache.org/jira/browse/GOBBLIN-2211?focusedWorklogId=976234&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-976234 ]
ASF GitHub Bot logged work on GOBBLIN-2211: ------------------------------------------- Author: ASF GitHub Bot Created on: 25/Jul/25 09:19 Start Date: 25/Jul/25 09:19 Worklog Time Spent: 10m Work Description: abhishekmjain commented on code in PR #4121: URL: https://github.com/apache/gobblin/pull/4121#discussion_r2230606590 ########## gobblin-runtime/src/main/java/org/apache/gobblin/runtime/ErrorClassifier.java: ########## @@ -0,0 +1,256 @@ +package org.apache.gobblin.runtime; + +import java.io.IOException; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import javax.inject.Inject; +import lombok.extern.slf4j.Slf4j; + +import org.apache.gobblin.configuration.Category; +import org.apache.gobblin.configuration.ErrorPatternProfile; +import org.apache.gobblin.metastore.ErrorPatternStore; +import org.apache.gobblin.runtime.troubleshooter.Issue; +import org.apache.gobblin.runtime.troubleshooter.IssueSeverity; +import org.apache.gobblin.service.ServiceConfigKeys; +import org.apache.gobblin.util.ConfigUtils; + +import com.typesafe.config.Config; + + +/** + * Classifies issues by matching their summary description to error patterns and categories. + * Categorisation is based on regex patterns and their associated categories. + * Each category has an associated priority value. + */ +@Slf4j +public class ErrorClassifier { + private final List<CompiledErrorPattern> errorIssues; + private final Map<String, Category> categoryMap; + private ErrorPatternStore errorStore = null; + + private final int maxErrorsInFinalError; + private static final String DEFAULT_CODE = "T0000"; + private Category defaultCategory = null; + + /** + * Loads all error issues and categories from the store into memory. + */ + @Inject + public ErrorClassifier(ErrorPatternStore store, Config config) + throws IOException { + this.errorStore = store; + + this.maxErrorsInFinalError = + ConfigUtils.getInt(config, ServiceConfigKeys.ERROR_CLASSIFICATION_MAX_ERRORS_IN_FINAL_KEY, + ServiceConfigKeys.DEFAULT_ERROR_CLASSIFICATION_MAX_ERRORS_IN_FINAL); + + //Obtaining Categories must be done before getting ErrorIssues, as it is used in ordering ErrorIssues by category priority. + this.categoryMap = new HashMap<>(); + for (Category cat : this.errorStore.getAllErrorCategories()) { + categoryMap.put(cat.getCategoryName(), cat); + } + + this.errorIssues = new ArrayList<>(); + for (ErrorPatternProfile issue : this.errorStore.getAllErrorIssuesOrderedByCategoryPriority()) { + errorIssues.add(new CompiledErrorPattern(issue)); + } + + List<String> regexList = new ArrayList<>(); + for (CompiledErrorPattern pei : errorIssues) { + regexList.add(pei.issue.getDescriptionRegex()); + } + + this.defaultCategory = this.errorStore.getDefaultCategory(); + } + + /** + * Returns the highest priority Category matching the given summary, or defaultCategory if initialised, or null if none match. + */ + public Category classify(String summary) { + if (summary == null) { + return null; + } + Category highest = null; + for (CompiledErrorPattern pei : errorIssues) { + if (pei.matches(summary)) { + Category cat = categoryMap.get(pei.getCategoryName()); + if (cat == null) { + continue; + } + if (highest == null || cat.getPriority() < highest.getPriority()) { + highest = cat; + } + } + } + if (highest == null) { + return defaultCategory != null ? defaultCategory : null; + } + return highest; + } + + /** + * Classifies a list of issues and returns the highest priority category with its matched issues. + * If no issues match, returns null. + * If defaultCategory is set, it will be used for unmatched issues. + */ + public Issue classifyEarlyStopWithDefault(List<Issue> issues) { + if (issues == null || issues.isEmpty()) { + return null; + } + + ClassificationResult result = performClassification(issues); + + if (result.highestCategoryName == null) { + return null; + } + + return buildFinalIssue(result.highestCategoryName, result.categoryToIssues); + } + + private ClassificationResult performClassification(List<Issue> issues) { + ClassificationResult result = new ClassificationResult(); + + for (Issue issue : issues) { + classifySingleIssue(issue, result); + } + + applyDefaultCategoryIfNeeded(result); + return result; + } + + private void classifySingleIssue(Issue issue, ClassificationResult result) { + Category matchedCategory = findBestMatchingCategory(issue, result.highestPriority); + + if (matchedCategory != null) { + addMatchedIssue(issue, matchedCategory, result); + } else { + addUnmatchedIssue(issue, result); + } + } + + private Category findBestMatchingCategory(Issue issue, Integer currentHighestPriority) { + for (CompiledErrorPattern pei : errorIssues) { + Category cat = categoryMap.get(pei.getCategoryName()); + if (cat == null) { + continue; + } + + // Early stop optimization - skip categories with lower priority + if (currentHighestPriority != null && cat.getPriority() > currentHighestPriority) { + break; + } + + if (pei.matches(issue.getSummary())) { + return cat; + } + } + return null; + } + + private void addMatchedIssue(Issue issue, Category category, ClassificationResult result) { + result.categoryToIssues.computeIfAbsent(category.getCategoryName(), k -> new ArrayList<>()).add(issue); + + updateHighestPriorityIfNeeded(category, result); + } + + private void addUnmatchedIssue(Issue issue, ClassificationResult result) { + result.unmatched.add(issue); + + // Initialize default priority only once when we encounter the first unmatched issue + if (result.defaultPriority == null && defaultCategory != null) { + result.defaultPriority = defaultCategory.getPriority(); + // Only update highest priority if no category has been matched yet OR if default category has higher priority (lower number) + if (result.highestPriority == null || defaultCategory.getPriority() < result.highestPriority) { + result.highestPriority = result.defaultPriority; + result.highestCategoryName = defaultCategory.getCategoryName(); + } + } + } + + private void updateHighestPriorityIfNeeded(Category category, ClassificationResult result) { + if (result.highestPriority == null || category.getPriority() < result.highestPriority) { + result.highestPriority = category.getPriority(); + result.highestCategoryName = category.getCategoryName(); + } + } + + private void applyDefaultCategoryIfNeeded(ClassificationResult result) { + boolean shouldUseDefault = result.highestPriority != null && result.highestPriority.equals(result.defaultPriority) + && !result.unmatched.isEmpty() && defaultCategory != null; + + if (shouldUseDefault) { + result.highestCategoryName = defaultCategory.getCategoryName(); + + for (Issue issue : result.unmatched) { + result.categoryToIssues.computeIfAbsent(defaultCategory.getCategoryName(), k -> new ArrayList<>()).add(issue); + } + } + } + + private Issue buildFinalIssue(String categoryName, Map<String, List<Issue>> categoryToIssues) { + List<Issue> matchedIssues = categoryToIssues.get(categoryName); + String details = buildDetailsString(matchedIssues); + + return Issue.builder().summary("Category: " + categoryName).details(details).severity(IssueSeverity.ERROR) + .time(ZonedDateTime.now()).code(DEFAULT_CODE).sourceClass(null).exceptionClass(null).properties(null).build(); + } + + private String buildDetailsString(List<Issue> issues) { + List<String> summaries = new ArrayList<>(); + int limit = Math.min(maxErrorsInFinalError, issues.size()); + + for (int i = 0; i < limit; i++) { + summaries.add(issues.get(i).getSummary()); + } + + return String.join(" || ", summaries); + } + + /** + * Helper class that stores the result of issue classification, including matched categories, unmatched issues, and priority information. + */ + private static class ClassificationResult { + Map<String, List<Issue>> categoryToIssues = new HashMap<>(); + List<Issue> unmatched = new ArrayList<>(); + Integer highestPriority = null; Review Comment: Since we allowed defaultCategory to be null, this is no longer applicable Issue Time Tracking ------------------- Worklog Id: (was: 976234) Time Spent: 5.5h (was: 5h 20m) > Implement Error Classification based on execution issues > -------------------------------------------------------- > > Key: GOBBLIN-2211 > URL: https://issues.apache.org/jira/browse/GOBBLIN-2211 > Project: Apache Gobblin > Issue Type: Bug > Components: gobblin-service > Reporter: Abhishek Jain > Assignee: Abhishek Tiwari > Priority: Major > Time Spent: 5.5h > Remaining Estimate: 0h > > Implement Error Classification to categorize the failure reason based on > issues encountered. -- This message was sent by Atlassian Jira (v8.20.10#820010)