abhishekmjain commented on code in PR #4121: URL: https://github.com/apache/gobblin/pull/4121#discussion_r2230606590
########## gobblin-runtime/src/main/java/org/apache/gobblin/runtime/ErrorClassifier.java: ########## @@ -0,0 +1,256 @@ +package org.apache.gobblin.runtime; + +import java.io.IOException; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import javax.inject.Inject; +import lombok.extern.slf4j.Slf4j; + +import org.apache.gobblin.configuration.Category; +import org.apache.gobblin.configuration.ErrorPatternProfile; +import org.apache.gobblin.metastore.ErrorPatternStore; +import org.apache.gobblin.runtime.troubleshooter.Issue; +import org.apache.gobblin.runtime.troubleshooter.IssueSeverity; +import org.apache.gobblin.service.ServiceConfigKeys; +import org.apache.gobblin.util.ConfigUtils; + +import com.typesafe.config.Config; + + +/** + * Classifies issues by matching their summary description to error patterns and categories. + * Categorisation is based on regex patterns and their associated categories. + * Each category has an associated priority value. + */ +@Slf4j +public class ErrorClassifier { + private final List<CompiledErrorPattern> errorIssues; + private final Map<String, Category> categoryMap; + private ErrorPatternStore errorStore = null; + + private final int maxErrorsInFinalError; + private static final String DEFAULT_CODE = "T0000"; + private Category defaultCategory = null; + + /** + * Loads all error issues and categories from the store into memory. + */ + @Inject + public ErrorClassifier(ErrorPatternStore store, Config config) + throws IOException { + this.errorStore = store; + + this.maxErrorsInFinalError = + ConfigUtils.getInt(config, ServiceConfigKeys.ERROR_CLASSIFICATION_MAX_ERRORS_IN_FINAL_KEY, + ServiceConfigKeys.DEFAULT_ERROR_CLASSIFICATION_MAX_ERRORS_IN_FINAL); + + //Obtaining Categories must be done before getting ErrorIssues, as it is used in ordering ErrorIssues by category priority. + this.categoryMap = new HashMap<>(); + for (Category cat : this.errorStore.getAllErrorCategories()) { + categoryMap.put(cat.getCategoryName(), cat); + } + + this.errorIssues = new ArrayList<>(); + for (ErrorPatternProfile issue : this.errorStore.getAllErrorIssuesOrderedByCategoryPriority()) { + errorIssues.add(new CompiledErrorPattern(issue)); + } + + List<String> regexList = new ArrayList<>(); + for (CompiledErrorPattern pei : errorIssues) { + regexList.add(pei.issue.getDescriptionRegex()); + } + + this.defaultCategory = this.errorStore.getDefaultCategory(); + } + + /** + * Returns the highest priority Category matching the given summary, or defaultCategory if initialised, or null if none match. + */ + public Category classify(String summary) { + if (summary == null) { + return null; + } + Category highest = null; + for (CompiledErrorPattern pei : errorIssues) { + if (pei.matches(summary)) { + Category cat = categoryMap.get(pei.getCategoryName()); + if (cat == null) { + continue; + } + if (highest == null || cat.getPriority() < highest.getPriority()) { + highest = cat; + } + } + } + if (highest == null) { + return defaultCategory != null ? defaultCategory : null; + } + return highest; + } + + /** + * Classifies a list of issues and returns the highest priority category with its matched issues. + * If no issues match, returns null. + * If defaultCategory is set, it will be used for unmatched issues. + */ + public Issue classifyEarlyStopWithDefault(List<Issue> issues) { + if (issues == null || issues.isEmpty()) { + return null; + } + + ClassificationResult result = performClassification(issues); + + if (result.highestCategoryName == null) { + return null; + } + + return buildFinalIssue(result.highestCategoryName, result.categoryToIssues); + } + + private ClassificationResult performClassification(List<Issue> issues) { + ClassificationResult result = new ClassificationResult(); + + for (Issue issue : issues) { + classifySingleIssue(issue, result); + } + + applyDefaultCategoryIfNeeded(result); + return result; + } + + private void classifySingleIssue(Issue issue, ClassificationResult result) { + Category matchedCategory = findBestMatchingCategory(issue, result.highestPriority); + + if (matchedCategory != null) { + addMatchedIssue(issue, matchedCategory, result); + } else { + addUnmatchedIssue(issue, result); + } + } + + private Category findBestMatchingCategory(Issue issue, Integer currentHighestPriority) { + for (CompiledErrorPattern pei : errorIssues) { + Category cat = categoryMap.get(pei.getCategoryName()); + if (cat == null) { + continue; + } + + // Early stop optimization - skip categories with lower priority + if (currentHighestPriority != null && cat.getPriority() > currentHighestPriority) { + break; + } + + if (pei.matches(issue.getSummary())) { + return cat; + } + } + return null; + } + + private void addMatchedIssue(Issue issue, Category category, ClassificationResult result) { + result.categoryToIssues.computeIfAbsent(category.getCategoryName(), k -> new ArrayList<>()).add(issue); + + updateHighestPriorityIfNeeded(category, result); + } + + private void addUnmatchedIssue(Issue issue, ClassificationResult result) { + result.unmatched.add(issue); + + // Initialize default priority only once when we encounter the first unmatched issue + if (result.defaultPriority == null && defaultCategory != null) { + result.defaultPriority = defaultCategory.getPriority(); + // Only update highest priority if no category has been matched yet OR if default category has higher priority (lower number) + if (result.highestPriority == null || defaultCategory.getPriority() < result.highestPriority) { + result.highestPriority = result.defaultPriority; + result.highestCategoryName = defaultCategory.getCategoryName(); + } + } + } + + private void updateHighestPriorityIfNeeded(Category category, ClassificationResult result) { + if (result.highestPriority == null || category.getPriority() < result.highestPriority) { + result.highestPriority = category.getPriority(); + result.highestCategoryName = category.getCategoryName(); + } + } + + private void applyDefaultCategoryIfNeeded(ClassificationResult result) { + boolean shouldUseDefault = result.highestPriority != null && result.highestPriority.equals(result.defaultPriority) + && !result.unmatched.isEmpty() && defaultCategory != null; + + if (shouldUseDefault) { + result.highestCategoryName = defaultCategory.getCategoryName(); + + for (Issue issue : result.unmatched) { + result.categoryToIssues.computeIfAbsent(defaultCategory.getCategoryName(), k -> new ArrayList<>()).add(issue); + } + } + } + + private Issue buildFinalIssue(String categoryName, Map<String, List<Issue>> categoryToIssues) { + List<Issue> matchedIssues = categoryToIssues.get(categoryName); + String details = buildDetailsString(matchedIssues); + + return Issue.builder().summary("Category: " + categoryName).details(details).severity(IssueSeverity.ERROR) + .time(ZonedDateTime.now()).code(DEFAULT_CODE).sourceClass(null).exceptionClass(null).properties(null).build(); + } + + private String buildDetailsString(List<Issue> issues) { + List<String> summaries = new ArrayList<>(); + int limit = Math.min(maxErrorsInFinalError, issues.size()); + + for (int i = 0; i < limit; i++) { + summaries.add(issues.get(i).getSummary()); + } + + return String.join(" || ", summaries); + } + + /** + * Helper class that stores the result of issue classification, including matched categories, unmatched issues, and priority information. + */ + private static class ClassificationResult { + Map<String, List<Issue>> categoryToIssues = new HashMap<>(); + List<Issue> unmatched = new ArrayList<>(); + Integer highestPriority = null; Review Comment: Since we allowed defaultCategory to be null, this is no longer applicable -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@gobblin.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org