abhishekmjain commented on code in PR #4121:
URL: https://github.com/apache/gobblin/pull/4121#discussion_r2223297454


##########
gobblin-runtime/src/main/java/org/apache/gobblin/runtime/troubleshooter/InMemoryMultiContextIssueRepository.java:
##########
@@ -46,6 +46,7 @@
 public class InMemoryMultiContextIssueRepository extends AbstractIdleService 
implements MultiContextIssueRepository {
   private final LRUMap<String, InMemoryIssueRepository> contextIssues;
   private final Configuration configuration;
+  private String _contextId;

Review Comment:
   Is this needed?



##########
gobblin-metastore/src/main/java/org/apache/gobblin/metastore/MysqlErrorPatternStore.java:
##########
@@ -279,64 +302,46 @@ public List<ErrorPatternProfile> 
getErrorPatternsByCategory(String categoryName)
   }
 
   @Override
-  public Category getDefaultCategory()
+  public ErrorCategory getDefaultCategory()
       throws IOException {
-    if (hasIsDefaultColumn()) {
-      Category cat = getDefaultCategoryFromIsDefault();
+    // 1. Try to use the configured default category name if set
+    if (configuredDefaultCategoryName != null && 
!configuredDefaultCategoryName.trim().isEmpty()) {
+      ErrorCategory cat = getErrorCategory(configuredDefaultCategoryName);
       if (cat != null) {
         return cat;
+      } else {
+        // Throw exception if configured category doesn't exist
+        throw new IOException(String.format(
+            "Configured default category '%s' not found in database",
+            configuredDefaultCategoryName));
       }
     }
-    // Fallback to previous logic: category with the highest priority (lowest 
priority number)
-    return getHighestPriorityCategory();
-  }
 
-  /**
-   * Returns the category with the highest priority, i.e. the lowest priority 
value (ascending order).
-   */
-  private Category getHighestPriorityCategory()
-      throws IOException {
-    try (Connection conn = dataSource.getConnection(); PreparedStatement ps = 
conn.prepareStatement(
-        String.format(GET_HIGHEST_ERROR_CATEGORY_STATEMENT, 
errorCategoriesTable))) {
-      try (ResultSet rs = ps.executeQuery()) {
-        if (rs.next()) {
-          return new Category(rs.getString(1), rs.getInt(2));
-        }
-      }
-    } catch (SQLException e) {
-      throw new IOException("Failed to get category", e);
+    // 2. No configuration provided - use lowest priority category as fallback
+    ErrorCategory lowestPriorityCategory = getLowestPriorityCategory();
+    if (lowestPriorityCategory == null) {
+      throw new IOException("No error categories found in database");
     }
-    return null;
-  }
 
-  /**
-   * Checks if the is_default column exists in the categories table.
-   */
-  private boolean hasIsDefaultColumn()
-      throws IOException {
-    try (Connection conn = dataSource.getConnection()) {
-      try (ResultSet rs = conn.getMetaData().getColumns(null, null, 
errorCategoriesTable, "is_default")) {
-        return rs.next();
-      }
-    } catch (SQLException e) {
-      throw new IOException("Failed to check for is_default column", e);
-    }
+    log.debug("No default category configured, using lowest priority category: 
{}",

Review Comment:
   `log.info`



##########
gobblin-metastore/src/main/java/org/apache/gobblin/metastore/MysqlErrorPatternStore.java:
##########
@@ -0,0 +1,364 @@
+package org.apache.gobblin.metastore;
+
+import org.apache.gobblin.broker.SharedResourcesBrokerFactory;
+import org.apache.gobblin.configuration.ConfigurationKeys;
+import org.apache.gobblin.configuration.ErrorPatternProfile;
+import org.apache.gobblin.configuration.Category;
+
+import com.typesafe.config.Config;
+
+import org.apache.gobblin.util.ConfigUtils;
+
+import javax.sql.DataSource;
+
+import javax.inject.Inject;
+import lombok.extern.slf4j.Slf4j;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+
+
+/**
+ * MySQL-backed implementation of IssueStore.
+ *
+ * Expected table schemas:
+ *
+ * 1. error_summary_regex_store
+ *    - description_regex: VARCHAR(255) NOT NULL UNIQUE
+ *    - error_category_name: VARCHAR(255) NOT NULL
+ *
+ * 2. error_categories
+ *    - error_category_name: VARCHAR(255) PRIMARY KEY
+ *    - priority: INT UNIQUE NOT NULL
+ *    - is_default: BOOLEAN (optional, not compulsory; used if present to 
indicate the default category)
+ *
+ * This class provides methods to primarily retrieve error regex patterns and 
error categories.
+ * There are also methods to add and delete, which should be used with 
caution, and retrieve error patterns and categories.
+ */
+@Slf4j
+public class MysqlErrorPatternStore implements ErrorPatternStore {
+  private final DataSource dataSource;
+  private final String errorRegexSummaryStoreTable;
+  private final String errorCategoriesTable;
+  public static final String CONFIG_PREFIX = "MysqlErrorPatternStore";
+
+  private static final int DEFAULT_MAX_CHARACTERS_IN_SQL_DESCRIPTION_REGEX = 
2000;
+  private static final int DEFAULT_MAX_CHARACTERS_IN_SQL_CATEGORY_NAME = 255;
+  private final int maxCharactersInSqlDescriptionRegex;
+  private final int maxCharactersInSqlCategoryName;
+
+  private static final String CREATE_ERROR_REGEX_SUMMARY_STORE_TABLE_STATEMENT 
=
+      "CREATE TABLE IF NOT EXISTS %s (" + "  description_regex VARCHAR(%d) NOT 
NULL UNIQUE, "
+          + "  error_category_name VARCHAR(%d) NOT NULL" + ")";
+
+  private static final String CREATE_ERROR_CATEGORIES_TABLE_NAME =
+      "CREATE TABLE IF NOT EXISTS %s (" + " error_category_name VARCHAR(%d) 
PRIMARY KEY, priority INT UNIQUE NOT NULL" + " )";
+
+  private static final String INSERT_ERROR_CATEGORY_STATEMENT = "INSERT INTO 
%s (error_category_name, priority) "
+      + "VALUES (?, ?) ON DUPLICATE KEY UPDATE priority=VALUES(priority)";
+
+  private static final String GET_ERROR_CATEGORY_STATEMENT =
+      "SELECT error_category_name, priority FROM %s WHERE error_category_name 
= ?";
+
+  private static final String GET_ALL_ERROR_CATEGORIES_STATEMENT =
+      "SELECT error_category_name, priority FROM %s ORDER BY priority ASC";
+
+  private static final String INSERT_ERROR_REGEX_SUMMARY_STATEMENT =
+      "INSERT INTO %s (description_regex, error_category_name) "
+          + "VALUES (?, ?) ON DUPLICATE KEY UPDATE 
error_category_name=VALUES(error_category_name)";
+
+  private static final String DELETE_ERROR_REGEX_SUMMARY_STATEMENT = "DELETE 
FROM %s WHERE description_regex = ?";
+
+  private static final String GET_ERROR_REGEX_SUMMARY_STATEMENT =
+      "SELECT description_regex, error_category_name FROM %s WHERE 
description_regex   = ?";
+
+  private static final String GET_ALL_ERROR_REGEX_SUMMARIES_STATEMENT =
+      "SELECT description_regex, error_category_name FROM %s";
+
+  private static final String GET_DEFAULT_CATEGORY_STATEMENT =
+      "SELECT error_category_name, priority FROM %s WHERE is_default = TRUE 
ORDER BY priority DESC";
+
+  private static final String GET_HIGHEST_ERROR_CATEGORY_STATEMENT =
+      "SELECT error_category_name, priority FROM %s ORDER BY priority ASC 
LIMIT 1";
+
+  private static final String 
GET_ALL_ERROR_ISSUES_ORDERED_BY_CATEGORY_PRIORITY_STATEMENT =
+      "SELECT e.description_regex, e.error_category_name FROM %s e "
+          + "JOIN %s c ON e.error_category_name = c.error_category_name "
+          + "ORDER BY c.priority ASC, e.description_regex ASC";

Review Comment:
   My bad, the selection wasn't highlighted. I meant the ordering for 
`e.description_regex` in particular. Ordering for `c.priority` is required.



##########
gobblin-runtime/src/main/java/org/apache/gobblin/runtime/troubleshooter/InMemoryMultiContextIssueRepository.java:
##########
@@ -60,6 +61,7 @@ public InMemoryMultiContextIssueRepository(Configuration 
configuration) {
   @Override
   public synchronized List<Issue> getAll(String contextId)
       throws TroubleshooterException {
+    _contextId = contextId;

Review Comment:
   this can be removed too



##########
gobblin-api/src/main/java/org/apache/gobblin/service/ServiceConfigKeys.java:
##########
@@ -167,4 +167,7 @@ public class ServiceConfigKeys {
   public static final int DEFAULT_ERROR_CLASSIFICATION_MAX_ERRORS_IN_FINAL = 
10;
   public static final String ERROR_CLASSIFICATION_MAX_ERRORS_TO_PROCESS_KEY = 
GOBBLIN_SERVICE_PREFIX + "errorClassification.maxErrorsToProcess";
   public static final int DEFAULT_ERROR_CLASSIFICATION_MAX_ERRORS_TO_PROCESS = 
1000;
+  public static final String ERROR_CLASSIFICATION_DEFAULT_PRIORITY_ENABLE_KEY 
= ServiceConfigKeys.GOBBLIN_SERVICE_PREFIX + 
"errorClassification.defaultPriorityValue";

Review Comment:
   rename to `ERROR_CLASSIFICATION_DEFAULT_PRIORITY_KEY`



##########
gobblin-metastore/src/main/java/org/apache/gobblin/metastore/MysqlErrorPatternStore.java:
##########
@@ -279,64 +302,46 @@ public List<ErrorPatternProfile> 
getErrorPatternsByCategory(String categoryName)
   }
 
   @Override
-  public Category getDefaultCategory()
+  public ErrorCategory getDefaultCategory()
       throws IOException {
-    if (hasIsDefaultColumn()) {
-      Category cat = getDefaultCategoryFromIsDefault();
+    // 1. Try to use the configured default category name if set
+    if (configuredDefaultCategoryName != null && 
!configuredDefaultCategoryName.trim().isEmpty()) {

Review Comment:
   nit: we can use `StringUtils.isNullOrEmpty()` here



##########
gobblin-docs/sources/QueryBasedSource.md:
##########
@@ -2,7 +2,7 @@
 
 # Introduction
 
[`QueryBasedSource`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/QueryBasedSource.java)
-represents a category of sources whose data is pulled by sending queries. A 
dataset of a source is identified as a
+represents a errorCategory of sources whose data is pulled by sending queries. 
A dataset of a source is identified as a

Review Comment:
   unintended change?



##########
gobblin-metastore/src/main/java/org/apache/gobblin/metastore/MysqlErrorPatternStore.java:
##########
@@ -279,64 +302,46 @@ public List<ErrorPatternProfile> 
getErrorPatternsByCategory(String categoryName)
   }
 
   @Override
-  public Category getDefaultCategory()
+  public ErrorCategory getDefaultCategory()
       throws IOException {
-    if (hasIsDefaultColumn()) {
-      Category cat = getDefaultCategoryFromIsDefault();
+    // 1. Try to use the configured default category name if set
+    if (configuredDefaultCategoryName != null && 
!configuredDefaultCategoryName.trim().isEmpty()) {
+      ErrorCategory cat = getErrorCategory(configuredDefaultCategoryName);
       if (cat != null) {
         return cat;
+      } else {
+        // Throw exception if configured category doesn't exist
+        throw new IOException(String.format(
+            "Configured default category '%s' not found in database",
+            configuredDefaultCategoryName));
       }
     }
-    // Fallback to previous logic: category with the highest priority (lowest 
priority number)
-    return getHighestPriorityCategory();
-  }
 
-  /**
-   * Returns the category with the highest priority, i.e. the lowest priority 
value (ascending order).
-   */
-  private Category getHighestPriorityCategory()
-      throws IOException {
-    try (Connection conn = dataSource.getConnection(); PreparedStatement ps = 
conn.prepareStatement(
-        String.format(GET_HIGHEST_ERROR_CATEGORY_STATEMENT, 
errorCategoriesTable))) {
-      try (ResultSet rs = ps.executeQuery()) {
-        if (rs.next()) {
-          return new Category(rs.getString(1), rs.getInt(2));
-        }
-      }
-    } catch (SQLException e) {
-      throw new IOException("Failed to get category", e);
+    // 2. No configuration provided - use lowest priority category as fallback
+    ErrorCategory lowestPriorityCategory = getLowestPriorityCategory();
+    if (lowestPriorityCategory == null) {
+      throw new IOException("No error categories found in database");
     }
-    return null;
-  }
 
-  /**
-   * Checks if the is_default column exists in the categories table.
-   */
-  private boolean hasIsDefaultColumn()
-      throws IOException {
-    try (Connection conn = dataSource.getConnection()) {
-      try (ResultSet rs = conn.getMetaData().getColumns(null, null, 
errorCategoriesTable, "is_default")) {
-        return rs.next();
-      }
-    } catch (SQLException e) {
-      throw new IOException("Failed to check for is_default column", e);
-    }
+    log.debug("No default category configured, using lowest priority category: 
{}",
+        lowestPriorityCategory.getCategoryName());
+    return lowestPriorityCategory;
   }
 
   /**
-   * Returns the default category using is_default column, or null if not 
found.
+   * Returns the category with the lowest priority, i.e. the highes priority 
value (descending order).

Review Comment:
   `highes` -> `highest`



##########
gobblin-runtime/src/main/java/org/apache/gobblin/runtime/ErrorClassifier.java:
##########
@@ -0,0 +1,256 @@
+package org.apache.gobblin.runtime;
+
+import java.io.IOException;
+import java.time.ZonedDateTime;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import javax.inject.Inject;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.gobblin.configuration.Category;
+import org.apache.gobblin.configuration.ErrorPatternProfile;
+import org.apache.gobblin.metastore.ErrorPatternStore;
+import org.apache.gobblin.runtime.troubleshooter.Issue;
+import org.apache.gobblin.runtime.troubleshooter.IssueSeverity;
+import org.apache.gobblin.service.ServiceConfigKeys;
+import org.apache.gobblin.util.ConfigUtils;
+
+import com.typesafe.config.Config;
+
+
+/**
+ * Classifies issues by matching their summary description to error patterns 
and categories.
+ * Categorisation is based on regex patterns and their associated categories.
+ * Each category has an associated priority value.
+ */
+@Slf4j
+public class ErrorClassifier {
+  private final List<CompiledErrorPattern> errorIssues;
+  private final Map<String, Category> categoryMap;
+  private ErrorPatternStore errorStore = null;
+
+  private final int maxErrorsInFinalError;
+  private static final String DEFAULT_CODE = "T0000";
+  private Category defaultCategory = null;
+
+  /**
+   * Loads all error issues and categories from the store into memory.
+   */
+  @Inject
+  public ErrorClassifier(ErrorPatternStore store, Config config)
+      throws IOException {
+    this.errorStore = store;
+
+    this.maxErrorsInFinalError =
+        ConfigUtils.getInt(config, 
ServiceConfigKeys.ERROR_CLASSIFICATION_MAX_ERRORS_IN_FINAL_KEY,
+            
ServiceConfigKeys.DEFAULT_ERROR_CLASSIFICATION_MAX_ERRORS_IN_FINAL);
+
+    //Obtaining Categories must be done before getting ErrorIssues, as it is 
used in ordering ErrorIssues by category priority.

Review Comment:
   Is it because of the categories map used 
[here](https://github.com/apache/gobblin/pull/4121/files/ff86626a8ab6e9d08201be9460e82bf21491caf8..01fc9a6be99b349c6dfbf0c5ab31424f7499c981#diff-a15d3c69e89634f073087cb9265d059fb22fd68713d91b1362b5d917a900f2baR169)?
   
   These two are two different category maps, so their objects would be 
independent. One is for the pattern store whereas the other is for error 
classifier, unless there is some sharing of these maps that I may have missed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@gobblin.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to