Re: [PR] [#1573] feat(spark-connector): Spark regression test system [gravitino]

via GitHub Wed, 31 Jul 2024 18:46:40 -0700


yuqi1129 commented on code in PR #3933:
URL: https://github.com/apache/gravitino/pull/3933#discussion_r1699281151



##########
spark-connector/spark-common/src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/SparkSQLRegressionTest.java:
##########
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import org.apache.logging.log4j.util.Strings;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.TestInstance;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The entrypoint to run SparkSQL regression test, you could run it like: 
./gradlew
+ * :spark-connector:spark-3.4:test --tests
+ * 
"com.datastrato.gravitino.spark.connector.integration.test.sql.SparkSQLRegressionTest"
 or specify
+ * a config file explicitly: ./gradlew :spark-connector:spark-3.4:test --tests
+ * 
"com.datastrato.gravitino.spark.connector.integration.test.sql.SparkSQLRegressionTest"
+ * -PconfigFile=/xxx/xx
+ */
+@Tag("gravitino-docker-it")

Review Comment:
   gravitino-docker-test



##########
spark-connector/spark-common/src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/SparkTestConfig.java:
##########
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.gravitino.Config;
+import org.apache.gravitino.config.ConfigBuilder;
+import org.apache.gravitino.config.ConfigEntry;
+import org.junit.platform.commons.util.StringUtils;
+
+public class SparkTestConfig extends Config {
+  private static final String DEFAULT_BASE_DIR =
+      Paths.get(
+              System.getenv("GRAVITINO_ROOT_DIR"),
+              "spark-connector",
+              "spark-common",
+              "src",
+              "test",
+              "resources")
+          .toString();
+
+  private static final ConfigEntry<String> TEST_BASE_DIR =
+      new ConfigBuilder("gravitino.test.sql.dir")

Review Comment:
   gravitino.test.sql.dir -> gravitino.spark.test.dir



##########
spark-connector/spark-common/src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/SparkQueryRunner.java:
##########
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+import com.google.common.collect.Maps;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.gravitino.Catalog;
+import org.apache.gravitino.client.GravitinoAdminClient;
+import org.apache.gravitino.client.GravitinoMetalake;
+import org.apache.gravitino.integration.test.container.ContainerSuite;
+import org.apache.gravitino.integration.test.container.HiveContainer;
+import org.apache.gravitino.integration.test.util.AbstractIT;
+import org.apache.gravitino.spark.connector.GravitinoSparkConfig;
+import org.apache.gravitino.spark.connector.iceberg.IcebergPropertiesConstants;
+import org.apache.gravitino.spark.connector.plugin.GravitinoSparkPlugin;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.Assertions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Run and check the correctness of the SparkSQLs */
+public class SparkQueryRunner {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(SparkSQLRegressionTest.class);
+  private static final String HIVE_CATALOG_NAME = "hive";
+  private static final String ICEBERG_CATALOG_NAME = "iceberg";
+
+  private SparkSession sparkSession;
+  private String gravitinoUri;
+  private String metalakeName;
+  private String warehouse;
+  private boolean regenerateGoldenFiles;

Review Comment:
   what is `golden` files?



##########
spark-connector/spark-common/src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/SparkQueryRunner.java:
##########
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+import com.google.common.collect.Maps;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.gravitino.Catalog;
+import org.apache.gravitino.client.GravitinoAdminClient;
+import org.apache.gravitino.client.GravitinoMetalake;
+import org.apache.gravitino.integration.test.container.ContainerSuite;
+import org.apache.gravitino.integration.test.container.HiveContainer;
+import org.apache.gravitino.integration.test.util.AbstractIT;
+import org.apache.gravitino.spark.connector.GravitinoSparkConfig;
+import org.apache.gravitino.spark.connector.iceberg.IcebergPropertiesConstants;
+import org.apache.gravitino.spark.connector.plugin.GravitinoSparkPlugin;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.Assertions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Run and check the correctness of the SparkSQLs */
+public class SparkQueryRunner {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(SparkSQLRegressionTest.class);
+  private static final String HIVE_CATALOG_NAME = "hive";
+  private static final String ICEBERG_CATALOG_NAME = "iceberg";
+
+  private SparkSession sparkSession;
+  private String gravitinoUri;
+  private String metalakeName;
+  private String warehouse;
+  private boolean regenerateGoldenFiles;
+  // catalogType -> catalogName
+  private Map<CatalogType, String> catalogs = new HashMap<>();
+  private boolean isGravitinoEnvSetup;
+  private String dataDir;
+
+  private static final ContainerSuite containerSuite = 
ContainerSuite.getInstance();
+
+  public SparkQueryRunner(SparkTestConfig sparkTestConfig) {
+    this.regenerateGoldenFiles = sparkTestConfig.generateGoldenFiles();
+    this.isGravitinoEnvSetup = sparkTestConfig.isGravitinoEnvSetUp();
+    String baseDir = sparkTestConfig.getBaseDir();
+    // translate to file:///xx/xx
+    this.dataDir = Paths.get(baseDir, "data").toUri().toString();
+    this.metalakeName = sparkTestConfig.getMetalakeName();
+    if (isGravitinoEnvSetup) {
+      try {
+        setupGravitinoEnv();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    } else {
+      this.gravitinoUri = sparkTestConfig.getGravitinoUri();
+      this.warehouse = sparkTestConfig.getWarehouseLocation();
+    }
+    initSparkEnv();
+
+    catalogs.put(CatalogType.HIVE, HIVE_CATALOG_NAME);
+    catalogs.put(CatalogType.ICEBERG, ICEBERG_CATALOG_NAME);
+    catalogs.put(CatalogType.UNKNOWN, HIVE_CATALOG_NAME);
+  }
+
+  public void runQuery(TestCaseGroup sqlTestCaseGroup) throws IOException {
+    useCatalog(sqlTestCaseGroup.getCatalogType());
+    try {
+      if (sqlTestCaseGroup.prepareFile != null) {
+        runQuery(sqlTestCaseGroup.prepareFile);
+      }
+      sqlTestCaseGroup.testCases.forEach(
+          testCase -> {
+            try {
+              runTestCase(testCase);
+            } catch (IOException e) {
+              throw new RuntimeException(e);
+            }
+          });
+    } finally {
+      if (sqlTestCaseGroup.cleanupFile != null) {
+        runQuery(sqlTestCaseGroup.cleanupFile);
+      }
+    }
+  }
+
+  public void close() throws Exception {
+    if (sparkSession != null) {
+      sparkSession.close();
+      sparkSession = null;
+    }
+    if (isGravitinoEnvSetup) {
+      closeGravitinoEnv();
+    }
+  }
+
+  private void setupGravitinoEnv() throws Exception {
+    // start hive and hdfs
+    containerSuite.startHiveContainer();
+    String hiveMetastoreUri =
+        String.format(
+            "thrift://%s:%d",
+            containerSuite.getHiveContainer().getContainerIpAddress(),
+            HiveContainer.HIVE_METASTORE_PORT);
+    this.warehouse =
+        String.format(
+            "hdfs://%s:%d/user/hive/warehouse",
+            containerSuite.getHiveContainer().getContainerIpAddress(),
+            HiveContainer.HDFS_DEFAULTFS_PORT);
+
+    // start Gravitino server
+    AbstractIT.startIntegrationTest();
+    int gravitinoPort = AbstractIT.getGravitinoServerPort();
+    this.gravitinoUri = String.format("http://127.0.0.1:%d";, gravitinoPort);
+
+    // init metalake and catalog
+    GravitinoAdminClient client = AbstractIT.getGravitinoClient();
+    client.createMetalake(metalakeName, "", Collections.emptyMap());
+    GravitinoMetalake metalake = client.loadMetalake(metalakeName);
+    metalake.createCatalog(
+        HIVE_CATALOG_NAME,
+        Catalog.Type.RELATIONAL,
+        "hive",
+        "",
+        getHiveCatalogConfigs(hiveMetastoreUri));
+    metalake.createCatalog(
+        ICEBERG_CATALOG_NAME,
+        Catalog.Type.RELATIONAL,
+        "lakehouse-iceberg",
+        "",
+        getIcebergCatalogConfigs(hiveMetastoreUri));
+
+    client.close();
+  }
+
+  private Map<String, String> getHiveCatalogConfigs(String hiveMetastoreUri) {
+    Map<String, String> catalogProperties = Maps.newHashMap();
+    catalogProperties.put(GravitinoSparkConfig.GRAVITINO_HIVE_METASTORE_URI, 
hiveMetastoreUri);
+    return catalogProperties;
+  }
+
+  protected Map<String, String> getIcebergCatalogConfigs(String 
hiveMetastoreUri) {
+    Map<String, String> catalogProperties = Maps.newHashMap();
+    catalogProperties.put(
+        IcebergPropertiesConstants.GRAVITINO_ICEBERG_CATALOG_BACKEND,
+        IcebergPropertiesConstants.ICEBERG_CATALOG_BACKEND_HIVE);
+    catalogProperties.put(
+        IcebergPropertiesConstants.GRAVITINO_ICEBERG_CATALOG_WAREHOUSE, 
warehouse);
+    catalogProperties.put(
+        IcebergPropertiesConstants.GRAVITINO_ICEBERG_CATALOG_URI, 
hiveMetastoreUri);
+    return catalogProperties;
+  }
+
+  private void closeGravitinoEnv() throws Exception {
+    AbstractIT.stopIntegrationTest();
+  }
+
+  private void writeQueryOutput(Path outputFile, List<QueryOutput> 
queryOutputs)
+      throws IOException {
+    String goldenOutput =
+        "-- Automatically generated by Gravitino Spark SQL test\n"
+            + 
queryOutputs.stream().map(QueryOutput::toString).collect(Collectors.joining("\n\n\n"))
+            + "\n";
+    stringToFile(outputFile, goldenOutput);
+  }
+
+  private List<String> getQueriesFromFile(Path file) throws IOException {
+    String input = fileToString(file);
+    Pair<List<String>, List<String>> pair = splitCommentsAndCodes(input);
+    List<String> queries = pair.getRight();
+    queries = splitWithSemicolon(queries);
+    queries = cleanAndFilterQueries(queries);
+    return queries;
+  }
+
+  private void runTestCase(TestCase testCase) throws IOException {
+    LOG.info("Run test case:{}", testCase.toString());
+    List<String> queries = getQueriesFromFile(testCase.getTestFile());
+    List<QueryOutput> queryOutputs = runTestQueries(queries, true);
+    if (regenerateGoldenFiles) {
+      writeQueryOutput(testCase.getTestOutputFile(), queryOutputs);
+    }
+    List<QueryOutput> expectedOutputs = 
getExpectedOutputs(testCase.getTestOutputFile());
+
+    Assertions.assertEquals(
+        expectedOutputs.size(), queryOutputs.size(), "Query size not match for 
test: " + testCase);
+
+    for (int i = 0; i < expectedOutputs.size(); i++) {
+      QueryOutput queryOutput = queryOutputs.get(i);
+      QueryOutput expectedOutput = expectedOutputs.get(i);
+      Assertions.assertEquals(
+          expectedOutput.getSql(), queryOutput.getSql(), "SQL not match for 
test: " + testCase);
+      Assertions.assertEquals(
+          expectedOutput.getSchema(),
+          queryOutput.getSchema(),
+          "SQL schema not match for test: " + testCase);
+      Assertions.assertEquals(
+          expectedOutput.getOutput(),
+          queryOutput.getOutput(),
+          "SQL output not match for test: " + testCase + ", sql: " + 
expectedOutput.getSql());
+    }
+  }
+
+  private List<QueryOutput> runTestQueries(List<String> queries, boolean 
catchException) {
+    SparkSession localSparkSession = sparkSession;
+    return queries.stream()
+        .map(
+            query -> {
+              Pair<String, List<String>> pair;
+              if (catchException) {
+                pair =
+                    SQLQueryTestHelper.handleExceptions(
+                        () -> 
SQLQueryTestHelper.getNormalizedResult(localSparkSession, query));
+              } else {
+                pair = 
SQLQueryTestHelper.getNormalizedResult(localSparkSession, query);
+              }
+              String schema = pair.getLeft();
+              String output =
+                  pair.getRight().stream()
+                      .collect(Collectors.joining("\n"))
+                      .replaceAll("\\s+$", "");
+              return new QueryOutput(query, schema, output);
+            })
+        .collect(Collectors.toList());
+  }
+
+  private List<QueryOutput> getExpectedOutputs(Path outputPath) throws 
IOException {
+    String goldenOutput = fileToString(outputPath);
+    String[] segments = goldenOutput.split("-- !query.*\\n");
+
+    List<QueryOutput> expectedQueryOutputs = new ArrayList<>();
+    // the first segment is comment, skip it
+    for (int i = 0; i * 3 + 3 < segments.length; i++) {
+      QueryOutput queryOutput =
+          new QueryOutput(
+              segments[i * 3 + 1].trim(),
+              segments[i * 3 + 2].trim(),
+              segments[i * 3 + 3].trim().replaceAll("\\s+$", ""));
+      expectedQueryOutputs.add(queryOutput);
+    }
+    return expectedQueryOutputs;
+  }
+
+  private static File stringToFile(Path path, String str) throws IOException {
+    File file = path.toFile();
+    try (PrintWriter out = new PrintWriter(file, 
StandardCharsets.UTF_8.toString())) {
+      out.write(str);
+    }
+    return file;
+  }
+
+  private String fileToString(Path filePath) throws IOException {
+    return new String(Files.readAllBytes(filePath), StandardCharsets.UTF_8);
+  }

Review Comment:
       FileUtils.writeStringToFile();
       FileUtils.readFileToString()
   
   



##########
spark-connector/spark-common/src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/CatalogType.java:
##########
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+public enum CatalogType {
+  HIVE,
+  ICEBERG,
+  UNKNOWN;
+
+  public static CatalogType fromString(String str) {
+    if (str == null) {
+      return UNKNOWN;
+    }
+    switch (str.toLowerCase()) {
+      case "hive":
+        return HIVE;
+      case "lakehouse-iceberg":
+        return ICEBERG;
+      default:
+        return UNKNOWN;
+    }

Review Comment:
   ```suggestion
   for (CatalogType type : CatalogType.values())
       if (type.name.equals(str.toUpperCase())) {
           //....
       }
   }
   ```



##########
spark-connector/spark-common/src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/CatalogType.java:
##########
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+public enum CatalogType {
+  HIVE,
+  ICEBERG,
+  UNKNOWN;
+
+  public static CatalogType fromString(String str) {
+    if (str == null) {
+      return UNKNOWN;
+    }
+    switch (str.toLowerCase()) {
+      case "hive":
+        return HIVE;
+      case "lakehouse-iceberg":
+        return ICEBERG;
+      default:
+        return UNKNOWN;
+    }
+  }
+
+  public static CatalogType merge(CatalogType parentCatalogType, CatalogType 
childCatalogType) {

Review Comment:
   What the meaning of `merge`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [#1573] feat(spark-connector): Spark regression test system [gravitino]

Reply via email to