Re: [PR] [#3760] improvement(client-python): Add Docker env and PyGVFS Integration tests [gravitino]

via GitHub Wed, 03 Jul 2024 02:58:10 -0700


jerryshao commented on code in PR #3876:
URL: https://github.com/apache/gravitino/pull/3876#discussion_r1663916270



##########
clients/client-python/tests/integration/integration_test_env.py:
##########
@@ -118,3 +120,79 @@ def tearDownClass(cls):
 
         if gravitino_server_running:
             logger.error("Can't stop Gravitino server!")
+
+    @classmethod
+    def restart_server(cls):
+        logger.info("Restarting Gravitino server...")
+        gravitino_home = os.environ.get("GRAVITINO_HOME")
+        gravitino_startup_script = os.path.join(gravitino_home, 
"bin/gravitino.sh")
+        if not os.path.exists(gravitino_startup_script):
+            raise GravitinoRuntimeException(
+                f"Can't find Gravitino startup script: 
{gravitino_startup_script}, "
+                "Please execute `./gradlew compileDistribution -x test` in the 
Gravitino "
+                "project root directory."
+            )
+
+        # Restart Gravitino Server
+        env_vars = os.environ.copy()
+        env_vars["HADOOP_USER_NAME"] = "datastrato"
+        result = subprocess.run(
+            [gravitino_startup_script, "restart"],
+            env=env_vars,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if result.stdout:
+            logger.info("stdout: %s", result.stdout)
+        if result.stderr:
+            logger.info("stderr: %s", result.stderr)
+
+        if not check_gravitino_server_status():
+            raise GravitinoRuntimeException("ERROR: Can't start Gravitino 
server!")
+
+    @classmethod
+    def _append_server_hadoop_conf(cls, config):
+        logger.info("Append server hadoop conf.")
+        gravitino_home = os.environ.get("GRAVITINO_HOME")
+        if gravitino_home is None:
+            raise GravitinoRuntimeException("Cannot find GRAVITINO_HOME env.")
+        hadoop_conf_path = f"{gravitino_home}/catalogs/hadoop/conf/hadoop.conf"
+        if not os.path.exists(hadoop_conf_path):
+            raise GravitinoRuntimeException(
+                f"Hadoop conf file is not found at `{hadoop_conf_path}`."
+            )
+
+        with open(hadoop_conf_path, mode="a", encoding="utf-8") as f:
+            for key, value in config.items():
+                f.write(f"\n{key} = {value}")
+
+    @classmethod
+    def _reset_server_hadoop_conf(cls, config):
+        logger.info("Reset server hadoop conf.")
+        gravitino_home = os.environ.get("GRAVITINO_HOME")
+        if gravitino_home is None:
+            raise GravitinoRuntimeException("Cannot find GRAVITINO_HOME env.")
+        hadoop_conf_path = f"{gravitino_home}/catalogs/hadoop/conf/hadoop.conf"
+        if not os.path.exists(hadoop_conf_path):
+            raise GravitinoRuntimeException(
+                f"Hadoop conf file is not found at `{hadoop_conf_path}`."
+            )
+        filtered_lines = []
+        with open(hadoop_conf_path, mode="r", encoding="utf-8") as file:
+            lines = file.readlines()
+            for line in lines:
+                line = line.strip()
+                find_config = False
+                for key, value in config.items():
+                    key_to_delete = f"{key} = {value}"
+                    if line.startswith(key_to_delete):
+                        find_config = True
+                        break
+                if find_config:
+                    continue
+                filtered_lines.append(line)
+
+        with open(hadoop_conf_path, mode="w", encoding="utf-8") as file:

Review Comment:
   Do you need to trunk the file before writing? Otherwise, I guess it will 
have some trailing contents if the `filtered_lines` cannot overwrite the whole 
content.



##########
clients/client-python/tests/integration/base_hadoop_env.py:
##########
@@ -0,0 +1,89 @@
+"""
+Copyright 2024 Datastrato Pvt Ltd.
+This software is licensed under the Apache License version 2.
+"""
+
+import logging
+import os
+import shutil
+import subprocess
+import tarfile
+
+import requests
+
+from gravitino.exceptions.gravitino_runtime_exception import 
GravitinoRuntimeException
+
+logger = logging.getLogger(__name__)
+
+HADOOP_VERSION = "2.7.3"
+HADOOP_PACK_NAME = f"hadoop-{HADOOP_VERSION}.tar.gz"
+HADOOP_DIR_NAME = f"hadoop-{HADOOP_VERSION}"
+HADOOP_DOWNLOAD_URL = 
f"https://archive.apache.org/dist/hadoop/core/hadoop-{HADOOP_VERSION}/{HADOOP_PACK_NAME}";
+LOCAL_BASE_DIR = "/tmp/gravitino"
+LOCAL_HADOOP_DIR = f"{LOCAL_BASE_DIR}/python_its/hadoop"
+
+
+class BaseHadoopEnvironment:
+
+    @classmethod
+    def init_hadoop_env(cls):
+        # download hadoop pack and unzip
+        if not os.path.exists(LOCAL_HADOOP_DIR):
+            os.makedirs(LOCAL_HADOOP_DIR)
+        cls._download_and_unzip_hadoop_pack()
+        # configure hadoop env
+        cls._configure_hadoop_environment()
+
+    @classmethod
+    def clear_hadoop_env(cls):
+        try:
+            shutil.rmtree(LOCAL_BASE_DIR)
+        except OSError as e:
+            logger.warning("Failed to delete directory '%s': %s", 
LOCAL_BASE_DIR, e)
+
+    @classmethod
+    def _download_and_unzip_hadoop_pack(cls):
+        logger.info("Download and unzip hadoop pack from %s.", 
HADOOP_DOWNLOAD_URL)
+        local_tar_path = f"{LOCAL_HADOOP_DIR}/{HADOOP_PACK_NAME}"

Review Comment:
   I would suggest to download to a temporary unique folder, after successfully 
downloaded and unzipped, then move to the destination you wanted, that will 
increase the robustness.



##########
clients/client-python/tests/integration/hdfs_container.py:
##########
@@ -0,0 +1,143 @@
+"""
+Copyright 2024 Datastrato Pvt Ltd.
+This software is licensed under the Apache License version 2.
+"""
+
+import asyncio
+import logging
+import os
+import time
+
+import docker
+from docker import types as tp
+from docker.errors import NotFound
+
+from gravitino.exceptions.gravitino_runtime_exception import 
GravitinoRuntimeException
+
+logger = logging.getLogger(__name__)
+
+
+async def check_hdfs_status(hive_container):

Review Comment:
   Can we rename "hdfs" to "hive" in this file, so later on when we implement 
tabular APIs, we can leverage this class to initialize hive container.



##########
clients/client-python/tests/integration/integration_test_env.py:
##########
@@ -118,3 +120,79 @@ def tearDownClass(cls):
 
         if gravitino_server_running:
             logger.error("Can't stop Gravitino server!")
+
+    @classmethod
+    def restart_server(cls):
+        logger.info("Restarting Gravitino server...")
+        gravitino_home = os.environ.get("GRAVITINO_HOME")
+        gravitino_startup_script = os.path.join(gravitino_home, 
"bin/gravitino.sh")
+        if not os.path.exists(gravitino_startup_script):
+            raise GravitinoRuntimeException(
+                f"Can't find Gravitino startup script: 
{gravitino_startup_script}, "
+                "Please execute `./gradlew compileDistribution -x test` in the 
Gravitino "
+                "project root directory."
+            )
+
+        # Restart Gravitino Server
+        env_vars = os.environ.copy()
+        env_vars["HADOOP_USER_NAME"] = "datastrato"
+        result = subprocess.run(
+            [gravitino_startup_script, "restart"],
+            env=env_vars,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if result.stdout:
+            logger.info("stdout: %s", result.stdout)
+        if result.stderr:
+            logger.info("stderr: %s", result.stderr)
+
+        if not check_gravitino_server_status():
+            raise GravitinoRuntimeException("ERROR: Can't start Gravitino 
server!")
+
+    @classmethod
+    def _append_server_hadoop_conf(cls, config):
+        logger.info("Append server hadoop conf.")
+        gravitino_home = os.environ.get("GRAVITINO_HOME")
+        if gravitino_home is None:
+            raise GravitinoRuntimeException("Cannot find GRAVITINO_HOME env.")
+        hadoop_conf_path = f"{gravitino_home}/catalogs/hadoop/conf/hadoop.conf"
+        if not os.path.exists(hadoop_conf_path):
+            raise GravitinoRuntimeException(
+                f"Hadoop conf file is not found at `{hadoop_conf_path}`."
+            )
+
+        with open(hadoop_conf_path, mode="a", encoding="utf-8") as f:
+            for key, value in config.items():
+                f.write(f"\n{key} = {value}")
+
+    @classmethod
+    def _reset_server_hadoop_conf(cls, config):
+        logger.info("Reset server hadoop conf.")
+        gravitino_home = os.environ.get("GRAVITINO_HOME")
+        if gravitino_home is None:
+            raise GravitinoRuntimeException("Cannot find GRAVITINO_HOME env.")
+        hadoop_conf_path = f"{gravitino_home}/catalogs/hadoop/conf/hadoop.conf"
+        if not os.path.exists(hadoop_conf_path):
+            raise GravitinoRuntimeException(
+                f"Hadoop conf file is not found at `{hadoop_conf_path}`."
+            )
+        filtered_lines = []
+        with open(hadoop_conf_path, mode="r", encoding="utf-8") as file:
+            lines = file.readlines()
+            for line in lines:
+                line = line.strip()
+                find_config = False
+                for key, value in config.items():
+                    key_to_delete = f"{key} = {value}"
+                    if line.startswith(key_to_delete):
+                        find_config = True
+                        break
+                if find_config:
+                    continue
+                filtered_lines.append(line)

Review Comment:
   The complexity of this method is n^2, maybe we can read the file into memory 
as dict and filter them in memory.



##########
clients/client-python/tests/integration/integration_test_env.py:
##########
@@ -118,3 +120,79 @@ def tearDownClass(cls):
 
         if gravitino_server_running:
             logger.error("Can't stop Gravitino server!")
+
+    @classmethod
+    def restart_server(cls):
+        logger.info("Restarting Gravitino server...")
+        gravitino_home = os.environ.get("GRAVITINO_HOME")
+        gravitino_startup_script = os.path.join(gravitino_home, 
"bin/gravitino.sh")
+        if not os.path.exists(gravitino_startup_script):
+            raise GravitinoRuntimeException(
+                f"Can't find Gravitino startup script: 
{gravitino_startup_script}, "
+                "Please execute `./gradlew compileDistribution -x test` in the 
Gravitino "
+                "project root directory."
+            )
+
+        # Restart Gravitino Server
+        env_vars = os.environ.copy()
+        env_vars["HADOOP_USER_NAME"] = "datastrato"
+        result = subprocess.run(
+            [gravitino_startup_script, "restart"],
+            env=env_vars,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if result.stdout:
+            logger.info("stdout: %s", result.stdout)
+        if result.stderr:
+            logger.info("stderr: %s", result.stderr)
+
+        if not check_gravitino_server_status():
+            raise GravitinoRuntimeException("ERROR: Can't start Gravitino 
server!")
+
+    @classmethod
+    def _append_server_hadoop_conf(cls, config):

Review Comment:
   It is actually not server hadoop conf, it is catalog hadoop conf, right? So 
maybe we should rename this method and below.



##########
clients/client-python/tests/integration/hdfs_container.py:
##########
@@ -0,0 +1,143 @@
+"""
+Copyright 2024 Datastrato Pvt Ltd.
+This software is licensed under the Apache License version 2.
+"""
+
+import asyncio
+import logging
+import os
+import time
+
+import docker
+from docker import types as tp
+from docker.errors import NotFound
+
+from gravitino.exceptions.gravitino_runtime_exception import 
GravitinoRuntimeException
+
+logger = logging.getLogger(__name__)
+
+
+async def check_hdfs_status(hive_container):
+    retry_limit = 15
+    for _ in range(retry_limit):
+        try:
+            command_and_args = ["bash", "/tmp/check-status.sh"]
+            exec_result = hive_container.exec_run(command_and_args)
+            if exec_result.exit_code != 0:
+                message = (
+                    f"Command {command_and_args} exited with 
{exec_result.exit_code}"
+                )
+                logger.warning(message)
+                logger.warning("output: %s", exec_result.output)
+                output_status_command = ["hdfs", "dfsadmin", "-report"]
+                exec_result = hive_container.exec_run(output_status_command)
+                logger.info("HDFS report, output: %s", exec_result.output)
+            else:
+                logger.info("HDFS startup successfully!")
+                return True
+        except Exception as e:
+            logger.error(
+                "Exception occurred while checking HDFS container status: %s", 
e
+            )
+        time.sleep(10)
+    return False
+
+
+async def check_hdfs_container_status(hive_container):
+    timeout_sec = 150
+    try:
+        result = await asyncio.wait_for(
+            check_hdfs_status(hive_container), timeout=timeout_sec
+        )
+        assert result is True, "HDFS container startup failed!"
+    except asyncio.TimeoutError as e:
+        raise GravitinoRuntimeException(
+            "Timeout occurred while waiting for checking HDFS container 
status."
+        ) from e
+
+
+class HDFSContainer:
+    _docker_client = None
+    _container = None
+    _network = None
+    _ip = ""
+    _network_name = "python-net"
+    _container_name = "python-hdfs"
+
+    def __init__(self):
+        self._docker_client = docker.from_env()
+        self._create_networks()
+        try:
+            container = 
self._docker_client.containers.get(self._container_name)
+            if container is not None:
+                if container.status == "running":
+                    container.kill()
+                container.remove()

Review Comment:
   What is the reason to kill and remove the container. I think we can reuse 
the container if it is already running, like Java testcontainer's behavior, so 
that we don't have to close and restart container when we have multiple test 
classes, WDYT?
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [#3760] improvement(client-python): Add Docker env and PyGVFS Integration tests [gravitino]

Reply via email to