(impala) 01/04: IMPALA-13015: Dataload fails due to concurrency issue with test.jceks

boroknagyz Mon, 06 May 2024 09:50:23 -0700

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch branch-4.4.0
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 5045f19b5374678c10888376955f2ff5e360ae5b
Author: Abhishek Rawat <[email protected]>
AuthorDate: Mon Apr 22 16:29:48 2024 -0700

    IMPALA-13015: Dataload fails due to concurrency issue with test.jceks
    
    Move 'hadoop credential' command used for creating test.jceks to
    testdata/bin/create-load-data.sh. Earlier it was in bin/load-data.py
    which is called in parallel and was causing failures due to race
    conditions.
    
    Testing:
    - Ran JniFrontendTest#testGetSecretFromKeyStore after data loading and
    test ran clean.
    
    Change-Id: I7fbeffc19f2b78c19fee9acf7f96466c8f4f9bcd
    Reviewed-on: http://gerrit.cloudera.org:8080/21346
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
    (cherry picked from commit f620e5d5c0bbdb0fd97bac31c7b7439cd13c6d08)
---
 bin/load-data.py                 | 15 ---------------
 testdata/bin/create-load-data.sh | 10 ++++++++++
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/bin/load-data.py b/bin/load-data.py
index 57ad313de..729dcb95b 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -87,7 +87,6 @@ WORKLOAD_DIR = options.workload_dir
 DATASET_DIR = options.dataset_dir
 TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
 AVRO_SCHEMA_DIR = "avro_schemas"
-TESTDATA_JCEKS_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/jceks')
 
 GENERATE_SCHEMA_CMD = "generate-schema-statements.py --exploration_strategy=%s 
"\
                       "--workload=%s --scale_factor=%s --verbose"
@@ -300,14 +299,6 @@ def hive_exec_query_files_parallel(thread_pool, 
query_files, step_name):
   exec_query_files_parallel(thread_pool, query_files, 'hive', step_name)
 
 
-def exec_hadoop_credential_cmd(secret_key, secret, provider_path, 
exit_on_error=True):
-  cmd = ("%s credential create %s -value %s -provider %s"
-      % (HADOOP_CMD, secret_key, secret, provider_path))
-  LOG.info("Executing Hadoop command: " + cmd)
-  exec_cmd(cmd, error_msg="Error executing Hadoop command, exiting",
-      exit_on_error=exit_on_error)
-
-
 def main():
   logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%H:%M:%S')
   LOG.setLevel(logging.DEBUG)
@@ -317,12 +308,6 @@ def main():
   #
   LOG.debug(' '.join(sys.argv))
 
-  jceks_path = TESTDATA_JCEKS_DIR + "/test.jceks"
-  if os.path.exists(jceks_path):
-    os.remove(jceks_path)
-  exec_hadoop_credential_cmd("openai-api-key-secret", "secret",
-      "localjceks://file" + jceks_path)
-
   all_workloads = available_workloads(WORKLOAD_DIR)
   workloads = []
   if options.workloads is None:
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 3571af516..011576495 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -546,6 +546,16 @@ function warm-up-hive {
   $HIVE_CMD -e "insert overwrite table hive_warm_up_tbl values (1);"
 }
 
+# IMPALA-13015, IMPALA-13026: This should be called during serial phase of 
data load.
+function create-hadoop-credential {
+  rm -f ${IMPALA_HOME}/testdata/jceks/test.jceks
+  hadoop credential create "openai-api-key-secret" -value "secret" -provider \
+    "localjceks://file/${IMPALA_HOME}/testdata/jceks/test.jceks"
+}
+
+run-step "Creating hadoop credential" create-hadoop-credential.log \
+    create-hadoop-credential
+
 # For kerberized clusters, use kerberos
 if ${CLUSTER_DIR}/admin is_kerberized; then
   LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos 
--principal=${MINIKDC_PRINC_HIVE}"

(impala) 01/04: IMPALA-13015: Dataload fails due to concurrency issue with test.jceks

Reply via email to