This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch branch-4.4.0 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 5045f19b5374678c10888376955f2ff5e360ae5b Author: Abhishek Rawat <[email protected]> AuthorDate: Mon Apr 22 16:29:48 2024 -0700 IMPALA-13015: Dataload fails due to concurrency issue with test.jceks Move 'hadoop credential' command used for creating test.jceks to testdata/bin/create-load-data.sh. Earlier it was in bin/load-data.py which is called in parallel and was causing failures due to race conditions. Testing: - Ran JniFrontendTest#testGetSecretFromKeyStore after data loading and test ran clean. Change-Id: I7fbeffc19f2b78c19fee9acf7f96466c8f4f9bcd Reviewed-on: http://gerrit.cloudera.org:8080/21346 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> (cherry picked from commit f620e5d5c0bbdb0fd97bac31c7b7439cd13c6d08) --- bin/load-data.py | 15 --------------- testdata/bin/create-load-data.sh | 10 ++++++++++ 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/bin/load-data.py b/bin/load-data.py index 57ad313de..729dcb95b 100755 --- a/bin/load-data.py +++ b/bin/load-data.py @@ -87,7 +87,6 @@ WORKLOAD_DIR = options.workload_dir DATASET_DIR = options.dataset_dir TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin') AVRO_SCHEMA_DIR = "avro_schemas" -TESTDATA_JCEKS_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/jceks') GENERATE_SCHEMA_CMD = "generate-schema-statements.py --exploration_strategy=%s "\ "--workload=%s --scale_factor=%s --verbose" @@ -300,14 +299,6 @@ def hive_exec_query_files_parallel(thread_pool, query_files, step_name): exec_query_files_parallel(thread_pool, query_files, 'hive', step_name) -def exec_hadoop_credential_cmd(secret_key, secret, provider_path, exit_on_error=True): - cmd = ("%s credential create %s -value %s -provider %s" - % (HADOOP_CMD, secret_key, secret, provider_path)) - LOG.info("Executing Hadoop command: " + cmd) - exec_cmd(cmd, error_msg="Error executing Hadoop command, exiting", - exit_on_error=exit_on_error) - - def main(): logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%H:%M:%S') LOG.setLevel(logging.DEBUG) @@ -317,12 +308,6 @@ def main(): # LOG.debug(' '.join(sys.argv)) - jceks_path = TESTDATA_JCEKS_DIR + "/test.jceks" - if os.path.exists(jceks_path): - os.remove(jceks_path) - exec_hadoop_credential_cmd("openai-api-key-secret", "secret", - "localjceks://file" + jceks_path) - all_workloads = available_workloads(WORKLOAD_DIR) workloads = [] if options.workloads is None: diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index 3571af516..011576495 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -546,6 +546,16 @@ function warm-up-hive { $HIVE_CMD -e "insert overwrite table hive_warm_up_tbl values (1);" } +# IMPALA-13015, IMPALA-13026: This should be called during serial phase of data load. +function create-hadoop-credential { + rm -f ${IMPALA_HOME}/testdata/jceks/test.jceks + hadoop credential create "openai-api-key-secret" -value "secret" -provider \ + "localjceks://file/${IMPALA_HOME}/testdata/jceks/test.jceks" +} + +run-step "Creating hadoop credential" create-hadoop-credential.log \ + create-hadoop-credential + # For kerberized clusters, use kerberos if ${CLUSTER_DIR}/admin is_kerberized; then LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos --principal=${MINIKDC_PRINC_HIVE}"
