Repository: impala Updated Branches: refs/heads/2.x 8dd465f69 -> 41f1050fb
IMPALA-6899: Optimize the HDFS commands used in dataload HDFS commandline calls can be expensive due to JVM startup and other costs. Since most HDFS commandline calls can take multiple paths, one way to reduce execution time is to consolidate multiple HDFS commands into a single HDFS call. Since HDFS put commands will follow symbolic links and can copy recursively, this can allow for further consolidation by creating the full directory structure and copying it in a single HDFS call. This does several of these optimizations throughout the dataload codepath. It saves a few seconds here and there: Loading Hive Builtins: 1:10 -> 0:30 Loading custom schemas: 0:35 -> 0:20 Loading Hive UDFs: 0:45 -> 0:25 Conflicts: testdata/bin/copy-udfs-udas.sh - conflict due to "Loosen hive-exec.jar glob pattern..." Change-Id: I0934353329dc7312394fc4457ab8db2a272c6282 Reviewed-on: http://gerrit.cloudera.org:8080/10120 Reviewed-by: Philip Zeyliger <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> (cherry picked from commit da363a99a4b1afff91600c71650e26932be9350a) Reviewed-on: http://gerrit.cloudera.org:8080/10167 Reviewed-by: Joe McDonnell <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/41f1050f Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/41f1050f Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/41f1050f Branch: refs/heads/2.x Commit: 41f1050fbe69da12a03b1da2ecfb8dc7241335b7 Parents: 8dd465f Author: Joe McDonnell <[email protected]> Authored: Wed Dec 20 10:53:43 2017 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Apr 25 01:22:40 2018 +0000 ---------------------------------------------------------------------- testdata/bin/copy-udfs-udas.sh | 48 ++++++------ testdata/bin/create-load-data.sh | 134 ++++++++++++++++---------------- testdata/bin/load-hive-builtins.sh | 81 ++++++++++--------- 3 files changed, 136 insertions(+), 127 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/41f1050f/testdata/bin/copy-udfs-udas.sh ---------------------------------------------------------------------- diff --git a/testdata/bin/copy-udfs-udas.sh b/testdata/bin/copy-udfs-udas.sh index adf64a0..54fcaee 100755 --- a/testdata/bin/copy-udfs-udas.sh +++ b/testdata/bin/copy-udfs-udas.sh @@ -77,27 +77,29 @@ fi # impala-hive-udfs.jar # test-udfs.ll # udf/uda samples (.so/.ll) -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/testutil/libTestUdas.so"\ - "${FILESYSTEM_PREFIX}/test-warehouse" -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/testutil/libTestUdfs.so"\ - "${FILESYSTEM_PREFIX}/test-warehouse" -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/testutil/libTestUdfs.so"\ - "${FILESYSTEM_PREFIX}/test-warehouse/libTestUdfs.SO" -hadoop fs -mkdir -p "${FILESYSTEM_PREFIX}/test-warehouse/udf_test" -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/testutil/libTestUdfs.so"\ - "${FILESYSTEM_PREFIX}/test-warehouse/udf_test/libTestUdfs.so" -hadoop fs -put -f "${HIVE_HOME}/lib/hive-exec-${IMPALA_HIVE_VERSION}.jar"\ - "${FILESYSTEM_PREFIX}/test-warehouse/hive-exec.jar" -hadoop fs -put -f "${IMPALA_HOME}/tests/test-hive-udfs/target/test-hive-udfs-1.0.jar"\ - "${FILESYSTEM_PREFIX}/test-warehouse/impala-hive-udfs.jar" -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/testutil/test-udfs.ll"\ - "${FILESYSTEM_PREFIX}/test-warehouse" -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/udf_samples/libudfsample.so"\ - "${FILESYSTEM_PREFIX}/test-warehouse" -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/udf_samples/udf-sample.ll"\ - "${FILESYSTEM_PREFIX}/test-warehouse" -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/udf_samples/libudasample.so"\ - "${FILESYSTEM_PREFIX}/test-warehouse" -hadoop fs -put -f "${IMPALA_HOME}/be/build/latest/udf_samples/uda-sample.ll"\ - "${FILESYSTEM_PREFIX}/test-warehouse" + +# Using a single HDFS command only works if the files already have the same names +# and directory structure that we want in HDFS. Create directories and symbolic links +# to make that possible. +UDF_TMP_DIR=$(mktemp -d) + +ln -s "${IMPALA_HOME}/be/build/latest/testutil/libTestUdas.so" "${UDF_TMP_DIR}" +ln -s "${IMPALA_HOME}/be/build/latest/testutil/libTestUdfs.so" "${UDF_TMP_DIR}" +ln -s "${IMPALA_HOME}/be/build/latest/testutil/libTestUdfs.so" \ + "${UDF_TMP_DIR}/libTestUdfs.SO" +mkdir "${UDF_TMP_DIR}/udf_test" +ln -s "${IMPALA_HOME}/be/build/latest/testutil/libTestUdfs.so" "${UDF_TMP_DIR}/udf_test" +ln -s "${HIVE_HOME}/lib/hive-exec-${IMPALA_HIVE_VERSION}.jar" "${UDF_TMP_DIR}/hive-exec.jar" +ln -s "${IMPALA_HOME}/tests/test-hive-udfs/target/test-hive-udfs-1.0.jar" \ + "${UDF_TMP_DIR}/impala-hive-udfs.jar" +ln -s "${IMPALA_HOME}/be/build/latest/testutil/test-udfs.ll" "${UDF_TMP_DIR}" +ln -s "${IMPALA_HOME}/be/build/latest/udf_samples/libudfsample.so" "${UDF_TMP_DIR}" +ln -s "${IMPALA_HOME}/be/build/latest/udf_samples/udf-sample.ll" "${UDF_TMP_DIR}" +ln -s "${IMPALA_HOME}/be/build/latest/udf_samples/libudasample.so" "${UDF_TMP_DIR}" +ln -s "${IMPALA_HOME}/be/build/latest/udf_samples/uda-sample.ll" "${UDF_TMP_DIR}" + +hadoop fs -put -f "${UDF_TMP_DIR}"/* "${FILESYSTEM_PREFIX}/test-warehouse" + +# Remove temporary directory +rm -r ${UDF_TMP_DIR} echo "Done copying udf/uda libraries." http://git-wip-us.apache.org/repos/asf/impala/blob/41f1050f/testdata/bin/create-load-data.sh ---------------------------------------------------------------------- diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index 51ba449..fcb7e69 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -134,32 +134,39 @@ echo "CM_HOST=${CM_HOST:-}" echo "REMOTE_LOAD=${REMOTE_LOAD:-}" function load-custom-schemas { + # HDFS commandline calls are slow, so consolidate the manipulation into + # as few calls as possible by populating a temporary directory with the + # appropriate structure and copying it in a single call. + TMP_DIR=$(mktemp -d) + + # Cleanup old schemas dir + hadoop fs -rm -r -f /test-warehouse/schemas SCHEMA_SRC_DIR=${IMPALA_HOME}/testdata/data/schemas - SCHEMA_DEST_DIR=/test-warehouse/schemas - # clean the old schemas directory. - hadoop fs -rm -r -f ${SCHEMA_DEST_DIR} - hadoop fs -mkdir ${SCHEMA_DEST_DIR} - hadoop fs -put $SCHEMA_SRC_DIR/zipcode_incomes.parquet ${SCHEMA_DEST_DIR}/ - hadoop fs -put $SCHEMA_SRC_DIR/alltypestiny.parquet ${SCHEMA_DEST_DIR}/ - hadoop fs -put $SCHEMA_SRC_DIR/enum ${SCHEMA_DEST_DIR}/ - hadoop fs -put $SCHEMA_SRC_DIR/malformed_decimal_tiny.parquet ${SCHEMA_DEST_DIR}/ - hadoop fs -put $SCHEMA_SRC_DIR/decimal.parquet ${SCHEMA_DEST_DIR}/ - hadoop fs -put $SCHEMA_SRC_DIR/nested/modern_nested.parquet ${SCHEMA_DEST_DIR}/ - hadoop fs -put $SCHEMA_SRC_DIR/nested/legacy_nested.parquet ${SCHEMA_DEST_DIR}/ + SCHEMA_TMP_DIR="${TMP_DIR}/schemas" + mkdir ${SCHEMA_TMP_DIR} + mkdir ${SCHEMA_TMP_DIR}/enum + ln -s ${SCHEMA_SRC_DIR}/zipcode_incomes.parquet ${SCHEMA_TMP_DIR} + ln -s ${SCHEMA_SRC_DIR}/alltypestiny.parquet ${SCHEMA_TMP_DIR} + ln -s ${SCHEMA_SRC_DIR}/enum/* ${SCHEMA_TMP_DIR}/enum + ln -s ${SCHEMA_SRC_DIR}/malformed_decimal_tiny.parquet ${SCHEMA_TMP_DIR} + ln -s ${SCHEMA_SRC_DIR}/decimal.parquet ${SCHEMA_TMP_DIR} + ln -s ${SCHEMA_SRC_DIR}/nested/modern_nested.parquet ${SCHEMA_TMP_DIR} + ln -s ${SCHEMA_SRC_DIR}/nested/legacy_nested.parquet ${SCHEMA_TMP_DIR} # CHAR and VARCHAR tables written by Hive - hadoop fs -mkdir -p /test-warehouse/chars_formats_avro_snap/ - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.avro \ - /test-warehouse/chars_formats_avro_snap - hadoop fs -mkdir -p /test-warehouse/chars_formats_parquet/ - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.parquet \ - /test-warehouse/chars_formats_parquet - hadoop fs -mkdir -p /test-warehouse/chars_formats_orc_def/ - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.orc \ - /test-warehouse/chars_formats_orc_def - hadoop fs -mkdir -p /test-warehouse/chars_formats_text/ - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.txt \ - /test-warehouse/chars_formats_text + mkdir -p ${TMP_DIR}/chars_formats_avro_snap \ + ${TMP_DIR}/chars_formats_parquet \ + ${TMP_DIR}/chars_formats_text \ + ${TMP_DIR}/chars_formats_orc_def + + ln -s ${IMPALA_HOME}/testdata/data/chars-formats.avro ${TMP_DIR}/chars_formats_avro_snap + ln -s ${IMPALA_HOME}/testdata/data/chars-formats.parquet ${TMP_DIR}/chars_formats_parquet + ln -s ${IMPALA_HOME}/testdata/data/chars-formats.orc ${TMP_DIR}/chars_formats_orc_def + ln -s ${IMPALA_HOME}/testdata/data/chars-formats.txt ${TMP_DIR}/chars_formats_text + + hadoop fs -put -f ${TMP_DIR}/* /test-warehouse + + rm -r ${TMP_DIR} } function load-data { @@ -257,8 +264,7 @@ function load-aux-workloads { function copy-auth-policy { echo COPYING AUTHORIZATION POLICY FILE - hadoop fs -rm -f ${FILESYSTEM_PREFIX}/test-warehouse/authz-policy.ini - hadoop fs -put ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini \ + hadoop fs -put -f ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini \ ${FILESYSTEM_PREFIX}/test-warehouse/ } @@ -267,20 +273,17 @@ function copy-and-load-dependent-tables { # TODO: The multi-format table will move these files. So we need to copy them to a # temporary location for that table to use. Should find a better way to handle this. echo COPYING AND LOADING DATA FOR DEPENDENT TABLES - hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat - hadoop fs -rm -r -f /tmp/alltypes_rc - hadoop fs -rm -r -f /tmp/alltypes_seq - hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009 - hadoop fs -mkdir -p /tmp/alltypes_rc/year=2009 + hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat \ + /tmp/alltypes_rc /tmp/alltypes_seq + hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009 \ + /tmp/alltypes_rc/year=2009 hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009 hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009 # Create a hidden file in AllTypesSmall - hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/_hidden - hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/.hidden - hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ + hadoop fs -cp -f /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ /test-warehouse/alltypessmall/year=2009/month=1/_hidden - hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ + hadoop fs -cp -f /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ /test-warehouse/alltypessmall/year=2009/month=1/.hidden # In case the data is updated by a non-super user, make sure the user can write @@ -299,8 +302,7 @@ function copy-and-load-dependent-tables { # # See: logs/data_loading/copy-and-load-dependent-tables.log) # See also: IMPALA-4345 - hadoop fs -chmod -R 777 /tmp/alltypes_rc - hadoop fs -chmod -R 777 /tmp/alltypes_seq + hadoop fs -chmod -R 777 /tmp/alltypes_rc /tmp/alltypes_seq # For tables that rely on loading data from local fs test-wareload-house # TODO: Find a good way to integrate this with the normal data loading scripts @@ -351,33 +353,31 @@ function load-custom-data { hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/ - # IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0 - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \ - /test-warehouse/bad_parquet_parquet - - # Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary) - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \ - /test-warehouse/bad_parquet_parquet - - # IMPALA-720: data file produced by parquet-mr with multiple row groups - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \ - /test-warehouse/bad_parquet_parquet - - # IMPALA-1401: data file produced by Hive 13 containing page statistics with long min/max - # string values - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/long_page_header.parquet \ - /test-warehouse/bad_parquet_parquet + # Load specialized parquet files into functional_parquet.bad_parquet + # bad_parquet_data.parquet - IMPALA-694: data file produced by + # parquet-mr version 1.2.5-cdh4.5.0 + # repeated_values.parquet - Data file produced by parquet-mr with repeated values + # (produces 0 bit width dictionary) + # multiple_rowgroups.parquet - IMPALA-720: data file produced by parquet-mr + # with multiple row groups + # long_page_header.parquet - IMPALA-1401: data file produced by Hive 13 containing + # page statistics with long min/max string values + hadoop fs -put -f \ + ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \ + ${IMPALA_HOME}/testdata/data/repeated_values.parquet \ + ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \ + ${IMPALA_HOME}/testdata/data/long_page_header.parquet \ + /test-warehouse/bad_parquet_parquet # IMPALA-3732: parquet files with corrupt strings - local parq_file - for parq_file in dict-encoded-negative-len.parq plain-encoded-negative-len.parq; do - hadoop fs -put -f ${IMPALA_HOME}/testdata/bad_parquet_data/$parq_file \ - /test-warehouse/bad_parquet_strings_negative_len_parquet - done - for parq_file in dict-encoded-out-of-bounds.parq plain-encoded-out-of-bounds.parq; do - hadoop fs -put -f ${IMPALA_HOME}/testdata/bad_parquet_data/$parq_file \ - /test-warehouse/bad_parquet_strings_out_of_bounds_parquet - done + hadoop fs -put -f \ + ${IMPALA_HOME}/testdata/bad_parquet_data/dict-encoded-negative-len.parq \ + ${IMPALA_HOME}/testdata/bad_parquet_data/plain-encoded-negative-len.parq \ + /test-warehouse/bad_parquet_strings_negative_len_parquet + hadoop fs -put -f \ + ${IMPALA_HOME}/testdata/bad_parquet_data/dict-encoded-out-of-bounds.parq \ + ${IMPALA_HOME}/testdata/bad_parquet_data/plain-encoded-out-of-bounds.parq \ + /test-warehouse/bad_parquet_strings_out_of_bounds_parquet # Remove all index files in this partition. hadoop fs -rm -f /test-warehouse/alltypes_text_lzo/year=2009/month=1/*.lzo.index @@ -421,24 +421,26 @@ function custom-post-load-steps { # Set both read and execute permissions because accessing the contents of a directory on # the local filesystem requires the x permission (while on HDFS it requires the r # permission). - hadoop fs -chmod -R 555 ${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=1 - hadoop fs -chmod -R 555 ${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=3 + hadoop fs -chmod -R 555 \ + ${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=1 \ + ${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=3 fi + hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet \ + ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_sixblocks_parquet \ + ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_one_row_group_parquet + #IMPALA-1881: data file produced by hive with multiple blocks. - hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet hadoop fs -Ddfs.block.size=1048576 -put -f \ ${IMPALA_HOME}/testdata/LineItemMultiBlock/000000_0 \ ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet # IMPALA-2466: Add more tests to the HDFS Parquet scanner (Added after IMPALA-1881) - hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_sixblocks_parquet && \ hadoop fs -Ddfs.block.size=1048576 -put -f \ ${IMPALA_HOME}/testdata/LineItemMultiBlock/lineitem_sixblocks.parquet \ ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_sixblocks_parquet # IMPALA-2466: Add more tests to the HDFS Parquet scanner (this has only one row group) - hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_one_row_group_parquet && \ hadoop fs -Ddfs.block.size=1048576 -put -f \ ${IMPALA_HOME}/testdata/LineItemMultiBlock/lineitem_one_row_group.parquet \ ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_one_row_group_parquet http://git-wip-us.apache.org/repos/asf/impala/blob/41f1050f/testdata/bin/load-hive-builtins.sh ---------------------------------------------------------------------- diff --git a/testdata/bin/load-hive-builtins.sh b/testdata/bin/load-hive-builtins.sh index 29d9b7f..061d42b 100755 --- a/testdata/bin/load-hive-builtins.sh +++ b/testdata/bin/load-hive-builtins.sh @@ -23,41 +23,46 @@ trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0) . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 # TODO: remove this once we understand why Hive looks in HDFS for many of its jars -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${HIVE_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${HIVE_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -put ${HIVE_HOME}/lib/*.jar ${FILESYSTEM_PREFIX}${HIVE_HOME}/lib/ - -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${HBASE_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${HBASE_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -put ${HBASE_HOME}/lib/*.jar ${FILESYSTEM_PREFIX}${HBASE_HOME}/lib/ - -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/common/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/common/ -${HADOOP_HOME}/bin/hadoop fs -put ${HADOOP_HOME}/share/hadoop/common/*.jar \ - ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/common/ -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/common/lib/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/common/lib/ -${HADOOP_HOME}/bin/hadoop fs -put ${HADOOP_HOME}/share/hadoop/common/lib/*.jar \ - ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/common/lib/ -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/mapreduce/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/mapreduce/ -${HADOOP_HOME}/bin/hadoop fs -put ${HADOOP_HOME}/share/hadoop/mapreduce/*.jar \ - ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/mapreduce/ -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/tools/lib -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/tools/lib -${HADOOP_HOME}/bin/hadoop fs -put ${HADOOP_HOME}/share/hadoop/tools/lib/*.jar \ - ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/tools/lib/ - -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${HADOOP_LZO}/build -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${HADOOP_LZO}/build -${HADOOP_HOME}/bin/hadoop fs -put ${HADOOP_LZO}/build/hadoop-lzo*.jar \ - ${FILESYSTEM_PREFIX}${HADOOP_LZO}/build/ - -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${SENTRY_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${SENTRY_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -put ${SENTRY_HOME}/lib/*.jar ${FILESYSTEM_PREFIX}${SENTRY_HOME}/lib/ - -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${FILESYSTEM_PREFIX}${IMPALA_HOME}/thirdparty/postgresql-jdbc/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}${IMPALA_HOME}/thirdparty/postgresql-jdbc/ -${HADOOP_HOME}/bin/hadoop fs -put ${POSTGRES_JDBC_DRIVER} \ - ${FILESYSTEM_PREFIX}${IMPALA_HOME}/thirdparty/postgresql-jdbc/ + +# Remove all directories in one command for efficiency +${HADOOP_HOME}/bin/hadoop fs -rm -skipTrash -r -f ${FILESYSTEM_PREFIX}${HIVE_HOME}/lib/ \ + ${FILESYSTEM_PREFIX}${HBASE_HOME}/lib/ \ + ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/common/ \ + ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/mapreduce/ \ + ${FILESYSTEM_PREFIX}${HADOOP_HOME}/share/hadoop/tools/lib \ + ${FILESYSTEM_PREFIX}${HADOOP_LZO}/build \ + ${FILESYSTEM_PREFIX}${SENTRY_HOME}/lib/ \ + ${FILESYSTEM_PREFIX}${IMPALA_HOME}/thirdparty/postgresql-jdbc/ + +TMP_DIR=$(mktemp -d) + +# Create the directory structure to copy over +mkdir -p ${TMP_DIR}/${HIVE_HOME}/lib \ + ${TMP_DIR}/${HBASE_HOME}/lib \ + ${TMP_DIR}/${HADOOP_HOME}/share/hadoop/common/lib \ + ${TMP_DIR}/${HADOOP_HOME}/share/hadoop/mapreduce \ + ${TMP_DIR}/${HADOOP_HOME}/share/hadoop/tools/lib \ + ${TMP_DIR}/${HADOOP_LZO}/build \ + ${TMP_DIR}/${SENTRY_HOME}/lib \ + ${TMP_DIR}/${IMPALA_HOME}/thirdparty/postgresql-jdbc/ + +# Add symbolic links to files in the appropriate places +ln -s ${HIVE_HOME}/lib/*.jar ${TMP_DIR}/${HIVE_HOME}/lib +ln -s ${HBASE_HOME}/lib/*.jar ${TMP_DIR}/${HBASE_HOME}/lib +ln -s ${HADOOP_HOME}/share/hadoop/common/*.jar \ + ${TMP_DIR}/${HADOOP_HOME}/share/hadoop/common +ln -s ${HADOOP_HOME}/share/hadoop/common/lib/*.jar \ + ${TMP_DIR}/${HADOOP_HOME}/share/hadoop/common/lib +ln -s ${HADOOP_HOME}/share/hadoop/mapreduce/*.jar \ + ${TMP_DIR}/${HADOOP_HOME}/share/hadoop/mapreduce +ln -s ${HADOOP_HOME}/share/hadoop/tools/lib/*.jar \ + ${TMP_DIR}/${HADOOP_HOME}/share/hadoop/tools/lib +ln -s ${HADOOP_LZO}/build/hadoop-lzo*.jar ${TMP_DIR}/${HADOOP_LZO}/build +ln -s ${SENTRY_HOME}/lib/*.jar ${TMP_DIR}/${SENTRY_HOME}/lib +# This is the only item that uses a different path +# TODO: why is this path different? +ln -s ${POSTGRES_JDBC_DRIVER} ${TMP_DIR}/${IMPALA_HOME}/thirdparty/postgresql-jdbc + +${HADOOP_HOME}/bin/hadoop fs -put ${TMP_DIR}/* ${FILESYSTEM_PREFIX}/ + +rm -r ${TMP_DIR}
