AHeise commented on a change in pull request #10460: [FLINK-14574] Fixing writing with plugin s3 filesystem. URL: https://github.com/apache/flink/pull/10460#discussion_r354853724
########## File path: flink-end-to-end-tests/test-scripts/common_s3_operations.sh ########## @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +################################### +# Starts a docker container of the aws. +# Keeps it busy such that it can be used to issue several commands. +# +# This caching mechanism is used to improve performance of s3_get_number_of_lines_by_prefix. +# +# Globals: +# TEST_INFRA_DIR +# Exports: +# AWSCLI_CONTAINER_ID +################################### +function aws_cli_start() { + export AWSCLI_CONTAINER_ID=$(docker run -d \ + --network host \ + --mount type=bind,source="$TEST_INFRA_DIR",target=/hostdir \ + -e AWS_REGION -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \ + --entrypoint python \ + -it banst/awscli) + + while [[ "$(docker inspect -f {{.State.Running}} "$AWSCLI_CONTAINER_ID")" -ne "true" ]]; do + sleep 0.1 + done + on_exit aws_cli_stop +} + +################################### +# Stops the docker container of the aws cli. +# +# Globals: +# AWSCLI_CONTAINER_ID +################################### +function aws_cli_stop() { + docker kill "$AWSCLI_CONTAINER_ID" + docker rm "$AWSCLI_CONTAINER_ID" + export AWSCLI_CONTAINER_ID= +} + +# always start it while sourcing, so that AWSCLI_CONTAINER_ID is available from parent script +if [[ $AWSCLI_CONTAINER_ID ]]; then + aws_cli_stop +fi +aws_cli_start + +################################### +# Runs an aws command on the previously started container. +# +# Globals: +# AWSCLI_CONTAINER_ID +################################### +function aws_cli() { + local endpoint="" + if [[ $S3_ENDPOINT ]]; then + endpoint="--endpoint-url $S3_ENDPOINT" + fi + docker exec "$AWSCLI_CONTAINER_ID" aws $endpoint "$@" +} + +################################### +# Download s3 objects to folder by full path prefix. +# +# Globals: +# IT_CASE_S3_BUCKET +# TEST_INFRA_DIR +# Arguments: +# $1 - local path to save folder with files +# $2 - s3 key full path prefix +# $3 - s3 file name prefix w/o directory to filter files by name (optional) +# $4 - recursive? +# Returns: +# None +################################### +function s3_get_by_full_path_and_filename_prefix() { + local args= + if [[ $3 ]]; then + args=" --exclude '*' --include '*/${3}[!/]*'" + fi + if [[ "$4" == true ]]; then + args="$args --recursive" + fi + aws_cli s3 cp "s3://$IT_CASE_S3_BUCKET/$2" "/hostdir/${1#$TEST_INFRA_DIR}"$args +} + +################################### +# Delete s3 objects by full path prefix. +# +# Globals: +# IT_CASE_S3_BUCKET +# Arguments: +# $1 - s3 key full path prefix +# Returns: +# None +################################### +function s3_delete_by_full_path_prefix() { + aws_cli s3 rm "s3://$IT_CASE_S3_BUCKET/$1" --recursive +} + +################################### +# Count number of lines in files of s3 objects filtered by prefix. +# The lines has to be simple to comply with CSV format +# because SQL is used to query the s3 objects. +# +# Globals: +# IT_CASE_S3_BUCKET +# Arguments: +# $1 - s3 key prefix +# $2 - s3 file name prefix w/o directory to filter files by name (optional) +# Returns: +# line number in part files +################################### +function s3_get_number_of_lines_by_prefix() { + local file_prefix="${2-}" + + # find all files that have the given prefix + parts=$(aws_cli s3api list-objects --bucket "$IT_CASE_S3_BUCKET" --prefix "$1" | + docker run -i stedolan/jq -r '[.Contents[].Key] | join(" ")') + + # in parallel (N tasks), query the number of lines, store result in a file named lines + N=10 + echo "0" >lines + # turn off job control, so that there is noise when starting/finishing bg tasks + old_state=$(set +o) + set +m + for part in $parts; do + if [[ $(basename "${part}") == $file_prefix* ]]; then + ((i = i % N)) + ((i++ == 0)) && wait + aws_cli s3api select-object-content --bucket "$IT_CASE_S3_BUCKET" --key "$part" \ + --expression "select count(*) from s3object" --expression-type "SQL" \ + --input-serialization='{"CSV": {}}' --output-serialization='{"CSV": {}}' /dev/stdout >>lines & + fi + done + wait + # restore old settings + eval "$state" + # add number of lines of each part + paste -sd+ lines | bc +} + +################################### +# Removes the temporary prefix in the e2e bucket. +# +# Globals: +# OUT Review comment: inlined into tests. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
