AHeise commented on a change in pull request #10460: [FLINK-14574] Fixing 
writing with plugin s3 filesystem. 
URL: https://github.com/apache/flink/pull/10460#discussion_r354833199
 
 

 ##########
 File path: flink-end-to-end-tests/test-scripts/common_s3_operations.sh
 ##########
 @@ -0,0 +1,169 @@
+#!/usr/bin/env bash
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+###################################
+# Starts a docker container of the aws.
+# Keeps it busy such that it can be used to issue several commands.
+#
+# This caching mechanism is used to improve performance of 
s3_get_number_of_lines_by_prefix.
+#
+# Globals:
+#   TEST_INFRA_DIR
+# Exports:
+#   AWSCLI_CONTAINER_ID
+###################################
+function aws_cli_start() {
+  export AWSCLI_CONTAINER_ID=$(docker run -d \
+    --network host \
+    --mount type=bind,source="$TEST_INFRA_DIR",target=/hostdir \
+    -e AWS_REGION -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
+    --entrypoint python \
+    -it banst/awscli)
+
+  while [[ "$(docker inspect -f {{.State.Running}} "$AWSCLI_CONTAINER_ID")" 
-ne "true" ]]; do
+    sleep 0.1
+  done
+  on_exit aws_cli_stop
+}
+
+###################################
+# Stops the docker container of the aws cli.
+#
+# Globals:
+#   AWSCLI_CONTAINER_ID
+###################################
+function aws_cli_stop() {
+  docker kill "$AWSCLI_CONTAINER_ID"
+  docker rm "$AWSCLI_CONTAINER_ID"
+  export AWSCLI_CONTAINER_ID=
+}
+
+# always start it while sourcing, so that AWSCLI_CONTAINER_ID is available 
from parent script
+if [[ $AWSCLI_CONTAINER_ID ]]; then
+  aws_cli_stop
+fi
+aws_cli_start
+
+###################################
+# Runs an aws command on the previously started container.
+#
+# Globals:
+#   AWSCLI_CONTAINER_ID
+###################################
+function aws_cli() {
+  local endpoint=""
+  if [[ $S3_ENDPOINT ]]; then
+    endpoint="--endpoint-url $S3_ENDPOINT"
+  fi
+  docker exec "$AWSCLI_CONTAINER_ID" aws $endpoint "$@"
+}
+
+###################################
+# Download s3 objects to folder by full path prefix.
+#
+# Globals:
+#   IT_CASE_S3_BUCKET
+#   TEST_INFRA_DIR
+# Arguments:
+#   $1 - local path to save folder with files
+#   $2 - s3 key full path prefix
+#   $3 - s3 file name prefix w/o directory to filter files by name (optional)
+#   $4 - recursive?
+# Returns:
+#   None
+###################################
+function s3_get_by_full_path_and_filename_prefix() {
+  local args=
+  if [[ $3 ]]; then
+    args=" --exclude '*' --include '*/${3}[!/]*'"
+  fi
+  if [[ "$4" == true ]]; then
+    args="$args --recursive"
+  fi
+  aws_cli s3 cp "s3://$IT_CASE_S3_BUCKET/$2" 
"/hostdir/${1#$TEST_INFRA_DIR}"$args
+}
+
+###################################
+# Delete s3 objects by full path prefix.
+#
+# Globals:
+#   IT_CASE_S3_BUCKET
+# Arguments:
+#   $1 - s3 key full path prefix
+# Returns:
+#   None
+###################################
+function s3_delete_by_full_path_prefix() {
+  aws_cli s3 rm "s3://$IT_CASE_S3_BUCKET/$1" --recursive
+}
+
+###################################
+# Count number of lines in files of s3 objects filtered by prefix.
+# The lines has to be simple to comply with CSV format
+# because SQL is used to query the s3 objects.
+#
+# Globals:
+#   IT_CASE_S3_BUCKET
+# Arguments:
+#   $1 - s3 key prefix
+#   $2 - s3 file name prefix w/o directory to filter files by name (optional)
+# Returns:
+#   line number in part files
+###################################
+function s3_get_number_of_lines_by_prefix() {
+  local file_prefix="${2-}"
+
+  # find all files that have the given prefix
+  parts=$(aws_cli s3api list-objects --bucket "$IT_CASE_S3_BUCKET" --prefix 
"$1" |
+    docker run -i stedolan/jq -r '[.Contents[].Key] | join(" ")')
+
+  # in parallel (N tasks), query the number of lines, store result in a file 
named lines
+  N=10
+  echo "0" >lines
+  # turn off job control, so that there is noise when starting/finishing bg 
tasks
+  old_state=$(set +o)
+  set +m
+  for part in $parts; do
+    if [[ $(basename "${part}") == $file_prefix* ]]; then
+      ((i = i % N))
+      ((i++ == 0)) && wait
 
 Review comment:
   Yes, exactly. It's the easiest way to fire off stuff in parallel, a FIFO 
queue with leases would be more complex to implement.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to