This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 11e66523d6070957f84c1fdbba3e26ecf3888d74 Author: Joe McDonnell <[email protected]> AuthorDate: Thu Sep 29 14:36:34 2022 -0700 IMPALA-11526: Install en_US.UTF-8 locale into docker images In IMPALA-11492, ExprTest.Utf8MaskTest was failing on some configurations because the en_US.UTF-8 was missing. Since the Docker images don't contain en_US.UTF-8, they are subject to the same bug. This was confirmed by adding tests cases to the test_utf8_strings.py end-to-end test and running it in the dockerized tests. This add the appropriate language pack to the list of packages installed for the Docker build. Testing: - This adds end-to-end tests to test_utf8_strings.py covering the same cases that were failing in ExprTest.Utf8MaskTest. They failed without the added languages packs, and now succeed. Change-Id: I353f257b3cb6d45f7d0a28f7d5319fdb457e6e3d Reviewed-on: http://gerrit.cloudera.org:8080/19080 Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Laszlo Gaal <[email protected]> --- bin/bootstrap_system.sh | 2 +- docker/daemon_entrypoint.sh | 21 +++++- docker/install_os_packages.sh | 17 +++++ .../queries/QueryTest/utf8-string-functions.test | 80 ++++++++++++++++++++++ 4 files changed, 118 insertions(+), 2 deletions(-) diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh index d637106cb..45c61c34e 100755 --- a/bin/bootstrap_system.sh +++ b/bin/bootstrap_system.sh @@ -268,7 +268,7 @@ redhat sudo yum install -y curl gawk gcc gcc-c++ git krb5-devel krb5-server \ wget vim-common nscd cmake fuse-devel zlib-devel \ psmisc lsof openssh-server redhat-lsb java-1.8.0-openjdk-devel \ java-1.8.0-openjdk-src python3-devel python3-setuptools net-tools \ - langpacks-en + langpacks-en glibc-langpack-en # Enable the Powertools repo for snappy-devel on RedHat 8 redhat8 sudo yum install -y dnf-plugins-core diff --git a/docker/daemon_entrypoint.sh b/docker/daemon_entrypoint.sh index a62cc81f5..08deadcab 100755 --- a/docker/daemon_entrypoint.sh +++ b/docker/daemon_entrypoint.sh @@ -176,7 +176,26 @@ fi # Set ulimit core file size 0. ulimit -c 0 +# The UTF-8 masking functions rely on the presence of en_US.utf8. Make sure +# it is present. +if locale -a | grep en_US.utf8 ; then + echo "en_US.utf8 is present" +else + echo "ERROR: en_US.utf8 locale is not present." + exit 1 +fi + # Set a UTF-8 locale to enable upper/lower/initcap functions with UTF-8 mode. -export LC_ALL=C.UTF-8 +# Use C.UTF-8 (aka C.utf8) if it is available, and fall back to en_US.utf8 if not +# +# Distributions can show either C.UTF-8 or C.utf8 in "locale -a", match either one +if locale -a | grep -e "^C.UTF-8" -e "^C.utf8" ; then + # C.UTF-8 and C.utf8 are interchangeable as a setting for LC_ALL. + export LC_ALL=C.UTF-8 +else + # Presence of en_US.utf8 was verified above + export LC_ALL=en_US.utf8 +fi +echo "LC_ALL: ${LC_ALL}" exec "$@" diff --git a/docker/install_os_packages.sh b/docker/install_os_packages.sh index e05fdaa68..f11fe94b4 100755 --- a/docker/install_os_packages.sh +++ b/docker/install_os_packages.sh @@ -92,6 +92,7 @@ if [[ $DISTRIBUTION == Ubuntu ]]; then fi apt-get install -y \ krb5-user \ + language-pack-en \ libsasl2-2 \ libsasl2-modules \ libsasl2-modules-gssapi-mit \ @@ -122,6 +123,16 @@ elif [[ $DISTRIBUTION == Redhat ]]; then krb5-workstation \ openldap-devel \ tzdata + + # UTF-8 masking functions require the presence of en_US.utf8. + # Install the appropriate language packs. Redhat/Centos 7 come + # with en_US.utf8, so there is no need to install anything. + if ! grep 'release 7\.' /etc/redhat-release; then + yum install -y --disableplugin=subscription-manager \ + glibc-langpack-en \ + langpacks-en + fi + if $INSTALL_DEBUG_TOOLS ; then echo "Installing extra debug tools" yum install -y --disableplugin=subscription-manager \ @@ -137,6 +148,12 @@ elif [[ $DISTRIBUTION == Redhat ]]; then fi fi +# Verify en_US.utf8 is present +if ! locale -a | grep en_US.utf8 ; then + echo "ERROR: en_US.utf8 locale is not present." + exit 1 +fi + # To minimize the size for the Docker image, clean up any unnecessary files. if [[ $DISTRIBUTION == Ubuntu ]]; then apt-get clean diff --git a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test index 8d607c95e..9417e5ad4 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test +++ b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test @@ -181,6 +181,86 @@ select mask('SQL引擎', 'x', 'x', 'x', 'x'), STRING,STRING,STRING,STRING,STRING ==== ---- QUERY +set utf8_mode=true; +select mask('abcd áäèü ABCD ÁÄÈÜ'); +---- RESULTS: RAW_STRING +'xxxx xxxx XXXX XXXX' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask('Ich möchte ein Bier. Tschüss'); +---- RESULTS: RAW_STRING +'Xxx xxxxxx xxx Xxxx. Xxxxxxx' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask('Hungarian áéíöóőüúű ÁÉÍÖÓŐÜÚŰ'); +---- RESULTS: RAW_STRING +'Xxxxxxxxx xxxxxxxxx XXXXXXXXX' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask('German äöüß ÄÖÜẞ'); +---- RESULTS: RAW_STRING +'Xxxxxx xxxx XXXX' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask('French àâæçéèêëïîôœùûüÿ ÀÂÆÇÉÈÊËÏÎÔŒÙÛÜŸ'); +---- RESULTS: RAW_STRING +'Xxxxxx xxxxxxxxxxxxxxxx XXXXXXXXXXXXXXXX' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask('Greek αβξδ άέήώ ΑΒΞΔ ΆΈΉΏ 1234'); +---- RESULTS: RAW_STRING +'Xxxxx xxxx xxxx XXXX XXXX nnnn' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask_first_n('áéíöóőüúű'); +---- RESULTS: RAW_STRING +'xxxxóőüúű' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask_show_first_n('áéíöóőüúű'); +---- RESULTS: RAW_STRING +'áéíöxxxxx' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask_last_n('áéíöóőüúű'); +---- RESULTS: RAW_STRING +'áéíöóxxxx' +---- TYPES +STRING +==== +---- QUERY +set utf8_mode=true; +select mask_show_last_n('áéíöóőüúű') +---- RESULTS: RAW_STRING +'xxxxxőüúű' +---- TYPES +STRING +==== +---- QUERY set utf8_mode=false; select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ'); ---- RESULTS: RAW_STRING
