[arrow] branch master updated: ARROW-2071: [Python] Fix test slowness on Travis-CI

wesm Mon, 05 Feb 2018 13:49:42 -0800

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 30ade84  ARROW-2071: [Python] Fix test slowness on Travis-CI
30ade84 is described below

commit 30ade84465f0c3b6d21833813bcdc0d6cf352cd8
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Feb 5 16:48:52 2018 -0500

    ARROW-2071: [Python] Fix test slowness on Travis-CI
    
    The pytorch build from the "soumith" channel can be buggy and fail 
importing with a symbol error.  In that case avoid trying "import torch" 
extremely often as this spends CPU time in the dynamic loader.
    
    Also lighten the plasma tests when run with Valgrind.
    
    The main offending job is down to 29 minutes, with 2 minutes being taken by 
the Python tests.
    
    Author: Antoine Pitrou <[email protected]>
    
    Closes #1561 from pitrou/ARROW-2071-fix-travis-ci-tests-slowness and 
squashes the following commits:
    
    da67a053 [Antoine Pitrou] ARROW-2071: [Python] Fix test slowness on 
Travis-CI
---
 ci/travis_before_script_cpp.sh             |  5 ++-
 ci/travis_build_parquet_cpp.sh             |  2 +-
 ci/travis_install_clang_tools.sh           |  4 +--
 ci/travis_install_conda.sh                 |  2 +-
 ci/travis_lint.sh                          |  2 +-
 ci/travis_script_python.sh                 |  9 +++---
 python/pyarrow/tests/test_plasma.py        | 49 +++++++++++++++++-------------
 python/pyarrow/tests/test_serialization.py | 19 +++++++-----
 8 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh
index 7c1d726..7725c56 100755
--- a/ci/travis_before_script_cpp.sh
+++ b/ci/travis_before_script_cpp.sh
@@ -52,11 +52,10 @@ if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then
 
   # HACK(wesm): We started experiencing OpenSSL failures when Miniconda was
   # updated sometime on October 2 or October 3
-  conda update -y -p $CPP_TOOLCHAIN ca-certificates -c defaults
+  conda update -y -q -p $CPP_TOOLCHAIN ca-certificates -c defaults
 fi
 
-
-mkdir $ARROW_CPP_BUILD_DIR
+mkdir -p $ARROW_CPP_BUILD_DIR
 pushd $ARROW_CPP_BUILD_DIR
 
 CMAKE_COMMON_FLAGS="\
diff --git a/ci/travis_build_parquet_cpp.sh b/ci/travis_build_parquet_cpp.sh
index 4330a31..4b6370e 100755
--- a/ci/travis_build_parquet_cpp.sh
+++ b/ci/travis_build_parquet_cpp.sh
@@ -28,7 +28,7 @@ export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
 PARQUET_DIR=$TRAVIS_BUILD_DIR/parquet
 mkdir -p $PARQUET_DIR
 
-git clone https://github.com/apache/parquet-cpp.git $PARQUET_DIR
+git clone -q https://github.com/apache/parquet-cpp.git $PARQUET_DIR
 
 pushd $PARQUET_DIR
 mkdir build-dir
diff --git a/ci/travis_install_clang_tools.sh b/ci/travis_install_clang_tools.sh
index bad1e73..d0108ad 100755
--- a/ci/travis_install_clang_tools.sh
+++ b/ci/travis_install_clang_tools.sh
@@ -20,5 +20,5 @@
 wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add -
 sudo apt-add-repository -y \
      "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-4.0 main"
-sudo apt-get update
-sudo apt-get install clang-4.0 clang-format-4.0 clang-tidy-4.0
+sudo apt-get update -q
+sudo apt-get install -q clang-4.0 clang-format-4.0 clang-tidy-4.0
diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh
old mode 100644
new mode 100755
index 3faa548..caec9bb
--- a/ci/travis_install_conda.sh
+++ b/ci/travis_install_conda.sh
@@ -25,7 +25,7 @@ else
   
MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh";
 fi
 
-wget -O miniconda.sh $MINICONDA_URL
+wget --no-verbose -O miniconda.sh $MINICONDA_URL
 
 source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh
 mkdir -p $CONDA_PKGS_DIRS
diff --git a/ci/travis_lint.sh b/ci/travis_lint.sh
index 6a2a0be..096170a 100755
--- a/ci/travis_lint.sh
+++ b/ci/travis_lint.sh
@@ -33,7 +33,7 @@ fi
 popd
 
 # Fail fast on style checks
-sudo pip install flake8
+sudo pip install -q flake8
 
 PYTHON_DIR=$TRAVIS_BUILD_DIR/python
 
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 7c896df..4187e75 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -53,6 +53,7 @@ if [ "$PYTHON_VERSION" != "2.7" ] || [ $TRAVIS_OS_NAME != 
"osx" ]; then
 fi
 
 # Build C++ libraries
+mkdir -p $ARROW_CPP_BUILD_DIR
 pushd $ARROW_CPP_BUILD_DIR
 
 # Clear out prior build files
@@ -77,21 +78,21 @@ popd
 pushd $ARROW_PYTHON_DIR
 
 if [ "$PYTHON_VERSION" == "2.7" ]; then
-  pip install futures
+  pip install -q futures
 fi
 
 export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE
 
-pip install -r requirements.txt
+pip install -q -r requirements.txt
 python setup.py build_ext --with-parquet --with-plasma --with-orc\
-       install --single-version-externally-managed --record=record.text
+       install -q --single-version-externally-managed --record=record.text
 popd
 
 python -c "import pyarrow.parquet"
 python -c "import pyarrow.plasma"
 python -c "import pyarrow.orc"
 
-if [ $TRAVIS_OS_NAME == "linux" ]; then
+if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then
   export PLASMA_VALGRIND=1
 fi
 
diff --git a/python/pyarrow/tests/test_plasma.py 
b/python/pyarrow/tests/test_plasma.py
index 9ea6476..27556e6 100644
--- a/python/pyarrow/tests/test_plasma.py
+++ b/python/pyarrow/tests/test_plasma.py
@@ -30,7 +30,9 @@ import time
 import pyarrow as pa
 import pandas as pd
 
-DEFAULT_PLASMA_STORE_MEMORY = 10 ** 9
+
+DEFAULT_PLASMA_STORE_MEMORY = 10 ** 8
+USE_VALGRIND = os.getenv("PLASMA_VALGRIND") == "1"
 
 
 def random_name():
@@ -71,7 +73,7 @@ def create_object_with_id(client, object_id, data_size, 
metadata_size,
     return memory_buffer, metadata
 
 
-def create_object(client, data_size, metadata_size, seal=True):
+def create_object(client, data_size, metadata_size=0, seal=True):
     object_id = random_object_id()
     memory_buffer, metadata = create_object_with_id(client, object_id,
                                                     data_size, metadata_size,
@@ -158,7 +160,7 @@ class TestPlasmaClient(object):
         import pyarrow.plasma as plasma
         # Start Plasma store.
         plasma_store_name, self.p = start_plasma_store(
-            use_valgrind=os.getenv("PLASMA_VALGRIND") == "1",
+            use_valgrind=USE_VALGRIND,
             use_one_memory_mapped_file=use_one_memory_mapped_file)
         # Connect to Plasma.
         self.plasma_client = plasma.connect(plasma_store_name, "", 64)
@@ -202,7 +204,7 @@ class TestPlasmaClient(object):
             assert memory_buffer[i] == i % 256
 
     def test_create_with_metadata(self):
-        for length in range(1000):
+        for length in range(0, 1000, 3):
             # Create an object id string.
             object_id = random_object_id()
             # Create a random metadata string.
@@ -246,7 +248,7 @@ class TestPlasmaClient(object):
                 assert False
 
     def test_get(self):
-        num_object_ids = 100
+        num_object_ids = 60
         # Test timing out of get with various timeouts.
         for timeout in [0, 10, 100, 1000]:
             object_ids = [random_object_id() for _ in range(num_object_ids)]
@@ -390,29 +392,30 @@ class TestPlasmaClient(object):
                 # For some reason the above didn't throw an exception, so fail.
                 assert False
 
+        PERCENT = DEFAULT_PLASMA_STORE_MEMORY // 100
+
         # Create a list to keep some of the buffers in scope.
         memory_buffers = []
-        _, memory_buffer, _ = create_object(self.plasma_client, 5 * 10 ** 8, 0)
+        _, memory_buffer, _ = create_object(self.plasma_client, 50 * PERCENT)
         memory_buffers.append(memory_buffer)
-        # Remaining space is 5 * 10 ** 8. Make sure that we can't create an
-        # object of size 5 * 10 ** 8 + 1, but we can create one of size
-        # 2 * 10 ** 8.
-        assert_create_raises_plasma_full(self, 5 * 10 ** 8 + 1)
-        _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0)
+        # Remaining space is 50%. Make sure that we can't create an
+        # object of size 50% + 1, but we can create one of size 20%.
+        assert_create_raises_plasma_full(self, 50 * PERCENT + 1)
+        _, memory_buffer, _ = create_object(self.plasma_client, 20 * PERCENT)
         del memory_buffer
-        _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0)
+        _, memory_buffer, _ = create_object(self.plasma_client, 20 * PERCENT)
         del memory_buffer
-        assert_create_raises_plasma_full(self, 5 * 10 ** 8 + 1)
+        assert_create_raises_plasma_full(self, 50 * PERCENT + 1)
 
-        _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0)
+        _, memory_buffer, _ = create_object(self.plasma_client, 20 * PERCENT)
         memory_buffers.append(memory_buffer)
-        # Remaining space is 3 * 10 ** 8.
-        assert_create_raises_plasma_full(self, 3 * 10 ** 8 + 1)
+        # Remaining space is 30%.
+        assert_create_raises_plasma_full(self, 30 * PERCENT + 1)
 
-        _, memory_buffer, _ = create_object(self.plasma_client, 10 ** 8, 0)
+        _, memory_buffer, _ = create_object(self.plasma_client, 10 * PERCENT)
         memory_buffers.append(memory_buffer)
-        # Remaining space is 2 * 10 ** 8.
-        assert_create_raises_plasma_full(self, 2 * 10 ** 8 + 1)
+        # Remaining space is 20%.
+        assert_create_raises_plasma_full(self, 20 * PERCENT + 1)
 
     def test_contains(self):
         fake_object_ids = [random_object_id() for _ in range(100)]
@@ -645,10 +648,14 @@ class TestPlasmaClient(object):
         del b7
         assert client.evict(2000) == 996 + 995 + 994
 
+    # Mitigate valgrind-induced slowness
+    SUBSCRIBE_TEST_SIZES = ([1, 10, 100, 1000] if USE_VALGRIND
+                            else [1, 10, 100, 1000, 10000])
+
     def test_subscribe(self):
         # Subscribe to notifications from the Plasma Store.
         self.plasma_client.subscribe()
-        for i in [1, 10, 100, 1000, 10000]:
+        for i in self.SUBSCRIBE_TEST_SIZES:
             object_ids = [random_object_id() for _ in range(i)]
             metadata_sizes = [np.random.randint(1000) for _ in range(i)]
             data_sizes = [np.random.randint(1000) for _ in range(i)]
@@ -670,7 +677,7 @@ class TestPlasmaClient(object):
         # plasma_client2 to make sure that all used objects will get evicted
         # properly.
         self.plasma_client2.subscribe()
-        for i in [1, 10, 100, 1000, 10000]:
+        for i in self.SUBSCRIBE_TEST_SIZES:
             object_ids = [random_object_id() for _ in range(i)]
             # Add 1 to the sizes to make sure we have nonzero object sizes.
             metadata_sizes = [np.random.randint(1000) + 1 for _ in range(i)]
diff --git a/python/pyarrow/tests/test_serialization.py 
b/python/pyarrow/tests/test_serialization.py
index 3b1324b..20c195a 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -28,15 +28,19 @@ import sys
 import pyarrow as pa
 import numpy as np
 
+try:
+    import torch
+except ImportError:
+    torch = None
+    # Blacklist the module in case `import torch` is costly before
+    # failing (ARROW-2071)
+    sys.modules['torch'] = None
+
 
 def assert_equal(obj1, obj2):
-    try:
-        import torch
-        if torch.is_tensor(obj1) and torch.is_tensor(obj2):
-            assert torch.equal(obj1, obj2)
-            return
-    except ImportError:
-        pass
+    if torch is not None and torch.is_tensor(obj1) and torch.is_tensor(obj2):
+        assert torch.equal(obj1, obj2)
+        return
     module_numpy = (type(obj1).__module__ == np.__name__ or
                     type(obj2).__module__ == np.__name__)
     if module_numpy:
@@ -346,7 +350,6 @@ def test_datetime_serialization(large_buffer):
 
 def test_torch_serialization(large_buffer):
     pytest.importorskip("torch")
-    import torch
 
     serialization_context = pa.default_serialization_context()
     pa.register_torch_serialization_handlers(serialization_context)

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[arrow] branch master updated: ARROW-2071: [Python] Fix test slowness on Travis-CI

Reply via email to