[ https://issues.apache.org/jira/browse/ARROW-2071?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16352973#comment-16352973 ]
ASF GitHub Bot commented on ARROW-2071: --------------------------------------- wesm closed pull request #1561: ARROW-2071: [Python] Fix test slowness on Travis-CI URL: https://github.com/apache/arrow/pull/1561 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 7c1d726d4..7725c560c 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -52,11 +52,10 @@ if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then # HACK(wesm): We started experiencing OpenSSL failures when Miniconda was # updated sometime on October 2 or October 3 - conda update -y -p $CPP_TOOLCHAIN ca-certificates -c defaults + conda update -y -q -p $CPP_TOOLCHAIN ca-certificates -c defaults fi - -mkdir $ARROW_CPP_BUILD_DIR +mkdir -p $ARROW_CPP_BUILD_DIR pushd $ARROW_CPP_BUILD_DIR CMAKE_COMMON_FLAGS="\ diff --git a/ci/travis_build_parquet_cpp.sh b/ci/travis_build_parquet_cpp.sh index 4330a31a0..4b6370ea7 100755 --- a/ci/travis_build_parquet_cpp.sh +++ b/ci/travis_build_parquet_cpp.sh @@ -28,7 +28,7 @@ export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN PARQUET_DIR=$TRAVIS_BUILD_DIR/parquet mkdir -p $PARQUET_DIR -git clone https://github.com/apache/parquet-cpp.git $PARQUET_DIR +git clone -q https://github.com/apache/parquet-cpp.git $PARQUET_DIR pushd $PARQUET_DIR mkdir build-dir diff --git a/ci/travis_install_clang_tools.sh b/ci/travis_install_clang_tools.sh index bad1e73d2..d0108ad37 100755 --- a/ci/travis_install_clang_tools.sh +++ b/ci/travis_install_clang_tools.sh @@ -20,5 +20,5 @@ wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add - sudo apt-add-repository -y \ "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-4.0 main" -sudo apt-get update -sudo apt-get install clang-4.0 clang-format-4.0 clang-tidy-4.0 +sudo apt-get update -q +sudo apt-get install -q clang-4.0 clang-format-4.0 clang-tidy-4.0 diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh old mode 100644 new mode 100755 index 3faa54881..caec9bb33 --- a/ci/travis_install_conda.sh +++ b/ci/travis_install_conda.sh @@ -25,7 +25,7 @@ else MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh" fi -wget -O miniconda.sh $MINICONDA_URL +wget --no-verbose -O miniconda.sh $MINICONDA_URL source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh mkdir -p $CONDA_PKGS_DIRS diff --git a/ci/travis_lint.sh b/ci/travis_lint.sh index 6a2a0be18..096170a9e 100755 --- a/ci/travis_lint.sh +++ b/ci/travis_lint.sh @@ -33,7 +33,7 @@ fi popd # Fail fast on style checks -sudo pip install flake8 +sudo pip install -q flake8 PYTHON_DIR=$TRAVIS_BUILD_DIR/python diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 7c896df9c..4187e7541 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -53,6 +53,7 @@ if [ "$PYTHON_VERSION" != "2.7" ] || [ $TRAVIS_OS_NAME != "osx" ]; then fi # Build C++ libraries +mkdir -p $ARROW_CPP_BUILD_DIR pushd $ARROW_CPP_BUILD_DIR # Clear out prior build files @@ -77,21 +78,21 @@ popd pushd $ARROW_PYTHON_DIR if [ "$PYTHON_VERSION" == "2.7" ]; then - pip install futures + pip install -q futures fi export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE -pip install -r requirements.txt +pip install -q -r requirements.txt python setup.py build_ext --with-parquet --with-plasma --with-orc\ - install --single-version-externally-managed --record=record.text + install -q --single-version-externally-managed --record=record.text popd python -c "import pyarrow.parquet" python -c "import pyarrow.plasma" python -c "import pyarrow.orc" -if [ $TRAVIS_OS_NAME == "linux" ]; then +if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then export PLASMA_VALGRIND=1 fi diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py index 9ea647667..27556e60d 100644 --- a/python/pyarrow/tests/test_plasma.py +++ b/python/pyarrow/tests/test_plasma.py @@ -30,7 +30,9 @@ import pyarrow as pa import pandas as pd -DEFAULT_PLASMA_STORE_MEMORY = 10 ** 9 + +DEFAULT_PLASMA_STORE_MEMORY = 10 ** 8 +USE_VALGRIND = os.getenv("PLASMA_VALGRIND") == "1" def random_name(): @@ -71,7 +73,7 @@ def create_object_with_id(client, object_id, data_size, metadata_size, return memory_buffer, metadata -def create_object(client, data_size, metadata_size, seal=True): +def create_object(client, data_size, metadata_size=0, seal=True): object_id = random_object_id() memory_buffer, metadata = create_object_with_id(client, object_id, data_size, metadata_size, @@ -158,7 +160,7 @@ def setup_method(self, test_method): import pyarrow.plasma as plasma # Start Plasma store. plasma_store_name, self.p = start_plasma_store( - use_valgrind=os.getenv("PLASMA_VALGRIND") == "1", + use_valgrind=USE_VALGRIND, use_one_memory_mapped_file=use_one_memory_mapped_file) # Connect to Plasma. self.plasma_client = plasma.connect(plasma_store_name, "", 64) @@ -202,7 +204,7 @@ def test_create(self): assert memory_buffer[i] == i % 256 def test_create_with_metadata(self): - for length in range(1000): + for length in range(0, 1000, 3): # Create an object id string. object_id = random_object_id() # Create a random metadata string. @@ -246,7 +248,7 @@ def test_create_existing(self): assert False def test_get(self): - num_object_ids = 100 + num_object_ids = 60 # Test timing out of get with various timeouts. for timeout in [0, 10, 100, 1000]: object_ids = [random_object_id() for _ in range(num_object_ids)] @@ -390,29 +392,30 @@ def assert_create_raises_plasma_full(unit_test, size): # For some reason the above didn't throw an exception, so fail. assert False + PERCENT = DEFAULT_PLASMA_STORE_MEMORY // 100 + # Create a list to keep some of the buffers in scope. memory_buffers = [] - _, memory_buffer, _ = create_object(self.plasma_client, 5 * 10 ** 8, 0) + _, memory_buffer, _ = create_object(self.plasma_client, 50 * PERCENT) memory_buffers.append(memory_buffer) - # Remaining space is 5 * 10 ** 8. Make sure that we can't create an - # object of size 5 * 10 ** 8 + 1, but we can create one of size - # 2 * 10 ** 8. - assert_create_raises_plasma_full(self, 5 * 10 ** 8 + 1) - _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0) + # Remaining space is 50%. Make sure that we can't create an + # object of size 50% + 1, but we can create one of size 20%. + assert_create_raises_plasma_full(self, 50 * PERCENT + 1) + _, memory_buffer, _ = create_object(self.plasma_client, 20 * PERCENT) del memory_buffer - _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0) + _, memory_buffer, _ = create_object(self.plasma_client, 20 * PERCENT) del memory_buffer - assert_create_raises_plasma_full(self, 5 * 10 ** 8 + 1) + assert_create_raises_plasma_full(self, 50 * PERCENT + 1) - _, memory_buffer, _ = create_object(self.plasma_client, 2 * 10 ** 8, 0) + _, memory_buffer, _ = create_object(self.plasma_client, 20 * PERCENT) memory_buffers.append(memory_buffer) - # Remaining space is 3 * 10 ** 8. - assert_create_raises_plasma_full(self, 3 * 10 ** 8 + 1) + # Remaining space is 30%. + assert_create_raises_plasma_full(self, 30 * PERCENT + 1) - _, memory_buffer, _ = create_object(self.plasma_client, 10 ** 8, 0) + _, memory_buffer, _ = create_object(self.plasma_client, 10 * PERCENT) memory_buffers.append(memory_buffer) - # Remaining space is 2 * 10 ** 8. - assert_create_raises_plasma_full(self, 2 * 10 ** 8 + 1) + # Remaining space is 20%. + assert_create_raises_plasma_full(self, 20 * PERCENT + 1) def test_contains(self): fake_object_ids = [random_object_id() for _ in range(100)] @@ -645,10 +648,14 @@ def test_evict(self): del b7 assert client.evict(2000) == 996 + 995 + 994 + # Mitigate valgrind-induced slowness + SUBSCRIBE_TEST_SIZES = ([1, 10, 100, 1000] if USE_VALGRIND + else [1, 10, 100, 1000, 10000]) + def test_subscribe(self): # Subscribe to notifications from the Plasma Store. self.plasma_client.subscribe() - for i in [1, 10, 100, 1000, 10000]: + for i in self.SUBSCRIBE_TEST_SIZES: object_ids = [random_object_id() for _ in range(i)] metadata_sizes = [np.random.randint(1000) for _ in range(i)] data_sizes = [np.random.randint(1000) for _ in range(i)] @@ -670,7 +677,7 @@ def test_subscribe_deletions(self): # plasma_client2 to make sure that all used objects will get evicted # properly. self.plasma_client2.subscribe() - for i in [1, 10, 100, 1000, 10000]: + for i in self.SUBSCRIBE_TEST_SIZES: object_ids = [random_object_id() for _ in range(i)] # Add 1 to the sizes to make sure we have nonzero object sizes. metadata_sizes = [np.random.randint(1000) + 1 for _ in range(i)] diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 3b1324beb..20c195a4b 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -28,15 +28,19 @@ import pyarrow as pa import numpy as np +try: + import torch +except ImportError: + torch = None + # Blacklist the module in case `import torch` is costly before + # failing (ARROW-2071) + sys.modules['torch'] = None + def assert_equal(obj1, obj2): - try: - import torch - if torch.is_tensor(obj1) and torch.is_tensor(obj2): - assert torch.equal(obj1, obj2) - return - except ImportError: - pass + if torch is not None and torch.is_tensor(obj1) and torch.is_tensor(obj2): + assert torch.equal(obj1, obj2) + return module_numpy = (type(obj1).__module__ == np.__name__ or type(obj2).__module__ == np.__name__) if module_numpy: @@ -346,7 +350,6 @@ def test_datetime_serialization(large_buffer): def test_torch_serialization(large_buffer): pytest.importorskip("torch") - import torch serialization_context = pa.default_serialization_context() pa.register_torch_serialization_handlers(serialization_context) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Reduce runtime of builds in Travis CI > ---------------------------------------------- > > Key: ARROW-2071 > URL: https://issues.apache.org/jira/browse/ARROW-2071 > Project: Apache Arrow > Issue Type: Improvement > Components: Python > Reporter: Wes McKinney > Assignee: Antoine Pitrou > Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > For some reason, recently each Python build has been taking about 15 minutes > to run. I speculate this is due to VM thrashing caused by reduced resources > on the Travis CI workers, related to the problem I fixed in ARROW-2062. > We should experiment, but it seems like perhaps this can be fixed either by: > * Reducing the size of the Plasma store on Travis CI > * Disabling valgrind in Plasma tests > The slowness could be caused by something else, though, so we should > investigate (and have pytest report slow tests in the logs) -- This message was sent by Atlassian JIRA (v7.6.3#76005)