This is an automated email from the ASF dual-hosted git repository. altay pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push: new a84c5b0 Restrict pyvcf dependency to Python 2 and link Jira issues tracking Py3 limitations. (#7773) a84c5b0 is described below commit a84c5b08179117005c11ed0559057a6ccff00f8e Author: tvalentyn <tvalen...@users.noreply.github.com> AuthorDate: Fri Feb 8 14:08:29 2019 -0800 Restrict pyvcf dependency to Python 2 and link Jira issues tracking Py3 limitations. (#7773) * Restrict pyvcf dependency to Python 2 and link Jira issues tracking Py3 limitations. * Skip all VCF IO tests so that we don't need to install pyvcf on Python 3. * Don't import VCF on Python 2. --- sdks/python/apache_beam/io/vcfio.py | 10 ++- sdks/python/apache_beam/io/vcfio_test.py | 91 +---------------------- sdks/python/container/base_image_requirements.txt | 4 +- sdks/python/setup.py | 6 +- 4 files changed, 17 insertions(+), 94 deletions(-) diff --git a/sdks/python/apache_beam/io/vcfio.py b/sdks/python/apache_beam/io/vcfio.py index 59e470f..0ce76bd 100644 --- a/sdks/python/apache_beam/io/vcfio.py +++ b/sdks/python/apache_beam/io/vcfio.py @@ -23,7 +23,9 @@ The 4.2 spec is available at https://samtools.github.io/hts-specs/VCFv4.2.pdf. from __future__ import absolute_import import logging +import sys import traceback +import warnings from builtins import next from builtins import object from collections import namedtuple @@ -32,8 +34,6 @@ from future.utils import iteritems from past.builtins import long from past.builtins import unicode -import vcf - from apache_beam.coders import coders from apache_beam.io import filebasedsource from apache_beam.io.filesystem import CompressionTypes @@ -41,6 +41,12 @@ from apache_beam.io.iobase import Read from apache_beam.io.textio import _TextSource as TextSource from apache_beam.transforms import PTransform +if sys.version_info[0] < 3: + import vcf +else: + warnings.warn("VCF IO will support Python 3 after migration to Nucleus, " + "see: BEAM-5628.") + __all__ = ['ReadFromVcf', 'Variant', 'VariantCall', 'VariantInfo', 'MalformedVcfRecord'] diff --git a/sdks/python/apache_beam/io/vcfio_test.py b/sdks/python/apache_beam/io/vcfio_test.py index b3f912b..9a4b793 100644 --- a/sdks/python/apache_beam/io/vcfio_test.py +++ b/sdks/python/apache_beam/io/vcfio_test.py @@ -93,6 +93,9 @@ def _count_equals_to(expected_count): return _count_equal +@unittest.skipIf(sys.version_info[0] == 3, + 'VCF io will be ported to Python 3 after switch to Nucleus. ' + 'See BEAM-5628') class VcfSourceTest(unittest.TestCase): # Distribution should skip tests that need VCF files due to large size @@ -229,10 +232,6 @@ class VcfSourceTest(unittest.TestCase): return (malformed_vcf_records, malformed_header_lines) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_sort_variants(self): sorted_variants = [ Variant(reference_name='a', start=20, end=22), @@ -244,10 +243,6 @@ class VcfSourceTest(unittest.TestCase): for permutation in permutations(sorted_variants): self.assertEqual(sorted(permutation), sorted_variants) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_variant_equality(self): base_variant = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], @@ -274,10 +269,6 @@ class VcfSourceTest(unittest.TestCase): self.assertNotEqual(base_variant, missing_field) @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing') - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_read_single_file_large(self): test_data_conifgs = [ {'file': 'valid-4.0.vcf', 'num_records': 5}, @@ -292,10 +283,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual(config['num_records'], len(read_data)) @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing') - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_read_file_pattern_large(self): read_data = self._read_records( os.path.join(get_full_dir(), 'valid-*.vcf')) @@ -304,10 +291,6 @@ class VcfSourceTest(unittest.TestCase): os.path.join(get_full_dir(), 'valid-*.vcf.gz')) self.assertEqual(9900, len(read_data_gz)) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_single_file_no_records(self): self.assertEqual( [], self._create_temp_file_and_read_records([''])) @@ -316,10 +299,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual( [], self._create_temp_file_and_read_records(_SAMPLE_HEADER_LINES)) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_single_file_verify_details(self): variant_1, vcf_line_1 = self._get_sample_variant_1() read_data = self._create_temp_file_and_read_records( @@ -333,10 +312,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual(3, len(read_data)) self._assert_variants_equal([variant_1, variant_2, variant_3], read_data) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_file_pattern_verify_details(self): variant_1, vcf_line_1 = self._get_sample_variant_1() variant_2, vcf_line_2 = self._get_sample_variant_2() @@ -351,10 +326,6 @@ class VcfSourceTest(unittest.TestCase): self._assert_variants_equal([variant_1, variant_2, variant_3], read_data) @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing') - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_read_after_splitting(self): file_name = get_full_file_path('valid-4.1-large.vcf') source = VcfSource(file_name) @@ -369,10 +340,6 @@ class VcfSourceTest(unittest.TestCase): split_records.extend(source_test_utils.read_from_source(*source_info)) self.assertEqual(9882, len(split_records)) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_invalid_file(self): invalid_file_contents = self._get_invalid_file_contents() for content in chain(*invalid_file_contents): @@ -384,10 +351,6 @@ class VcfSourceTest(unittest.TestCase): self._create_temp_vcf_file(content, tempdir) self._read_records(os.path.join(tempdir.get_path(), '*.vcf')) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_allow_malformed_records(self): invalid_records, invalid_headers = self._get_invalid_file_contents() @@ -406,10 +369,6 @@ class VcfSourceTest(unittest.TestCase): self._read_records(self._create_temp_vcf_file(content, tempdir), allow_malformed_records=True) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_no_samples(self): header_line = '#CHROM POS ID REF ALT QUAL FILTER INFO\n' record_line = '19 123 . G A . PASS AF=0.2' @@ -422,10 +381,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual(1, len(read_data)) self.assertEqual(expected_variant, read_data[0]) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_no_info(self): record_line = 'chr19 123 . . . . . . GT . .' expected_variant = Variant(reference_name='chr19', start=122, end=123) @@ -438,10 +393,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual(1, len(read_data)) self.assertEqual(expected_variant, read_data[0]) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_info_numbers_and_types(self): info_headers = [ '##INFO=<ID=HA,Number=A,Type=String,Description="StringInfo_A">\n', @@ -475,10 +426,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_end_info_key(self): phaseset_header_line = ( '##INFO=<ID=END,Number=1,Type=Integer,Description="End of record.">\n') @@ -497,10 +444,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_custom_phaseset(self): phaseset_header_line = ( '##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phaseset">\n') @@ -524,10 +467,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_format_numbers(self): format_headers = [ '##FORMAT=<ID=FU,Number=.,Type=String,Description="Format_variable">\n', @@ -551,10 +490,6 @@ class VcfSourceTest(unittest.TestCase): self.assertEqual(1, len(read_data)) self.assertEqual(expected_variant, read_data[0]) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_pipeline_read_single_file(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + @@ -565,10 +500,6 @@ class VcfSourceTest(unittest.TestCase): pipeline.run() @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing') - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_pipeline_read_single_file_large(self): pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromVcf( @@ -577,10 +508,6 @@ class VcfSourceTest(unittest.TestCase): pipeline.run() @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing') - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_pipeline_read_file_pattern_large(self): pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromVcf( @@ -588,10 +515,6 @@ class VcfSourceTest(unittest.TestCase): assert_that(pcoll, _count_equals_to(9900)) pipeline.run() - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_read_reentrant_without_splitting(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + @@ -599,10 +522,6 @@ class VcfSourceTest(unittest.TestCase): source = VcfSource(file_name) source_test_utils.assert_reentrant_reads_succeed((source, None, None)) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_read_reentrant_after_splitting(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + @@ -613,10 +532,6 @@ class VcfSourceTest(unittest.TestCase): source_test_utils.assert_reentrant_reads_succeed( (splits[0].source, splits[0].start_position, splits[0].stop_position)) - @unittest.skipIf(sys.version_info[0] == 3 and - os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', - 'VCF io will only be ported after switch to Nucleus ' - 'See BEAM-5628') def test_dynamic_work_rebalancing(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + diff --git a/sdks/python/container/base_image_requirements.txt b/sdks/python/container/base_image_requirements.txt index 032a9ca..bf311d2 100644 --- a/sdks/python/container/base_image_requirements.txt +++ b/sdks/python/container/base_image_requirements.txt @@ -40,7 +40,7 @@ pyarrow==0.11.1 pydot==1.2.4 pyparsing==2.3.1 pytz==2018.4 -pyvcf==0.6.8 +pyvcf==0.6.8;python_version<"3.0" pyyaml==3.12 typing==3.6.1 @@ -49,7 +49,7 @@ nose==1.3.7 # GCP extra features google-apitools==0.5.26 -googledatastore==7.0.1 +googledatastore==7.0.1;python_version<"3.0" google-cloud-pubsub==0.39.0 google-cloud-bigquery==1.6.0 proto-google-cloud-datastore-v1==0.90.4 diff --git a/sdks/python/setup.py b/sdks/python/setup.py index c44c97e..8b23605 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -116,12 +116,13 @@ REQUIRED_PACKAGES = [ 'oauth2client>=2.0.1,<4', # grpcio 1.8.1 and above requires protobuf 3.5.0.post1. 'protobuf>=3.5.0.post1,<4', - # pyarrow is not supported on Windows for Python 2 [BEAM-6287] + # [BEAM-6287] pyarrow is not supported on Windows for Python 2 ('pyarrow>=0.11.1,<0.12.0; python_version >= "3.0" or ' 'platform_system != "Windows"'), 'pydot>=1.2.0,<1.3', 'pytz>=2018.3', - 'pyvcf>=0.6.8,<0.7.0', + # [BEAM-5628] Beam VCF IO is not supported in Python 3. + 'pyvcf>=0.6.8,<0.7.0; python_version < "3.0"', 'pyyaml>=3.12,<4.0.0', 'typing>=3.6.0,<3.7.0; python_version < "3.5.0"', ] @@ -139,6 +140,7 @@ GCP_REQUIREMENTS = [ # google-apitools 0.5.23 and above has important Python 3 supports. 'google-apitools>=0.5.26,<0.5.27', 'proto-google-cloud-datastore-v1>=0.90.0,<=0.90.4', + # [BEAM-4543] Datastore IO is not supported in Python 3. 'googledatastore>=7.0.1,<7.1; python_version < "3.0"', 'google-cloud-pubsub==0.39.0', # GCP packages required by tests