[ 
https://issues.apache.org/jira/browse/BEAM-1874?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17230722#comment-17230722
 ] 

Saman Vaisipour commented on BEAM-1874:
---------------------------------------

We encounter this issue with the 2 most recent Python SDKs (2.24.0 and 2.25.0):

Traceback (most recent call last): File 
"/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 
649, in do_work work_executor.execute() File 
"/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 179, 
in execute op.start() File "dataflow_worker/shuffle_operations.py", line 63, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File 
"dataflow_worker/shuffle_operations.py", line 64, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File 
"dataflow_worker/shuffle_operations.py", line 79, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File 
"dataflow_worker/shuffle_operations.py", line 80, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File 
"dataflow_worker/shuffle_operations.py", line 84, in 
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File 
"apache_beam/runners/worker/operations.py", line 356, in 
apache_beam.runners.worker.operations.Operation.output File 
"apache_beam/runners/worker/operations.py", line 218, in 
apache_beam.runners.worker.operations.SingletonConsumerSet.receive File 
"dataflow_worker/shuffle_operations.py", line 261, in 
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process 
File "dataflow_worker/shuffle_operations.py", line 268, in 
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process 
File "apache_beam/runners/worker/operations.py", line 356, in 
apache_beam.runners.worker.operations.Operation.output File 
"apache_beam/runners/worker/operations.py", line 218, in 
apache_beam.runners.worker.operations.SingletonConsumerSet.receive File 
"apache_beam/runners/worker/operations.py", line 703, in 
apache_beam.runners.worker.operations.DoOperation.process File 
"apache_beam/runners/worker/operations.py", line 704, in 
apache_beam.runners.worker.operations.DoOperation.process File 
"apache_beam/runners/common.py", line 1215, in 
apache_beam.runners.common.DoFnRunner.process File 
"apache_beam/runners/common.py", line 1279, in 
apache_beam.runners.common.DoFnRunner._reraise_augmented File 
"apache_beam/runners/common.py", line 1213, in 
apache_beam.runners.common.DoFnRunner.process File 
"apache_beam/runners/common.py", line 569, in 
apache_beam.runners.common.SimpleInvoker.invoke_process File 
"apache_beam/runners/common.py", line 1374, in 
apache_beam.runners.common._OutputProcessor.process_outputs File 
"apache_beam/runners/worker/operations.py", line 218, in 
apache_beam.runners.worker.operations.SingletonConsumerSet.receive File 
"apache_beam/runners/worker/operations.py", line 703, in 
apache_beam.runners.worker.operations.DoOperation.process File 
"apache_beam/runners/worker/operations.py", line 704, in 
apache_beam.runners.worker.operations.DoOperation.process File 
"apache_beam/runners/common.py", line 1215, in 
apache_beam.runners.common.DoFnRunner.process File 
"apache_beam/runners/common.py", line 1279, in 
apache_beam.runners.common.DoFnRunner._reraise_augmented File 
"apache_beam/runners/common.py", line 1213, in 
apache_beam.runners.common.DoFnRunner.process File 
"apache_beam/runners/common.py", line 569, in 
apache_beam.runners.common.SimpleInvoker.invoke_process File 
"apache_beam/runners/common.py", line 1374, in 
apache_beam.runners.common._OutputProcessor.process_outputs File 
"apache_beam/runners/worker/operations.py", line 218, in 
apache_beam.runners.worker.operations.SingletonConsumerSet.receive File 
"apache_beam/runners/worker/operations.py", line 703, in 
apache_beam.runners.worker.operations.DoOperation.process File 
"apache_beam/runners/worker/operations.py", line 704, in 
apache_beam.runners.worker.operations.DoOperation.process File 
"apache_beam/runners/common.py", line 1215, in 
apache_beam.runners.common.DoFnRunner.process File 
"apache_beam/runners/common.py", line 1294, in 
apache_beam.runners.common.DoFnRunner._reraise_augmented File 
"/usr/local/lib/python3.7/site-packages/future/utils/__init__.py", line 446, in 
raise_with_traceback raise exc.with_traceback(traceback) File 
"apache_beam/runners/common.py", line 1213, in 
apache_beam.runners.common.DoFnRunner.process File 
"apache_beam/runners/common.py", line 569, in 
apache_beam.runners.common.SimpleInvoker.invoke_process File 
"apache_beam/runners/common.py", line 1347, in 
apache_beam.runners.common._OutputProcessor.process_outputs File 
"/opt/gcp_variant_transforms/src/gcp_variant_transforms/beam_io/vcfio.py", line 
305, in _read_records for record in record_iterator: File 
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/vcf_parser.py",
 line 385, in __next__ text_line = self._next_non_empty_line(self._text_lines) 
File 
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/vcf_parser.py",
 line 337, in _next_non_empty_line text_line = next(iterator).strip() File 
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/bgzf.py",
 line 147, in read_records record = file_to_read.readline() File 
"/usr/local/lib/python3.7/site-packages/apache_beam/io/filesystem.py", line 
292, in readline self._fetch_to_internal_buffer(self._read_size // 2) File 
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/bgzf.py",
 line 66, in _fetch_to_internal_buffer 
self._fetch_and_decompress_data_to_buffer(num_bytes) File 
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/bgzf.py",
 line 177, in _fetch_and_decompress_data_to_buffer self._complete_last_line() 
File 
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/bgzf.py",
 line 208, in _complete_last_line self._block.end, self._block.end + 
self._read_size) File 
"/usr/local/lib/python3.7/site-packages/apache_beam/io/gcp/gcsio.py", line 600, 
in get_range self._downloader.GetRange(start, end - 1) File 
"/usr/local/lib/python3.7/site-packages/apitools/base/py/transfer.py", line 
471, in GetRange progress, end_byte = self.__NormalizeStartEnd(start, end) File 
"/usr/local/lib/python3.7/site-packages/apitools/base/py/transfer.py", line 
344, in __NormalizeStartEnd '[start=%d, total_size=%d]' % (start, 
self.total_size)) apitools.base.py.exceptions.TransferInvalidError: Cannot have 
start index greater than total size [start=2441668794, total_size=2441668794] 
[while running 'ReadVariants/ReadBlock']

> Google Cloud Storage TextIO read fails with gz-files having Content-Encoding: 
> gzip header
> -----------------------------------------------------------------------------------------
>
>                 Key: BEAM-1874
>                 URL: https://issues.apache.org/jira/browse/BEAM-1874
>             Project: Beam
>          Issue Type: Bug
>          Components: sdk-py-core
>    Affects Versions: 0.6.0
>            Reporter: Samuli Holopainen
>            Priority: P3
>
> We have gzipped text files in Google Cloud Storage that have the following 
> metadata headers set:
> Content-Encoding: gzip
> Content-Type: application/octet-stream
> Trying to read these with apache_beam.io.ReadFromText yields the following 
> error:
> ERROR:root:Exception while fetching 341565 bytes from position 0 of 
> gs://...-c72fa25a-5d8a-4801-a0b4-54b58c4723ce.gz: Cannot have start index 
> greater than total size
> Traceback (most recent call last):
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 585, in _fetch_to_queue
>     value = func(*args)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 610, in _get_segment
>     downloader.GetRange(start, end)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
>  line 477, in GetRange
>     progress, end_byte = self.__NormalizeStartEnd(start, end)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
>  line 340, in __NormalizeStartEnd
>     'Cannot have start index greater than total size')
> TransferInvalidError: Cannot have start index greater than total size
> WARNING:root:Task failed: Traceback (most recent call last):
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py",
>  line 300, in __call__
>     result = evaluator.finish_bundle()
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
>  line 206, in finish_bundle
>     bundles = _read_values_to_bundles(reader)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
>  line 196, in _read_values_to_bundles
>     read_result = [GlobalWindows.windowed_value(e) for e in reader]
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/concat_source.py",
>  line 79, in read
>     range_tracker.sub_range_tracker(source_ix)):
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
>  line 155, in read_records
>     read_buffer)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
>  line 245, in _read_record
>     sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
>  line 190, in _find_separator_bounds
>     file_to_read, read_buffer, current_pos + 1):
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
>  line 212, in _try_to_ensure_num_bytes_in_buffer
>     read_data = file_to_read.read(self._buffer_size)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
>  line 460, in read
>     self._fetch_to_internal_buffer(num_bytes)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
>  line 420, in _fetch_to_internal_buffer
>     buf = self._file.read(self._read_size)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 472, in read
>     return self._read_inner(size=size, readline=False)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 516, in _read_inner
>     self._fetch_next_if_buffer_exhausted()
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 577, in _fetch_next_if_buffer_exhausted
>     raise exn
> TransferInvalidError: Cannot have start index greater than total size
> After removing the Content-Encoding header the read works fine.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to