[
https://issues.apache.org/jira/browse/BEAM-1874?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17230722#comment-17230722
]
Saman Vaisipour commented on BEAM-1874:
---------------------------------------
We encounter this issue with the 2 most recent Python SDKs (2.24.0 and 2.25.0):
Traceback (most recent call last): File
"/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line
649, in do_work work_executor.execute() File
"/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 179,
in execute op.start() File "dataflow_worker/shuffle_operations.py", line 63, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File
"dataflow_worker/shuffle_operations.py", line 64, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File
"dataflow_worker/shuffle_operations.py", line 79, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File
"dataflow_worker/shuffle_operations.py", line 80, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File
"dataflow_worker/shuffle_operations.py", line 84, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start File
"apache_beam/runners/worker/operations.py", line 356, in
apache_beam.runners.worker.operations.Operation.output File
"apache_beam/runners/worker/operations.py", line 218, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive File
"dataflow_worker/shuffle_operations.py", line 261, in
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
File "dataflow_worker/shuffle_operations.py", line 268, in
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
File "apache_beam/runners/worker/operations.py", line 356, in
apache_beam.runners.worker.operations.Operation.output File
"apache_beam/runners/worker/operations.py", line 218, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive File
"apache_beam/runners/worker/operations.py", line 703, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/worker/operations.py", line 704, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/common.py", line 1215, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 1279, in
apache_beam.runners.common.DoFnRunner._reraise_augmented File
"apache_beam/runners/common.py", line 1213, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 569, in
apache_beam.runners.common.SimpleInvoker.invoke_process File
"apache_beam/runners/common.py", line 1374, in
apache_beam.runners.common._OutputProcessor.process_outputs File
"apache_beam/runners/worker/operations.py", line 218, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive File
"apache_beam/runners/worker/operations.py", line 703, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/worker/operations.py", line 704, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/common.py", line 1215, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 1279, in
apache_beam.runners.common.DoFnRunner._reraise_augmented File
"apache_beam/runners/common.py", line 1213, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 569, in
apache_beam.runners.common.SimpleInvoker.invoke_process File
"apache_beam/runners/common.py", line 1374, in
apache_beam.runners.common._OutputProcessor.process_outputs File
"apache_beam/runners/worker/operations.py", line 218, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive File
"apache_beam/runners/worker/operations.py", line 703, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/worker/operations.py", line 704, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/common.py", line 1215, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 1294, in
apache_beam.runners.common.DoFnRunner._reraise_augmented File
"/usr/local/lib/python3.7/site-packages/future/utils/__init__.py", line 446, in
raise_with_traceback raise exc.with_traceback(traceback) File
"apache_beam/runners/common.py", line 1213, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 569, in
apache_beam.runners.common.SimpleInvoker.invoke_process File
"apache_beam/runners/common.py", line 1347, in
apache_beam.runners.common._OutputProcessor.process_outputs File
"/opt/gcp_variant_transforms/src/gcp_variant_transforms/beam_io/vcfio.py", line
305, in _read_records for record in record_iterator: File
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/vcf_parser.py",
line 385, in __next__ text_line = self._next_non_empty_line(self._text_lines)
File
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/vcf_parser.py",
line 337, in _next_non_empty_line text_line = next(iterator).strip() File
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/bgzf.py",
line 147, in read_records record = file_to_read.readline() File
"/usr/local/lib/python3.7/site-packages/apache_beam/io/filesystem.py", line
292, in readline self._fetch_to_internal_buffer(self._read_size // 2) File
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/bgzf.py",
line 66, in _fetch_to_internal_buffer
self._fetch_and_decompress_data_to_buffer(num_bytes) File
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/bgzf.py",
line 177, in _fetch_and_decompress_data_to_buffer self._complete_last_line()
File
"/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/beam_io/bgzf.py",
line 208, in _complete_last_line self._block.end, self._block.end +
self._read_size) File
"/usr/local/lib/python3.7/site-packages/apache_beam/io/gcp/gcsio.py", line 600,
in get_range self._downloader.GetRange(start, end - 1) File
"/usr/local/lib/python3.7/site-packages/apitools/base/py/transfer.py", line
471, in GetRange progress, end_byte = self.__NormalizeStartEnd(start, end) File
"/usr/local/lib/python3.7/site-packages/apitools/base/py/transfer.py", line
344, in __NormalizeStartEnd '[start=%d, total_size=%d]' % (start,
self.total_size)) apitools.base.py.exceptions.TransferInvalidError: Cannot have
start index greater than total size [start=2441668794, total_size=2441668794]
[while running 'ReadVariants/ReadBlock']
> Google Cloud Storage TextIO read fails with gz-files having Content-Encoding:
> gzip header
> -----------------------------------------------------------------------------------------
>
> Key: BEAM-1874
> URL: https://issues.apache.org/jira/browse/BEAM-1874
> Project: Beam
> Issue Type: Bug
> Components: sdk-py-core
> Affects Versions: 0.6.0
> Reporter: Samuli Holopainen
> Priority: P3
>
> We have gzipped text files in Google Cloud Storage that have the following
> metadata headers set:
> Content-Encoding: gzip
> Content-Type: application/octet-stream
> Trying to read these with apache_beam.io.ReadFromText yields the following
> error:
> ERROR:root:Exception while fetching 341565 bytes from position 0 of
> gs://...-c72fa25a-5d8a-4801-a0b4-54b58c4723ce.gz: Cannot have start index
> greater than total size
> Traceback (most recent call last):
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 585, in _fetch_to_queue
> value = func(*args)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 610, in _get_segment
> downloader.GetRange(start, end)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
> line 477, in GetRange
> progress, end_byte = self.__NormalizeStartEnd(start, end)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
> line 340, in __NormalizeStartEnd
> 'Cannot have start index greater than total size')
> TransferInvalidError: Cannot have start index greater than total size
> WARNING:root:Task failed: Traceback (most recent call last):
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py",
> line 300, in __call__
> result = evaluator.finish_bundle()
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
> line 206, in finish_bundle
> bundles = _read_values_to_bundles(reader)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
> line 196, in _read_values_to_bundles
> read_result = [GlobalWindows.windowed_value(e) for e in reader]
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/concat_source.py",
> line 79, in read
> range_tracker.sub_range_tracker(source_ix)):
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
> line 155, in read_records
> read_buffer)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
> line 245, in _read_record
> sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
> line 190, in _find_separator_bounds
> file_to_read, read_buffer, current_pos + 1):
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
> line 212, in _try_to_ensure_num_bytes_in_buffer
> read_data = file_to_read.read(self._buffer_size)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
> line 460, in read
> self._fetch_to_internal_buffer(num_bytes)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
> line 420, in _fetch_to_internal_buffer
> buf = self._file.read(self._read_size)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 472, in read
> return self._read_inner(size=size, readline=False)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 516, in _read_inner
> self._fetch_next_if_buffer_exhausted()
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 577, in _fetch_next_if_buffer_exhausted
> raise exn
> TransferInvalidError: Cannot have start index greater than total size
> After removing the Content-Encoding header the read works fine.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)