[
https://issues.apache.org/jira/browse/BEAM-1874?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Kenneth Knowles updated BEAM-1874:
----------------------------------
Description:
We have gzipped text files in Google Cloud Storage that have the following
metadata headers set:
{code}
Content-Encoding: gzip
Content-Type: application/octet-stream
{code}
Trying to read these with apache_beam.io.ReadFromText yields the following
error:
{code}
ERROR:root:Exception while fetching 341565 bytes from position 0 of
gs://...-c72fa25a-5d8a-4801-a0b4-54b58c4723ce.gz: Cannot have start index
greater than total size
Traceback (most recent call last):
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 585, in _fetch_to_queue
value = func(*args)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 610, in _get_segment
downloader.GetRange(start, end)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
line 477, in GetRange
progress, end_byte = self.__NormalizeStartEnd(start, end)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
line 340, in __NormalizeStartEnd
'Cannot have start index greater than total size')
TransferInvalidError: Cannot have start index greater than total size
WARNING:root:Task failed: Traceback (most recent call last):
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py",
line 300, in __call__
result = evaluator.finish_bundle()
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
line 206, in finish_bundle
bundles = _read_values_to_bundles(reader)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
line 196, in _read_values_to_bundles
read_result = [GlobalWindows.windowed_value(e) for e in reader]
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/concat_source.py",
line 79, in read
range_tracker.sub_range_tracker(source_ix)):
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
line 155, in read_records
read_buffer)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
line 245, in _read_record
sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
line 190, in _find_separator_bounds
file_to_read, read_buffer, current_pos + 1):
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
line 212, in _try_to_ensure_num_bytes_in_buffer
read_data = file_to_read.read(self._buffer_size)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
line 460, in read
self._fetch_to_internal_buffer(num_bytes)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
line 420, in _fetch_to_internal_buffer
buf = self._file.read(self._read_size)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 472, in read
return self._read_inner(size=size, readline=False)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 516, in _read_inner
self._fetch_next_if_buffer_exhausted()
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 577, in _fetch_next_if_buffer_exhausted
raise exn
TransferInvalidError: Cannot have start index greater than total size
{code}
After removing the Content-Encoding header the read works fine.
was:
We have gzipped text files in Google Cloud Storage that have the following
metadata headers set:
Content-Encoding: gzip
Content-Type: application/octet-stream
Trying to read these with apache_beam.io.ReadFromText yields the following
error:
ERROR:root:Exception while fetching 341565 bytes from position 0 of
gs://...-c72fa25a-5d8a-4801-a0b4-54b58c4723ce.gz: Cannot have start index
greater than total size
Traceback (most recent call last):
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 585, in _fetch_to_queue
value = func(*args)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 610, in _get_segment
downloader.GetRange(start, end)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
line 477, in GetRange
progress, end_byte = self.__NormalizeStartEnd(start, end)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
line 340, in __NormalizeStartEnd
'Cannot have start index greater than total size')
TransferInvalidError: Cannot have start index greater than total size
WARNING:root:Task failed: Traceback (most recent call last):
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py",
line 300, in __call__
result = evaluator.finish_bundle()
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
line 206, in finish_bundle
bundles = _read_values_to_bundles(reader)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
line 196, in _read_values_to_bundles
read_result = [GlobalWindows.windowed_value(e) for e in reader]
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/concat_source.py",
line 79, in read
range_tracker.sub_range_tracker(source_ix)):
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
line 155, in read_records
read_buffer)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
line 245, in _read_record
sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
line 190, in _find_separator_bounds
file_to_read, read_buffer, current_pos + 1):
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
line 212, in _try_to_ensure_num_bytes_in_buffer
read_data = file_to_read.read(self._buffer_size)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
line 460, in read
self._fetch_to_internal_buffer(num_bytes)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
line 420, in _fetch_to_internal_buffer
buf = self._file.read(self._read_size)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 472, in read
return self._read_inner(size=size, readline=False)
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 516, in _read_inner
self._fetch_next_if_buffer_exhausted()
File
"/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
line 577, in _fetch_next_if_buffer_exhausted
raise exn
TransferInvalidError: Cannot have start index greater than total size
After removing the Content-Encoding header the read works fine.
> Google Cloud Storage TextIO read fails with gz-files having Content-Encoding:
> gzip header
> -----------------------------------------------------------------------------------------
>
> Key: BEAM-1874
> URL: https://issues.apache.org/jira/browse/BEAM-1874
> Project: Beam
> Issue Type: Bug
> Components: sdk-py-core
> Affects Versions: 0.6.0
> Reporter: Samuli Holopainen
> Priority: P1
>
> We have gzipped text files in Google Cloud Storage that have the following
> metadata headers set:
> {code}
> Content-Encoding: gzip
> Content-Type: application/octet-stream
> {code}
> Trying to read these with apache_beam.io.ReadFromText yields the following
> error:
> {code}
> ERROR:root:Exception while fetching 341565 bytes from position 0 of
> gs://...-c72fa25a-5d8a-4801-a0b4-54b58c4723ce.gz: Cannot have start index
> greater than total size
> Traceback (most recent call last):
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 585, in _fetch_to_queue
> value = func(*args)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 610, in _get_segment
> downloader.GetRange(start, end)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
> line 477, in GetRange
> progress, end_byte = self.__NormalizeStartEnd(start, end)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
> line 340, in __NormalizeStartEnd
> 'Cannot have start index greater than total size')
> TransferInvalidError: Cannot have start index greater than total size
> WARNING:root:Task failed: Traceback (most recent call last):
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py",
> line 300, in __call__
> result = evaluator.finish_bundle()
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
> line 206, in finish_bundle
> bundles = _read_values_to_bundles(reader)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
> line 196, in _read_values_to_bundles
> read_result = [GlobalWindows.windowed_value(e) for e in reader]
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/concat_source.py",
> line 79, in read
> range_tracker.sub_range_tracker(source_ix)):
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
> line 155, in read_records
> read_buffer)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
> line 245, in _read_record
> sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
> line 190, in _find_separator_bounds
> file_to_read, read_buffer, current_pos + 1):
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
> line 212, in _try_to_ensure_num_bytes_in_buffer
> read_data = file_to_read.read(self._buffer_size)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
> line 460, in read
> self._fetch_to_internal_buffer(num_bytes)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
> line 420, in _fetch_to_internal_buffer
> buf = self._file.read(self._read_size)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 472, in read
> return self._read_inner(size=size, readline=False)
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 516, in _read_inner
> self._fetch_next_if_buffer_exhausted()
> File
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
> line 577, in _fetch_next_if_buffer_exhausted
> raise exn
> TransferInvalidError: Cannot have start index greater than total size
> {code}
> After removing the Content-Encoding header the read works fine.
--
This message was sent by Atlassian Jira
(v8.20.1#820001)