[ 
https://issues.apache.org/jira/browse/BEAM-1874?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17545775#comment-17545775
 ] 

Loic M commented on BEAM-1874:
------------------------------

I ran into a very similar issue with version 2.39.0, the only difference being 
that the error message tells me that the size of the file I am trying to read 
is 0.

 

Traceback (most recent call last):
  File "/home/lixoloic/.pyenv/versions/3.8.5/lib/python3.8/runpy.py", line 194, 
in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/lixoloic/.pyenv/versions/3.8.5/lib/python3.8/runpy.py", line 87, 
in _run_code
    exec(code, run_globals)
  File 
"/home/lixoloic/sensor-data-ingestion/beam_pipelines/improve_data_ingestion.py",
 line 126, in <module>
    run(
  File 
"/home/lixoloic/sensor-data-ingestion/beam_pipelines/improve_data_ingestion.py",
 line 101, in run
    (
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/pipeline.py",
 line 596, in __exit__
    self.result = self.run()
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/pipeline.py",
 line 573, in run
    return self.runner.run_pipeline(self, self._options)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/direct/direct_runner.py",
 line 131, in run_pipeline
    return runner.run_pipeline(pipeline, options)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py",
 line 199, in run_pipeline
    self._latest_run_result = self.run_via_runner_api(
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py",
 line 210, in run_via_runner_api
    return self.run_stages(stage_context, stages)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py",
 line 395, in run_stages
    stage_results = self._run_stage(
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py",
 line 660, in _run_stage
    self._run_bundle(
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py",
 line 783, in _run_bundle
    result, splits = bundle_manager.process_bundle(
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py",
 line 1094, in process_bundle
    result_future = self._worker_handler.control_conn.push(process_bundle_req)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/worker_handlers.py",
 line 378, in push
    response = self.worker.do_instruction(request)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py",
 line 580, in do_instruction
    return getattr(self, request_type)(
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py",
 line 618, in process_bundle
    bundle_processor.process_bundle(instruction_id))
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/worker/bundle_processor.py",
 line 995, in process_bundle
    input_op_by_transform_id[element.transform_id].process_encoded(
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/runners/worker/bundle_processor.py",
 line 221, in process_encoded
    self.output(decoded_value)
  File "apache_beam/runners/worker/operations.py", line 346, in 
apache_beam.runners.worker.operations.Operation.output
  File "apache_beam/runners/worker/operations.py", line 348, in 
apache_beam.runners.worker.operations.Operation.output
  File "apache_beam/runners/worker/operations.py", line 215, in 
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
  File "apache_beam/runners/worker/operations.py", line 817, in 
apache_beam.runners.worker.operations.SdfProcessSizedElements.process
  File "apache_beam/runners/worker/operations.py", line 826, in 
apache_beam.runners.worker.operations.SdfProcessSizedElements.process
  File "apache_beam/runners/common.py", line 1206, in 
apache_beam.runners.common.DoFnRunner.process_with_sized_restriction
  File "apache_beam/runners/common.py", line 698, in 
apache_beam.runners.common.PerWindowInvoker.invoke_process
  File "apache_beam/runners/common.py", line 836, in 
apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
  File "apache_beam/runners/common.py", line 1334, in 
apache_beam.runners.common._OutputProcessor.process_outputs
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/io/textio.py",
 line 221, in read_records
    record, num_bytes_to_next_record = self._read_record(file_to_read,
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/io/textio.py",
 line 366, in _read_record
    sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/io/textio.py",
 line 287, in _find_separator_bounds
    if not self._try_to_ensure_num_bytes_in_buffer(
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/io/textio.py",
 line 333, in _try_to_ensure_num_bytes_in_buffer
    read_data = file_to_read.read(self._buffer_size)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/io/filesystem.py",
 line 265, in read
    self._fetch_to_internal_buffer(num_bytes)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/io/filesystem.py",
 line 223, in _fetch_to_internal_buffer
    buf = self._file.read(self._read_size)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/io/filesystemio.py",
 line 109, in readinto
    data = self._downloader.get_range(start, end)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apache_beam/io/gcp/gcsio.py",
 line 698, in get_range
    self._downloader.GetRange(start, end - 1)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apitools/base/py/transfer.py",
 line 484, in GetRange
    progress, end_byte = self.__NormalizeStartEnd(start, end)
  File 
"/home/lixoloic/.cache/pypoetry/virtualenvs/sensor-data-ingestion-qope4Qf2-py3.8/lib/python3.8/site-packages/apitools/base/py/transfer.py",
 line 342, in __NormalizeStartEnd
    raise exceptions.TransferInvalidError(
apitools.base.py.exceptions.TransferInvalidError: Cannot have start index 
greater than total size [start=0, total_size=0]

> Google Cloud Storage TextIO read fails with gz-files having Content-Encoding: 
> gzip header
> -----------------------------------------------------------------------------------------
>
>                 Key: BEAM-1874
>                 URL: https://issues.apache.org/jira/browse/BEAM-1874
>             Project: Beam
>          Issue Type: Bug
>          Components: sdk-py-core
>    Affects Versions: 0.6.0
>            Reporter: Samuli Holopainen
>            Priority: P3
>
> We have gzipped text files in Google Cloud Storage that have the following 
> metadata headers set:
> {code}
> Content-Encoding: gzip
> Content-Type: application/octet-stream
> {code}
> Trying to read these with apache_beam.io.ReadFromText yields the following 
> error:
> {code}
> ERROR:root:Exception while fetching 341565 bytes from position 0 of 
> gs://...-c72fa25a-5d8a-4801-a0b4-54b58c4723ce.gz: Cannot have start index 
> greater than total size
> Traceback (most recent call last):
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 585, in _fetch_to_queue
>     value = func(*args)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 610, in _get_segment
>     downloader.GetRange(start, end)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
>  line 477, in GetRange
>     progress, end_byte = self.__NormalizeStartEnd(start, end)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apitools/base/py/transfer.py",
>  line 340, in __NormalizeStartEnd
>     'Cannot have start index greater than total size')
> TransferInvalidError: Cannot have start index greater than total size
> WARNING:root:Task failed: Traceback (most recent call last):
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py",
>  line 300, in __call__
>     result = evaluator.finish_bundle()
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
>  line 206, in finish_bundle
>     bundles = _read_values_to_bundles(reader)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py",
>  line 196, in _read_values_to_bundles
>     read_result = [GlobalWindows.windowed_value(e) for e in reader]
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/concat_source.py",
>  line 79, in read
>     range_tracker.sub_range_tracker(source_ix)):
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
>  line 155, in read_records
>     read_buffer)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
>  line 245, in _read_record
>     sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
>  line 190, in _find_separator_bounds
>     file_to_read, read_buffer, current_pos + 1):
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/textio.py",
>  line 212, in _try_to_ensure_num_bytes_in_buffer
>     read_data = file_to_read.read(self._buffer_size)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
>  line 460, in read
>     self._fetch_to_internal_buffer(num_bytes)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/fileio.py",
>  line 420, in _fetch_to_internal_buffer
>     buf = self._file.read(self._read_size)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 472, in read
>     return self._read_inner(size=size, readline=False)
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 516, in _read_inner
>     self._fetch_next_if_buffer_exhausted()
>   File 
> "/Users/samuli.holopainen/miniconda2/envs/python-dataflow/lib/python2.7/site-packages/apache_beam/io/gcp/gcsio.py",
>  line 577, in _fetch_next_if_buffer_exhausted
>     raise exn
> TransferInvalidError: Cannot have start index greater than total size
> {code}
> After removing the Content-Encoding header the read works fine.



--
This message was sent by Atlassian Jira
(v8.20.7#820007)

Reply via email to