This is an automated email from the ASF dual-hosted git repository.

damccorm pushed a commit to branch users/damccorm/extras-cp
in repository https://gitbox.apache.org/repos/asf/beam.git

commit 9e4cbde2649f330f8fc57f43b43361a1a11b3eba
Author: Danny McCormick <[email protected]>
AuthorDate: Wed Nov 26 15:53:20 2025 -0500

    split hdfs into extra (#36773)
    
    * split hdfs into extra
    
    * CHANGES
    
    * tox
    
    * try/catch
    
    * test fixes
    
    * add to coverage tasks
---
 CHANGES.md                                          |  2 +-
 sdks/python/apache_beam/io/hadoopfilesystem.py      | 11 +++++++++--
 sdks/python/apache_beam/io/hadoopfilesystem_test.py |  7 +++++++
 sdks/python/setup.py                                |  2 +-
 sdks/python/tox.ini                                 |  9 +++++----
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 68af5a342d7..5dd07aab92b 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -81,7 +81,7 @@ Now Beam has full support for Milvus integration including 
Milvus enrichment and
 
 ## Breaking Changes
 
-* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)).
+* (Python) Some Python dependencies have been split out into extras. To ensure 
all previously installed dependencies are installed, when installing Beam you 
can `pip install apache-beam[gcp,interactive,yaml,redis,hadoop,tfrecord]`, 
though most users will not need all of these extras 
([#34554](https://github.com/apache/beam/issues/34554)).
 
 ## Deprecations
 
diff --git a/sdks/python/apache_beam/io/hadoopfilesystem.py 
b/sdks/python/apache_beam/io/hadoopfilesystem.py
index cf488c228a2..3287644eed8 100644
--- a/sdks/python/apache_beam/io/hadoopfilesystem.py
+++ b/sdks/python/apache_beam/io/hadoopfilesystem.py
@@ -26,8 +26,6 @@ import posixpath
 import re
 from typing import BinaryIO  # pylint: disable=unused-import
 
-import hdfs
-
 from apache_beam.io import filesystemio
 from apache_beam.io.filesystem import BeamIOError
 from apache_beam.io.filesystem import CompressedFile
@@ -37,6 +35,11 @@ from apache_beam.io.filesystem import FileSystem
 from apache_beam.options.pipeline_options import HadoopFileSystemOptions
 from apache_beam.options.pipeline_options import PipelineOptions
 
+try:
+  import hdfs
+except ImportError:
+  hdfs = None
+
 __all__ = ['HadoopFileSystem']
 
 _HDFS_PREFIX = 'hdfs:/'
@@ -108,6 +111,10 @@ class HadoopFileSystem(FileSystem):
     See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`.
     """
     super().__init__(pipeline_options)
+    if hdfs is None:
+      raise ImportError(
+          'Failed to import hdfs. You can ensure it is '
+          'installed by installing the hadoop beam extra')
     logging.getLogger('hdfs.client').setLevel(logging.WARN)
     if pipeline_options is None:
       raise ValueError('pipeline_options is not set')
diff --git a/sdks/python/apache_beam/io/hadoopfilesystem_test.py 
b/sdks/python/apache_beam/io/hadoopfilesystem_test.py
index 8c21effc882..eb0925224dd 100644
--- a/sdks/python/apache_beam/io/hadoopfilesystem_test.py
+++ b/sdks/python/apache_beam/io/hadoopfilesystem_test.py
@@ -32,6 +32,11 @@ from apache_beam.io.filesystem import BeamIOError
 from apache_beam.options.pipeline_options import HadoopFileSystemOptions
 from apache_beam.options.pipeline_options import PipelineOptions
 
+try:
+  import hdfs as actual_hdfs
+except ImportError:
+  actual_hdfs = None
+
 
 class FakeFile(io.BytesIO):
   """File object for FakeHdfs"""
@@ -201,6 +206,7 @@ class FakeHdfs(object):
 
 
 @parameterized_class(('full_urls', ), [(False, ), (True, )])
[email protected](actual_hdfs is None, "hdfs extra not installed")
 class HadoopFileSystemTest(unittest.TestCase):
   def setUp(self):
     self._fake_hdfs = FakeHdfs()
@@ -607,6 +613,7 @@ class HadoopFileSystemTest(unittest.TestCase):
     self.assertFalse(self.fs.exists(url2))
 
 
[email protected](actual_hdfs is None, "hdfs extra not installed")
 class HadoopFileSystemRuntimeValueProviderTest(unittest.TestCase):
   """Tests pipeline_options, in the form of a
   RuntimeValueProvider.runtime_options object."""
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index 289433f9ea5..b700d796983 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -379,7 +379,6 @@ if __name__ == '__main__':
           # TODO(https://github.com/grpc/grpc/issues/37710): Unpin grpc
           
'grpcio>=1.33.1,<2,!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0;
 python_version <= "3.12"',  # pylint: disable=line-too-long
           'grpcio>=1.67.0; python_version >= "3.13"',
-          'hdfs>=2.1.0,<3.0.0',
           'httplib2>=0.8,<0.23.0',
           'jsonpickle>=3.0.0,<4.0.0',
           # numpy can have breaking changes in minor versions.
@@ -563,6 +562,7 @@ if __name__ == '__main__':
               # `--update` / `-U` flag to replace the dask release brought in
               # by distributed.
           ],
+          'hadoop': ['hdfs>=2.1.0,<3.0.0'],
           'yaml': [
               'docstring-parser>=0.15,<1.0',
               'jinja2>=3.0,<3.2',
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index da0932728b2..431cd186c1b 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -33,7 +33,7 @@ pip_pre = True
 # allow apps that support color to use it.
 passenv=TERM,CLOUDSDK_CONFIG,DOCKER_*,TESTCONTAINERS_*,TC_*,ALLOYDB_PASSWORD
 # Set [] options for pip installation of apache-beam tarball.
-extras = test,dataframe,redis,tfrecord,yaml
+extras = test,dataframe,hadoop,redis,tfrecord,yaml
 # Don't warn that these commands aren't installed.
 allowlist_externals =
   false
@@ -97,8 +97,8 @@ install_command = {envbindir}/python.exe {envbindir}/pip.exe 
install --retries 1
 list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze
 
 [testenv:py{310,311,312,313}-cloud]
-; extras = test,gcp,interactive,dataframe,aws,azure,redis
-extras = test,gcp,interactive,dataframe,aws,azure
+; extras = test,gcp,interactive,dataframe,aws,azure
+extras = test,hadoop,gcp,interactive,dataframe,aws,azure
 commands =
   python apache_beam/examples/complete/autocomplete_test.py
   bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}"
@@ -173,7 +173,7 @@ setenv =
   TC_SLEEP_TIME = {env:TC_SLEEP_TIME:1}
 
 # NOTE: we could add ml_test to increase the collected code coverage metrics, 
but it would make the suite slower.
-extras = test,gcp,interactive,dataframe,aws,redis
+extras = test,hadoop,gcp,interactive,dataframe,aws,redis
 commands =
   bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" 
"--cov-report=xml --cov=. --cov-append"
 
@@ -228,6 +228,7 @@ deps =
   holdup==1.8.0
 extras =
   gcp
+  hdfs
 allowlist_externals =
   bash
   echo

Reply via email to