This is an automated email from the ASF dual-hosted git repository.
kaxilnaik pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 2c26b15 Make `pandas` an optional core dependency (#17575)
2c26b15 is described below
commit 2c26b15a8087cb8a81eb19fedbc768bd6da92df7
Author: Kaxil Naik <[email protected]>
AuthorDate: Fri Aug 13 00:07:50 2021 +0100
Make `pandas` an optional core dependency (#17575)
We only use `pandas` in `DbApiHook.get_pandas_df`. Not all users use it,
plus
while `pandas` now supports many pre-compiled packages it still can take
forever where
it needs to be compiled.
So for first-time users this can be a turn off. If pandas is already
installed this
will work fine, but if not users have an option to run `pip install
apache-airflow[pandas]`
closes #12500
---
BREEZE.rst | 12 ++++++------
CONTRIBUTING.rst | 4 ++--
Dockerfile | 2 +-
INSTALL | 4 ++--
UPDATING.md | 13 +++++++++++++
airflow/executors/celery_executor.py | 6 +++++-
airflow/hooks/dbapi.py | 5 ++++-
airflow/utils/json.py | 12 ++++++++----
docs/apache-airflow/extra-packages-ref.rst | 2 ++
setup.cfg | 3 ---
setup.py | 8 ++++++--
11 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/BREEZE.rst b/BREEZE.rst
index 683d2fb..f86f6f8 100644
--- a/BREEZE.rst
+++ b/BREEZE.rst
@@ -1315,8 +1315,8 @@ This is the current syntax for `./breeze <./breeze>`_:
Production image:
async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,
-
http,ldap,google,google_auth,microsoft.azure,mysql,postgres,redis,sendgrid,sftp,
- slack,ssh,statsd,virtualenv
+
http,ldap,google,google_auth,microsoft.azure,mysql,pandas,postgres,redis,sendgrid,
+ sftp,slack,ssh,statsd,virtualenv
--image-tag TAG
Additional tag in the image.
@@ -1914,8 +1914,8 @@ This is the current syntax for `./breeze <./breeze>`_:
Production image:
async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,
-
http,ldap,google,google_auth,microsoft.azure,mysql,postgres,redis,sendgrid,sftp,
- slack,ssh,statsd,virtualenv
+
http,ldap,google,google_auth,microsoft.azure,mysql,pandas,postgres,redis,sendgrid,
+ sftp,slack,ssh,statsd,virtualenv
--image-tag TAG
Additional tag in the image.
@@ -2501,8 +2501,8 @@ This is the current syntax for `./breeze <./breeze>`_:
Production image:
async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,
-
http,ldap,google,google_auth,microsoft.azure,mysql,postgres,redis,sendgrid,sftp,
- slack,ssh,statsd,virtualenv
+
http,ldap,google,google_auth,microsoft.azure,mysql,pandas,postgres,redis,sendgrid,
+ sftp,slack,ssh,statsd,virtualenv
--image-tag TAG
Additional tag in the image.
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index d8e2c43..8874a8d 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -593,8 +593,8 @@ devel_all, devel_ci, devel_hadoop, dingding, discord, doc,
docker, druid, elasti
facebook, ftp, gcp, gcp_api, github_enterprise, google, google_auth, grpc,
hashicorp, hdfs, hive,
http, imap, jdbc, jenkins, jira, kerberos, kubernetes, ldap, leveldb,
microsoft.azure,
microsoft.mssql, microsoft.psrp, microsoft.winrm, mongo, mssql, mysql, neo4j,
odbc, openfaas,
-opsgenie, oracle, pagerduty, papermill, password, pinot, plexus, postgres,
presto, qds, qubole,
-rabbitmq, redis, s3, salesforce, samba, segment, sendgrid, sentry, sftp,
singularity, slack,
+opsgenie, oracle, pagerduty, pandas, papermill, password, pinot, plexus,
postgres, presto, qds,
+qubole, rabbitmq, redis, s3, salesforce, samba, segment, sendgrid, sentry,
sftp, singularity, slack,
snowflake, spark, sqlite, ssh, statsd, tableau, telegram, trino, vertica,
virtualenv, webhdfs,
winrm, yandex, zendesk
diff --git a/Dockerfile b/Dockerfile
index 782e5b4..847ad5a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -34,7 +34,7 @@
# much smaller.
#
ARG AIRFLOW_VERSION="2.2.0.dev0"
-ARG
AIRFLOW_EXTRAS="async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,http,ldap,google,google_auth,microsoft.azure,mysql,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv"
+ARG
AIRFLOW_EXTRAS="async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,http,ldap,google,google_auth,microsoft.azure,mysql,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv"
ARG ADDITIONAL_AIRFLOW_EXTRAS=""
ARG ADDITIONAL_PYTHON_DEPS=""
diff --git a/INSTALL b/INSTALL
index 47f48c3..d938ffb 100644
--- a/INSTALL
+++ b/INSTALL
@@ -97,8 +97,8 @@ devel_all, devel_ci, devel_hadoop, dingding, discord, doc,
docker, druid, elasti
facebook, ftp, gcp, gcp_api, github_enterprise, google, google_auth, grpc,
hashicorp, hdfs, hive,
http, imap, jdbc, jenkins, jira, kerberos, kubernetes, ldap, leveldb,
microsoft.azure,
microsoft.mssql, microsoft.psrp, microsoft.winrm, mongo, mssql, mysql, neo4j,
odbc, openfaas,
-opsgenie, oracle, pagerduty, papermill, password, pinot, plexus, postgres,
presto, qds, qubole,
-rabbitmq, redis, s3, salesforce, samba, segment, sendgrid, sentry, sftp,
singularity, slack,
+opsgenie, oracle, pagerduty, pandas, papermill, password, pinot, plexus,
postgres, presto, qds,
+qubole, rabbitmq, redis, s3, salesforce, samba, segment, sendgrid, sentry,
sftp, singularity, slack,
snowflake, spark, sqlite, ssh, statsd, tableau, telegram, trino, vertica,
virtualenv, webhdfs,
winrm, yandex, zendesk
diff --git a/UPDATING.md b/UPDATING.md
index b18dff5..a0cd1d6 100644
--- a/UPDATING.md
+++ b/UPDATING.md
@@ -73,6 +73,19 @@ https://developers.google.com/style/inclusive-documentation
-->
+### `pandas` is now an optional dependency
+
+Previously `pandas` was a core requirement so when you run `pip install
apache-airflow` it looked for `pandas`
+library and installed it if it does not exist.
+
+If you want to install `pandas` compatible with Airflow, you can use
`[pandas]` extra while
+installing Airflow, example for Python 3.8 and Airflow 2.1.2:
+
+```shell
+pip install -U "apache-airflow[pandas]==2.1.2" \
+ --constraint
https://raw.githubusercontent.com/apache/airflow/constraints-2.1.2/constraints-3.8.txt"
+```
+
### Dummy trigger rule has been deprecated
`TriggerRule.DUMMY` is replaced by `TriggerRule.ALWAYS`.
diff --git a/airflow/executors/celery_executor.py
b/airflow/executors/celery_executor.py
index e3fc398..56edb6e 100644
--- a/airflow/executors/celery_executor.py
+++ b/airflow/executors/celery_executor.py
@@ -183,7 +183,6 @@ def on_celery_import_modules(*args, **kwargs):
doesn't matter, but for short tasks this starts to be a noticeable impact.
"""
import jinja2.ext # noqa: F401
- import numpy # noqa: F401
import airflow.jobs.local_task_job
import airflow.macros
@@ -192,6 +191,11 @@ def on_celery_import_modules(*args, **kwargs):
import airflow.operators.subdag # noqa: F401
try:
+ import numpy # noqa: F401
+ except ImportError:
+ pass
+
+ try:
import kubernetes.client # noqa: F401
except ImportError:
pass
diff --git a/airflow/hooks/dbapi.py b/airflow/hooks/dbapi.py
index bac75a2..4156500 100644
--- a/airflow/hooks/dbapi.py
+++ b/airflow/hooks/dbapi.py
@@ -129,7 +129,10 @@ class DbApiHook(BaseHook):
:param kwargs: (optional) passed into pandas.io.sql.read_sql method
:type kwargs: dict
"""
- from pandas.io import sql as psql
+ try:
+ from pandas.io import sql as psql
+ except ImportError:
+ raise Exception("pandas library not installed, run: pip install
'apache-airflow[pandas]'.")
with closing(self.get_conn()) as conn:
return psql.read_sql(sql, con=conn, params=parameters, **kwargs)
diff --git a/airflow/utils/json.py b/airflow/utils/json.py
index 5847ef4..d859fd1 100644
--- a/airflow/utils/json.py
+++ b/airflow/utils/json.py
@@ -19,10 +19,14 @@
from datetime import date, datetime
from decimal import Decimal
-import numpy as np
from flask.json import JSONEncoder
try:
+ import numpy as np
+except ImportError:
+ np = None
+
+try:
from kubernetes.client import models as k8s
except ImportError:
k8s = None
@@ -51,7 +55,7 @@ class AirflowJsonEncoder(JSONEncoder):
# Technically lossy due to floating point errors, but the best we
# can do without implementing a custom encode function.
return float(obj)
- elif isinstance(
+ elif np is not None and isinstance(
obj,
(
np.int_,
@@ -68,9 +72,9 @@ class AirflowJsonEncoder(JSONEncoder):
),
):
return int(obj)
- elif isinstance(obj, np.bool_):
+ elif np is not None and isinstance(obj, np.bool_):
return bool(obj)
- elif isinstance(
+ elif np is not None and isinstance(
obj, (np.float_, np.float16, np.float32, np.float64, np.complex_,
np.complex64, np.complex128)
):
return float(obj)
diff --git a/docs/apache-airflow/extra-packages-ref.rst
b/docs/apache-airflow/extra-packages-ref.rst
index ae88728..dba5e1c 100644
--- a/docs/apache-airflow/extra-packages-ref.rst
+++ b/docs/apache-airflow/extra-packages-ref.rst
@@ -62,6 +62,8 @@ python dependencies for the provided package.
+---------------------+-----------------------------------------------------+----------------------------------------------------------------------------+
| leveldb | ``pip install 'apache-airflow[leveldb]'`` |
Required for use leveldb extra in google provider |
+---------------------+-----------------------------------------------------+----------------------------------------------------------------------------+
+| pandas | ``pip install 'apache-airflow[pandas]'`` |
Install Pandas library compatible with Airflow |
++---------------------+-----------------------------------------------------+----------------------------------------------------------------------------+
| password | ``pip install 'apache-airflow[password]'`` |
Password authentication for users |
+---------------------+-----------------------------------------------------+----------------------------------------------------------------------------+
| rabbitmq | ``pip install 'apache-airflow[rabbitmq]'`` |
RabbitMQ support as a Celery backend |
diff --git a/setup.cfg b/setup.cfg
index d3c5f57..69ad425 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -126,9 +126,6 @@ install_requires =
numpy;python_version>="3.7"
# Required by vendored-in connexion
openapi-spec-validator>=0.2.4
- # Pandas stopped releasing 3.6 binaries for 1.2.* series.
- pandas>=0.17.1, <1.2;python_version<"3.7"
- pandas>=0.17.1, <2.0;python_version>="3.7"
pendulum~=2.0
pep562~=1.0;python_version<"3.7"
psutil>=4.2.0, <6.0.0
diff --git a/setup.py b/setup.py
index 3b6650f..801721f 100644
--- a/setup.py
+++ b/setup.py
@@ -395,6 +395,9 @@ oracle = [
pagerduty = [
'pdpyras>=4.1.2,<5',
]
+pandas = [
+ 'pandas>=0.17.1, <2.0',
+]
papermill = [
'papermill[all]>=1.2.1',
'scrapbook[all]',
@@ -535,7 +538,7 @@ devel = [
'yamllint',
]
-devel_minreq = cgroups + devel + doc + kubernetes + mysql + password
+devel_minreq = cgroups + devel + doc + kubernetes + mysql + pandas + password
devel_hadoop = devel_minreq + hdfs + hive + kerberos + presto + webhdfs
# Dict of all providers which are part of the Apache Airflow repository
together with their requirements
@@ -636,6 +639,7 @@ CORE_EXTRAS_REQUIREMENTS: Dict[str, List[str]] = {
'kerberos': kerberos,
'ldap': ldap,
'leveldb': leveldb,
+ 'pandas': pandas,
'password': password,
'rabbitmq': rabbitmq,
'sentry': sentry,
@@ -765,7 +769,7 @@ _all_requirements = list({req for extras_reqs in
EXTRAS_REQUIREMENTS.values() fo
EXTRAS_REQUIREMENTS["all"] = _all_requirements
# All db user extras here
-EXTRAS_REQUIREMENTS["all_dbs"] = all_dbs
+EXTRAS_REQUIREMENTS["all_dbs"] = all_dbs + pandas
# This can be simplified to devel_hadoop + _all_requirements due to inclusions
# but we keep it for explicit sake. We are de-duplicating it anyway.