This is an automated email from the ASF dual-hosted git repository. kaxilnaik pushed a commit to branch v2-10-test in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/v2-10-test by this push: new 80a19904606 Remove Scarf tracking (#45865) (#45941) 80a19904606 is described below commit 80a19904606ecbfd1a6158cb733fbb7161f4d35f Author: Kaxil Naik <kaxiln...@apache.org> AuthorDate: Thu Jan 23 03:16:17 2025 +0530 Remove Scarf tracking (#45865) (#45941) --- README.md | 3 - RELEASE_NOTES.rst | 11 +- airflow/cli/commands/scheduler_command.py | 3 - airflow/config_templates/config.yml | 22 ---- airflow/reproducible_build.yaml | 4 +- airflow/settings.py | 7 -- airflow/utils/usage_data_collection.py | 123 --------------------- docs/apache-airflow/faq.rst | 23 ---- .../installation/installing-from-pypi.rst | 6 - tests/core/test_settings.py | 25 +---- tests/utils/test_usage_data_collection.py | 104 ----------------- 11 files changed, 8 insertions(+), 323 deletions(-) diff --git a/README.md b/README.md index 8da91a71f9a..0273ecbddb2 100644 --- a/README.md +++ b/README.md @@ -534,6 +534,3 @@ The CI infrastructure for Apache Airflow has been sponsored by: <a href="https://astronomer.io"><img src="https://assets2.astronomer.io/logos/logoForLIGHTbackground.png" alt="astronomer.io" width="250px"></a> <a href="https://aws.amazon.com/opensource/"><img src="docs/integration-logos/aws/aws-cloud-alt_light...@4x.png" alt="AWS OpenSource" width="130px"></a> - -<!-- telemetry/analytics pixel: --> -<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=1b5a5e3c-da81-42f5-befa-42d836bf1b54" alt="Tracking Pixel" /> diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index cb7572626b4..9ff9aeb5c4f 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -223,6 +223,11 @@ Airflow 2.10.0 (2024-08-15) Significant Changes ^^^^^^^^^^^^^^^^^^^ +Scarf based telemetry: Airflow now collect telemetry data (#39510) +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Airflow integrates Scarf to collect basic usage data during operation. Deployments can opt-out of data collection by +setting the ``[usage_data_collection]enabled`` option to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. + Datasets no longer trigger inactive DAGs (#38891) """"""""""""""""""""""""""""""""""""""""""""""""" @@ -271,12 +276,6 @@ Previously known as hybrid executors, this new feature allows Airflow to use mul to use a specific executor that suits its needs best. A single DAG can contain tasks all using different executors. Please see the Airflow documentation for more details. Note: This feature is still experimental. See `documentation on Executor <https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/executor/index.html#using-multiple-executors-concurrently>`_ for a more detailed description. -Scarf based telemetry: Does Airflow collect any telemetry data? (#39510) -"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -Airflow integrates Scarf to collect basic usage data during operation. Deployments can opt-out of data collection by setting the ``[usage_data_collection]enabled`` option to False, or the SCARF_ANALYTICS=false environment variable. -See `FAQ on this <https://airflow.apache.org/docs/apache-airflow/stable/faq.html#does-airflow-collect-any-telemetry-data>`_ for more information. - - New Features """""""""""" - AIP-61 Hybrid Execution (`AIP-61 <https://github.com/apache/airflow/pulls?q=is%3Apr+label%3Aarea%3Ahybrid-executors+is%3Aclosed+milestone%3A%22Airflow+2.10.0%22>`_) diff --git a/airflow/cli/commands/scheduler_command.py b/airflow/cli/commands/scheduler_command.py index 96cfe1e2852..37fd399d2e0 100644 --- a/airflow/cli/commands/scheduler_command.py +++ b/airflow/cli/commands/scheduler_command.py @@ -33,7 +33,6 @@ from airflow.utils import cli as cli_utils from airflow.utils.cli import process_subdir from airflow.utils.providers_configuration_loader import providers_configuration_loaded from airflow.utils.scheduler_health import serve_health_check -from airflow.utils.usage_data_collection import usage_data_collection log = logging.getLogger(__name__) @@ -54,8 +53,6 @@ def scheduler(args: Namespace): """Start Airflow Scheduler.""" print(settings.HEADER) - usage_data_collection() - run_command_with_daemon_option( args=args, process_name="scheduler", diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index 613c5e3394a..06f51aff786 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -2735,25 +2735,3 @@ sensors: type: float example: ~ default: "604800" -usage_data_collection: - description: | - Airflow integrates `Scarf <https://about.scarf.sh/>`__ to collect basic platform and usage data - during operation. This data assists Airflow maintainers in better understanding how Airflow is used. - Insights gained from this telemetry are critical for prioritizing patches, minor releases, and - security fixes. Additionally, this information supports key decisions related to the development road map. - Check the FAQ doc for more information on what data is collected. - - Deployments can opt-out of analytics by setting the ``enabled`` option - to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. - Individual users can easily opt-out of analytics in various ways documented in the - `Scarf Do Not Track docs <https://docs.scarf.sh/gateway/#do-not-track>`__. - - options: - enabled: - description: | - Enable or disable usage data collection and sending. - version_added: 2.10.0 - type: boolean - example: ~ - default: "True" - see_also: ":ref:`Usage data collection FAQ <usage-data-collection>`" diff --git a/airflow/reproducible_build.yaml b/airflow/reproducible_build.yaml index 9f7c6d5a100..eef20e83cad 100644 --- a/airflow/reproducible_build.yaml +++ b/airflow/reproducible_build.yaml @@ -1,2 +1,2 @@ -release-notes-hash: 7be47e2ddbbe1bfbd0d3f572d2b7800a -source-date-epoch: 1736532824 +release-notes-hash: 4c64543422c2823b475306f5e634d598 +source-date-epoch: 1737575461 diff --git a/airflow/settings.py b/airflow/settings.py index 7e9626d788f..85d56d4be85 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -800,13 +800,6 @@ def initialize(): atexit.register(dispose_orm) -def is_usage_data_collection_enabled() -> bool: - """Check if data collection is enabled.""" - return conf.getboolean("usage_data_collection", "enabled", fallback=True) and ( - os.getenv("SCARF_ANALYTICS", "").strip().lower() != "false" - ) - - # Const stuff KILOBYTE = 1024 diff --git a/airflow/utils/usage_data_collection.py b/airflow/utils/usage_data_collection.py deleted file mode 100644 index 3bdfb180fa9..00000000000 --- a/airflow/utils/usage_data_collection.py +++ /dev/null @@ -1,123 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -This module is for management of Airflow's usage data collection. - -This module is not part of the public interface and is subject to change at any time. - -:meta private: -""" - -from __future__ import annotations - -import os -import platform -from urllib.parse import urlencode - -import httpx -from packaging.version import parse - -from airflow import __version__ as airflow_version, settings -from airflow.configuration import conf - - -def usage_data_collection(): - if not settings.is_usage_data_collection_enabled(): - return - - # Exclude pre-releases and dev versions - if _version_is_prerelease(airflow_version): - return - - # Exclude CI environments - if _is_ci_environ(): - return - - scarf_domain = "https://apacheairflow.gateway.scarf.sh/scheduler" - - try: - platform_sys, arch = get_platform_info() - - params = { - "version": airflow_version, - "python_version": get_python_version(), - "platform": platform_sys, - "arch": arch, - "database": get_database_name(), - "db_version": get_database_version(), - "executor": get_executor(), - } - - query_string = urlencode(params) - scarf_url = f"{scarf_domain}?{query_string}" - - httpx.get(scarf_url, timeout=5.0) - except Exception: - pass - - -def _version_is_prerelease(version: str) -> bool: - return parse(version).is_prerelease - - -def _is_ci_environ() -> bool: - """Return True if running in any known CI environment.""" - if os.getenv("CI") == "true": - # Generic CI variable set by many CI systems (GH Actions, Travis, GitLab, CircleCI, Jenkins, Heroku) - return True - - # Other CI variables set by specific CI systems - ci_env_vars = { - "CIRCLECI", # CircleCI - "CODEBUILD_BUILD_ID", # AWS CodeBuild - "GITHUB_ACTIONS", # GitHub Actions - "GITLAB_CI", # GitLab CI - "JENKINS_URL", # Jenkins - "TF_BUILD", # Azure Pipelines - "TRAVIS", # Travis CI - } - - return any(var in os.environ for var in ci_env_vars) - - -def get_platform_info() -> tuple[str, str]: - return platform.system(), platform.machine() - - -def get_database_version() -> str: - if settings.engine is None: - return "None" - - version_info = settings.engine.dialect.server_version_info - # Example: (1, 2, 3) -> "1.2" (cut only major+minor w/o patch) - return ".".join(map(str, version_info[0:2])) if version_info else "None" - - -def get_database_name() -> str: - if settings.engine is None: - return "None" - return settings.engine.dialect.name - - -def get_executor() -> str: - return conf.get("core", "EXECUTOR") - - -def get_python_version() -> str: - # Cut only major+minor from the python version string (e.g. 3.10.12 --> 3.10) - return ".".join(platform.python_version().split(".")[0:2]) diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index 6021ba514ad..7cef74c5f44 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -522,26 +522,3 @@ This means ``explicit_defaults_for_timestamp`` is disabled in your mysql server #. Set ``explicit_defaults_for_timestamp = 1`` under the ``mysqld`` section in your ``my.cnf`` file. #. Restart the Mysql server. - -Does Airflow collect any telemetry data? ----------------------------------------- - -.. _usage-data-collection: - -Airflow integrates `Scarf <https://about.scarf.sh/>`__ to collect basic usage data during operation. -This data assists Airflow maintainers in better understanding how Airflow is used. -Insights gained from this data are helpful for prioritizing patches, minor releases, and -security fixes. Additionally, this information supports key decisions related to the development road map. - -Deployments can opt-out of data collection by setting the :ref:`[usage_data_collection] enabled <config:usage_data_collection__enabled>` -option to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. -Individual users can easily opt-out of analytics in various ways documented in the -`Scarf Do Not Track docs <https://docs.scarf.sh/gateway/#do-not-track>`__. - -The telemetry data collected is limited to the following: - -- Airflow version -- Python version -- Operating system & machine architecture -- Executor -- Metadata DB type & its version diff --git a/docs/apache-airflow/installation/installing-from-pypi.rst b/docs/apache-airflow/installation/installing-from-pypi.rst index 8c689da5e1f..a62d87d9054 100644 --- a/docs/apache-airflow/installation/installing-from-pypi.rst +++ b/docs/apache-airflow/installation/installing-from-pypi.rst @@ -330,12 +330,6 @@ dependencies compatible with just airflow core at the moment Airflow was release # For example: https://raw.githubusercontent.com/apache/airflow/constraints-|version|/constraints-no-providers-3.8.txt pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" - -.. note:: - - Airflow uses `Scarf <https://about.scarf.sh/>`__ to collect basic usage data during operation. - Check the :ref:`Usage data collection FAQ <usage-data-collection>` for more information about the data collected and how to opt-out. - Troubleshooting ''''''''''''''' diff --git a/tests/core/test_settings.py b/tests/core/test_settings.py index 483ef24e25f..3a0c33b08b6 100644 --- a/tests/core/test_settings.py +++ b/tests/core/test_settings.py @@ -31,7 +31,7 @@ from airflow.__main__ import configure_internal_api from airflow.api_internal.internal_api_call import InternalApiConfig from airflow.configuration import conf from airflow.exceptions import AirflowClusterPolicyViolation, AirflowConfigException -from airflow.settings import _ENABLE_AIP_44, TracebackSession, is_usage_data_collection_enabled +from airflow.settings import _ENABLE_AIP_44, TracebackSession from airflow.utils.session import create_session from tests.test_utils.config import conf_vars @@ -368,26 +368,3 @@ def test_create_session_ctx_mgr_no_call_methods(mock_new, clear_internal_api): assert session == m method_calls = [x[0] for x in m.method_calls] assert method_calls == [] # commit and close not called when using internal API - - -@pytest.mark.parametrize( - "env_var, conf_setting, is_enabled", - [ - ("false", "True", False), # env forces disable - ("false", "False", False), # Both force disable - ("False ", "False", False), # Both force disable - ("true", "True", True), # Both enable - ("true", "False", False), # Conf forces disable - (None, "True", True), # Default env, conf enables - (None, "False", False), # Default env, conf disables - ], -) -def test_usage_data_collection_disabled(env_var, conf_setting, is_enabled, clear_internal_api): - conf_patch = conf_vars({("usage_data_collection", "enabled"): conf_setting}) - - if env_var is not None: - with conf_patch, patch.dict(os.environ, {"SCARF_ANALYTICS": env_var}): - assert is_usage_data_collection_enabled() == is_enabled - else: - with conf_patch: - assert is_usage_data_collection_enabled() == is_enabled diff --git a/tests/utils/test_usage_data_collection.py b/tests/utils/test_usage_data_collection.py deleted file mode 100644 index 143bce39eca..00000000000 --- a/tests/utils/test_usage_data_collection.py +++ /dev/null @@ -1,104 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -from __future__ import annotations - -import platform -from unittest import mock - -import pytest - -from airflow import __version__ as airflow_version -from airflow.configuration import conf -from airflow.utils.usage_data_collection import ( - get_database_version, - get_python_version, - usage_data_collection, -) - - -@pytest.mark.parametrize("is_enabled, is_prerelease", [(False, True), (True, True)]) -@mock.patch("httpx.get") -def test_scarf_analytics_disabled(mock_get, is_enabled, is_prerelease): - with mock.patch("airflow.settings.is_usage_data_collection_enabled", return_value=is_enabled), mock.patch( - "airflow.utils.usage_data_collection._version_is_prerelease", return_value=is_prerelease - ): - usage_data_collection() - mock_get.assert_not_called() - - -@mock.patch("airflow.settings.is_usage_data_collection_enabled", return_value=True) -@mock.patch("airflow.utils.usage_data_collection._version_is_prerelease", return_value=False) -@mock.patch("airflow.utils.usage_data_collection._is_ci_environ", return_value=False) -@mock.patch("airflow.utils.usage_data_collection.get_database_version", return_value="12.3") -@mock.patch("airflow.utils.usage_data_collection.get_database_name", return_value="postgres") -@mock.patch("httpx.get") -def test_scarf_analytics( - mock_get, - mock_is_usage_data_collection_enabled, - mock_version_is_ci, - mock_version_is_prerelease, - get_database_version, - get_database_name, -): - platform_sys = platform.system() - platform_machine = platform.machine() - python_version = get_python_version() - executor = conf.get("core", "EXECUTOR") - scarf_endpoint = "https://apacheairflow.gateway.scarf.sh/scheduler" - usage_data_collection() - - expected_scarf_url = ( - f"{scarf_endpoint}?version={airflow_version}" - f"&python_version={python_version}" - f"&platform={platform_sys}" - f"&arch={platform_machine}" - f"&database=postgres" - f"&db_version=12.3" - f"&executor={executor}" - ) - - mock_get.assert_called_once_with(expected_scarf_url, timeout=5.0) - - -@pytest.mark.skip_if_database_isolation_mode -@pytest.mark.db_test -@pytest.mark.parametrize( - "version_info, expected_version", - [ - ((1, 2, 3), "1.2"), # Normal version tuple - (None, "None"), # No version info available - ((1,), "1"), # Single element version tuple - ((1, 2, 3, "beta", 4), "1.2"), # Complex version tuple with strings - ], -) -def test_get_database_version(version_info, expected_version): - with mock.patch("airflow.settings.engine.dialect.server_version_info", new=version_info): - assert get_database_version() == expected_version - - -@pytest.mark.parametrize( - "version_info, expected_version", - [ - ("1.2.3", "1.2"), # Normal version - ("4", "4"), # Single element version - ("1.2.3.beta4", "1.2"), # Complex version tuple with strings - ], -) -def test_get_python_version(version_info, expected_version): - with mock.patch("platform.python_version", return_value=version_info): - assert get_python_version() == expected_version