This is an automated email from the ASF dual-hosted git repository.

mobuchowski pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/main by this push:
     new fb6511212e docs: Update whole OpenLineage Provider docs. (#37620)
fb6511212e is described below

commit fb6511212e6b5d552b69fdd05cb8c9501cc1ab18
Author: Kacper Muda <[email protected]>
AuthorDate: Wed Feb 28 15:13:40 2024 +0100

    docs: Update whole OpenLineage Provider docs. (#37620)
---
 airflow/providers/openlineage/provider.yaml        |  37 +-
 .../configurations-ref.rst                         |   3 +
 .../guides/developer.rst                           | 412 +++++++++++++++++++--
 .../guides/structure.rst                           |  51 ++-
 .../guides/user.rst                                | 244 ++++++++++--
 .../apache-airflow-providers-openlineage/index.rst |  28 +-
 .../macros.rst                                     |   6 +-
 .../supported_classes.rst                          |  15 +-
 8 files changed, 693 insertions(+), 103 deletions(-)

diff --git a/airflow/providers/openlineage/provider.yaml 
b/airflow/providers/openlineage/provider.yaml
index 5ba4a0ee32..6871e05e8d 100644
--- a/airflow/providers/openlineage/provider.yaml
+++ b/airflow/providers/openlineage/provider.yaml
@@ -58,65 +58,68 @@ config:
   openlineage:
     description: |
       This section applies settings for OpenLineage integration.
-      For backwards compatibility with `openlineage-python` one can still use
-      `openlineage.yml` file or `OPENLINEAGE_` environment variables. However, 
below
-      configuration takes precedence over those.
-      More in documentation - 
https://openlineage.io/docs/client/python#configuration.
+      More about configuration and it's precedence can be found at
+      
https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/guides/user.html#transport-setup
     options:
       disabled:
         description: |
-          Set this to true if you don't want OpenLineage to emit events.
+          Disable sending events without uninstalling the OpenLineage Provider 
by setting this to true.
         type: boolean
         example: ~
         default: "False"
         version_added: ~
       disabled_for_operators:
         description: |
-          Semicolon separated string of Airflow Operator names to disable
+          Exclude some Operators from emitting OpenLineage events by passing a 
string of semicolon separated
+          full import paths of Operators to disable.
         type: string
         example: 
"airflow.operators.bash.BashOperator;airflow.operators.python.PythonOperator"
         default: ""
         version_added: 1.1.0
       namespace:
         description: |
-          OpenLineage namespace
+          Set namespace that the lineage data belongs to, so that if you use 
multiple OpenLineage producers,
+          events coming from them will be logically separated.
         version_added: ~
         type: string
-        example: "food_delivery"
+        example: "my_airflow_instance_1"
         default: ~
       extractors:
         description: |
-          Semicolon separated paths to custom OpenLineage extractors.
+          Register custom OpenLineage Extractors by passing a string of 
semicolon separated full import paths.
         type: string
         example: full.path.to.ExtractorClass;full.path.to.AnotherExtractorClass
         default: ~
         version_added: ~
       config_path:
         description: |
-          Path to YAML config. This provides backwards compatibility to pass 
config as
-          `openlineage.yml` file.
+          Specify the path to the YAML configuration file.
+          This ensures backwards compatibility with passing config through the 
`openlineage.yml` file.
         version_added: ~
         type: string
-        example: ~
+        example: "full/path/to/openlineage.yml"
         default: ""
       transport:
         description: |
-          OpenLineage Client transport configuration. It should contain type
-          and additional options per each type.
+          Pass OpenLineage Client transport configuration as JSON string. It 
should contain type of the
+          transport and additional options (different for each transport 
type). For more details see:
+          https://openlineage.io/docs/client/python/#built-in-transport-types
 
           Currently supported types are:
 
             * HTTP
             * Kafka
             * Console
+            * File
         type: string
-        example: '{"type": "http", "url": "http://localhost:5000"}'
+        example: '{"type": "http", "url": "http://localhost:5000";, "endpoint": 
"api/v1/lineage"}'
         default: ""
         version_added: ~
       disable_source_code:
         description: |
-          If disabled, OpenLineage events do not contain source code of 
particular
-          operators, like PythonOperator.
+          Disable the inclusion of source code in OpenLineage events by 
setting this to `true`.
+          By default, several Operators (e.g. Python, Bash) will include their 
source code in the events
+          unless disabled.
         default: ~
         example: ~
         type: boolean
diff --git a/docs/apache-airflow-providers-openlineage/configurations-ref.rst 
b/docs/apache-airflow-providers-openlineage/configurations-ref.rst
index 5885c9d91b..2e8c5e2ad6 100644
--- a/docs/apache-airflow-providers-openlineage/configurations-ref.rst
+++ b/docs/apache-airflow-providers-openlineage/configurations-ref.rst
@@ -15,4 +15,7 @@
     specific language governing permissions and limitations
     under the License.
 
+
+.. _configuration:openlineage:
+
 .. include:: ../exts/includes/providers-configurations-ref.rst
diff --git a/docs/apache-airflow-providers-openlineage/guides/developer.rst 
b/docs/apache-airflow-providers-openlineage/guides/developer.rst
index 360dc2616b..035774e69a 100644
--- a/docs/apache-airflow-providers-openlineage/guides/developer.rst
+++ b/docs/apache-airflow-providers-openlineage/guides/developer.rst
@@ -23,6 +23,42 @@
 Implementing OpenLineage in Operators
 -------------------------------------
 
+OpenLineage makes adding lineage to your data pipelines easy through support 
of direct modification of Airflow Operators.
+When it's possible to modify the Operator adding lineage extraction can be as 
easy as adding a single method to it.
+See :ref:`openlineage_methods:openlineage` for more details.
+
+There might be some Operators that you can not modify (f.e. third party 
providers), but still want the lineage to be extracted from them.
+To handle this situation, OpenLineage allows you to provide custom Extractor 
for any Operator.
+See :ref:`custom_extractors:openlineage` for more details.
+
+If all of the above can not be implemented, as a fallback, there is a way to 
manually annotate lineage.
+Airflow allows Operators to track lineage by specifying the input and outputs 
of the Operators via inlets and outlets.
+See :ref:`inlets_outlets:openlineage` for more details.
+
+.. _extraction_precedence:openlineage:
+
+Extraction precedence
+=====================
+
+As there are multiple possible ways of implementing OpenLineage support for 
the Operator,
+it's important to keep in mind the order in which OpenLineage looks for 
lineage data:
+
+1. **Extractor** - check if there is a custom Extractor specified for Operator 
class name. Any custom Extractor registered by the user will take precedence 
over default Extractors defined in Airflow Provider source code (f.e. 
BashExtractor).
+2. **OpenLineage methods** - if there is no Extractor explicitly specified for 
Operator class name, DefaultExtractor is used, that looks for OpenLineage 
methods in Operator.
+3. **Inlets and Outlets** - if there are no OpenLineage methods defined in the 
Operator, inlets and outlets are checked.
+
+If all the above options are missing, no lineage data is extracted from the 
Operator. You will still receive OpenLineage events
+enriched with things like general Airflow facets, proper event time and type, 
but the inputs/outputs will be empty
+and Operator-specific facets will be missing.
+
+.. _openlineage_methods:openlineage:
+
+OpenLineage methods
+===================
+
+This approach is recommended when dealing with your own Operators, where you 
can directly implement OpenLineage methods.
+When dealing with Operators that you can not modify (f.e. third party 
providers), but still want the lineage to be extracted from them, see 
:ref:`custom_extractors:openlineage`.
+
 OpenLineage defines a few methods for implementation in Operators. Those are 
referred to as OpenLineage methods.
 
 .. code-block:: python
@@ -55,11 +91,11 @@ Instead of returning complete OpenLineage event, the 
provider defines ``Operator
 OpenLineage integration itself takes care to enrich it with things like 
general Airflow facets, proper event time and type, creating proper OpenLineage 
RunEvent.
 
 How to properly implement OpenLineage methods?
-==============================================
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-There are a couple of things worth noting when implementing OpenLineage in 
operators.
+There are a couple of things worth noting when implementing OpenLineage in 
Operators.
 
-First, do not import OpenLineage methods on top-level, but in OL method itself.
+First, do not import OpenLineage-related objects on top-level, but in OL 
method itself.
 This allows users to use your provider even if they do not have OpenLineage 
provider installed.
 
 Second important point is to make sure your provider returns 
OpenLineage-compliant dataset names.
@@ -80,9 +116,35 @@ before execute, there might be no point in writing 
``_on_start`` method.
 Similarly, if there's no relevant failure data - or the failure conditions are 
unknown,
 implementing ``get_openlineage_facets_on_failure`` is probably not worth it.
 
-Here's example of properly implemented ``get_openlineage_facets_on_complete`` 
method, for ``GcsToGcsOperator``.
+How to test OpenLineage methods?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Unit testing OpenLineage integration in Operators is very similar to testing 
Operators itself.
+Objective of those tests is making sure the ``get_openlineage_*`` methods 
return proper ``OperatorLineage``
+data structure with relevant fields filled. It's recommended to mock any 
external calls.
+Authors of tests need to remember the condition of calling different OL 
methods is different.
+``get_openlineage_facets_on_start`` is called before ``execute``, and as such, 
must not depend on values
+that are set there.
+
+See :ref:`local_troubleshooting:openlineage` for details on how to 
troubleshoot OpenLineage locally.
 
-.. code-block::
+There is no existing framework for system testing OpenLineage integration, but 
the easiest way it can be achieved is
+by comparing emitted events (f.e. with ``FileTransport``) against expected 
ones.
+Objective of author of OpenLineage system test is to provide expected 
dictionary of event keys.
+Event keys identify event send from particular Operator and method: they have 
structure ``<dag_id>.<task_id>.event.<event_type>``;
+it's always possible to identify particular event send from particular task 
this way.
+The provided event structure does not have to contain all the fields that are 
in the resulting event.
+Only the fields provided by test author can be compared; this allows to check 
only for fields particular
+test cares about. It also allows to skip fields that are (semi) randomly 
generated, like ``runId`` or ``eventTime``,
+or just always the same in context of OpenLineage in Airflow, like 
``producer``.
+
+Example
+^^^^^^^
+
+Here's example of properly implemented ``get_openlineage_facets_on_complete`` 
method, for `GcsToGcsOperator 
<https://github.com/apache/airflow/blob/main/airflow/providers/google/cloud/transfers/gcs_to_gcs.py>`_.
+As there is some processing made in ``execute`` method, and there is no 
relevant failure data, implementing this single method is enough.
+
+.. code-block::  python
 
     def get_openlineage_facets_on_complete(self, task_instance):
         """
@@ -104,26 +166,330 @@ Here's example of properly implemented 
``get_openlineage_facets_on_complete`` me
             ],
         )
 
+For more examples of implemented OpenLineage methods, check out the source 
code of :ref:`supported_classes:openlineage`.
 
-How to add tests to OpenLineage integration?
-============================================
+.. _custom_extractors:openlineage:
 
-Unit testing OpenLineage integration in operators is very similar to testing 
operators itself.
-Objective of those tests is making sure the ``get_openlineage_*`` methods 
return proper ``OperatorLineage``
-data structure with relevant fields filled. It's recommended to mock any 
external calls.
-Authors of tests need to remember the condition of calling different OL 
methods is different.
-``get_openlineage_facets_on_start`` is called before ``execute``, and as such, 
must not depend on values
-that are set there.
+Custom Extractors
+=================
 
-System testing OpenLineage integration relies on the existing system test 
framework.
-There is special ``VariableTransport`` that gathers OpenLineage events in 
Airflow database,
-and ``OpenLineageTestOperator`` that compares those events to expected ones. 
Objective of author
-of OpenLineage system test is to provide expected dictionary of event keys and 
events to ``OpenLineageTestOperator``.
+This approach is recommended when dealing with Operators that you can not 
modify (f.e. third party providers), but still want the lineage to be extracted 
from them.
+If you want to extract lineage from your own Operators, you may prefer 
directly implementing OpenLineage methods as described in 
:ref:`openlineage_methods:openlineage`.
 
-Event keys identify event send from particular operator and method: they have 
structure ``<dag_id>.<task_id>.event.<event_type>``;
-it's always possible to identify particular event send from particular task 
this way.
+This approach works by detecting which Airflow Operators your DAG is using, 
and extracting lineage data from them using corresponding Extractors class.
 
-The provided event structure does not have to contain all the fields that are 
in the resulting event.
-Only the fields provided by test author are compared; this allows to check 
only for fields particular
-test cares about. It also allows to skip fields that are (semi) randomly 
generated, like ``runId`` or ``eventTime``,
-or just always the same in context of OpenLineage in Airflow, like 
``producer``.
+Interface
+^^^^^^^^^
+
+Custom Extractors have to derive from :class:`BaseExtractor 
<airflow.providers.openlineage.extractors.base.BaseExtractor>`
+and implement at least two methods: ``_execute_extraction`` and 
``get_operator_classnames``.
+
+BaseOperator defines two methods: ``extract`` and ``extract_on_complete``, 
that are called and used to provide actual lineage data.
+The difference is that ``extract`` is called before Operator's ``execute`` 
method, while ``extract_on_complete`` is called after.
+By default, ``extract`` calls ``_execute_extraction`` method implemented in 
custom Extractor, and ``extract_on_complete``
+calls the ``extract`` method. If you want to provide some additional 
information available after the task execution, you can
+override ``extract_on_complete`` method. This can be used to extract any 
additional information that the Operator
+sets on it's own properties. Good example is ``SnowflakeOperator`` that sets 
``query_ids`` after execution.
+
+The ``get_operator_classnames`` is a classmethod that is used to provide list 
of Operators that your Extractor can get lineage from.
+
+For example:
+
+.. code-block::  python
+
+    @classmethod
+    def get_operator_classnames(cls) -> List[str]:
+      return ['PostgresOperator']
+
+If the name of the Operator matches one of the names on the list, the 
Extractor will be instantiated - with Operator
+provided in the Extractor's ``self.operator`` property - and both ``extract`` 
and ``extract_on_complete`` methods will be called.
+
+Both methods return ``OperatorLineage`` structure:
+
+.. code-block::  python
+
+    @define
+    class OperatorLineage:
+        """Structure returned from lineage extraction."""
+
+        inputs: list[Dataset] = Factory(list)
+        outputs: list[Dataset] = Factory(list)
+        run_facets: dict[str, BaseFacet] = Factory(dict)
+        job_facets: dict[str, BaseFacet] = Factory(dict)
+
+
+Inputs and outputs are lists of plain OpenLineage datasets 
(`openlineage.client.run.Dataset`).
+
+``run_facets`` and ``job_facets`` are dictionaries of optional RunFacets and 
JobFacets that would be attached to the job - for example,
+you might want to attach ``SqlJobFacet`` if your Operator is executing SQL.
+
+To learn more about facets in OpenLineage see :ref:`custom_facets:openlineage`.
+
+Registering Custom Extractor
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+OpenLineage integration does not know that you've provided an Extractor unless 
you'll register it.
+
+It can be done by using ``extractors`` option in Airflow configuration.
+
+.. code-block:: ini
+
+    [openlineage]
+    transport = '{"type": "http", "url": "http://example.com:5000"}'
+    extractors = full.path.to.ExtractorClass;full.path.to.AnotherExtractorClass
+
+``AIRFLOW__OPENLINEAGE__EXTRACTORS`` environment variable is an equivalent.
+
+.. code-block:: ini
+
+  
AIRFLOW__OPENLINEAGE__EXTRACTORS='full.path.to.ExtractorClass;full.path.to.AnotherExtractorClass'
+
+Optionally, you can separate them with whitespace. It's useful if you're 
providing them as part of some YAML file.
+
+.. code-block:: ini
+
+    AIRFLOW__OPENLINEAGE__EXTRACTORS: >-
+      full.path.to.FirstExtractor;
+      full.path.to.SecondExtractor
+
+
+Remember to make sure that the path is importable for scheduler and worker.
+
+Debugging Custom Extractor
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two common problems associated with custom Extractors.
+
+First, is wrong path provided to ``extractors`` option in Airflow 
configuration. The path needs to be exactly the same as one you'd use from your 
code.
+If the path is wrong or non-importable from worker, plugin will fail to load 
the Extractors and proper OpenLineage events for that Operator won't be emitted.
+
+Second one, and maybe more insidious, are imports from Airflow. Due to the 
fact that OpenLineage code gets instantiated when Airflow worker itself starts,
+any import from Airflow can be unnoticeably cyclical. This causes OpenLineage 
extraction to fail.
+
+To avoid this issue, import from Airflow only locally - in 
``_execute_extraction`` or ``extract_on_complete`` methods.
+If you need imports for type checking, guard them behind typing.TYPE_CHECKING.
+
+
+Testing Custom Extractor
+^^^^^^^^^^^^^^^^^^^^^^^^
+As all code, custom Extractors should be tested. This section will provide 
some information about the most important
+data structures to write tests for and some notes on troubleshooting. We 
assume prior knowledge of writing custom Extractors.
+To learn more about how Operators and Extractors work together under the hood, 
check out :ref:`custom_extractors:openlineage`.
+
+When testing an Extractor, we want to firstly verify if ``OperatorLineage`` 
object is being created,
+specifically verifying that the object is being built with the correct input 
and output datasets and relevant facets.
+This is done in OpenLineage via pytest, with appropriate mocking and patching 
for connections and objects.
+Check out `example tests 
<https://github.com/apache/airflow/blob/main/tests/providers/openlineage/extractors/test_base.py>`_.
+
+Testing each facet is also important, as data or graphs in the UI can render 
incorrectly if the facets are wrong.
+For example, if the facet name is created incorrectly in the Extractor, then 
the Operator's task will not show up in the lineage graph,
+creating a gap in pipeline observability.
+
+Even with unit tests, an Extractor may still not be operating as expected.
+The easiest way to tell if data isn't coming through correctly is if the UI 
elements are not showing up correctly in the Lineage tab.
+
+See :ref:`local_troubleshooting:openlineage` for details on how to 
troubleshoot OpenLineage locally.
+
+Example
+^^^^^^^
+
+This is an example of a simple Extractor for an Operator that executes export 
Query in BigQuery and saves the result to S3 file.
+Some information is known before Operator's ``execute`` method is called, and 
we can already extract some lineage in ``_execute_extraction`` method.
+After Operator's ``execute`` method is called, in ``extract_on_complete``, we 
can simply attach some additional Facets
+f.e. with Bigquery Job ID to what we've prepared earlier. This way, we get all 
possible information from the Operator.
+
+Please note that this is just an example. There are some OpenLineage built-in 
features that can facilitate different processes,
+like extracting column level lineage and inputs/outputs from SQL query with 
SQL parser.
+
+.. code-block:: python
+
+    from openlineage.client.facet import BaseFacet, ExternalQueryRunFacet, 
SqlJobFacet
+    from openlineage.client.run import Dataset
+
+    from airflow.models.baseoperator import BaseOperator
+    from airflow.providers.openlineage.extractors.base import BaseExtractor
+
+
+    class ExampleOperator(BaseOperator):
+        def __init__(self, query, bq_table_reference, s3_path) -> None:
+            self.bq_table_reference = bq_table_reference
+            self.s3_path = s3_path
+            self.s3_file_name = s3_file_name
+            self.query = query
+            self._job_id = None
+
+        def execute(self, context) -> Any:
+            self._job_id = run_query(query=self.query)
+
+
+    class ExampleExtractor(BaseExtractor):
+        @classmethod
+        def get_operator_classnames(cls):
+            return ["ExampleOperator"]
+
+        def _execute_extraction(self) -> OperatorLineage:
+            """Define what we know before Operator's extract is called."""
+            return OperatorLineage(
+                inputs=[Dataset(namespace="bigquery", 
name=self.bq_table_reference)],
+                outputs=[Dataset(namespace=self.s3_path, 
name=self.s3_file_name)],
+                job_facets={
+                    "sql": SqlJobFacet(
+                        query="EXPORT INTO ... OPTIONS(FORMAT=csv, SEP=';' 
...) AS SELECT * FROM ... "
+                    )
+                },
+            )
+
+        def extract_on_complete(self) -> OperatorLineage:
+            """Add what we received after Operator's extract call."""
+            lineage_metadata = self.extract()
+            lineage_metadata.run_facets = {
+                "parent": ExternalQueryRunFacet(externalQueryId=self._job_id, 
source="bigquery")
+            }
+            return lineage_metadata
+
+For more examples of OpenLineage Extractors, check out the source code of
+`BashExtractor 
<https://github.com/apache/airflow/blob/main/airflow/providers/openlineage/extractors/bash.py>`_
 or
+`PythonExtractor 
<https://github.com/apache/airflow/blob/main/airflow/providers/openlineage/extractors/python.py>`_.
+
+.. _inlets_outlets:openlineage:
+
+Manually annotated lineage
+==========================
+
+This approach is rarely recommended, only in very specific cases, when it's 
impossible to extract some lineage information from the Operator itself.
+If you want to extract lineage from your own Operators, you may prefer 
directly implementing OpenLineage methods as described in 
:ref:`openlineage_methods:openlineage`.
+When dealing with Operators that you can not modify (f.e. third party 
providers), but still want the lineage to be extracted from them, see 
:ref:`custom_extractors:openlineage`.
+
+Airflow allows Operators to track lineage by specifying the input and outputs 
of the Operators via
+`inlets and outlets 
<https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage>`_.
+OpenLineage will, by default, use inlets and outlets as input/output datasets 
if it cannot find any successful extraction from the OpenLineage methods or the 
Extractors.
+
+.. important::
+
+    Airflow supports inlets and outlets to be either a Table, Column, File or 
User entity. However, currently OpenLineage only extracts lineage via Table 
entity
+
+
+Example
+^^^^^^^
+
+An Operator inside the Airflow DAG can be annotated with inlets and outlets 
like in the below example:
+
+.. code-block:: python
+
+    """Example DAG demonstrating the usage of the extraction via Inlets and 
Outlets."""
+
+    import pendulum
+    import datetime
+
+    from airflow import DAG
+    from airflow.operators.bash import BashOperator
+    from airflow.lineage.entities import Table, File
+
+
+    def create_table(cluster, database, name):
+        return Table(
+            database=database,
+            cluster=cluster,
+            name=name,
+        )
+
+
+    t1 = create_table("c1", "d1", "t1")
+    t2 = create_table("c1", "d1", "t2")
+    t3 = create_table("c1", "d1", "t3")
+    t4 = create_table("c1", "d1", "t4")
+    f1 = File(url="http://randomfile";)
+
+    with DAG(
+        dag_id="example_operator",
+        schedule_interval="0 0 * * *",
+        start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+        dagrun_timeout=datetime.timedelta(minutes=60),
+        params={"example_key": "example_value"},
+    ) as dag:
+        task1 = BashOperator(
+            task_id="task_1_with_inlet_outlet",
+            bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
+            inlets=[t1, t2],
+            outlets=[t3],
+        )
+
+        task2 = BashOperator(
+            task_id="task_2_with_inlet_outlet",
+            bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
+            inlets=[t3, f1],
+            outlets=[t4],
+        )
+
+        task1 >> task2
+
+    if __name__ == "__main__":
+        dag.cli()
+
+Note that the ``File`` entity, defined in the example code, is not captured by 
the lineage event currently as described in the ``important`` box above.
+
+Conversion from Airflow Table entity to OpenLineage Dataset is made in the 
following way:
+- ``CLUSTER`` of the table entity becomes the namespace of OpenLineage's 
Dataset
+- The name of the dataset is formed by ``{{DATABASE}}.{{NAME}}`` where 
``DATABASE`` and ``NAME`` are attributes specified by Airflow's Table entity.
+
+.. _custom_facets:openlineage:
+
+Custom facets
+=============
+To learn more about facets in OpenLineage, please refer to `facet 
documentation <https://openlineage.io/docs/spec/facets/>`_.
+Also check out `available Facets 
<https://github.com/OpenLineage/OpenLineage/blob/main/client/python/openlineage/client/facet.py>`_
+
+The OpenLineage spec might not contain all the facets you need to write your 
extractor,
+in which case you will have to make your own `custom facets 
<https://openlineage.io/docs/spec/facets/custom-facets>`_.
+More on creating custom facets can be found `here 
<https://openlineage.io/blog/extending-with-facets/>`_.
+
+.. _job_hierarchy:openlineage:
+
+Job Hierarchy
+=============
+
+Apache Airflow features an inherent job hierarchy: DAGs, large and 
independently schedulable units, comprise smaller, executable tasks.
+
+OpenLineage reflects this structure in its Job Hierarchy model.
+
+- Upon DAG scheduling, a START event is emitted.
+- Subsequently, following Airflow's task order, each task triggers:
+
+  - START events at TaskInstance start.
+  - COMPLETE/FAILED events upon completion.
+
+- Finally, upon DAG termination, a completion event (COMPLETE or FAILED) is 
emitted.
+
+TaskInstance events' ParentRunFacet references the originating DAG run.
+
+.. _local_troubleshooting:openlineage:
+
+Local troubleshooting
+=====================
+
+When testing code locally, `Marquez 
<https://marquezproject.ai/docs/quickstart>`_ can be used to inspect the data 
being emitted—or not being emitted.
+Using Marquez will allow you to figure out if the error is being caused by the 
Extractor or the API.
+If data is being emitted from the Extractor as expected but isn't making it to 
the UI,
+then the Extractor is fine and an issue should be opened up in OpenLineage. 
However, if data is not being emitted properly,
+it is likely that more unit tests are needed to cover Extractor behavior.
+Marquez can help you pinpoint which facets are not being formed properly so 
you know where to add test coverage.
+
+Where can I learn more?
+=======================
+
+- Check out `OpenLineage website <https://openlineage.io>`_.
+- Visit our `GitHub repository <https://github.com/OpenLineage/OpenLineage>`_.
+- Watch multiple `talks <https://openlineage.io/resources#conference-talks>`_ 
about OpenLineage.
+
+Feedback
+========
+
+You can reach out to us on `slack <http://bit.ly/OpenLineageSlack>`_ and leave 
us feedback!
+
+
+How to contribute
+=================
+
+We welcome your contributions! OpenLineage is an Open Source project under 
active development, and we'd love your help!
+
+Sounds fun? Check out our `new contributor guide 
<https://github.com/OpenLineage/OpenLineage/blob/main/CONTRIBUTING.md>`_ to get 
started.
diff --git a/docs/apache-airflow-providers-openlineage/guides/structure.rst 
b/docs/apache-airflow-providers-openlineage/guides/structure.rst
index c91debd940..6d3d9584eb 100644
--- a/docs/apache-airflow-providers-openlineage/guides/structure.rst
+++ b/docs/apache-airflow-providers-openlineage/guides/structure.rst
@@ -17,16 +17,51 @@
     under the License.
 
 
-Structure of OpenLineage Airflow integration
+OpenLineage Airflow integration
 --------------------------------------------
 
-OpenLineage integration implements AirflowPlugin. This allows it to be 
discovered on Airflow start and
-register Airflow Listener.
+OpenLineage is an open framework for data lineage collection and analysis.
+At its core it is an extensible specification that systems can use to 
interoperate with lineage metadata.
+`Check out OpenLineage docs <https://openlineage.io/docs/>`_.
 
-The listener is then called when certain events happen in Airflow - when DAGs 
or TaskInstances start, complete or fail.
-For DAGs, the listener runs in Airflow Scheduler.
-For TaskInstances, the listener runs on Airflow Worker.
+Quickstart
+==========
+
+To instrument your Airflow instance with OpenLineage, see 
:ref:`guides/user:openlineage`.
+
+To implement OpenLineage support for Airflow Operators, see 
:ref:`guides/developer:openlineage`.
+
+What's in it for me ?
+=====================
+
+The metadata collected can answer questions like:
+
+- Why did specific data transformation fail?
+- What are the upstream sources feeding into certain dataset?
+- What downstream processes rely on this specific dataset?
+- Is my data fresh?
+- Can I identify the bottleneck in my data processing pipeline?
+- How did the latest code change affect data processing times?
+- How can I trace the cause of data inaccuracies in my report?
+- How are data privacy and compliance requirements being managed through the 
data's lifecycle?
+- Are there redundant data processes that can be optimized or removed?
+- What data dependencies exist for this critical report?
+
+Understanding complex inter-DAG dependencies and providing up-to-date runtime 
visibility into DAG execution can be challenging.
+OpenLineage integrates with Airflow to collect DAG lineage metadata so that 
inter-DAG dependencies are easily maintained
+and viewable via a lineage graph, while also keeping a catalog of historical 
runs of DAGs.
+
+For OpenLineage backend that will receive events, you can use `Marquez 
<https://marquezproject.ai/>`_
+
+How it works under the hood ?
+=============================
+
+OpenLineage integration implements `AirflowPlugin 
<https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/plugins.html>`_.
+This allows it to be discovered on Airflow start and register
+`Airflow Listener 
<https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/listeners.html>`_.
+
+The ``OpenLineageListener`` is then called by Airflow when certain events 
happen - when DAGs or TaskInstances start, complete or fail.
+For DAGs, the listener runs in Airflow Scheduler. For TaskInstances, the 
listener runs on Airflow Worker.
 
 When TaskInstance listener method gets called, the ``OpenLineageListener`` 
constructs metadata like event's unique ``run_id`` and event time.
-Then, it tries to find valid Extractor for given operator. The Extractors are 
a framework
-for external extraction of metadata from
+Then, it tries to extract metadata from Airflow Operators as described in 
:ref:`extraction_precedence:openlineage`.
diff --git a/docs/apache-airflow-providers-openlineage/guides/user.rst 
b/docs/apache-airflow-providers-openlineage/guides/user.rst
index c2859f00a7..2e642abc76 100644
--- a/docs/apache-airflow-providers-openlineage/guides/user.rst
+++ b/docs/apache-airflow-providers-openlineage/guides/user.rst
@@ -16,90 +16,282 @@
     specific language governing permissions and limitations
     under the License.
 
+.. _guides/user:openlineage:
 
 Using OpenLineage integration
 -----------------------------
 
+OpenLineage is an open framework for data lineage collection and analysis. At 
its core is an extensible specification that systems can use to interoperate 
with lineage metadata.
+`Check out OpenLineage docs <https://openlineage.io/docs/>`_.
+
+**No change to user DAG files is required to use OpenLineage**. Basic 
configuration is needed so that OpenLineage knows where to send events.
+
+Quickstart
+==========
+
+.. note::
+
+    OpenLineage Provider offers a diverse range of data transport options 
(http, kafka, file etc.),
+    including the flexibility to create a custom solution. Configuration can 
be managed through several approaches
+    and there is an extensive array of settings available for users to 
fine-tune and enhance their use of OpenLineage.
+    For a comprehensive explanation of these features, please refer to the 
subsequent sections of this document.
+
+This example is a basic demonstration of OpenLineage setup.
+
+1. Install provider package or add it to ``requirements.txt`` file.
+
+   .. code-block:: ini
+
+      pip install apache-airflow-providers-openlineage
+
+2. Provide a ``Transport`` configuration so that OpenLineage knows where to 
send the events. Within ``airflow.cfg`` file
+
+   .. code-block:: ini
+
+      [openlineage]
+      transport = '{"type": "http", "url": "http://example.com:5000";, 
"endpoint": "api/v1/lineage"}'
+
+   or with ``AIRFLOW__OPENLINEAGE__TRANSPORT`` environment variable
+
+   .. code-block:: ini
+
+      AIRFLOW__OPENLINEAGE__TRANSPORT='{"type": "http", "url": 
"http://example.com:5000";, "endpoint": "api/v1/lineage"}'
+
+3. **That's it !**  OpenLineage events should be sent to the configured 
backend when DAGs are run.
+
 Usage
 =====
 
-No change to user DAG files is required to use OpenLineage. However, it needs 
to be configured.
-Primary, and recommended method of configuring OpenLineage Airflow Provider is 
Airflow configuration.
+When enabled and configured, the integration requires no further action from 
the user. It will automatically:
+
+- Collect task input / output metadata (source, schema, etc.).
+- Collect task run-level metadata (execution time, state, parameters, etc.)
+- Collect task job-level metadata (owners, type, description, etc.)
+- Collect task-specific metadata (bigquery job id, python source code, etc.) - 
depending on the Operator
+
+All this data will be sent as OpenLineage events to the configured backend as 
described in :ref:`job_hierarchy:openlineage`.
+
+Transport setup
+===============
+
+Primary, and recommended method of configuring OpenLineage Airflow Provider is 
Airflow configuration (``airflow.cfg`` file).
+All possible configuration options, with example values, can be found in 
:ref:`the configuration section <configuration:openlineage>`.
 
 At minimum, one thing that needs to be set up in every case is ``Transport`` - 
where do you wish for
-your events to end up - for example `Marquez <https://marquezproject.ai/>`_. 
The ``transport`` field in configuration is used for that purpose.
+your events to end up - for example `Marquez <https://marquezproject.ai/>`_.
+
+Transport as JSON string
+^^^^^^^^^^^^^^^^^^^^^^^^
+The ``transport`` option in Airflow configuration is used for that purpose.
 
 .. code-block:: ini
 
     [openlineage]
-    transport = '{"type": "http", "url": "http://example.com:5000"}'
+    transport = '{"type": "http", "url": "http://example.com:5000";, 
"endpoint": "api/v1/lineage"}'
+
+``AIRFLOW__OPENLINEAGE__TRANSPORT`` environment variable is an equivalent.
+
+.. code-block:: ini
+
+  AIRFLOW__OPENLINEAGE__TRANSPORT='{"type": "http", "url": 
"http://example.com:5000";, "endpoint": "api/v1/lineage"}'
 
 
-If you want to look at OpenLineage events without sending them anywhere, you 
can set up ConsoleTransport - the events will end up in task logs.
+If you want to look at OpenLineage events without sending them anywhere, you 
can set up ``ConsoleTransport`` - the events will end up in task logs.
 
 .. code-block:: ini
 
     [openlineage]
     transport = '{"type": "console"}'
 
+.. note::
+  For full list of built-in transport types, specific transport's options or 
instructions on how to implement your custom transport, refer to
+  `Python client documentation 
<https://openlineage.io/docs/client/python#built-in-transport-types>`_.
 
-You can also configure OpenLineage transport using  ``openlineage.yml`` file.
-Detailed description of that configuration method is in `OpenLineage python 
docs <https://openlineage.io/docs/client/python#configuration>`_.
-To do that, you also need to set up path to the file in Airflow config, or 
point ``OPENLINEAGE_CONFIG`` variable to it:
+Transport as config file
+^^^^^^^^^^^^^^^^^^^^^^^^
+You can also configure OpenLineage ``Transport`` using a YAML file (f.e. 
``openlineage.yml``).
+Provide the path to the YAML file as ``config_path`` option in Airflow 
configuration.
 
 .. code-block:: ini
 
     [openlineage]
     config_path = '/path/to/openlineage.yml'
 
-Lastly, you can set up http transport using ``OPENLINEAGE_URL`` environment 
variable, passing it the URL target of the OpenLineage consumer.
+``AIRFLOW__OPENLINEAGE__CONFIG_PATH`` environment variable is an equivalent.
+
+.. code-block:: ini
+
+  AIRFLOW__OPENLINEAGE__CONFIG_PATH='/path/to/openlineage.yml'
+
+Example content of config YAML file:
+
+.. code-block:: ini
+
+  transport:
+    type: http
+    url: https://backend:5000
+    endpoint: events/receive
+    auth:
+      type: api_key
+      apiKey: f048521b-dfe8-47cd-9c65-0cb07d57591e
+
+.. note::
+
+    Detailed description of that configuration method, together with example 
config files,
+    can be found `in Python client documentation 
<https://openlineage.io/docs/client/python#built-in-transport-types>`_.
+
+Configuration precedence
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+As there are multiple possible ways of configuring OpenLineage, it's important 
to keep in mind the precedence of different configurations.
+OpenLineage Airflow Provider looks for the configuration in the following 
order:
+
+1. Check ``config_path`` in ``airflow.cfg`` under ``openlineage`` section (or 
AIRFLOW__OPENLINEAGE__CONFIG_PATH environment variable)
+2. Check ``transport`` in ``airflow.cfg`` under ``openlineage`` section (or 
AIRFLOW__OPENLINEAGE__TRANSPORT environment variable)
+3. If all the above options are missing, the OpenLineage Python client used 
underneath looks for configuration in the order described in `this 
<https://openlineage.io/docs/client/python#configuration>`_ documentation. 
Please note that **using Airflow configuration is encouraged** and is the only 
future proof solution.
+
+Backwards compatibility
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. warning::
+
+  Below variables **should not** be used and can be removed in the future. 
Consider using Airflow configuration (described above) for a future proof 
solution.
+
+For backwards compatibility with ``openlineage-airflow`` package, some 
environment variables are still available:
+
+- ``OPENLINEAGE_DISABLED`` is an equivalent of 
``AIRFLOW__OPENLINEAGE__DISABLED``.
+- ``OPENLINEAGE_CONFIG`` is an equivalent of 
``AIRFLOW__OPENLINEAGE__CONFIG_PATH``.
+- ``OPENLINEAGE_NAMESPACE`` is an equivalent of 
``AIRFLOW__OPENLINEAGE__NAMESPACE``.
+- ``OPENLINEAGE_EXTRACTORS`` is an equivalent of setting 
``AIRFLOW__OPENLINEAGE__EXTRACTORS``.
+- ``OPENLINEAGE_AIRFLOW_DISABLE_SOURCE_CODE`` is an equivalent of 
``AIRFLOW__OPENLINEAGE__DISABLE_SOURCE_CODE``.
+- ``OPENLINEAGE_URL`` can be used to set up simple http transport. This method 
has some limitations and may require using other environment variables to 
achieve desired output. See `docs 
<https://openlineage.io/docs/client/python#http-transport-configuration-with-environment-variables>`_.
+
+
+Additional Options
+==================
+
+Namespace
+^^^^^^^^^
 
-It's also very useful to set up OpenLineage namespace for this particular 
instance. If not set, it's using ``default`` namespace.
+It's very useful to set up OpenLineage namespace for this particular instance.
 That way, if you use multiple OpenLineage producers, events coming from them 
will be logically separated.
+If not set, it's using ``default`` namespace. Provide the name of the 
namespace as ``namespace`` option in Airflow configuration.
 
 .. code-block:: ini
 
     [openlineage]
-    transport = '{"type": "http", "url": "http://example.com:5000"}'
+    transport = '{"type": "http", "url": "http://example.com:5000";, 
"endpoint": "api/v1/lineage"}'
     namespace = 'my-team-airflow-instance`
 
+``AIRFLOW__OPENLINEAGE__NAMESPACE`` environment variable is an equivalent.
 
-Additional Options
-==================
+.. code-block:: ini
 
-You can disable sending OpenLineage events without uninstalling OpenLineage 
provider by setting ``disabled`` to true or setting ``OPENLINEAGE_DISABLED``
-environment variable to True.
+  AIRFLOW__OPENLINEAGE__NAMESPACE='my-team-airflow-instance'
+
+
+Disable
+^^^^^^^
+You can disable sending OpenLineage events without uninstalling OpenLineage 
provider by setting
+``disabled`` option to ``true`` in Airflow configuration.
 
 .. code-block:: ini
 
     [openlineage]
-    transport = '{"type": "http", "url": "http://example.com:5000"}'
+    transport = '{"type": "http", "url": "http://example.com:5000";, 
"endpoint": "api/v1/lineage"}'
     disabled = true
 
+``AIRFLOW__OPENLINEAGE__DISABLED`` environment variable is an equivalent.
+
+.. code-block:: ini
+
+  AIRFLOW__OPENLINEAGE__DISABLED=true
 
-Several operators - for example Python, Bash - will by default include their 
source code in their OpenLineage events. To prevent that, set 
``disable_source_code`` to true.
+
+Disable source code
+^^^^^^^^^^^^^^^^^^^
+
+Several Operators (f.e. Python, Bash) will by default include their source 
code in their OpenLineage events.
+To prevent that, set ``disable_source_code`` option to ``true`` in Airflow 
configuration.
 
 .. code-block:: ini
 
     [openlineage]
-    transport = '{"type": "http", "url": "http://example.com:5000"}'
+    transport = '{"type": "http", "url": "http://example.com:5000";, 
"endpoint": "api/v1/lineage"}'
     disable_source_code = true
 
-If you used OpenLineage previously, and use `Custom Extractors 
<https://openlineage.io/docs/integrations/airflow/extractors/custom-extractors>`_
 feature, you can also use them in OpenLineage provider.
-Register the extractors using ``extractors`` config option.
+``AIRFLOW__OPENLINEAGE__DISABLE_SOURCE_CODE`` environment variable is an 
equivalent.
+
+.. code-block:: ini
+
+  AIRFLOW__OPENLINEAGE__DISABLE_SOURCE_CODE=true
+
+
+Disabled for Operators
+^^^^^^^^^^^^^^^^^^^^^^
+
+You can easily exclude some Operators from emitting OpenLineage events by 
passing a string of semicolon separated
+full import paths of Airflow Operators to disable as 
``disabled_for_operators`` field in Airflow configuration.
+
+.. code-block:: ini
+
+    [openlineage]
+    transport = '{"type": "http", "url": "http://example.com:5000";, 
"endpoint": "api/v1/lineage"}'
+    disabled_for_operators = 
'airflow.operators.bash.BashOperator;airflow.operators.python.PythonOperator'
+
+``AIRFLOW__OPENLINEAGE__DISABLED_FOR_OPERATORS`` environment variable is an 
equivalent.
+
+.. code-block:: ini
+
+  
AIRFLOW__OPENLINEAGE__DISABLED_FOR_OPERATORS='airflow.operators.bash.BashOperator;airflow.operators.python.PythonOperator'
+
+Custom Extractors
+^^^^^^^^^^^^^^^^^
+
+If you use :ref:`custom Extractors <custom_extractors:openlineage>` feature, 
register the extractors by passing
+a string of semicolon separated Airflow Operators full import paths to 
``extractors`` option in Airflow configuration.
 
 .. code-block:: ini
 
     [openlineage]
-    transport = '{"type": "http", "url": "http://example.com:5000"}'
+    transport = '{"type": "http", "url": "http://example.com:5000";, 
"endpoint": "api/v1/lineage"}'
     extractors = full.path.to.ExtractorClass;full.path.to.AnotherExtractorClass
 
+``AIRFLOW__OPENLINEAGE__EXTRACTORS`` environment variable is an equivalent.
+
+.. code-block:: ini
+
+  
AIRFLOW__OPENLINEAGE__EXTRACTORS='full.path.to.ExtractorClass;full.path.to.AnotherExtractorClass'
+
+
+Troubleshooting
+===============
+
+See :ref:`local_troubleshooting:openlineage` for details on how to 
troubleshoot OpenLineage locally.
+
+
+Adding support for custom Operators
+===================================
+
+If you want to add OpenLineage coverage for particular Operator, take a look 
at :ref:`guides/developer:openlineage`
+
+
+Where can I learn more?
+=======================
+
+- Check out `OpenLineage website <https://openlineage.io>`_.
+- Visit our `GitHub repository <https://github.com/OpenLineage/OpenLineage>`_.
+- Watch multiple `talks <https://openlineage.io/resources#conference-talks>`_ 
about OpenLineage.
+
+Feedback
+========
+
+You can reach out to us on `slack <http://bit.ly/OpenLineageSlack>`_ and leave 
us feedback!
 
-Other
-=====
 
-If you want to add OpenLineage coverage for particular operator, take a look at
+How to contribute
+=================
 
-:ref:`guides/developer:openlineage`
+We welcome your contributions! OpenLineage is an Open Source project under 
active development, and we'd love your help!
 
-For more explanation visit `OpenLineage docs <https://openlineage.io/docs>`_
+Sounds fun? Check out our `new contributor guide 
<https://github.com/OpenLineage/OpenLineage/blob/main/CONTRIBUTING.md>`_ to get 
started.
diff --git a/docs/apache-airflow-providers-openlineage/index.rst 
b/docs/apache-airflow-providers-openlineage/index.rst
index 514d34b6b0..419b9850c3 100644
--- a/docs/apache-airflow-providers-openlineage/index.rst
+++ b/docs/apache-airflow-providers-openlineage/index.rst
@@ -32,29 +32,12 @@
 .. toctree::
     :hidden:
     :maxdepth: 1
-    :caption: User guide
-
-    Guides <guides/user>
-
-.. toctree::
-    :hidden:
-    :maxdepth: 1
-    :caption: Developer guide
-
-    Guides <guides/developer>
-
-.. toctree::
-    :hidden:
-    :maxdepth: 1
-    :caption: Structure of provider
-
-    Guides <guides/structure>
-
-.. toctree::
-    :hidden:
-    :maxdepth: 1
-    :caption: Macros
+    :caption: Guides
 
+    Intro <guides/structure>
+    User <guides/user>
+    Developer <guides/developer>
+    Supported operators <supported_classes>
     Macros <macros>
 
 .. toctree::
@@ -72,7 +55,6 @@
 
     PyPI Repository 
<https://pypi.org/project/apache-airflow-providers-openlineage/>
     Installing from sources <installing-providers-from-sources>
-    Supported operators <supported_classes.rst>
 
 .. THE REMAINDER OF THE FILE IS AUTOMATICALLY GENERATED. IT WILL BE 
OVERWRITTEN AT RELEASE TIME!
 
diff --git a/docs/apache-airflow-providers-openlineage/macros.rst 
b/docs/apache-airflow-providers-openlineage/macros.rst
index 3526ee1d37..72c1e3a7a5 100644
--- a/docs/apache-airflow-providers-openlineage/macros.rst
+++ b/docs/apache-airflow-providers-openlineage/macros.rst
@@ -20,7 +20,9 @@
 OpenLineage Macros
 ==================
 
-Invoke as a jinja template, e.g.
+Macros included in OpenLineage plugin get integrated to Airflow's main 
collections and become available for use.
+
+They can be invoked as a Jinja template, e.g.
 
 Lineage run id
 --------------
@@ -45,7 +47,7 @@ Lineage parent id
             python_callable=my_task_function,
             op_args=[
                 "{{ macros.OpenLineageProviderPlugin.lineage_parent_id(run_id, 
task_instance) }}"
-            ],  # macro invoked
+            ],  # lineage_parent_id macro invoked
             provide_context=False,
             dag=dag,
         )
diff --git a/docs/apache-airflow-providers-openlineage/supported_classes.rst 
b/docs/apache-airflow-providers-openlineage/supported_classes.rst
index 81266929cf..c9a9656c7c 100644
--- a/docs/apache-airflow-providers-openlineage/supported_classes.rst
+++ b/docs/apache-airflow-providers-openlineage/supported_classes.rst
@@ -16,13 +16,20 @@
     under the License.
 
 
+.. _supported_classes:openlineage:
+
 Supported operators
 ===================
 
-Below is a list of operators that support OpenLineage extraction,
-along with specific DB types that are compatible with the 
SQLExecuteQueryOperator.
+Below is a list of Operators that support OpenLineage extraction, along with 
specific DB types that are compatible with the SQLExecuteQueryOperator.
+
+.. important::
+
+    While we strive to keep the list of supported operators current,
+    please be aware that our updating process is automated and may not always 
capture everything accurately.
+
+.. tip::
 
-**Disclaimer:** While we strive to keep the list of supported operators 
current,
-please be aware that our updating process is automated and may not always 
capture everything accurately.
+  You can easily implement OpenLineage support for any operator. See 
:ref:`guides/developer:openlineage`.
 
 .. airflow-providers-openlineage-supported-classes::

Reply via email to