vincbeck commented on code in PR #29168: URL: https://github.com/apache/airflow/pull/29168#discussion_r1087100539
########## airflow/providers/amazon/aws/operators/neptune.py: ########## @@ -0,0 +1,152 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +from airflow.models import BaseOperator +from airflow.providers.amazon.aws.hooks.neptune import NeptuneHook +from airflow.providers.amazon.aws.utils.neptune import NeptuneDbType + +if TYPE_CHECKING: + from airflow.utils.context import Context + + +class NeptuneStartDbOperator(BaseOperator): + """ + Starts a Neptune DB cluster + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:NeptuneStartDbOperator` + + :param db_identifier: The AWS identifier of the DB to start + :param db_type: Type of the DB - either "instance" or "cluster" (default: "cluster") + :param aws_conn_id: The Airflow connection used for AWS credentials. (default: "aws_default") + :param wait_for_completion: If True, waits for DB to start. (default: True) + + Note: In boto3 supports starting db operator only for cluster and not for instance db_type. + So, default is maintained as Cluster, however it can be extended once instance db_type is available, + similar to RDS database implementation + """ + + template_fields = ("db_identifier", "db_type") + STATES_FOR_STARTING = ["available", "starting"] + + def __init__( + self, + *, + db_identifier: str, + db_type: NeptuneDbType | str = NeptuneDbType.CLUSTER, + aws_conn_id: str = "aws_default", + region_name: str = "us-east-1", + wait_for_completion: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.db_identifier = db_identifier + self.hook = NeptuneHook(aws_conn_id=aws_conn_id, region_name=region_name) + self.db_identifier = db_identifier + self.db_type = db_type + self.aws_conn_id = aws_conn_id + self.wait_for_completion = wait_for_completion + + def execute(self, context: Context) -> str: + self.db_type = NeptuneDbType(self.db_type) Review Comment: You're overriding the value ou already set on line `66`? I am not sure I understand what you are trying to achieve here ########## airflow/providers/amazon/aws/operators/neptune.py: ########## @@ -0,0 +1,152 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +from airflow.models import BaseOperator +from airflow.providers.amazon.aws.hooks.neptune import NeptuneHook +from airflow.providers.amazon.aws.utils.neptune import NeptuneDbType + +if TYPE_CHECKING: + from airflow.utils.context import Context + + +class NeptuneStartDbOperator(BaseOperator): + """ + Starts a Neptune DB cluster + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:NeptuneStartDbOperator` + + :param db_identifier: The AWS identifier of the DB to start + :param db_type: Type of the DB - either "instance" or "cluster" (default: "cluster") + :param aws_conn_id: The Airflow connection used for AWS credentials. (default: "aws_default") + :param wait_for_completion: If True, waits for DB to start. (default: True) + + Note: In boto3 supports starting db operator only for cluster and not for instance db_type. + So, default is maintained as Cluster, however it can be extended once instance db_type is available, + similar to RDS database implementation + """ + + template_fields = ("db_identifier", "db_type") + STATES_FOR_STARTING = ["available", "starting"] + + def __init__( + self, + *, + db_identifier: str, + db_type: NeptuneDbType | str = NeptuneDbType.CLUSTER, + aws_conn_id: str = "aws_default", + region_name: str = "us-east-1", + wait_for_completion: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.db_identifier = db_identifier + self.hook = NeptuneHook(aws_conn_id=aws_conn_id, region_name=region_name) + self.db_identifier = db_identifier + self.db_type = db_type + self.aws_conn_id = aws_conn_id + self.wait_for_completion = wait_for_completion + + def execute(self, context: Context) -> str: + self.db_type = NeptuneDbType(self.db_type) + start_db_response = None + if ( + self.hook.get_db_cluster_state(self.db_identifier) + not in NeptuneStartDbOperator.STATES_FOR_STARTING + ): + self._start_db() + + if self.wait_for_completion: + self._wait_until_db_available() + return json.dumps(start_db_response, default=str) Review Comment: `start_db_response` is always `None`? ########## airflow/providers/amazon/aws/operators/neptune.py: ########## @@ -0,0 +1,152 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +from airflow.models import BaseOperator +from airflow.providers.amazon.aws.hooks.neptune import NeptuneHook +from airflow.providers.amazon.aws.utils.neptune import NeptuneDbType + +if TYPE_CHECKING: + from airflow.utils.context import Context + + +class NeptuneStartDbOperator(BaseOperator): + """ + Starts a Neptune DB cluster + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:NeptuneStartDbOperator` + + :param db_identifier: The AWS identifier of the DB to start + :param db_type: Type of the DB - either "instance" or "cluster" (default: "cluster") + :param aws_conn_id: The Airflow connection used for AWS credentials. (default: "aws_default") + :param wait_for_completion: If True, waits for DB to start. (default: True) + + Note: In boto3 supports starting db operator only for cluster and not for instance db_type. + So, default is maintained as Cluster, however it can be extended once instance db_type is available, + similar to RDS database implementation + """ + + template_fields = ("db_identifier", "db_type") + STATES_FOR_STARTING = ["available", "starting"] + + def __init__( + self, + *, + db_identifier: str, + db_type: NeptuneDbType | str = NeptuneDbType.CLUSTER, + aws_conn_id: str = "aws_default", + region_name: str = "us-east-1", + wait_for_completion: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.db_identifier = db_identifier + self.hook = NeptuneHook(aws_conn_id=aws_conn_id, region_name=region_name) + self.db_identifier = db_identifier + self.db_type = db_type + self.aws_conn_id = aws_conn_id + self.wait_for_completion = wait_for_completion + + def execute(self, context: Context) -> str: + self.db_type = NeptuneDbType(self.db_type) + start_db_response = None + if ( + self.hook.get_db_cluster_state(self.db_identifier) + not in NeptuneStartDbOperator.STATES_FOR_STARTING + ): + self._start_db() + + if self.wait_for_completion: + self._wait_until_db_available() + return json.dumps(start_db_response, default=str) + + def _start_db(self): + self.log.info("Starting DB %s '%s'", self.db_type.value, self.db_identifier) + self.hook.conn.start_db_cluster(DBClusterIdentifier=self.db_identifier) + + def _wait_until_db_available(self): + self.log.info("Waiting for DB %s to reach 'available' state", self.db_type.value) + self.hook.wait_for_db_cluster_state(self.db_identifier, target_state="available") + + +class NeptuneStopDbOperator(BaseOperator): + """ + Stops a Neptune DB cluster + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:NeptuneStopDbOperator` + + :param db_identifier: The AWS identifier of the DB to start + :param db_type: Type of the DB - either "instance" or "cluster" (default: "cluster") + :param aws_conn_id: The Airflow connection used for AWS credentials. (default: "aws_default") + :param wait_for_completion: If True, waits for DB to start. (default: True) + + Note: In boto3 supports starting db operator only for cluster and not for instance db_type. + So, default is maintained as Cluster, however it can be extended once instance db_type is available, + similar to RDS database implementation + """ + + template_fields = ("db_identifier", "db_type") + STATES_FOR_STOPPING = ["stopped", "stopping"] + + def __init__( + self, + *, + db_identifier: str, + db_type: NeptuneDbType | str = NeptuneDbType.INSTANCE, + aws_conn_id: str = "aws_default", + region_name: str = "us-east-1", + wait_for_completion: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.hook = NeptuneHook(aws_conn_id=aws_conn_id, region_name=region_name) + self.db_identifier = db_identifier + self.db_type = db_type + self.aws_conn_id = aws_conn_id + self.wait_for_completion = wait_for_completion + + def execute(self, context: Context) -> str: + self.db_type = NeptuneDbType(self.db_type) + stop_db_response = None + if ( + self.hook.get_db_cluster_state(self.db_identifier) + not in NeptuneStopDbOperator.STATES_FOR_STOPPING + ): + stop_db_response = self._stop_db() + if self.wait_for_completion: + self._wait_until_db_stopped() + return json.dumps(stop_db_response, default=str) + + def _stop_db(self): + self.log.info("Stopping DB %s '%s'", self.db_type.value, self.db_identifier) + response = self.hook.conn.stop_db_cluster(DBClusterIdentifier=self.db_identifier) + return response Review Comment: ```suggestion return self.hook.conn.stop_db_cluster(DBClusterIdentifier=self.db_identifier) ``` ########## airflow/providers/amazon/aws/hooks/neptune.py: ########## @@ -0,0 +1,113 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Interact with AWS Neptune.""" +from __future__ import annotations + +import time +from typing import Callable + +from airflow.exceptions import AirflowException, AirflowNotFoundException +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook + + +class NeptuneHook(AwsBaseHook): + """ + Interact with AWS Neptune using proper client from the boto3 library. + + Hook attribute `conn` has all methods that listed in documentation + + .. seealso:: + - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/neptune.html + - https://docs.aws.amazon.com/neptune/index.html + + Additional arguments (such as ``aws_conn_id`` or ``region_name``) may be specified and + are passed down to the underlying AwsBaseHook. + + .. seealso:: + :class:`~airflow.providers.amazon.aws.hooks.base_aws.AwsGenericHook` + + :param aws_conn_id: The Airflow connection used for AWS credentials. + """ + + def __init__(self, *args, **kwargs) -> None: + kwargs["client_type"] = "neptune" + super().__init__(*args, **kwargs) + + def get_db_cluster_state(self, db_cluster_id: str) -> str: + """ + Get the current state of a DB cluster. + + :param db_cluster_id: The ID of the target DB cluster. + :return: Returns the status of the DB cluster as a string (eg. "available") + :rtype: str + :raises AirflowNotFoundException: If the DB cluster does not exist. + """ + try: + response = self.conn.describe_db_clusters(DBClusterIdentifier=db_cluster_id) + except self.conn.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "DBClusterNotFoundFault": + raise AirflowNotFoundException(e) + raise e + return response["DBClusters"][0]["Status"].lower() + + def wait_for_db_cluster_state( + self, db_cluster_id: str, target_state: str, check_interval: int = 30, max_attempts: int = 40 + ) -> None: + """ + Polls until the target state is reached. + An error is raised after a max number of attempts. + + :param db_cluster_id: The ID of the target DB cluster. + :param target_state: Wait until this state is reached + :param check_interval: The amount of time in seconds to wait between attempts + :param max_attempts: The maximum number of attempts to be made + + """ + + def poke(): + return self.get_db_cluster_state(db_cluster_id) + + target_state = target_state.lower() + self._wait_for_state(poke, target_state, check_interval, max_attempts) + self.log.info("DB cluster snapshot '%s' reached the '%s' state", db_cluster_id, target_state) Review Comment: Correct! Please use the function `waiter` defined [here](https://github.com/apache/airflow/blob/main/airflow/providers/amazon/aws/utils/waiter.py) ########## docs/apache-airflow-providers-amazon/operators/neptune.rst: ########## @@ -0,0 +1,70 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +====================================================== +Amazon Neptune Documentation +====================================================== + +`Amazon Neptune is a fast, reliable, fully managed graph database service that makes it easy to build and run +applications that work with highly connected datasets. The core of Neptune is a purpose-built, +high-performance graph database engine that is optimized for storing billions of relationships and +querying the graph with milliseconds latency. Neptune supports the popular graph query languages +Apache TinkerPop Gremlin and W3C's SPARQL, allowing you to build queries that efficiently navigate highly connected +datasets. Neptune powers graph use cases such as recommendation engines, fraud detection, knowledge graphs, +drug discovery, and network security.` Review Comment: ```suggestion Amazon Neptune is a fast, reliable, fully managed graph database service that makes it easy to build and run applications that work with highly connected datasets. The core of Neptune is a purpose-built, high-performance graph database engine that is optimized for storing billions of relationships and querying the graph with milliseconds latency. Neptune supports the popular graph query languages Apache TinkerPop Gremlin and W3C's SPARQL, allowing you to build queries that efficiently navigate highly connected datasets. Neptune powers graph use cases such as recommendation engines, fraud detection, knowledge graphs, drug discovery, and network security. ``` ########## tests/system/providers/amazon/aws/example_neptune_cluster.py: ########## @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +from datetime import datetime + +from airflow import DAG +from airflow.models.baseoperator import chain +from airflow.providers.amazon.aws.operators.neptune import NeptuneStartDbOperator, NeptuneStopDbOperator +from tests.system.providers.amazon.aws.utils import ENV_ID_KEY, SystemTestContextBuilder + +sys_test_context_task = SystemTestContextBuilder().build() + +DAG_ID = "example_neptune_cluster" + +with DAG( + dag_id=DAG_ID, + schedule="@once", + start_date=datetime(2023, 1, 1), + tags=["example"], + catchup=False, +) as dag: + test_context = sys_test_context_task() + + # Assuming Neptune DB is already created, its identifier is provided to test NeptuneStartDbOperator + # and NeptuneStopDbOperator + neptune_db_identifier = f"{test_context[ENV_ID_KEY]}-neptune-database" Review Comment: Ideally we are trying to make system tests as self contained as possible which means, here it would be great if you could create the difference resources you need to start the database. It does not mean to create the operators associated to these actions, you can call these actions by creating custom tasks using TaskFlow API. A good example is [example_batch.py](https://github.com/apache/airflow/blob/main/tests/system/providers/amazon/aws/example_batch.py) ########## airflow/providers/amazon/aws/operators/neptune.py: ########## @@ -0,0 +1,152 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +from airflow.models import BaseOperator +from airflow.providers.amazon.aws.hooks.neptune import NeptuneHook +from airflow.providers.amazon.aws.utils.neptune import NeptuneDbType + +if TYPE_CHECKING: + from airflow.utils.context import Context + + +class NeptuneStartDbOperator(BaseOperator): + """ + Starts a Neptune DB cluster + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:NeptuneStartDbOperator` + + :param db_identifier: The AWS identifier of the DB to start + :param db_type: Type of the DB - either "instance" or "cluster" (default: "cluster") + :param aws_conn_id: The Airflow connection used for AWS credentials. (default: "aws_default") + :param wait_for_completion: If True, waits for DB to start. (default: True) + + Note: In boto3 supports starting db operator only for cluster and not for instance db_type. + So, default is maintained as Cluster, however it can be extended once instance db_type is available, + similar to RDS database implementation + """ + + template_fields = ("db_identifier", "db_type") + STATES_FOR_STARTING = ["available", "starting"] + + def __init__( + self, + *, + db_identifier: str, + db_type: NeptuneDbType | str = NeptuneDbType.CLUSTER, + aws_conn_id: str = "aws_default", + region_name: str = "us-east-1", + wait_for_completion: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.db_identifier = db_identifier + self.hook = NeptuneHook(aws_conn_id=aws_conn_id, region_name=region_name) + self.db_identifier = db_identifier + self.db_type = db_type + self.aws_conn_id = aws_conn_id + self.wait_for_completion = wait_for_completion + + def execute(self, context: Context) -> str: + self.db_type = NeptuneDbType(self.db_type) + start_db_response = None + if ( + self.hook.get_db_cluster_state(self.db_identifier) + not in NeptuneStartDbOperator.STATES_FOR_STARTING + ): + self._start_db() + + if self.wait_for_completion: + self._wait_until_db_available() + return json.dumps(start_db_response, default=str) + + def _start_db(self): + self.log.info("Starting DB %s '%s'", self.db_type.value, self.db_identifier) + self.hook.conn.start_db_cluster(DBClusterIdentifier=self.db_identifier) + + def _wait_until_db_available(self): + self.log.info("Waiting for DB %s to reach 'available' state", self.db_type.value) + self.hook.wait_for_db_cluster_state(self.db_identifier, target_state="available") + + +class NeptuneStopDbOperator(BaseOperator): + """ + Stops a Neptune DB cluster + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:NeptuneStopDbOperator` + + :param db_identifier: The AWS identifier of the DB to start + :param db_type: Type of the DB - either "instance" or "cluster" (default: "cluster") + :param aws_conn_id: The Airflow connection used for AWS credentials. (default: "aws_default") + :param wait_for_completion: If True, waits for DB to start. (default: True) + + Note: In boto3 supports starting db operator only for cluster and not for instance db_type. + So, default is maintained as Cluster, however it can be extended once instance db_type is available, + similar to RDS database implementation + """ + + template_fields = ("db_identifier", "db_type") + STATES_FOR_STOPPING = ["stopped", "stopping"] + + def __init__( + self, + *, + db_identifier: str, + db_type: NeptuneDbType | str = NeptuneDbType.INSTANCE, + aws_conn_id: str = "aws_default", + region_name: str = "us-east-1", + wait_for_completion: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.hook = NeptuneHook(aws_conn_id=aws_conn_id, region_name=region_name) + self.db_identifier = db_identifier + self.db_type = db_type + self.aws_conn_id = aws_conn_id + self.wait_for_completion = wait_for_completion + + def execute(self, context: Context) -> str: + self.db_type = NeptuneDbType(self.db_type) Review Comment: Same as above ########## docs/apache-airflow-providers-amazon/operators/neptune.rst: ########## @@ -0,0 +1,70 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +====================================================== +Amazon Neptune Documentation +====================================================== Review Comment: Please be sure lines of `===` are the same length as the title -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
