shivaam commented on code in PR #61794:
URL: https://github.com/apache/airflow/pull/61794#discussion_r2800563041


##########
providers/common/ai/src/airflow/providers/common/ai/evals/llm_sql.py:
##########
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from pydantic_evals import Case
+from pydantic_evals.evaluators import EvaluationReason, Evaluator, 
EvaluatorContext
+
+
+@dataclass
+class ValidateSQL(Evaluator):
+    """Validate generated sql."""
+
+    BLOCKED_KEYWORDS: list[str] | None = None
+    # TODO Identify and add more validations
+    def blocked_key_word_validation(self, query: str) -> EvaluationReason | 
None:
+        for key_word in self.BLOCKED_KEYWORDS:
+            if key_word in query.upper():

Review Comment:
   the substring check will be false-positive on legitimate table or column 
names
   
   `SELECT drop_date FROM shipments` will also be blocked. Is this expected?



##########
providers/common/ai/src/airflow/providers/common/ai/hooks/pydantic_ai.py:
##########
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import annotations
+
+import json
+from functools import cached_property
+from typing import TYPE_CHECKING, Any
+
+from airflow.providers.common.ai.exceptions import ModelCreationError
+from airflow.providers.common.ai.llm_providers.model_providers import 
ModelProviderFactory
+from airflow.sdk import BaseHook, Connection
+
+if TYPE_CHECKING:
+    from pydantic_ai.models import Model
+
+    from airflow.providers.common.ai.llm_providers.base import ModelProvider
+    from airflow.providers.common.sql.hooks.sql import DbApiHook
+
+
+class PydanticAIHook(BaseHook):
+    """Hook for Pydantic AI."""
+
+    _model_provider_factory: ModelProviderFactory | None = None
+
+    conn_name_attr = "pydantic_ai_conn_id"
+    default_conn_name = "pydantic_ai_default"
+    conn_type = "pydantic_ai"
+    hook_name = "PydanticAI"
+
+    def __init__(
+        self, pydantic_ai_conn_id: str = default_conn_name, provider_model: 
str | None = None, **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.provider_model = provider_model
+        self.pydantic_ai_conn_id = pydantic_ai_conn_id
+        self._api_key: str | None = None
+        self.connection: Connection | None = None
+
+    @classmethod
+    def get_ui_field_behaviour(cls) -> dict[str, Any]:
+        return {
+            "hidden_fields": ["schema"],
+            "relabeling": {
+                "password": "API Key",
+            },
+            "placeholders": {
+                "extra": json.dumps(
+                    {
+                        "provider_model": "",
+                        "model_settings": {},
+                    }
+                )
+            },
+        }
+
+    def get_conn(self) -> Connection:
+        if self.connection is None:
+            self.connection = self.get_connection(self.pydantic_ai_conn_id)
+        return self.connection
+
+    def get_provider_model_name_from_conn(self):
+        return self.get_conn().extra_dejson.get("provider_model")
+
+    @cached_property
+    def get_api_key_from_conn(self):
+        return self.get_conn().password
+
+    @classmethod
+    def get_provider_model_factory(cls):
+        if cls._model_provider_factory is None:
+            cls._model_provider_factory = ModelProviderFactory()
+        return cls._model_provider_factory
+
+    @classmethod
+    def register_model_provider(cls, provider: ModelProvider):
+        cls.get_provider_model_factory().register_model_provider(provider)
+
+    def get_model(self, **kwargs) -> Model:
+        try:
+            provider_model_name = self.provider_model or 
self.get_provider_model_name_from_conn()
+            if not provider_model_name:
+                raise ValueError("No provider model name provided")
+            provider_name, model_name = 
self.get_provider_model_factory().parse_model_provider_name(
+                provider_model_name
+            )
+
+            settings = self.get_conn().extra_dejson.get("model_settings")
+            if settings:
+                kwargs["model_settings"] = settings
+            return 
self._model_provider_factory.get_model_provider(provider_name).build_model(
+                model_name, api_key=self.get_api_key_from_conn, **kwargs
+            )
+        except Exception as e:
+            raise ModelCreationError(f"Error building model: {e}")
+
+    @staticmethod

Review Comment:
   Wondering if this is the correct place for this hook? Keeping it here 
couples the shared AI hook to database but I imagine the AI hook can be used 
for non-database operations like content generation. 



##########
providers/postgres/src/airflow/providers/postgres/hooks/postgres.py:
##########
@@ -691,3 +691,11 @@ def insert_rows(
                     nb_rows += len(chunked_rows)
                     self.log.info("Loaded %s rows into %s so far", nb_rows, 
table)
         self.log.info("Done loading. Loaded a total of %s rows into %s", 
nb_rows, table)
+
+    def get_schema(self, table_name: str):
+        from airflow.providers.common.sql.hooks.handlers import 
fetch_all_handler
+
+        return self.run(
+            sql=f"""SELECT column_name, data_type FROM 
information_schema.columns WHERE table_name = '{table_name}';""",

Review Comment:
   `table_name` comes from user-provided DataSourceConfig and the f-string is 
passed directly into the SQL. This is a SQL injection risk — for example below 
query can be executed to drop the table
   
   `SELECT column_name, data_type FROM information_schema.columns WHERE 
table_name = ''; DROP TABLE users; --';
   `
   
   This happens before the LLM is called, so the ValidateSQL safety layer 
doesn't protect against it. 



##########
providers/common/ai/src/airflow/providers/common/ai/llm_providers/base.py:
##########
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+from apache_beam.runners.worker.operations import TYPE_CHECKING

Review Comment:
   is this import correct? I think we might need to import from typing



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to