Nataneljpwd commented on code in PR #61143:
URL: https://github.com/apache/airflow/pull/61143#discussion_r2748172785
##########
providers/common/sql/src/airflow/providers/common/sql/operators/generic_transfer.py:
##########
@@ -80,6 +81,7 @@ def __init__(
source_hook_params: dict | None = None,
destination_conn_id: str,
destination_hook_params: dict | None = None,
+ rows_processor: Callable[[Any, Context], Any] = lambda rows,
**context: rows,
Review Comment:
Maybe it is a better idea to make it nullable? I can see it becoming a
performance overhead for a large number of rows
##########
providers/common/sql/src/airflow/providers/common/sql/operators/generic_transfer.py:
##########
@@ -139,6 +142,16 @@ def render_template_fields(
if isinstance(commit_every, str):
self.insert_args["commit_every"] = int(commit_every)
+ def _process_rows(self, rows: list[Any], context: Context):
+ return self._rows_processor(rows, **context) # type: ignore
+
+ def _insert_rows(self, rows: list[Any], context: Context):
+ rows = self._process_rows(rows=rows, context=context)
Review Comment:
Same thing here
##########
providers/common/sql/src/airflow/providers/common/sql/operators/generic_transfer.py:
##########
@@ -139,6 +142,16 @@ def render_template_fields(
if isinstance(commit_every, str):
self.insert_args["commit_every"] = int(commit_every)
+ def _process_rows(self, rows: list[Any], context: Context):
+ return self._rows_processor(rows, **context) # type: ignore
Review Comment:
An optimization could be to check if it is null, and if so, just skip
processing
##########
providers/common/sql/src/airflow/providers/common/sql/operators/generic_transfer.py:
##########
@@ -139,6 +142,16 @@ def render_template_fields(
if isinstance(commit_every, str):
self.insert_args["commit_every"] = int(commit_every)
+ def _process_rows(self, rows: list[Any], context: Context):
+ return self._rows_processor(rows, **context) # type: ignore
+
+ def _insert_rows(self, rows: list[Any], context: Context):
Review Comment:
Why is this method needed? Why not just do it in the execute method, as it
seems like the logic is simple enough
##########
providers/common/sql/src/airflow/providers/common/sql/triggers/sql.py:
##########
@@ -78,15 +79,23 @@ def get_hook(self) -> DbApiHook:
)
return hook
+ async def get_records(self) -> Any:
Review Comment:
Is this supposed to be a public exposed api? As it seems to be more like a
private method, correct me if I'm wrong
##########
providers/common/sql/src/airflow/providers/common/sql/operators/generic_transfer.py:
##########
@@ -196,15 +205,8 @@ def execute_complete(
self.log.info("Offset increased to %d", offset)
context["ti"].xcom_push(key="offset", value=offset)
- self.log.info("Inserting %d rows into %s", len(results),
self.destination_conn_id)
- self.destination_hook.insert_rows(
- table=self.destination_table, rows=results,
**self.insert_args
- )
- self.log.info(
- "Inserting %d rows into %s done!",
- len(results),
- self.destination_conn_id,
- )
+ rows = self._process_rows(rows=rows, context=context)
+ self._insert_rows(rows=rows, context=context)
Review Comment:
Same thing here, why not just do the 3 steps one after the other
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]