RNHTTR commented on code in PR #40447:
URL: https://github.com/apache/airflow/pull/40447#discussion_r1674061812


##########
airflow/api_connexion/endpoints/log_endpoint.py:
##########
@@ -112,3 +118,58 @@ def get_log(
     logs = task_log_reader.read_log_stream(ti, task_try_number, metadata)
 
     return Response(logs, headers={"Content-Type": return_type})
+
+
[email protected]_access_dag("GET", DagAccessEntity.TASK_LOGS)
+@provide_session
+@unify_bucket_name_and_key
+@provide_bucket_name
+def get_log_pages(
+    *,
+    dag_id: str,
+    dag_run_id: str,
+    task_id: str,
+    bucket_name: str,
+    key: str,
+    session: Session = NEW_SESSION,
+) -> APIResponse:
+    """Get total number of log pages for a specific task instance."""
+    task_log_reader = TaskLogReader()
+
+    if not task_log_reader.supports_read:
+        raise BadRequest("Task log handler does not support read logs.")
+
+    query = (
+        select(TaskInstance)
+        .where(
+            TaskInstance.task_id == task_id,
+            TaskInstance.dag_id == dag_id,
+            TaskInstance.run_id == dag_run_id,
+        )
+        .join(TaskInstance.dag_run)
+        
.options(joinedload(TaskInstance.trigger).joinedload(Trigger.triggerer_job))
+    )
+
+    ti = session.scalar(query)
+
+    # Check if the task instance state is terminal
+    if ti.state is None:
+        raise BadRequest("TaskInstance state is None")
+
+    # Maybe add more?
+    if ti.state not in {TaskInstanceState.SUCCESS, TaskInstanceState.FAILED, 
TaskInstanceState.UP_FOR_RETRY}:
+        return {"total_pages": 1}
+
+    # Fetch s3 log content length
+    s3_hook = S3Hook()

Review Comment:
   I don't think you want to use S3Hook here, because this needs to be generic 
(as it could also be used for Airflow logs stored in GCS or Azure Blob Storage).
   
   Maybe you can use the `task_log_reader` to somehow call 
[get_content_length](https://github.com/apache/airflow/blob/497ca471e7d508e87c1cf687db00065b803525f5/airflow/providers/amazon/aws/hooks/s3.py#L926)



##########
airflow/utils/log/log_reader.py:
##########
@@ -40,7 +40,7 @@ class TaskLogReader:
     """Time to sleep between loops while waiting for more logs"""
 
     def read_log_chunks(
-        self, ti: TaskInstance, try_number: int | None, metadata
+        self, ti: TaskInstance, try_number: int | None, metadata, page_number: 
int | None = None

Review Comment:
   @andyjianzhou did you get a chance to look into the metadata param?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to