korbit-ai[bot] commented on code in PR #32572: URL: https://github.com/apache/superset/pull/32572#discussion_r1987904570
########## superset/commands/logs/prune.py: ########## @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import logging +import time +from datetime import datetime, timedelta + +import sqlalchemy as sa + +from superset import db +from superset.commands.base import BaseCommand +from superset.models.core import Log + +logger = logging.getLogger(__name__) + + +# pylint: disable=consider-using-transaction +class LogPruneCommand(BaseCommand): + """ + Command to prune the logs table by deleting rows older than the specified retention period. + + This command deletes records from the `Log` table that have not been changed within the + specified number of days. It helps in maintaining the database by removing outdated entries + and freeing up space. + + Attributes: + retention_period_days (int): The number of days for which records should be retained. + Records older than this period will be deleted. + """ # noqa: E501 + + def __init__(self, retention_period_days: int): + """ + :param retention_period_days: Number of days to keep in the logs table + """ + self.retention_period_days = retention_period_days + + def run(self) -> None: + """ + Executes the prune command + """ + batch_size = 999 # SQLite has a IN clause limit of 999 Review Comment: ### Magic Number Should Be Named Constant <sub></sub> <details> <summary>Tell me more</summary> ###### What is the issue? The magic number 999 should be defined as a named constant at the module or class level. ###### Why this matters Magic numbers make code harder to maintain and understand their purpose without the comment. A named constant makes the intent clear and provides a single point of change. ###### Suggested change ∙ *Feature Preview* ```python # At module or class level SQLITE_IN_CLAUSE_LIMIT = 999 def run(self) -> None: batch_size = SQLITE_IN_CLAUSE_LIMIT ``` </details> <sub> [](https://app.korbit.ai/feedback/aa91ff46-6083-4491-9416-b83dd1994b51/bd0e77ac-94eb-455c-87e2-9ab36eb01014?suggestedFixEnabled=true) 💬 Looking for more details? Reply to this comment to chat with Korbit. </sub> <!--- korbi internal id:300308ee-d3c2-4e75-a665-0620fcec6ba5 --> ########## superset/tasks/scheduler.py: ########## @@ -142,3 +143,24 @@ def prune_query(retention_period_days: Optional[int] = None) -> None: QueryPruneCommand(retention_period_days).run() except CommandException as ex: logger.exception("An error occurred while pruning queries: %s", ex) + + +@celery_app.task(name="prune_logs") +def prune_logs(retention_period_days: Optional[int] = None) -> None: Review Comment: ### Missing Celery Task Performance Guards <sub></sub> <details> <summary>Tell me more</summary> ###### What is the issue? The prune_logs task lacks performance-related task options that could help manage resource consumption during log pruning operations. ###### Why this matters Without proper task options like rate limiting or soft/hard time limits, large log pruning operations could consume excessive system resources or run indefinitely, potentially impacting other operations. ###### Suggested change ∙ *Feature Preview* Add appropriate Celery task options to manage resource consumption: ```python @celery_app.task( name="prune_logs", soft_time_limit=3600, # 1 hour soft timeout time_limit=3900, # 1 hour + 5 min hard timeout rate_limit="1/hour" # Limit to one execution per hour ) ``` </details> <sub> [](https://app.korbit.ai/feedback/aa91ff46-6083-4491-9416-b83dd1994b51/dd8e7fd0-e804-46c3-926f-34da320e2fd4?suggestedFixEnabled=true) 💬 Looking for more details? Reply to this comment to chat with Korbit. </sub> <!--- korbi internal id:a6de7c19-08af-480a-b659-ab0dfb737305 --> ########## superset/tasks/scheduler.py: ########## @@ -142,3 +143,24 @@ QueryPruneCommand(retention_period_days).run() except CommandException as ex: logger.exception("An error occurred while pruning queries: %s", ex) + + +@celery_app.task(name="prune_logs") +def prune_logs(retention_period_days: Optional[int] = None) -> None: + stats_logger: BaseStatsLogger = app.config["STATS_LOGGER"] + stats_logger.incr("prune_logs") + + # TODO: Deprecated: Remove support for passing retention period via options in 6.0 + if retention_period_days is None: + retention_period_days = prune_logs.request.properties.get( + "retention_period_days" + ) + logger.warning( + "Your `prune_logs` beat schedule uses `options` to pass the retention " + "period, please use `kwargs` instead." + ) + + try: + LogPruneCommand(retention_period_days).run() + except CommandException as ex: + logger.exception("An error occurred while pruning logs: %s", ex) Review Comment: ### Generic exception log missing retention period context <sub></sub> <details> <summary>Tell me more</summary> ###### What is the issue? The exception log message is too generic and lacks context about the retention period being used. ###### Why this matters During troubleshooting, it would be difficult to determine which retention period was active when the pruning failed, making debugging more time-consuming. ###### Suggested change ∙ *Feature Preview* ```python logger.exception( "An error occurred while pruning logs with retention period of %s days: %s", retention_period_days, ex ) ``` </details> <sub> [](https://app.korbit.ai/feedback/aa91ff46-6083-4491-9416-b83dd1994b51/ef5796f6-7cab-457f-b56b-d01cb517a826?suggestedFixEnabled=true) 💬 Looking for more details? Reply to this comment to chat with Korbit. </sub> <!--- korbi internal id:43a5b4dc-c392-4765-ac7e-9e671cf63f83 --> ########## superset/commands/logs/prune.py: ########## @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import logging +import time +from datetime import datetime, timedelta + +import sqlalchemy as sa + +from superset import db +from superset.commands.base import BaseCommand +from superset.models.core import Log + +logger = logging.getLogger(__name__) + + +# pylint: disable=consider-using-transaction +class LogPruneCommand(BaseCommand): + """ + Command to prune the logs table by deleting rows older than the specified retention period. + + This command deletes records from the `Log` table that have not been changed within the + specified number of days. It helps in maintaining the database by removing outdated entries + and freeing up space. + + Attributes: + retention_period_days (int): The number of days for which records should be retained. + Records older than this period will be deleted. + """ # noqa: E501 + + def __init__(self, retention_period_days: int): + """ + :param retention_period_days: Number of days to keep in the logs table + """ + self.retention_period_days = retention_period_days + + def run(self) -> None: + """ + Executes the prune command + """ + batch_size = 999 # SQLite has a IN clause limit of 999 + total_deleted = 0 + start_time = time.time() + + # Select all IDs that need to be deleted + ids_to_delete = ( + db.session.execute( + sa.select(Log.id).where( + Log.dttm + < datetime.now() - timedelta(days=self.retention_period_days) + ) + ) + .scalars() + .all() + ) Review Comment: ### Memory-intensive ID loading <sub></sub> <details> <summary>Tell me more</summary> ###### What is the issue? Loading all IDs into memory at once could cause memory issues with large log tables. ###### Why this matters For tables with millions of records to delete, this approach could exhaust available memory and crash the application. ###### Suggested change ∙ *Feature Preview* ```python def run(self) -> None: batch_size = 999 total_deleted = 0 start_time = time.time() while True: # Select only the next batch of IDs ids_to_delete = ( db.session.execute( sa.select(Log.id) .where(Log.dttm < datetime.now() - timedelta(days=self.retention_period_days)) .limit(batch_size) ) .scalars() .all() ) if not ids_to_delete: break result = db.session.execute(sa.delete(Log).where(Log.id.in_(ids_to_delete))) total_deleted += result.rowcount db.session.commit() logger.info( "Deleted %s rows from the logs table older than %s days", total_deleted, self.retention_period_days, ) ``` </details> <sub> [](https://app.korbit.ai/feedback/aa91ff46-6083-4491-9416-b83dd1994b51/1313ec5e-a544-4172-9038-fafb8d45483d?suggestedFixEnabled=true) 💬 Looking for more details? Reply to this comment to chat with Korbit. </sub> <!--- korbi internal id:4e0b1e53-5753-4dcc-a73b-1470ebb7a486 --> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
