[GitHub] [superset] ktmud commented on a change in pull request #15279: feat: run extra query on QueryObject and add compare operator for post_processing

GitBox Thu, 08 Jul 2021 15:18:29 -0700


ktmud commented on a change in pull request #15279:
URL: https://github.com/apache/superset/pull/15279#discussion_r666522297




##########
File path: superset/utils/core.py
##########
@@ -115,6 +115,8 @@
 
 DTTM_ALIAS = "__timestamp"
 
+TIME_COMPARISION = "__"

Review comment:
       Since there is no need to revert the column name construction, maybe we 
can make this a function:
   
   ```python
   def get_time_comparison_column_name(col: str, period: str):
       return f"{col} ({period})"
   ```
   
   (I think parentheses would look nice than `__`, too)
   
   

##########
File path: superset/common/utils.py
##########
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+from typing import Any, Dict, Optional
+
+from flask_caching import Cache
+from pandas import DataFrame
+
+from superset import app
+from superset.constants import CacheRegion
+from superset.exceptions import CacheLoadError
+from superset.extensions import cache_manager
+from superset.models.helpers import QueryResult
+from superset.stats_logger import BaseStatsLogger
+from superset.utils.cache import set_and_log_cache
+from superset.utils.core import error_msg_from_exception, get_stacktrace, 
QueryStatus
+
+config = app.config
+stats_logger: BaseStatsLogger = config["STATS_LOGGER"]
+logger = logging.getLogger(__name__)
+
+_cache: Dict[CacheRegion, Cache] = {
+    CacheRegion.DEFAULT: cache_manager.cache,
+    CacheRegion.DATA: cache_manager.data_cache,
+}
+
+
+class QueryCacheManager:
+    # pylint: disable=too-many-instance-attributes
+    def __init__(
+        self,
+        df: DataFrame = DataFrame(),
+        query: str = "",
+        annotation_data: Optional[Dict[str, Any]] = None,
+        status: Optional[str] = None,
+        error_message: Optional[str] = None,
+        is_loaded: bool = False,
+        stacktrace: Optional[str] = None,
+        is_cached: Optional[bool] = None,
+        cache_dttm: Optional[str] = None,
+        cache_value: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        self.df = df
+        self.query = query
+        self.annotation_data = {} if annotation_data is None else 
annotation_data
+        self.status = status
+        self.error_message = error_message
+
+        self.is_loaded = is_loaded
+        self.stacktrace = stacktrace
+        self.is_cached = is_cached
+        self.cache_dttm = cache_dttm
+        self.cache_value = cache_value
+
+    def load_query(
+        self,
+        query_result: QueryResult,
+        annotation_data: Optional[Dict[str, Any]] = None,
+        force_query: Optional[bool] = False,
+    ) -> None:
+        try:
+            self.status = query_result.status
+            self.query = query_result.query
+            self.error_message = query_result.error_message
+            self.df = query_result.df
+            self.annotation_data = {} if annotation_data is None else 
annotation_data
+
+            if self.status != QueryStatus.FAILED:
+                stats_logger.incr("loaded_from_source")
+                if not force_query:
+                    stats_logger.incr("loaded_from_source_without_force")
+                self.is_loaded = True
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+            if not self.error_message:
+                self.error_message = str(ex)
+            self.status = QueryStatus.FAILED
+            self.stacktrace = get_stacktrace()
+
+    def set_query(
+        self,
+        key: Optional[str],
+        timeout: Optional[int] = None,
+        datasource_uid: Optional[str] = None,
+        region: CacheRegion = CacheRegion.DEFAULT,
+    ) -> None:
+        value = {
+            "df": self.df,
+            "query": self.query,
+            "annotation_data": self.annotation_data,
+        }
+        if self.is_loaded and key and self.status != QueryStatus.FAILED:
+            self.set(key, value, timeout, datasource_uid, region)

Review comment:
       ```suggestion
               self.set(key, value, timeout=timeout, 
datasource_uid=datasource_uid, region=region)
   ```
   
   Let's use more named arguments to avoid bugs caused by bad argument 
positions.

##########
File path: superset/common/query_context.py
##########
@@ -101,21 +104,143 @@ def __init__(  # pylint: disable=too-many-arguments
             "result_format": self.result_format,
         }
 
-    def get_query_result(self, query_object: QueryObject) -> Dict[str, Any]:
-        """Returns a pandas dataframe based on the query object"""
+    @staticmethod
+    def left_join_on_dttm(
+        left_df: pd.DataFrame, right_df: pd.DataFrame
+    ) -> pd.DataFrame:
+        df = left_df.set_index(DTTM_ALIAS).join(right_df.set_index(DTTM_ALIAS))
+        df.reset_index(level=0, inplace=True)
+        return df
+
+    def processing_time_offsets(
+        self, df: pd.DataFrame, query_object: QueryObject,
+    ) -> Tuple[pd.DataFrame, List[str], List[Optional[str]]]:
+        # ensure query_object is immutable
+        query_object_clone = copy.copy(query_object)
+        rv_sql = []
+        cache_keys = []
+
+        time_offsets = query_object.time_offsets
+        outer_from_dttm = query_object.from_dttm
+        outer_to_dttm = query_object.to_dttm
+        for offset in time_offsets:

Review comment:
       I'm not sure you need to run and cache a completely new query for each 
offset.
   
   Can we somehow compute the final time periods and generate proper `WHERE` 
conditions with `or` filters instead?
   
   ```python
   def get_time_periods_for_offsets(time_range, offsets):
       [start, end] = time_range
       periods = [time_range]
       for offset in periods:
           periods.append([start += offset, end += offset])
       return periods
   ```
   
   Then change 
https://github.com/apache/superset/blob/bee386e643a202f182a5cffd03c460fc27efe959/superset/connectors/sqla/models.py#L1370-L1375
   
   to something like
   
   ```python
   inner_time_filter = or_([dttm_col.between(start, end) for start, end in 
periods])
   subq = subq.where(and_(*(where_clause_and + [inner_time_filter]))
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [superset] ktmud commented on a change in pull request #15279: feat: run extra query on QueryObject and add compare operator for post_processing

Reply via email to