ktmud commented on a change in pull request #15279:
URL: https://github.com/apache/superset/pull/15279#discussion_r666522297
##########
File path: superset/utils/core.py
##########
@@ -115,6 +115,8 @@
DTTM_ALIAS = "__timestamp"
+TIME_COMPARISION = "__"
Review comment:
Since there is no need to revert the column name construction, maybe we
can make this a function:
```python
def get_time_comparison_column_name(col: str, period: str):
return f"{col} ({period})"
```
(I think parentheses would look nice than `__`, too)
##########
File path: superset/common/utils.py
##########
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+from typing import Any, Dict, Optional
+
+from flask_caching import Cache
+from pandas import DataFrame
+
+from superset import app
+from superset.constants import CacheRegion
+from superset.exceptions import CacheLoadError
+from superset.extensions import cache_manager
+from superset.models.helpers import QueryResult
+from superset.stats_logger import BaseStatsLogger
+from superset.utils.cache import set_and_log_cache
+from superset.utils.core import error_msg_from_exception, get_stacktrace,
QueryStatus
+
+config = app.config
+stats_logger: BaseStatsLogger = config["STATS_LOGGER"]
+logger = logging.getLogger(__name__)
+
+_cache: Dict[CacheRegion, Cache] = {
+ CacheRegion.DEFAULT: cache_manager.cache,
+ CacheRegion.DATA: cache_manager.data_cache,
+}
+
+
+class QueryCacheManager:
+ # pylint: disable=too-many-instance-attributes
+ def __init__(
+ self,
+ df: DataFrame = DataFrame(),
+ query: str = "",
+ annotation_data: Optional[Dict[str, Any]] = None,
+ status: Optional[str] = None,
+ error_message: Optional[str] = None,
+ is_loaded: bool = False,
+ stacktrace: Optional[str] = None,
+ is_cached: Optional[bool] = None,
+ cache_dttm: Optional[str] = None,
+ cache_value: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ self.df = df
+ self.query = query
+ self.annotation_data = {} if annotation_data is None else
annotation_data
+ self.status = status
+ self.error_message = error_message
+
+ self.is_loaded = is_loaded
+ self.stacktrace = stacktrace
+ self.is_cached = is_cached
+ self.cache_dttm = cache_dttm
+ self.cache_value = cache_value
+
+ def load_query(
+ self,
+ query_result: QueryResult,
+ annotation_data: Optional[Dict[str, Any]] = None,
+ force_query: Optional[bool] = False,
+ ) -> None:
+ try:
+ self.status = query_result.status
+ self.query = query_result.query
+ self.error_message = query_result.error_message
+ self.df = query_result.df
+ self.annotation_data = {} if annotation_data is None else
annotation_data
+
+ if self.status != QueryStatus.FAILED:
+ stats_logger.incr("loaded_from_source")
+ if not force_query:
+ stats_logger.incr("loaded_from_source_without_force")
+ self.is_loaded = True
+ except Exception as ex: # pylint: disable=broad-except
+ logger.exception(ex)
+ if not self.error_message:
+ self.error_message = str(ex)
+ self.status = QueryStatus.FAILED
+ self.stacktrace = get_stacktrace()
+
+ def set_query(
+ self,
+ key: Optional[str],
+ timeout: Optional[int] = None,
+ datasource_uid: Optional[str] = None,
+ region: CacheRegion = CacheRegion.DEFAULT,
+ ) -> None:
+ value = {
+ "df": self.df,
+ "query": self.query,
+ "annotation_data": self.annotation_data,
+ }
+ if self.is_loaded and key and self.status != QueryStatus.FAILED:
+ self.set(key, value, timeout, datasource_uid, region)
Review comment:
```suggestion
self.set(key, value, timeout=timeout,
datasource_uid=datasource_uid, region=region)
```
Let's use more named arguments to avoid bugs caused by bad argument
positions.
##########
File path: superset/common/query_context.py
##########
@@ -101,21 +104,143 @@ def __init__( # pylint: disable=too-many-arguments
"result_format": self.result_format,
}
- def get_query_result(self, query_object: QueryObject) -> Dict[str, Any]:
- """Returns a pandas dataframe based on the query object"""
+ @staticmethod
+ def left_join_on_dttm(
+ left_df: pd.DataFrame, right_df: pd.DataFrame
+ ) -> pd.DataFrame:
+ df = left_df.set_index(DTTM_ALIAS).join(right_df.set_index(DTTM_ALIAS))
+ df.reset_index(level=0, inplace=True)
+ return df
+
+ def processing_time_offsets(
+ self, df: pd.DataFrame, query_object: QueryObject,
+ ) -> Tuple[pd.DataFrame, List[str], List[Optional[str]]]:
+ # ensure query_object is immutable
+ query_object_clone = copy.copy(query_object)
+ rv_sql = []
+ cache_keys = []
+
+ time_offsets = query_object.time_offsets
+ outer_from_dttm = query_object.from_dttm
+ outer_to_dttm = query_object.to_dttm
+ for offset in time_offsets:
Review comment:
I'm not sure you need to run and cache a completely new query for each
offset.
Can we somehow compute the final time periods and generate proper `WHERE`
conditions with `or` filters instead?
```python
def get_time_periods_for_offsets(time_range, offsets):
[start, end] = time_range
periods = [time_range]
for offset in periods:
periods.append([start += offset, end += offset])
return periods
```
Then change
https://github.com/apache/superset/blob/bee386e643a202f182a5cffd03c460fc27efe959/superset/connectors/sqla/models.py#L1370-L1375
to something like
```python
inner_time_filter = or_([dttm_col.between(start, end) for start, end in
periods])
subq = subq.where(and_(*(where_clause_and + [inner_time_filter]))
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]