villebro commented on a change in pull request #10270:
URL: https://github.com/apache/superset/pull/10270#discussion_r564360106
##########
File path: superset/common/query_object.py
##########
@@ -214,8 +223,34 @@ def __init__(
@property
def metric_names(self) -> List[str]:
+ """Return metrics names (labels), coerce adhoc metrics to strings."""
return get_metric_names(self.metrics)
+ @property
+ def column_names(self) -> List[str]:
+ """Return column names (labels). Reserved for future adhoc calculated
+ columns."""
+ return self.columns
+
+ def validate(
+ self, raise_exceptions: Optional[bool] = True
+ ) -> Optional[QueryObjectValidationError]:
+ """Validate query object"""
+ error: Optional[QueryObjectValidationError] = None
+ all_labels = self.metric_names + self.column_names
+ if len(set(all_labels)) < len(all_labels):
+ dup_labels = find_duplicates(all_labels)
+ error = QueryObjectValidationError(
+ _(
+ "Duplicate column/metric labels: %(labels)s. Please make "
+ "sure all columns and metrics have a unique label.",
+ labels=", ".join(map(lambda x: f'"{x}"', dup_labels)),
Review comment:
nit (might require the `TypeVar` suggestion below):
```suggestion
labels=", ".join(dup_labels),
```
##########
File path: superset/utils/pandas_postprocessing.py
##########
@@ -554,29 +554,53 @@ def _parse_location(location: str) -> Tuple[float, float,
float]:
raise QueryObjectValidationError(_("Invalid geodetic string"))
+@validate_column_args("columns")
def contribution(
- df: DataFrame, orientation: PostProcessingContributionOrientation
+ df: DataFrame,
+ orientation: Optional[
+ PostProcessingContributionOrientation
+ ] = PostProcessingContributionOrientation.COLUMN,
+ columns: Optional[List[str]] = None,
+ rename_columns: Optional[List[str]] = None,
) -> DataFrame:
"""
- Calculate cell contibution to row/column total.
+ Calculate cell contibution to row/column total for numeric columns.
+ Non-numeric columns will be kept untouched.
+
+ If `columns` are specified, only calculate contributions on selected
columns.
:param df: DataFrame containing all-numeric data (temporal column ignored)
+ :param columns: Columns to calculate values from.
+ :param rename_columns: The new labels for the calculated contribution
columns.
+ The original columns will not be removed.
:param orientation: calculate by dividing cell with row/column total
- :return: DataFrame with contributions, with temporal column at beginning
if present
+ :return: DataFrame with contributions.
"""
- temporal_series: Optional[Series] = None
contribution_df = df.copy()
- if DTTM_ALIAS in df.columns:
- temporal_series = cast(Series, contribution_df.pop(DTTM_ALIAS))
-
- if orientation == PostProcessingContributionOrientation.ROW:
- contribution_dft = contribution_df.T
- contribution_df = (contribution_dft / contribution_dft.sum()).T
- else:
- contribution_df = contribution_df / contribution_df.sum()
-
- if temporal_series is not None:
- contribution_df.insert(0, DTTM_ALIAS, temporal_series)
+ numeric_df = contribution_df.select_dtypes(include="number")
Review comment:
nice 👍
##########
File path: superset/common/query_context.py
##########
@@ -56,7 +56,7 @@ class QueryContext:
cache_type: ClassVar[str] = "df"
enforce_numerical_metrics: ClassVar[bool] = True
- datasource: BaseDatasource
+ datasource: SqlaTable
Review comment:
I think this was discussed in a previous review on some other class. I'm
pretty sure we can't assume that a `datasource` is `SqlaTable` here, as
`QueryContext` needs to support all supported `BaseDatasource`s.
##########
File path: superset/utils/core.py
##########
@@ -1481,3 +1498,7 @@ def get_time_filter_status( # pylint:
disable=too-many-branches
def format_list(items: Sequence[str], sep: str = ", ", quote: str = '"') ->
str:
quote_escaped = "\\" + quote
return sep.join(f"{quote}{x.replace(quote, quote_escaped)}{quote}" for x
in items)
+
+
+def find_duplicates(items: Iterable[Any]) -> List[Any]:
+ return [item for item, count in collections.Counter(items).items() if
count > 1]
Review comment:
`Any` can be made
[generic](https://mypy.readthedocs.io/en/latest/generics.html#generic-functions)
here:
```suggestion
from typing import TypeVar
T = TypeVar('T')
def find_duplicates(items: Iterable[T]) -> List[T]:
return [item for item, count in collections.Counter(items).items() if
count > 1]
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]