mistercrunch closed pull request #4771: [druid] fix 'Unorderable types' when
col has nulls
URL: https://github.com/apache/incubator-superset/pull/4771
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/superset/connectors/druid/models.py
b/superset/connectors/druid/models.py
index 25c68acae1..b9263129f2 100644
--- a/superset/connectors/druid/models.py
+++ b/superset/connectors/druid/models.py
@@ -1277,6 +1277,21 @@ def run_query( # noqa / druid
client.query_builder.last_query.query_dict, indent=2)
return query_str
+ @staticmethod
+ def homogenize_types(df, groupby_cols):
+ """Converting all GROUPBY columns to strings
+
+ When grouping by a numeric (say FLOAT) column, pydruid returns
+ strings in the dataframe. This creates issues downstream related
+ to having mixed types in the dataframe
+
+ Here we replace None with <NULL> and make the whole series a
+ str instead of an object.
+ """
+ for col in groupby_cols:
+ df[col] = df[col].fillna('<NULL>').astype(str)
+ return df
+
def query(self, query_obj):
qry_start_dttm = datetime.now()
client = self.cluster.get_pydruid_client()
@@ -1284,6 +1299,8 @@ def query(self, query_obj):
client=client, query_obj=query_obj, phase=2)
df = client.export_pandas()
+ df = self.homogenize_types(df, query_obj.get('groupby', []))
+
if df is None or df.size == 0:
raise Exception(_('No data was returned.'))
df.columns = [
diff --git a/tests/druid_tests.py b/tests/druid_tests.py
index b0d9caff76..868cd5e2d5 100644
--- a/tests/druid_tests.py
+++ b/tests/druid_tests.py
@@ -61,6 +61,7 @@ def __reduce__(self):
'timestamp': '2012-01-01T00:00:00.000Z',
'event': {
'dim1': 'Canada',
+ 'dim2': 'boy',
'metric1': 12345678,
},
},
@@ -69,6 +70,7 @@ def __reduce__(self):
'timestamp': '2012-01-01T00:00:00.000Z',
'event': {
'dim1': 'USA',
+ 'dim2': 'girl',
'metric1': 12345678 / 2,
},
},
@@ -165,7 +167,7 @@ def test_client(self, PyDruid):
'row_limit': 5000,
'include_search': 'false',
'metrics': ['count'],
- 'groupby': ['dim1', 'dim2d'],
+ 'groupby': ['dim1', 'dim2'],
'force': 'true',
}
# two groupby
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services