mistercrunch closed pull request #5866: Fix regression around low row limit for 
CSV exports
URL: https://github.com/apache/incubator-superset/pull/5866
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/superset/config.py b/superset/config.py
index a5e4f2988c..80f6f85e23 100644
--- a/superset/config.py
+++ b/superset/config.py
@@ -275,8 +275,13 @@
 # Set this API key to enable Mapbox visualizations
 MAPBOX_API_KEY = os.environ.get('MAPBOX_API_KEY', '')
 
-# Maximum number of rows returned in the SQL editor
-SQL_MAX_ROW = 1000
+# Maximum number of rows returned from a database
+# in async mode, no more than SQL_MAX_ROW will be returned and stored
+# in the results backend. This also becomes the limit when exporting CSVs
+SQL_MAX_ROW = 100000
+
+# Limit to be returned to the frontend.
+DISPLAY_MAX_ROW = 1000
 
 # Maximum number of tables/views displayed in the dropdown window in SQL Lab.
 MAX_TABLE_NAMES = 3000
@@ -302,8 +307,6 @@ class CeleryConfig(object):
 CELERY_CONFIG = CeleryConfig
 """
 CELERY_CONFIG = None
-SQL_CELERY_DB_FILE_PATH = os.path.join(DATA_DIR, 'celerydb.sqlite')
-SQL_CELERY_RESULTS_DB_FILE_PATH = os.path.join(DATA_DIR, 
'celery_results.sqlite')
 
 # static http headers to be served by your Superset server.
 # This header prevents iFrames from other domains and
diff --git a/superset/views/core.py b/superset/views/core.py
index e021709434..7a6a05802d 100755
--- a/superset/views/core.py
+++ b/superset/views/core.py
@@ -2354,7 +2354,18 @@ def results(self, key):
             return 
json_error_response(security_manager.get_table_access_error_msg(
                 '{}'.format(rejected_tables)), status=403)
 
-        return json_success(utils.zlib_decompress_to_string(blob))
+        payload = utils.zlib_decompress_to_string(blob)
+        display_limit = app.config.get('DISPLAY_MAX_ROW', None)
+        if display_limit:
+            payload_json = json.loads(payload)
+            payload_json['data'] = payload_json['data'][:display_limit]
+        return json_success(
+            json.dumps(
+                payload_json,
+                default=utils.json_iso_dttm_ser,
+                ignore_nan=True,
+            ),
+        )
 
     @has_access_api
     @expose('/stop_query/', methods=['POST'])
@@ -2407,7 +2418,7 @@ def sql_json(self):
                 tmp_table_name,
             )
 
-        client_id = request.form.get('client_id') or utils.shortid()
+        client_id = request.form.get('client_id') or utils.shortid()[:10]
 
         query = Query(
             database_id=int(database_id),
diff --git a/tests/base_tests.py b/tests/base_tests.py
index 051e7c4c47..62d3938a69 100644
--- a/tests/base_tests.py
+++ b/tests/base_tests.py
@@ -172,7 +172,7 @@ def revoke_public_access_to_table(self, table):
                     perm.view_menu and table.perm in perm.view_menu.name):
                 security_manager.del_permission_role(public_role, perm)
 
-    def run_sql(self, sql, client_id, user_name=None, raise_on_error=False):
+    def run_sql(self, sql, client_id=None, user_name=None, 
raise_on_error=False):
         if user_name:
             self.logout()
             self.login(username=(user_name if user_name else 'admin'))
diff --git a/tests/celery_tests.py b/tests/celery_tests.py
index 58302dbe68..06b10031cd 100644
--- a/tests/celery_tests.py
+++ b/tests/celery_tests.py
@@ -6,7 +6,6 @@
 from __future__ import unicode_literals
 
 import json
-import os
 import subprocess
 import time
 import unittest
@@ -14,7 +13,7 @@
 import pandas as pd
 from past.builtins import basestring
 
-from superset import app, cli, db, security_manager
+from superset import app, db
 from superset.models.helpers import QueryStatus
 from superset.models.sql_lab import Query
 from superset.sql_parse import SupersetQuery
@@ -23,13 +22,12 @@
 
 
 BASE_DIR = app.config.get('BASE_DIR')
+CELERY_SLEEP_TIME = 5
 
 
 class CeleryConfig(object):
-    BROKER_URL = 'sqla+sqlite:///' + app.config.get('SQL_CELERY_DB_FILE_PATH')
+    BROKER_URL = app.config.get('CELERY_RESULT_BACKEND')
     CELERY_IMPORTS = ('superset.sql_lab', )
-    CELERY_RESULT_BACKEND = (
-        'db+sqlite:///' + app.config.get('SQL_CELERY_RESULTS_DB_FILE_PATH'))
     CELERY_ANNOTATIONS = {'sql_lab.add': {'rate_limit': '10/s'}}
     CONCURRENCY = 1
 
@@ -89,29 +87,13 @@ def get_query_by_id(self, id):
 
     @classmethod
     def setUpClass(cls):
-        try:
-            os.remove(app.config.get('SQL_CELERY_DB_FILE_PATH'))
-        except OSError as e:
-            app.logger.warn(str(e))
-        try:
-            os.remove(app.config.get('SQL_CELERY_RESULTS_DB_FILE_PATH'))
-        except OSError as e:
-            app.logger.warn(str(e))
-
-        security_manager.sync_role_definitions()
-
-        worker_command = BASE_DIR + '/bin/superset worker'
+        db.session.query(Query).delete()
+        db.session.commit()
+
+        worker_command = BASE_DIR + '/bin/superset worker -w 2'
         subprocess.Popen(
             worker_command, shell=True, stdout=subprocess.PIPE)
 
-        admin = security_manager.find_user('admin')
-        if not admin:
-            security_manager.add_user(
-                'admin', 'admin', ' user', '[email protected]',
-                security_manager.find_role('Admin'),
-                password='general')
-        cli.load_examples_run(load_test_data=True)
-
     @classmethod
     def tearDownClass(cls):
         subprocess.call(
@@ -123,7 +105,7 @@ def tearDownClass(cls):
             shell=True,
         )
 
-    def run_sql(self, db_id, sql, client_id, cta='false', tmp_table='tmp',
+    def run_sql(self, db_id, sql, client_id=None, cta='false', tmp_table='tmp',
                 async_='false'):
         self.login()
         resp = self.client.post(
@@ -151,11 +133,13 @@ def test_run_sync_query_cta(self):
         main_db = get_main_database(db.session)
         db_id = main_db.id
         eng = main_db.get_sqla_engine()
+        tmp_table_name = 'tmp_async_22'
+        self.drop_table_if_exists(tmp_table_name, main_db)
         perm_name = 'can_sql_json'
         sql_where = (
             "SELECT name FROM ab_permission WHERE name='{}'".format(perm_name))
         result2 = self.run_sql(
-            db_id, sql_where, '2', tmp_table='tmp_table_2', cta='true')
+            db_id, sql_where, '2', tmp_table=tmp_table_name, cta='true')
         self.assertEqual(QueryStatus.SUCCESS, result2['query']['state'])
         self.assertEqual([], result2['data'])
         self.assertEqual([], result2['columns'])
@@ -170,8 +154,7 @@ def test_run_sync_query_cta_no_data(self):
         main_db = get_main_database(db.session)
         db_id = main_db.id
         sql_empty_result = 'SELECT * FROM ab_user WHERE id=666'
-        result3 = self.run_sql(
-            db_id, sql_empty_result, '3', tmp_table='tmp_table_3', cta='true')
+        result3 = self.run_sql(db_id, sql_empty_result, '3')
         self.assertEqual(QueryStatus.SUCCESS, result3['query']['state'])
         self.assertEqual([], result3['data'])
         self.assertEqual([], result3['columns'])
@@ -179,22 +162,31 @@ def test_run_sync_query_cta_no_data(self):
         query3 = self.get_query_by_id(result3['query']['serverId'])
         self.assertEqual(QueryStatus.SUCCESS, query3.status)
 
+    def drop_table_if_exists(self, table_name, database=None):
+        """Drop table if it exists, works on any DB"""
+        sql = 'DROP TABLE {}'.format(table_name)
+        db_id = database.id
+        if database:
+            database.allow_dml = True
+            db.session.flush()
+        return self.run_sql(db_id, sql)
+
     def test_run_async_query(self):
         main_db = get_main_database(db.session)
-        eng = main_db.get_sqla_engine()
+        db_id = main_db.id
+
+        self.drop_table_if_exists('tmp_async_1', main_db)
+
         sql_where = "SELECT name FROM ab_role WHERE name='Admin'"
         result = self.run_sql(
-            main_db.id, sql_where, '4', async_='true', tmp_table='tmp_async_1',
+            db_id, sql_where, '4', async_='true', tmp_table='tmp_async_1',
             cta='true')
         assert result['query']['state'] in (
             QueryStatus.PENDING, QueryStatus.RUNNING, QueryStatus.SUCCESS)
 
-        time.sleep(1)
+        time.sleep(CELERY_SLEEP_TIME)
 
         query = self.get_query_by_id(result['query']['serverId'])
-        df = pd.read_sql_query(query.select_sql, con=eng)
-        self.assertEqual(QueryStatus.SUCCESS, query.status)
-        self.assertEqual([{'name': 'Admin'}], df.to_dict(orient='records'))
         self.assertEqual(QueryStatus.SUCCESS, query.status)
         self.assertTrue('FROM tmp_async_1' in query.select_sql)
         self.assertEqual(
@@ -202,27 +194,25 @@ def test_run_async_query(self):
             "WHERE name='Admin' LIMIT 666", query.executed_sql)
         self.assertEqual(sql_where, query.sql)
         self.assertEqual(0, query.rows)
-        self.assertEqual(666, query.limit)
         self.assertEqual(False, query.limit_used)
         self.assertEqual(True, query.select_as_cta)
         self.assertEqual(True, query.select_as_cta_used)
 
     def test_run_async_query_with_lower_limit(self):
         main_db = get_main_database(db.session)
-        eng = main_db.get_sqla_engine()
+        db_id = main_db.id
+        self.drop_table_if_exists('tmp_async_2', main_db)
+
         sql_where = "SELECT name FROM ab_role WHERE name='Alpha' LIMIT 1"
         result = self.run_sql(
-            main_db.id, sql_where, '5', async_='true', tmp_table='tmp_async_2',
+            db_id, sql_where, '5', async_='true', tmp_table='tmp_async_2',
             cta='true')
         assert result['query']['state'] in (
             QueryStatus.PENDING, QueryStatus.RUNNING, QueryStatus.SUCCESS)
 
-        time.sleep(1)
+        time.sleep(CELERY_SLEEP_TIME)
 
         query = self.get_query_by_id(result['query']['serverId'])
-        df = pd.read_sql_query(query.select_sql, con=eng)
-        self.assertEqual(QueryStatus.SUCCESS, query.status)
-        self.assertEqual([{'name': 'Alpha'}], df.to_dict(orient='records'))
         self.assertEqual(QueryStatus.SUCCESS, query.status)
         self.assertTrue('FROM tmp_async_2' in query.select_sql)
         self.assertEqual(
diff --git a/tests/superset_test_config.py b/tests/superset_test_config.py
index 3076a0556c..aacbd6a0ad 100644
--- a/tests/superset_test_config.py
+++ b/tests/superset_test_config.py
@@ -12,7 +12,6 @@
 if 'SUPERSET__SQLALCHEMY_DATABASE_URI' in os.environ:
     SQLALCHEMY_DATABASE_URI = 
os.environ.get('SUPERSET__SQLALCHEMY_DATABASE_URI')
 
-SQL_CELERY_RESULTS_DB_FILE_PATH = os.path.join(DATA_DIR, 
'celery_results.sqlite')
 SQL_SELECT_AS_CTA = True
 SQL_MAX_ROW = 666
 
@@ -28,7 +27,6 @@
 class CeleryConfig(object):
     BROKER_URL = 'redis://localhost'
     CELERY_IMPORTS = ('superset.sql_lab', )
-    CELERY_RESULT_BACKEND = 'db+sqlite:///' + SQL_CELERY_RESULTS_DB_FILE_PATH
     CELERY_ANNOTATIONS = {'sql_lab.add': {'rate_limit': '10/s'}}
     CONCURRENCY = 1
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to