(cloudberry) 01/41: Dump number of segments during minirepro and gpsd (#14225)

yjhjstz Wed, 01 Jan 2025 19:12:15 -0800

This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


commit b0b1c36713845d82c1c3ad1e84d9595d3a66640b
Author: Chris Hajas <[email protected]>
AuthorDate: Thu Oct 6 10:54:00 2022 -0700

    Dump number of segments during minirepro and gpsd (#14225)
    
    This commit includes a couple improvements when dumping minirepros and gpsd:
    
    1. Dump the number of segments. This is needed to replicate the plan,
    and often we have to ask users/customers this separately. Instead just
    retrieve it during the minirepro
    2. Output a `set optimizer=off` for when the minirepro is being
    restored. The short insert statements are much faster (~10X) using
    planner than Orca, especially if done in debug buiild. Output this by
    default so I don't have to modify this manually.
---
 gpMgmt/bin/gpsd                         | 19 ++++++++++++++++++-
 gpMgmt/bin/minirepro                    | 23 +++++++++++++++++++----
 src/test/regress/expected/minirepro.out | 12 ++++++++----
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/gpMgmt/bin/gpsd b/gpMgmt/bin/gpsd
index ad61a1e3de..fe877a38de 100755
--- a/gpMgmt/bin/gpsd
+++ b/gpMgmt/bin/gpsd
@@ -35,10 +35,19 @@ def ResultIter(cursor, arraysize=1000):
 def getVersion(envOpts):
     cmd = subprocess.Popen('psql --pset footer -Atqc "select version()" 
template1', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, 
env=envOpts)
     if cmd.wait() != 0:
-        sys.stderr.write('\nError while trying to find HAWQ/GPDB version.\n\n' 
+ cmd.communicate()[1].decode('ascii') + '\n\n')
+        sys.stderr.write('\nError while trying to find GPDB version.\n\n' + 
cmd.communicate()[1].decode('ascii') + '\n\n')
         sys.exit(1)
     return cmd.communicate()[0].decode('ascii')
 
+def get_num_segments(cursor):
+    query = "select count(*) from gp_segment_configuration where role='p' and 
content >=0;"
+    try:
+        cursor.execute(query)
+    except pgdb.DatabaseError as e:
+        sys.stderr.write('\nError while trying to retrieve number of 
segments.\n\n' + str(e) + '\n\n')
+        sys.exit(1)
+    vals = cursor.fetchone()
+    return vals[0]
 
 def dumpSchema(connectionInfo, envOpts):
     dump_cmd = 'pg_dump -h {host} -p {port} -U {user} -s -x -O 
{database}'.format(**connectionInfo)
@@ -133,6 +142,7 @@ def main():
 
     version = getVersion(envOpts)
 
+
     timestamp = datetime.datetime.today()
 
     connectionInfo = {
@@ -142,9 +152,14 @@ def main():
     'database': db,
     'options': pgoptions
     }
+    num_segments = 0
+    with closing(pgdb.connect(**connectionInfo)) as connection:
+        with closing(connection.cursor()) as cursor:
+            num_segments = get_num_segments(cursor)
     sys.stdout.writelines(['\n-- Greenplum database Statistics Dump',
                            '\n-- Copyright (C) 2007 - 2014 Pivotal'
                            '\n-- Database: ' + db,
+                           '\n-- Num Segments: ' + str(num_segments),
                            '\n-- Date:     ' + timestamp.date().isoformat(),
                            '\n-- Time:     ' + timestamp.time().isoformat(),
                            '\n-- CmdLine:  ' + ' '.join(sys.argv),
@@ -168,6 +183,8 @@ def main():
                            '\n-- Allow system table modifications',
                            '\n-- ',
                            '\nset allow_system_table_mods=true;\n\n'])
+    # turn off optimizer when loading stats. Orca adds a bit of overhead, but 
it's significant when small insrt queries take 1 vs .1ms
+    sys.stdout.writelines('set optimizer to off;\n\n')
     sys.stdout.flush()
 
     try:
diff --git a/gpMgmt/bin/minirepro b/gpMgmt/bin/minirepro
index 5342dcf175..cf3c28115e 100755
--- a/gpMgmt/bin/minirepro
+++ b/gpMgmt/bin/minirepro
@@ -97,11 +97,21 @@ def get_server_version(cursor):
     query = "select version()"
     try:
         cursor.execute(query)
-        vals = cursor.fetchone()
-        return vals[0]
     except pgdb.DatabaseError as e:
-        sys.stderr.write('\nError while trying to find HAWQ/GPDB version.\n\n' 
+ str(e) + '\n\n')
+        sys.stderr.write('\nError while trying to find GPDB version.\n\n' + 
str(e) + '\n\n')
         sys.exit(1)
+    vals = cursor.fetchone()
+    return vals[0]
+
+def get_num_segments(cursor):
+    query = "select count(*) from gp_segment_configuration where role='p' and 
content >=0;"
+    try:
+        cursor.execute(query)
+    except pgdb.DatabaseError as e:
+        sys.stderr.write('\nError while trying to retrieve number of 
segments.\n\n' + str(e) + '\n\n')
+        sys.exit(1)
+    vals = cursor.fetchone()
+    return vals[0]
 
 def parse_cmd_line():
     p = OptionParser(usage='Usage: %prog <database> [options]', version='%prog 
'+version, conflict_handler="resolve", epilog="WARNING: This tool collects 
statistics about your data, including most common values, which requires some 
data elements to be included in the output file. Please review output file to 
ensure it is within corporate policy to transport the output file.")
@@ -295,6 +305,8 @@ def main():
     # get server version, which is dumped to minirepro output file
     server_ver = get_server_version(cursor)
 
+    num_segments = get_num_segments(cursor)
+
     """
     invoke gp_toolkit UDF, dump object oids as json text
     input: query file name
@@ -329,6 +341,8 @@ def main():
 
     # make sure we connect with the right database
     f_out.writelines('\\connect ' + db + '\n\n')
+    # turn off optimizer when loading stats. Orca adds a bit of overhead, but 
it's significant when small insrt queries take 1 vs .1ms
+    f_out.writelines('set optimizer to off;\n\n')
 
     # first create schema DDLs
     print("Writing schema DDLs ...")
@@ -362,7 +376,8 @@ def main():
     f_out.writelines(['\n-- ',
                        '\n-- Query text',
                        '\n-- \n\n'])
-
+    line = 'set optimizer_segments = ' + str(num_segments) + ';'
+    f_out.writelines('\n-- ' + line + '\n')
     with open(query_file, 'r') as query_f:
         for line in query_f:
             f_out.writelines('-- ' + line)
diff --git a/src/test/regress/expected/minirepro.out 
b/src/test/regress/expected/minirepro.out
index 79d67481f9..0b4a8e5a06 100644
--- a/src/test/regress/expected/minirepro.out
+++ b/src/test/regress/expected/minirepro.out
@@ -64,6 +64,7 @@ SET
 SET
 SET
 SET
+SET
 SET
  set_config 
 ------------
@@ -146,6 +147,7 @@ SET
 SET
 SET
 SET
+SET
 SET
  set_config 
 ------------
@@ -230,6 +232,7 @@ SET
 SET
 SET
 SET
+SET
 SET
  set_config 
 ------------
@@ -271,6 +274,7 @@ SET
 SET
 SET
 SET
+SET
 SET
  set_config 
 ------------
@@ -283,10 +287,10 @@ SET
 SET
 SET
 SET
-psql:data/minirepro.sql:46: ERROR:  only shared relations can be placed in 
pg_global tablespace
-psql:data/minirepro.sql:48: ERROR:  permission denied: "pg_tablespace" is a 
system catalog
-psql:data/minirepro.sql:56: ERROR:  permission denied: "pg_tablespace" is a 
system catalog
-psql:data/minirepro.sql:64: ERROR:  permission denied: "pg_tablespace" is a 
system catalog
+psql:data/minirepro.sql:48: ERROR:  only shared relations can be placed in 
pg_global tablespace
+psql:data/minirepro.sql:50: ERROR:  permission denied: "pg_tablespace" is a 
system catalog
+psql:data/minirepro.sql:58: ERROR:  permission denied: "pg_tablespace" is a 
system catalog
+psql:data/minirepro.sql:66: ERROR:  permission denied: "pg_tablespace" is a 
system catalog
 SET
 UPDATE 1
 DELETE 1


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) 01/41: Dump number of segments during minirepro and gpsd (#14225)

Reply via email to