FlatBuffers changes

michaelsmith Thu, 15 Aug 2024 17:16:59 -0700

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit db0f0dadf19e8a18bdf8e7c1788e1fe2af9cb675
Author: stiga-huang <[email protected]>
AuthorDate: Tue Aug 6 12:42:19 2024 +0800

    IMPALA-13240: Add gerrit comments for Thrift/FlatBuffers changes
    
    Adds gerrit comments for changes in Thrift/FlatBuffers files that could
    break the communication between impalad and catalogd/statestore during
    upgrade.
    
    Basically, only new optional fields can be added in Thrift protocol. For
    Flatbuffers schemas, we should only add new fields at the end of a table
    definition.
    
    Adds a new option (--revision) for critique-gerrit-review.py to specify
    the revision (HEAD or a commit, branch, etc). Also adds an option
    (--base-revision) to specify the base revision for comparison.
    
    To test the script locally, prepare a virtual env with the virtualenv
    package installed:
      virtualenv venv
      source venv/bin/activate
      pip install virtualenv
    Then run the script with --dryrun:
      python bin/jenkins/critique-gerrit-review.py --dryrun --revision effc9df93
    
    Limitations
     - False positive in cases that add new Thrift structs with required
       fields and only use those new structs in new optional fields, e.g.
       effc9df93 and 72732da9d.
     - Might have false positive results on reformat changes due to simple
       string checks, e.g. 91d8a8f62.
     - Can't check incompatible changes in FlatBuffers files. Just add
       general file level comments.
    
    We can integrate DUPCheck in the future to parse the Thrift/FlatBuffers
    files to AST and compare the AST instead.
    https://github.com/jwjwyoung/DUPChecker
    
    Tests:
     - Verified incompatible commits like 012996a06 and 65094a74f.
     - Verified posting Gerrit comments from local env using my username.
    
    Change-Id: Ib35fafa50bfd38631312d22464df14d426f55346
    Reviewed-on: http://gerrit.cloudera.org:8080/21646
    Reviewed-by: Riza Suminto <[email protected]>
    Tested-by: Quanlong Huang <[email protected]>
---
 bin/jenkins/critique-gerrit-review.py | 235 ++++++++++++++++++++++++++++++----
 common/thrift/Data.thrift             |   2 +
 2 files changed, 211 insertions(+), 26 deletions(-)

diff --git a/bin/jenkins/critique-gerrit-review.py 
b/bin/jenkins/critique-gerrit-review.py
index c165017c4..d3ce54ec3 100755
--- a/bin/jenkins/critique-gerrit-review.py
+++ b/bin/jenkins/critique-gerrit-review.py
@@ -45,7 +45,6 @@ from os import environ
 import os.path
 import re
 from subprocess import check_call, check_output, Popen, PIPE
-import sys
 import virtualenv
 
 FLAKE8_VERSION = "3.9.2"
@@ -60,7 +59,7 @@ FLAKE8_DIFF_PATH = os.path.join(VENV_BIN, "flake8-diff")
 LINE_LIMIT = 90
 
 # Source file extensions that we should apply our line limit and whitespace 
rules to.
-SOURCE_EXTENSIONS = set([".cc", ".h", ".java", ".py", ".sh", ".thrift"])
+SOURCE_EXTENSIONS = {".cc", ".h", ".java", ".py", ".sh", ".thrift"}
 
 # Source file patterns that we exclude from our checks.
 EXCLUDE_FILE_PATTERNS = [
@@ -74,6 +73,49 @@ EXCLUDE_FILE_PATTERNS = [
     re.compile(r".*/.*\.xml\.py")  # Long lines in config template files.
 ]
 
+# Thrift files that are not used in communication between impalad and 
catalogd/statestore
+EXCLUDE_THRIFT_FILES = {
+  "BackendGflags.thrift",    # Only used between FE and BE
+  "beeswax.thrift",          # Only used between client and impalad
+  "DataSinks.thrift",        # Only used in impalads
+  "Descriptors.thrift",      # Only used in impalads
+  "ExecStats.thrift",        # Only used in impalads
+  "LineageGraph.thrift",     # Only used in impalads
+  "NetworkTest.thrift",      # Unused in production
+  "Planner.thrift",          # Only used in impalads
+  "PlanNodes.thrift",        # Only used in impalads
+  "parquet.thrift",          # Only used in impalads
+  "ResourceProfile.thrift",  # Only used in impalads
+  "SystemTables.thrift",     # Only used in impalads
+  "Zip.thrift",              # Unused
+}
+
+THRIFT_FILE_COMMENT = (
+    "This file is used in communication between impalad and 
catalogd/statestore. "
+    "Please make sure impalads can still work with new/old versions of 
catalogd and "
+    "statestore. Basically only new optional fields can be added.")
+FBS_FILE_COMMENT = (
+    "This file is used in communication between impalad and 
catalogd/statestore. "
+    "Please make sure impalads can still work with new/old versions of 
catalogd and "
+    "statestore. Basically only new fields can be added and should be added at 
the end "
+    "of a table definition.\n"
+    "https://flatbuffers.dev/flatbuffers_guide_writing_schema.html";)
+COMMENT_REVISION_SIDE = "REVISION"
+COMMENT_PARENT_SIDE = "PARENT"
+
+# Matches range information like:
+#  @@ -229,0 +230,2 @@ struct TColumnStats {
+RANGE_RE = re.compile(r"^@@ -([0-9]*).* \+([0-9]*).*@@\s*(.*)$")
+# Matches required/optional fields like:
+#  7: required i64 num_trues
+REQUIRED_FIELD_RE = re.compile(r"[0-9]*: required \w* \w*")
+OPTIONAL_FIELD_RE = re.compile(r"[0-9]*: optional \w* \w*")
+# Matches include statements like:
+#  include "Exprs.thrift"
+INCLUDE_FILE_RE = re.compile(r"include \"(\w+\.thrift)\"")
+THRIFT_WARNING_SUFFIX = (" might break the compatibility between impalad and "
+                         "catalogd/statestore during upgrade")
+
 
 def setup_virtualenv():
   """Set up virtualenv with flake8-diff."""
@@ -83,7 +125,15 @@ def setup_virtualenv():
               "flake8-diff=={0}".format(FLAKE8_DIFF_VERSION)])
 
 
-def get_flake8_comments(revision):
+def get_comment_input(message, line_number=0, side=COMMENT_REVISION_SIDE,
+                      context_line="", dryrun=False):
+  comment = {"message": message, "line": line_number, "side": side}
+  if dryrun:
+    comment["context_line"] = context_line
+  return comment
+
+
+def get_flake8_comments(base_revision, revision):
   """Get flake8 warnings for code changes made in the git commit 'revision'.
   Returns a dict with file path as keys and a list of CommentInput objects. See
   
https://gerrit-review.googlesource.com/Documentation/rest-api-changes.html#review-input
@@ -93,7 +143,6 @@ def get_flake8_comments(revision):
   flake8_env = os.environ.copy()
   flake8_env["PATH"] = "{0}:{1}".format(VENV_BIN, flake8_env["PATH"])
 
-  base_revision = "{0}^".format(revision)
   flake8_diff_proc = Popen(
       [FLAKE8_DIFF_PATH, "--standard-flake8-output", "--color", "off", 
base_revision,
        revision],
@@ -134,7 +183,7 @@ def get_flake8_comments(revision):
   return comments
 
 
-def get_misc_comments(revision):
+def get_misc_comments(base_revision, revision, dryrun=False):
   """Get miscellaneous warnings for code changes made in the git commit 
'revision', e.g.
   long lines and trailing whitespace. These warnings are produced by directly 
parsing the
   diff output."""
@@ -143,7 +192,7 @@ def get_misc_comments(revision):
   #  @@ -128 +133,2 @@ if __name__ == "__main__":
   RANGE_RE = re.compile(r"^@@ -[0-9,]* \+([0-9]*).*$")
 
-  diff = check_output(["git", "diff", "-U0", "{0}^..{0}".format(revision)],
+  diff = check_output(["git", "diff", "-U0", "{0}..{1}".format(base_revision, 
revision)],
       universal_newlines=True)
   curr_file = None
   check_source_file = False
@@ -168,35 +217,154 @@ def get_misc_comments(revision):
       curr_line_num = int(match.group(1))
     elif diff_line.startswith("+") and check_source_file:
       # An added or modified line - check it to see if we should generate 
warnings.
-      add_misc_comments_for_line(comments, diff_line[1:], curr_file, 
curr_line_num)
+      add_misc_comments_for_line(comments, diff_line[1:], curr_file, 
curr_line_num,
+                                 dryrun)
       curr_line_num += 1
   return comments
 
 
-def add_misc_comments_for_line(comments, line, curr_file, curr_line_num):
+def add_misc_comments_for_line(comments, line, curr_file, curr_line_num, 
dryrun=False):
   """Helper for get_misc_comments to generate comments for 'line' at 
'curr_line_num' in
   'curr_file' and append them to 'comments'."""
   # Check for trailing whitespace.
   if line.rstrip() != line:
-    comments[curr_file].append(
-        {"message": "line has trailing whitespace", "line": curr_line_num})
+    comments[curr_file].append(get_comment_input(
+        "line has trailing whitespace", curr_line_num, context_line=line, 
dryrun=dryrun))
 
   # Check for long lines. Skip .py files since flake8 already flags long lines.
   if len(line) > LINE_LIMIT and os.path.splitext(curr_file)[1] != ".py":
     msg = "line too long ({0} > {1})".format(len(line), LINE_LIMIT)
-    comments[curr_file].append(
-        {"message": msg, "line": curr_line_num})
+    comments[curr_file].append(get_comment_input(
+        msg, curr_line_num, context_line=line, dryrun=dryrun))
 
   if '\t' in line:
-    comments[curr_file].append(
-        {"message": "tab used for whitespace", "line": curr_line_num})
+    comments[curr_file].append(get_comment_input(
+        "tab used for whitespace", curr_line_num, context_line=line, 
dryrun=dryrun))
+
+  if 'ThriftDebugString' in line and curr_file.startswith("be/src/"):
+    comments[curr_file].append(get_comment_input(
+        "Please make sure you don't output sensitive data with 
ThriftDebugString(). "
+        "If so, use impala::RedactedDebugString() instead.",
+        curr_line_num, context_line=line, dryrun=dryrun))
+
+
+def get_catalog_compatibility_comments(base_revision, revision, dryrun=False):
+  """Get comments on Thrift/FlatBuffers changes that might break the 
communication
+  between impalad and catalogd/statestore"""
+  comments = defaultdict(lambda: [])
+
+  diff = check_output(
+      ["git", "diff", "-U0", "{0}..{1}".format(base_revision, revision),
+       "--", "common/thrift"],
+      universal_newlines=True)
+  curr_file = None
+  check_source_file = False
+  has_concerns = False
+  in_enum_clause = False
+  is_thrift_file = False
+  # Line numbers in the old file and in the new file
+  old_line_num = 0
+  new_line_num = 0
+  for diff_line in diff.splitlines():
+    if diff_line.startswith("--- "):
+      continue
+    elif diff_line.startswith("+++ "):
+      # Start of diff for a file. Add a comment for the previous file if has 
concerns.
+      if curr_file and check_source_file and has_concerns:
+        comments[curr_file].append(get_comment_input(THRIFT_FILE_COMMENT))
+      # Strip off "+++ b/" to get the file path.
+      curr_file = diff_line[6:]
+      check_source_file = False
+      has_concerns = False
+      in_enum_clause = False
+      is_thrift_file = os.path.splitext(curr_file)[1] == ".thrift"
+      if is_thrift_file and os.path.basename(curr_file) not in 
EXCLUDE_THRIFT_FILES:
+        check_source_file = True
+    elif check_source_file and diff_line.startswith("@@ "):
+      # Figure out the starting line of the hunk. Examples:
+      #  @@ -932,0 +933,5 @@ enum TImpalaQueryOptions {
+      #  @@ -55,0 +56 @@ enum TPlanNodeType {
+      #  @@ -109 +109 @@ struct THdfsTableSink {
+      # We want to extract the start line for the added lines
+      match = RANGE_RE.match(diff_line)
+      if not match:
+        raise Exception("Pattern did not match diff 
line:\n{0}".format(diff_line))
+      old_line_num = int(match.group(1))
+      new_line_num = int(match.group(2))
+      in_enum_clause = match.group(3).startswith("enum ")
+    elif check_source_file and diff_line.startswith("-"):
+      # Check if deleting/modifying a required field
+      change = diff_line[1:].strip()
+      if in_enum_clause and not change.startswith("//"):
+        has_concerns = True
+        comments[curr_file].append(get_comment_input(
+          "Modifying/deleting this enum item" + THRIFT_WARNING_SUFFIX,
+          old_line_num, COMMENT_PARENT_SIDE, diff_line, dryrun))
+      elif REQUIRED_FIELD_RE.match(change):
+        has_concerns = True
+        comments[curr_file].append(get_comment_input(
+          "Modifying/deleting this required field" + THRIFT_WARNING_SUFFIX,
+          old_line_num, COMMENT_PARENT_SIDE, diff_line, dryrun))
+      elif OPTIONAL_FIELD_RE.match(change):
+        # Removing an optional field should be OK unless the field number is 
reused by
+        # a new optional field. Add a general comment on the file.
+        has_concerns = True
+      old_line_num += 1
+    elif is_thrift_file and diff_line.startswith("+"):
+      change = diff_line[1:].strip()
+      # Check includes in Thrift files
+      match = INCLUDE_FILE_RE.match(change)
+      # Case 1: a target file includes a whitelist file, E.g. Frontend.thrift 
includes
+      # LineageGraph.thrift. Future changes in LineageGraph.thrift might impact
+      # Frontend.thrift so
+      #  - LineageGraph.thrift should be removed from the whitelist (i.e.
+      #    EXCLUDE_THRIFT_FILES) if it will be used in impalad and catalogd.
+      #  - Or developers should make sure the included new fields are 
read/write only
+      #    in impalads or only in catalogd.
+      if match and check_source_file and match.group(1) in 
EXCLUDE_THRIFT_FILES:
+        comments[curr_file].append(get_comment_input(
+            "Future changes in {0} might break the compatibility between 
impalad and "
+            "catalogd/statestore. Please remove {0} from EXCLUDE_THRIFT_FILES 
in "
+            "bin/jenkins/critique-gerrit-review.py or make sure the new fields 
are "
+            "read/write only in impalads or only in 
catalogd".format(match.group(1)),
+            new_line_num, context_line=diff_line, dryrun=dryrun))
+      # Case 2: A whitelist file includes a target file, e.g. PlanNodes.thrift 
includes
+      # Data.thrift. Note that PlanNodes.thrift is supposed to be used inside 
impalad.
+      # Data.thrift is used in both impalad and catalogd. We should ensure new 
fields
+      # included from Data.thrift are not set from catalogd.
+      elif (match and not check_source_file
+          and match.group(1) not in EXCLUDE_THRIFT_FILES):
+        comments[curr_file].append(get_comment_input(
+            "Thrift objects in the current file are supposed to be used inside 
"
+            "impalads. Please make sure new fields includes from {} are not 
set by "
+            "catalogd.".format(match.group(1)),
+            new_line_num, context_line=diff_line, dryrun=dryrun))
+      elif check_source_file:
+        if REQUIRED_FIELD_RE.match(change):
+          has_concerns = True
+          comments[curr_file].append(get_comment_input(
+              "Modifying/adding this required field" + THRIFT_WARNING_SUFFIX,
+              new_line_num, context_line=diff_line, dryrun=dryrun))
+      new_line_num += 1
+  if curr_file and check_source_file and has_concerns:
+    comments[curr_file].append(get_comment_input(THRIFT_FILE_COMMENT))
+  merge_comments(
+      comments, get_flatbuffers_compatibility_comments(base_revision, 
revision))
+  return comments
+
+
+def get_flatbuffers_compatibility_comments(base_revision, revision):
+  comments = defaultdict(lambda: [])
+  diff = check_output(
+      ["git", "diff", "--numstat", "{0}..{1}".format(base_revision, revision),
+       "--", "common/fbs"],
+      universal_newlines=True)
+  for diff_line in diff.splitlines():
+    _, _, path = diff_line.split()
+    if os.path.splitext(path)[1] == ".fbs":
+      comments[path].append(get_comment_input(FBS_FILE_COMMENT))
+  return comments
 
-  if 'ThriftDebugString' in line:
-    comments[curr_file].append(
-        {"message": ("Please make sure you don't output sensitive data with "
-                     "ThriftDebugString(). If so, use 
impala::RedactedDebugString() "
-                     "instead."),
-         "line": curr_line_num })
 
 def post_review_to_gerrit(review_input):
   """Post a review to the gerrit patchset. 'review_input' is a ReviewInput 
JSON object
@@ -221,16 +389,31 @@ def merge_comments(a, b):
 if __name__ == "__main__":
   parser = ArgumentParser(description="Generate and post gerrit comments")
   parser.add_argument("--dryrun", action='store_true',
-                      help="Don't post comments back to gerrit")
+                      help="Don't post comments back to gerrit. Also shows the 
context "
+                           "lines if possible.")
+  parser.add_argument("--revision", default="HEAD",
+                      help="The revision to check. Defaults to HEAD. Note that 
"
+                           "flake8-diff only actually works correctly on HEAD. 
So "
+                           "specifying other commits might miss the results of 
"
+                           "flake8-diff.")
+  parser.add_argument("--base-revision",
+                      help="The base revision to check. Defaults to the parent 
commit of"
+                           " revision")
   args = parser.parse_args()
 
   setup_virtualenv()
-  # flake8-diff only actually works correctly on HEAD, so this is the only 
revision
-  # we can correctly handle.
-  revision = 'HEAD'
-  comments = get_flake8_comments(revision)
-  merge_comments(comments, get_misc_comments(revision))
+  revision = args.revision
+  base_revision = args.base_revision if args.base_revision else 
"{0}^".format(revision)
+  comments = get_flake8_comments(base_revision, revision)
+  merge_comments(comments, get_misc_comments(base_revision, revision, 
args.dryrun))
+  merge_comments(
+    comments, get_catalog_compatibility_comments(base_revision, revision, 
args.dryrun))
   review_input = {"comments": comments}
+  if len(comments) > 0:
+    review_input["message"] = (
+        "gerrit-auto-critic failed. You can reproduce it locally using 
command:\n\n"
+        "  python2 bin/jenkins/critique-gerrit-review.py --dryrun\n\n"
+        "To run it, you might need a virtual env with virtualenv installed.")
   print(json.dumps(review_input, indent=True))
   if not args.dryrun:
     post_review_to_gerrit(review_input)
diff --git a/common/thrift/Data.thrift b/common/thrift/Data.thrift
index f4a5450a1..b68657f3e 100644
--- a/common/thrift/Data.thrift
+++ b/common/thrift/Data.thrift
@@ -33,6 +33,8 @@ struct TColumnValue {
  11: optional i32 date_val
 }
 
+// A result row returned to the client. Used in both impalad and catalogd.
+// Note that catalogd sets the summary of DDL/DMLs.
 struct TResultRow {
   1: list<TColumnValue> colVals
 }

(impala) 01/02: IMPALA-13240: Add gerrit comments for Thrift/FlatBuffers changes

Reply via email to