steveloughran commented on code in PR #15384:
URL: https://github.com/apache/iceberg/pull/15384#discussion_r3082149225


##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,255 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   */
+  sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {
+    record Name(String name) implements PathSegment {}
+
+    record Index(int index) implements PathSegment {}
+  }
+
   private static final String RFC9535_NAME_FIRST =
       "[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
   private static final String RFC9535_NAME_CHARS =
       "[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
   private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
       Pattern.compile(RFC9535_NAME_FIRST + 
RFC9535_NAME_CHARS).asMatchPredicate();
 
+  /** Letters that follow {@code \} for control-character escapes in RFC 9535 
quoted segments. */
+  private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";
+
+  private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";
+
   private static final Pattern RFC9535_REQUIRES_ESCAPE =
       Pattern.compile(
           
"[^\\x{0020}-\\x{0026}\\x{0028}-\\x{005B}\\x{005D}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]");
 
+  /**
+   * Matches one bracket segment {@code ['...']} where inner text may contain 
RFC 9535 escapes
+   * (quote, backslash, control characters, and four-digit hex escapes).
+   */
+  private static final Pattern BRACKET_SEGMENT = 
Pattern.compile("\\['((?:[^'\\\\]|\\\\.)*)'\\]");
+
   private static final Map<Character, String> RFC9535_ESCAPE_REPLACEMENTS = 
buildReplacementMap();
 
-  private static final Splitter DOT = Splitter.on(".");
   private static final String ROOT = "$";
 
-  static List<String> parse(String path) {
+  /**
+   * Parses a path into segments. After the root {@code $}, each segment is 
either dot shorthand
+   * ({@code .name} per RFC 9535), a single-quoted bracket name ({@code 
['...']}) with RFC 9535
+   * escapes, or a numeric array index ({@code [n]}). Forms may be mixed (e.g. 
{@code $.a['b.c']},
+   * {@code $.items[0].tags}, {@code $.matrix[0][1]}). Wildcards and recursive 
descent are not
+   * supported.
+   *
+   * <p>The root path {@code $} yields an empty segment list.
+   */
+  static List<PathSegment> parse(String path) {
     Preconditions.checkArgument(path != null, "Invalid path: null");
+    Preconditions.checkArgument(!path.isEmpty(), "Invalid path: empty");
+    Preconditions.checkArgument(
+        path.startsWith(ROOT), "Invalid path, does not start with %s: %s", 
ROOT, path);
+
+    if (path.equals(ROOT)) {
+      return Lists.newArrayList();
+    }
+
+    return parseAfterRoot(path);
+  }
+
+  /** Normalizes object field names only (no array indices). */
+  public static String toNormalizedPath(Iterable<String> fields) {
+    return toNormalizedPath(
+        
Streams.stream(fields).map(PathSegment.Name::new).collect(Collectors.toList()));
+  }
+
+  static String toNormalizedPath(List<PathSegment> segments) {
+    StringBuilder builder = new StringBuilder(ROOT);
+    for (PathSegment segment : segments) {
+      if (segment instanceof PathSegment.Name) {
+        String name = ((PathSegment.Name) segment).name();
+        builder.append("['").append(rfc9535escape(name)).append("']");
+      } else if (segment instanceof PathSegment.Index) {
+        int index = ((PathSegment.Index) segment).index();
+        Preconditions.checkArgument(index >= 0, "Invalid path, negative array 
index: %s", index);
+        builder.append('[').append(index).append(']');
+      } else {
+        throw new IllegalStateException("Unknown segment: " + segment);
+      }
+    }
+    return builder.toString();
+  }
+
+  @SuppressWarnings("StatementSwitchToExpressionSwitch")
+  private static List<PathSegment> parseAfterRoot(String path) {
+    List<PathSegment> segments = Lists.newArrayList();
+    Matcher bracketMatcher = BRACKET_SEGMENT.matcher(path);
+    int len = path.length();
+    int pos = ROOT.length();
+
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      switch (ch) {
+        case '.':
+          pos = appendDotSegment(path, pos, segments);
+          break;
+
+        case '[':
+          pos = appendBracketOrIndexSegment(path, pos, segments, 
bracketMatcher);
+          break;
+
+        default:
+          throw new IllegalArgumentException(
+              String.format("Invalid path, expected '.' or '[' at position %s: 
%s", pos, path));
+      }
+    }
+
+    return segments;
+  }
+
+  /** Consumes from {@code path[dotPos]} a leading {@code .} and RFC 9535 
shorthand member name. */
+  private static int appendDotSegment(String path, int dotPos, 
List<PathSegment> segments) {
+    int pos = dotPos + 1;
+    int len = path.length();
+    Preconditions.checkArgument(pos < len, "Invalid path, trailing dot: %s", 
path);
+    int start = pos;
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      if (ch == '.' || ch == '[') {
+        break;
+      }
+      pos += 1;
+    }
+
+    Preconditions.checkArgument(pos > start, "Invalid path, empty segment 
after '.': %s", path);
+    String name = path.substring(start, pos);
     Preconditions.checkArgument(
-        !path.contains("[") && !path.contains("]"), "Unsupported path, 
contains bracket: %s", path);
+        RFC9535_MEMBER_NAME_SHORTHAND.test(name),
+        "Invalid path: %s (%s has invalid characters)",
+        path,
+        name);
+    segments.add(new PathSegment.Name(name));
+    return pos;
+  }
+
+  /**
+   * Consumes either a numeric array index {@code [n]} or a quoted name {@code 
['...']} starting at
+   * {@code path[bracketPos]}.
+   */
+  private static int appendBracketOrIndexSegment(
+      String path, int bracketPos, List<PathSegment> segments, Matcher 
bracketMatcher) {
     Preconditions.checkArgument(
-        !path.contains("*"), "Unsupported path, contains wildcard: %s", path);
+        bracketPos < path.length() && path.charAt(bracketPos) == '[', "Invalid 
path: %s", path);
+    if (bracketPos + 1 < path.length() && isAsciiDigit(path.charAt(bracketPos 
+ 1))) {
+      return appendArrayIndexSegment(path, bracketPos, segments);
+    }
+    return appendQuotedBracketSegment(path, bracketPos, segments, 
bracketMatcher);
+  }
+
+  private static boolean isAsciiDigit(char ch) {
+    return ch >= '0' && ch <= '9';
+  }
+
+  private static int appendArrayIndexSegment(
+      String path, int bracketPos, List<PathSegment> segments) {
+    int pos = bracketPos + 1;
+    int len = path.length();
+    int start = pos;
+    while (pos < len && isAsciiDigit(path.charAt(pos))) {
+      pos += 1;
+    }
+    Preconditions.checkArgument(pos > start, "Invalid path, empty array index 
in: %s", path);
     Preconditions.checkArgument(
-        !path.contains(".."), "Unsupported path, contains recursive descent: 
%s", path);
+        pos < len && path.charAt(pos) == ']', "Invalid path, unclosed array 
index in: %s", path);
+    int index = Integer.parseInt(path.substring(start, pos));
+    Preconditions.checkArgument(index >= 0, "Invalid path, negative array 
index in: %s", path);
+    segments.add(new PathSegment.Index(index));
+    return pos + 1;
+  }
 
-    List<String> parts = DOT.splitToList(path);
+  private static int appendQuotedBracketSegment(

Review Comment:
   for these and the others I found it a bit hard to follow. Could the javadocs 
explain what segments they expect. And maye move that List<PathSegment> 
segments to be at the start of every method as it's what gets updated



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to