qlong commented on code in PR #16714:
URL: https://github.com/apache/iceberg/pull/16714#discussion_r3405788612


##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,332 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   *
+   * <p>This is a copy of the canonical implementation from <a
+   * href="https://github.com/apache/iceberg/pull/15384";>PR #15384</a>. Once 
that PR merges, this
+   * class will be the single definition and this note can be removed.
+   */
+  public sealed interface PathSegment permits PathSegment.Name, 
PathSegment.Index {
+    record Name(String name) implements PathSegment {}
+
+    record Index(int index) implements PathSegment {}
+  }
+
   private static final String RFC9535_NAME_FIRST =
       "[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
   private static final String RFC9535_NAME_CHARS =
       "[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
   private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
       Pattern.compile(RFC9535_NAME_FIRST + 
RFC9535_NAME_CHARS).asMatchPredicate();
 
+  /** Letters that follow {@code \} for control-character escapes in RFC 9535 
quoted segments. */
+  private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";
+
+  private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";
+
   private static final Pattern RFC9535_REQUIRES_ESCAPE =
       Pattern.compile(
           
"[^\\x{0020}-\\x{0026}\\x{0028}-\\x{005B}\\x{005D}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]");
 
+  /**
+   * Matches one bracket segment {@code ['...']} where inner text may contain 
RFC 9535 escapes
+   * (quote, backslash, control characters, and four-digit hex escapes).
+   */
+  private static final Pattern BRACKET_SEGMENT = 
Pattern.compile("\\['((?:[^'\\\\]|\\\\.)*)'\\]");
+
   private static final Map<Character, String> RFC9535_ESCAPE_REPLACEMENTS = 
buildReplacementMap();
 
-  private static final Splitter DOT = Splitter.on(".");
   private static final String ROOT = "$";
 
-  static List<String> parse(String path) {
+  /**
+   * Parses a path into segments. After the root {@code $}, each segment is 
either dot shorthand
+   * ({@code .name} per RFC 9535), a single-quoted bracket name ({@code 
['...']}) with RFC 9535
+   * escapes, or a numeric array index ({@code [n]}). Forms may be mixed (e.g. 
{@code $.a['b.c']},
+   * {@code $.items[0].tags}, {@code $.matrix[0][1]}). Wildcards and recursive 
descent are not
+   * supported.
+   *
+   * <p>The root path {@code $} yields an empty segment list.
+   */
+  public static List<PathSegment> parse(String path) {
     Preconditions.checkArgument(path != null, "Invalid path: null");
+    Preconditions.checkArgument(!path.isEmpty(), "Invalid path: empty");
+    Preconditions.checkArgument(
+        path.startsWith(ROOT), "Invalid path, does not start with %s: %s", 
ROOT, path);
+
+    if (path.equals(ROOT)) {
+      return Lists.newArrayList();
+    }
+
+    return parseAfterRoot(path);
+  }
+
+  /**
+   * Parses a JSON path into navigation steps for shredded variant extraction. 
Object field steps
+   * are plain strings; array index steps use {@code "[n]"} encoding (see 
{@link
+   * #isArrayIndexPart}).
+   */
+  public static List<String> parseObjectPath(String path) {
+    List<String> parts = Lists.newArrayList();
+    for (PathSegment segment : parse(path)) {
+      if (segment instanceof PathSegment.Name) {
+        parts.add(((PathSegment.Name) segment).name());
+      } else if (segment instanceof PathSegment.Index) {
+        parts.add("[" + ((PathSegment.Index) segment).index() + "]");

Review Comment:
   Good call out. I removed parseObjectPath, and pass List<PathSegment> through 
the Parquet extraction readers. Added tests



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to