Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,13 @@ public class BoundExtract<T> implements BoundTerm<T> {
private final String path;
private final Type type;

/**
* @param path normalized extract path, as from {@link UnboundExtract#path()}; already validated
* and canonical in {@link UnboundExtract#bind}.
*/
BoundExtract(BoundReference<?> ref, String path, Type type) {
this.ref = ref;
this.path = PathUtil.toNormalizedPath(PathUtil.parse(path));
this.path = path;
this.type = type;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,24 @@ public static String describe(Term term) {
return ((NamedReference<?>) term).name();
} else if (term instanceof BoundReference) {
return ((BoundReference<?>) term).name();
} else if (term instanceof UnboundExtract) {
UnboundExtract<?> unboundExtract = (UnboundExtract<?>) term;
return "extract("
+ describe(unboundExtract.ref())
+ ", "
+ unboundExtract.path()
+ ", "
+ unboundExtract.type()
+ ")";
} else if (term instanceof BoundExtract) {
BoundExtract<?> boundExtract = (BoundExtract<?>) term;
return "extract("
+ describe(boundExtract.ref())
+ ", "
+ boundExtract.path()
+ ", "
+ boundExtract.type()
+ ")";
} else {
throw new UnsupportedOperationException("Unsupported term: " + term);
}
Expand All @@ -254,6 +272,9 @@ public static <T> UnboundTerm<T> unbind(BoundTerm<T> term) {
return Expressions.transform(bound.ref().name(), bound.transform());
} else if (term instanceof BoundReference) {
return Expressions.ref(((BoundReference<T>) term).name());
} else if (term instanceof BoundExtract) {
BoundExtract<T> bound = (BoundExtract<T>) term;
return Expressions.extract(bound.ref().name(), bound.path(), bound.type().toString());
}

throw new UnsupportedOperationException("Cannot unbind unsupported term: " + term);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ public static <T> UnboundTerm<T> truncate(String name, int width) {
return new UnboundTransform<>(ref(name), Transforms.truncate(width));
}

/**
* Extract a field from a variant column. {@code path} is a small RFC 9535-style JSONPath: root
* {@code $}, then steps are {@code .name}, {@code ['name']} (RFC 9535 escapes inside quotes), or
* {@code [n]} for a zero-based array index. You can mix these ({@code $.a['b.c']}, {@code
* $.items[0].tags[1]}). {@link UnboundExtract#path()} returns the normalized string.
*/
public static <T> UnboundTerm<T> extract(String name, String path, String type) {
return new UnboundExtract<>(ref(name), path, type);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import static org.apache.iceberg.expressions.Expressions.rewriteNot;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Collection;
import java.util.Comparator;
import java.util.Map;
Expand Down Expand Up @@ -631,7 +632,15 @@ private boolean isNonNullPreserving(Bound<?> term) {
}
}

/**
* Build a variant from the buffer, regardless of the ordering of the incoming buffer.
*
* @param buffer source data
* @return variant instance
*/
private static VariantObject parseBounds(ByteBuffer buffer) {
return Variant.from(buffer).value().asObject();
// Explicitly use little-endian encoding for reading buffer
ByteBuffer littleEndian = buffer.duplicate().order(ByteOrder.LITTLE_ENDIAN);
return Variant.from(littleEndian).value().asObject();
}
}
273 changes: 250 additions & 23 deletions api/src/main/java/org/apache/iceberg/expressions/PathUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,60 +27,287 @@
import java.util.stream.Collectors;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Streams;

public class PathUtil {
private PathUtil() {}

/**
* One step in a variant JSONPath: an object member name or a zero-based array index (RFC 9535
* {@code [n]} selector).
*/
sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {
record Name(String name) implements PathSegment {}

record Index(int index) implements PathSegment {}
}

private static final String RFC9535_NAME_FIRST =
"[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
private static final String RFC9535_NAME_CHARS =
"[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
Pattern.compile(RFC9535_NAME_FIRST + RFC9535_NAME_CHARS).asMatchPredicate();

/** Letters that follow {@code \} for control-character escapes in RFC 9535 quoted segments. */
private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";

private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";

private static final Pattern RFC9535_REQUIRES_ESCAPE =
Pattern.compile(
"[^\\x{0020}-\\x{0026}\\x{0028}-\\x{005B}\\x{005D}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]");

/**
* Matches one bracket segment {@code ['...']} where inner text may contain RFC 9535 escapes
* (quote, backslash, control characters, and four-digit hex escapes).
*/
private static final Pattern BRACKET_SEGMENT = Pattern.compile("\\['((?:[^'\\\\]|\\\\.)*)'\\]");

private static final Map<Character, String> RFC9535_ESCAPE_REPLACEMENTS = buildReplacementMap();

private static final Splitter DOT = Splitter.on(".");
private static final String ROOT = "$";

static List<String> parse(String path) {
/**
* Parses a path into segments. After the root {@code $}, each segment is either dot shorthand
* ({@code .name} per RFC 9535), a single-quoted bracket name ({@code ['...']}) with RFC 9535
* escapes, or a numeric array index ({@code [n]}). Forms may be mixed (e.g. {@code $.a['b.c']},
* {@code $.items[0].tags}, {@code $.matrix[0][1]}). Wildcards and recursive descent are not
* supported.
*
* <p>The root path {@code $} yields an empty segment list.
*/
static List<PathSegment> parse(String path) {
Preconditions.checkArgument(path != null, "Invalid path: null");
Preconditions.checkArgument(!path.isEmpty(), "Invalid path: empty");
Preconditions.checkArgument(
path.startsWith(ROOT), "Invalid path, does not start with %s: %s", ROOT, path);

if (path.equals(ROOT)) {
return Lists.newArrayList();
}

return parseAfterRoot(path);
}

/** Normalizes object field names only (no array indices). */
public static String toNormalizedPath(Iterable<String> fields) {
return toNormalizedPath(
Streams.stream(fields).map(PathSegment.Name::new).collect(Collectors.toList()));
}

static String toNormalizedPath(List<PathSegment> segments) {
StringBuilder builder = new StringBuilder(ROOT);
for (PathSegment segment : segments) {
if (segment instanceof PathSegment.Name) {
String name = ((PathSegment.Name) segment).name();
builder.append("['").append(rfc9535escape(name)).append("']");
} else if (segment instanceof PathSegment.Index) {
int index = ((PathSegment.Index) segment).index();
Preconditions.checkArgument(index >= 0, "Invalid path, negative array index: %s", index);
builder.append('[').append(index).append(']');
} else {
throw new IllegalStateException("Unknown segment: " + segment);
}
}
return builder.toString();
}

private static List<PathSegment> parseAfterRoot(String path) {
List<PathSegment> segments = Lists.newArrayList();
Matcher bracketMatcher = BRACKET_SEGMENT.matcher(path);
int len = path.length();
int pos = ROOT.length();

while (pos < len) {
char ch = path.charAt(pos);
pos =
switch (ch) {
case '.' -> appendDotSegment(segments, path, pos);
case '[' -> appendBracketOrIndexSegment(segments, path, pos, bracketMatcher);
default ->
throw new IllegalArgumentException(
String.format(
"Invalid path, expected '.' or '[' at position %s: %s", pos, path));
};
}

return segments;
}

/**
* Appends a dot-style segment to {@code segments} by reading from {@code path[dotPos]}: a single
* leading {@code .} then an RFC 9535 shorthand name until the next {@code .} or {@code [}.
*
* @param segments output; segments parsed so far, updated in place
* @param path full path
* @param dotPos index of the {@code .} starting the segment
*/
private static int appendDotSegment(List<PathSegment> segments, String path, int dotPos) {
int pos = dotPos + 1;
int pathLen = path.length();
Preconditions.checkArgument(pos < pathLen, "Invalid path, trailing dot: %s", path);
int start = pos;
while (pos < pathLen) {
char ch = path.charAt(pos);
if (ch == '.' || ch == '[') {
break;
}
pos++;
}

Preconditions.checkArgument(pos > start, "Invalid path, empty segment after '.': %s", path);
String name = path.substring(start, pos);
Preconditions.checkArgument(
!path.contains("[") && !path.contains("]"), "Unsupported path, contains bracket: %s", path);
RFC9535_MEMBER_NAME_SHORTHAND.test(name),
"Invalid path: %s (%s has invalid characters)",
path,
name);
segments.add(new PathSegment.Name(name));
return pos;
}

/**
* Appends a bracket segment to {@code segments} starting at {@code path[bracketPos]}. If the next
* character is a digit, consumes a numeric array index {@code [n]}; otherwise consumes a quoted
* name {@code ['...']}. A lone {@code [} with no following quoted form (e.g. the path ends at
* {@code $[}) is rejected in {@link #appendQuotedBracketSegment} when the pattern does not match.
*
* @param segments output; segments parsed so far, updated in place
* @param path full path
* @param bracketPos index of the opening {@code [}
*/
private static int appendBracketOrIndexSegment(
List<PathSegment> segments, String path, int bracketPos, Matcher bracketMatcher) {
Preconditions.checkArgument(
!path.contains("*"), "Unsupported path, contains wildcard: %s", path);
bracketPos < path.length() && path.charAt(bracketPos) == '[', "Invalid path: %s", path);
if (bracketPos + 1 < path.length() && isAsciiDigit(path.charAt(bracketPos + 1))) {
return appendArrayIndexSegment(segments, path, bracketPos);
}
return appendQuotedBracketSegment(segments, path, bracketPos, bracketMatcher);
}

private static boolean isAsciiDigit(char ch) {
return ch >= '0' && ch <= '9';
}

/**
* Appends a non-negative array index from {@code [n]} to {@code segments}, starting with {@code
* [} at {@code path[bracketPos]}.
*
* @param segments output; segments parsed so far, updated in place
* @param path full path
* @param bracketPos index of the opening {@code [} before the digits
*/
private static int appendArrayIndexSegment(
List<PathSegment> segments, String path, int bracketPos) {
int pos = bracketPos + 1;
int len = path.length();
int start = pos;
while (pos < len && isAsciiDigit(path.charAt(pos))) {
pos++;
}
Preconditions.checkArgument(pos > start, "Invalid path, empty array index in: %s", path);
Preconditions.checkArgument(
!path.contains(".."), "Unsupported path, contains recursive descent: %s", path);
pos < len && path.charAt(pos) == ']', "Invalid path, unclosed array index in: %s", path);
int index;
String digits = path.substring(start, pos);
try {
index = Integer.parseInt(digits);
} catch (NumberFormatException e) {
throw new IllegalArgumentException(
String.format("Invalid path, array index out of int range: %s", path), e);
}
Preconditions.checkArgument(index >= 0, "Invalid path, negative array index in: %s", path);
segments.add(new PathSegment.Index(index));
return pos + 1;
}

List<String> parts = DOT.splitToList(path);
/**
* Appends a name from a {@code ['...']} segment to {@code segments} using the bracket matcher
* (inner text may use RFC 9535 escapes). Expects a full quoted bracket token at {@code
* path[bracketPos]}; otherwise the matcher or alignment checks throw.
*
* @param segments output; segments parsed so far, updated in place
* @param path full path
* @param bracketPos index of the opening {@code [} that must begin {@code ['}
*/
private static int appendQuotedBracketSegment(
List<PathSegment> segments, String path, int bracketPos, Matcher bracketMatcher) {
Preconditions.checkArgument(
ROOT.equals(parts.get(0)), "Invalid path, does not start with %s: %s", ROOT, path);

List<String> names = parts.subList(1, parts.size());
for (String name : names) {
Preconditions.checkArgument(
RFC9535_MEMBER_NAME_SHORTHAND.test(name),
"Invalid path: %s (%s has invalid characters)",
path,
name);
bracketMatcher.find(bracketPos), "Invalid path, malformed bracket segment: %s", path);
Preconditions.checkArgument(
bracketMatcher.start() == bracketPos,
"Invalid path, unexpected characters at position %s: %s",
bracketPos,
path);
segments.add(new PathSegment.Name(rfc9535unescape(bracketMatcher.group(1))));
return bracketMatcher.end();
}

/** Unescapes the inner text of a {@code ['...']} segment (inverse of {@link #rfc9535escape}). */
@VisibleForTesting
@SuppressWarnings("StatementSwitchToExpressionSwitch")
static String rfc9535unescape(String escaped) {
if (!escaped.contains("\\")) {
return escaped;
}

StringBuilder builder = new StringBuilder(escaped.length());
int cursor = 0;
while (cursor < escaped.length()) {
char ch = escaped.charAt(cursor);
if (ch != '\\') {
builder.append(ch);
cursor += 1;
} else {
Preconditions.checkArgument(
cursor + 1 < escaped.length(), "Invalid escape sequence at end of: %s", escaped);
char next = escaped.charAt(cursor + 1);
switch (next) {
case 'u':
Preconditions.checkArgument(
cursor + 5 < escaped.length(),
"Invalid \\uXXXX escape at position %s in: %s",
cursor,
escaped);
builder.append((char) Integer.parseInt(escaped.substring(cursor + 2, cursor + 6), 16));
cursor += 6;
break;
case 'b':
case 't':
case 'f':
case 'n':
case 'r':
case '\'':
case '\\':
builder.append(rfc9535SimpleEscapedChar(next));
cursor += 2;
break;
default:
throw new IllegalArgumentException(
"Invalid escape sequence \\" + next + " in: " + escaped);
}
}
}

return names;
return builder.toString();
}

public static String toNormalizedPath(Iterable<String> fields) {
return ROOT
+ Streams.stream(fields)
.map(PathUtil::rfc9535escape)
.map(name -> "['" + name + "']")
.collect(Collectors.joining(""));
private static char rfc9535SimpleEscapedChar(char next) {
int idx = RFC9535_SIMPLE_ESCAPE_LETTERS.indexOf(next);
if (idx >= 0) {
return RFC9535_SIMPLE_ESCAPE_CHARS.charAt(idx);
}
if (next == '\'') {
return '\'';
}
if (next == '\\') {
return '\\';
}
throw new IllegalArgumentException("Invalid simple escape: \\" + next);
}

@VisibleForTesting
Expand Down
Loading
Loading