source) {
+ this.source = source;
+ }
+
+ public TimeFilter getTimeFilter() {
+ return timeFilter;
+ }
+
+ public void setTimeFilter(TimeFilter timeFilter) {
+ this.timeFilter = timeFilter;
+ }
+
+ public DocumentMatchContext.Limits getLimits() {
+ return limits;
+ }
+
+ public void setLimits(DocumentMatchContext.Limits limits) {
+ this.limits = limits;
+ }
+
+ public boolean isTld() {
+ return tld;
+ }
+
+ public void setTld(boolean tld) {
+ this.tld = tld;
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java
new file mode 100644
index 00000000000..2febb7c4cd9
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java
@@ -0,0 +1,158 @@
+package datawave.query.function;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Value;
+
+import datawave.query.predicate.TimeFilter;
+
+/**
+ * Per-document runtime state used by {@code document:match(...)} evaluation.
+ *
+ * This context carries the raw {@code d}-column entries retained for a candidate document, the configured size limits used while decoding those payloads, and
+ * the per-{@code d}-column {@link DocumentMatchResults} accumulated across one or more {@code document:match(...)} calls for a single evaluation. Context
+ * instances are expected to be created fresh for each evaluation pass rather than reused across multiple documents or repeated evaluations of the same
+ * document.
+ */
+public class DocumentMatchContext {
+ public static final int DEFAULT_MAX_ENCODED_SIZE = 256 * 1024 * 1024;
+ public static final int DEFAULT_MAX_DECODED_SIZE = 384 * 1024 * 1024;
+ public static final int DEFAULT_MAX_ENCODED_CONTEXT_SIZE = 256 * 1024 * 1024;
+
+ /**
+ * Immutable runtime limits for {@code document:match(...)} payload processing.
+ */
+ public static class Limits {
+ private final int maxEncodedValueSize;
+ private final int maxDecodedValueSize;
+ private final int maxEncodedContextSize;
+
+ /**
+ * @param maxEncodedValueSize
+ * maximum allowed encoded payload size in bytes
+ * @param maxDecodedValueSize
+ * maximum allowed decoded payload size in bytes
+ * @param maxEncodedContextSize
+ * maximum allowed aggregate encoded payload size retained for a document, in bytes
+ */
+ public Limits(int maxEncodedValueSize, int maxDecodedValueSize, int maxEncodedContextSize) {
+ this.maxEncodedValueSize = maxEncodedValueSize;
+ this.maxDecodedValueSize = maxDecodedValueSize;
+ this.maxEncodedContextSize = maxEncodedContextSize;
+ }
+
+ /**
+ * @return the maximum encoded payload size, in bytes
+ */
+ public int getMaxEncodedValueSize() {
+ return maxEncodedValueSize;
+ }
+
+ /**
+ * @return the maximum decoded payload size, in bytes
+ */
+ public int getMaxDecodedValueSize() {
+ return maxDecodedValueSize;
+ }
+
+ /**
+ * @return the maximum aggregate encoded payload size retained for a document, in bytes
+ */
+ public int getMaxEncodedContextSize() {
+ return maxEncodedContextSize;
+ }
+ }
+
+ private final List> documentEntries;
+ private final Limits limits;
+ private final Map matches = new LinkedHashMap<>();
+
+ /**
+ * Creates a per-evaluation match context for the retained {@code d}-column entries of a single candidate document.
+ *
+ * @param documentEntries
+ * retained {@code d}-column entries for the document being evaluated
+ * @param limits
+ * payload-processing limits applied during decode and match extraction
+ */
+ public DocumentMatchContext(List> documentEntries, Limits limits) {
+ this.documentEntries = documentEntries;
+ this.limits = limits;
+ }
+
+ /**
+ * Builds a context from already-aggregated document entries using explicit runtime limits.
+ *
+ * @param entries
+ * aggregated document entries
+ * @param timeFilter
+ * optional time filter to apply while selecting {@code d}-column entries
+ * @param limits
+ * payload-processing limits
+ * @return a fresh context containing only eligible {@code d}-column entries for a single evaluation pass
+ */
+ public static DocumentMatchContext from(List> entries, TimeFilter timeFilter, Limits limits) {
+ List> documentEntries = new ArrayList<>();
+ for (Entry entry : entries) {
+ if (entry.getKey().getColumnFamily().toString().equals("d") && (timeFilter == null || timeFilter.apply(entry))) {
+ documentEntries.add(entry);
+ }
+ }
+ return new DocumentMatchContext(documentEntries, limits);
+ }
+
+ /**
+ * @return the retained {@code d}-column entries available to {@code document:match(...)} during the current evaluation
+ */
+ public List> getDocumentEntries() {
+ return Collections.unmodifiableList(documentEntries);
+ }
+
+ public int getMaxEncodedValueSize() {
+ return limits.getMaxEncodedValueSize();
+ }
+
+ public int getMaxDecodedValueSize() {
+ return limits.getMaxDecodedValueSize();
+ }
+
+ public int getMaxEncodedContextSize() {
+ return limits.getMaxEncodedContextSize();
+ }
+
+ /**
+ * @return the payload-processing limits associated with this evaluation context
+ */
+ public Limits getLimits() {
+ return limits;
+ }
+
+ /**
+ * Records per-call matches in the per-{@code d}-column document-wide result set.
+ *
+ * @param key
+ * the matched {@code d}-column key
+ * @param search
+ * the literal string matched by the invocation
+ * @param offsets
+ * starting offsets found in the matched view
+ */
+ public void addMatches(Key key, String search, List offsets) {
+ matches.computeIfAbsent(key, DocumentMatchResults::new).addMatches(search, offsets);
+ }
+
+ /**
+ * Returns the per-entry match results accumulated during this evaluation.
+ *
+ * @return an immutable snapshot view of the accumulated per-{@code d}-column match results
+ */
+ public List getMatches() {
+ return List.copyOf(matches.values());
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java
new file mode 100644
index 00000000000..fe024a77b16
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java
@@ -0,0 +1,201 @@
+package datawave.query.function;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
+import org.apache.log4j.Logger;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Maps;
+
+import datawave.query.attributes.Attribute;
+import datawave.query.attributes.Attributes;
+import datawave.query.attributes.Document;
+import datawave.query.attributes.DocumentKey;
+import datawave.query.jexl.functions.DocumentFunctions;
+import datawave.query.predicate.TimeFilter;
+import datawave.query.util.Tuple3;
+import datawave.query.util.Tuples;
+
+/**
+ * Builds a {@link DocumentMatchContext} close to evaluation time and attaches it to the side-channel map used for JEXL context population.
+ */
+public class DocumentMatchContextFunction implements Function>,Tuple3>> {
+ private static final Logger log = Logger.getLogger(DocumentMatchContextFunction.class);
+ private static final String DOCUMENT_COLUMN_FAMILY_STRING = "d";
+ private static final char DOCUMENT_COLUMN_FAMILY_CHAR = 'd';
+ private static final char NULL = '\0';
+ private final DocumentMatchConfig config;
+ private final SortedKeyValueIterator source;
+
+ /**
+ * Creates a context-populating function from the supplied document-match configuration.
+ *
+ * @param config
+ * document-match configuration
+ */
+ public DocumentMatchContextFunction(DocumentMatchConfig config) {
+ this.config = config;
+ this.source = config.getSource(); // let's keep things clean.
+ }
+
+ @Override
+ public Tuple3> apply(Tuple3> from) {
+ try {
+ Set documentKeys = getDocumentKeys(from.first(), from.second());
+ if (log.isDebugEnabled()) {
+ log.debug("Collecting document-match context for tuple key " + from.first() + " using document keys " + documentKeys);
+ }
+
+ List> dEntries = collectDocumentColumnAttributes(documentKeys);
+ DocumentMatchContext context = DocumentMatchContext.from(dEntries, config.getTimeFilter(), config.getLimits());
+ if (log.isDebugEnabled()) {
+ log.debug("Collected " + dEntries.size() + " d-column entries for tuple key " + from.first());
+ }
+
+ Map map = from.third().isEmpty() ? new HashMap<>() : new HashMap<>(from.third());
+ map.put(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, context);
+ return Tuples.tuple(from.first(), from.second(), map);
+ } catch (IOException e) {
+ throw new IllegalStateException("Unable to collect document-match context for " + from.first(), e);
+ }
+ }
+
+ private List> collectDocumentColumnAttributes(Set documentKeys) throws IOException {
+ List> documentColumns = new ArrayList<>();
+ long retainedBytes = 0L;
+ for (Key documentKey : documentKeys) {
+ retainedBytes = collectDocumentColumnAttributes(documentKey, documentColumns, retainedBytes);
+ if (retainedBytes >= config.getLimits().getMaxEncodedContextSize()) {
+ break;
+ }
+ }
+ return documentColumns;
+ }
+
+ private long collectDocumentColumnAttributes(Key documentKey, List> documentColumns, long retainedBytes) throws IOException {
+ String row = documentKey.getRow().toString();
+ String datatypeAndUid = documentKey.getColumnFamily().toString();
+ Key startKey = new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + NULL);
+ Key endKey = config.isTld() ? new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + '\uffff')
+ : new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + '.');
+ Range documentColumnRange = new Range(startKey, true, endKey, false);
+ if (log.isDebugEnabled()) {
+ log.debug("Seeking d-column range " + documentColumnRange + " for document key " + documentKey);
+ }
+
+ source.seek(documentColumnRange, Collections.emptyList(), false);
+
+ while (source.hasTop() && isDocumentColumn(source.getTopKey(), documentKey)) {
+ Entry entry = Maps.immutableEntry(source.getTopKey(), source.getTopValue());
+ if (!shouldRetainDocumentColumn(entry, documentKey)) {
+ source.next();
+ continue;
+ }
+
+ int encodedLength = entry.getValue().get().length;
+ long retainedBytesWithEntry = retainedBytes + encodedLength;
+ if (retainedBytesWithEntry > config.getLimits().getMaxEncodedContextSize()) {
+ if (log.isDebugEnabled()) {
+ log.debug("Reached aggregate encoded document-match context limit of " + config.getLimits().getMaxEncodedContextSize()
+ + " bytes while collecting d-column entry " + entry.getKey() + " for document key " + documentKey
+ + "; skipping this and remaining d-column entries");
+ }
+ return config.getLimits().getMaxEncodedContextSize();
+ }
+
+ if (log.isDebugEnabled()) {
+ log.debug("Collected d-column entry " + entry.getKey() + " for document key " + documentKey);
+ }
+ documentColumns.add(entry);
+ retainedBytes = retainedBytesWithEntry;
+ source.next();
+ }
+
+ if (log.isDebugEnabled()) {
+ log.debug("Finished d-column scan for document key " + documentKey + "; next top key is " + (source.hasTop() ? source.getTopKey() : ""));
+ }
+ return retainedBytes;
+ }
+
+ private boolean shouldRetainDocumentColumn(Entry entry, Key documentKey) {
+ TimeFilter timeFilter = config.getTimeFilter();
+ if (timeFilter != null && !timeFilter.apply(entry)) {
+ if (log.isDebugEnabled()) {
+ log.debug("Skipping d-column entry " + entry.getKey() + " for document key " + documentKey + " because it did not match the time filter");
+ }
+ return false;
+ }
+
+ int encodedLength = entry.getValue().get().length;
+ if (encodedLength > config.getLimits().getMaxEncodedValueSize()) {
+ if (log.isDebugEnabled()) {
+ log.debug("Skipping oversized d-column entry " + entry.getKey() + " for document key " + documentKey + " because encoded payload size "
+ + encodedLength + " exceeds configured limit of " + config.getLimits().getMaxEncodedValueSize() + " bytes");
+ }
+ return false;
+ }
+
+ return true;
+ }
+
+ private Set getDocumentKeys(Key tupleKey, Document document) {
+ Set docKeys = new HashSet<>((config.isTld()) ? 4 : 1);
+ Attribute> docKeyAttr = document.get(Document.DOCKEY_FIELD_NAME);
+ if (docKeyAttr == null) {
+ docKeys.add(tupleKey);
+ return docKeys;
+ }
+
+ if (docKeyAttr instanceof DocumentKey) {
+ docKeys.add(((DocumentKey) docKeyAttr).getDocKey());
+ } else if (docKeyAttr instanceof Attributes) {
+ for (Attribute> docKey : ((Attributes) docKeyAttr).getAttributes()) {
+ if (docKey instanceof DocumentKey) {
+ docKeys.add(((DocumentKey) docKey).getDocKey());
+ } else {
+ throw new IllegalStateException("Unexpected sub-Attribute type for " + Document.DOCKEY_FIELD_NAME + ": " + docKey.getClass());
+ }
+ }
+ } else {
+ throw new IllegalStateException("Unexpected Attribute type for " + Document.DOCKEY_FIELD_NAME + ": " + docKeyAttr.getClass());
+ }
+
+ if (docKeys.isEmpty()) {
+ docKeys.add(tupleKey);
+ }
+ return docKeys;
+ }
+
+ /**
+ * Determines whether a scanned key is a {@code d}-column for the supplied document key.
+ *
+ * The comparison intentionally checks the scanned key's column qualifier against the document key's column family. For event keys, the column family is
+ * {@code datatype\0uid}, while {@code d}-column qualifiers are laid out as {@code datatype\0uid\0view}. Matching on this prefix ensures that the collected
+ * {@code d}-column belongs to the same document identity as the event key.
+ *
+ * @param documentContentKey
+ * scanned 'd' column shard-table key
+ * @param documentKey
+ * event or document key whose {@code datatype\0uid} identifies the document
+ * @return {@code true} if the scanned key is a matching {@code d}-column entry for the document
+ */
+ private boolean isDocumentColumn(Key documentContentKey, Key documentKey) {
+ // A document key's column family is datatype\0uid, and a d-column qualifier begins with that same datatype\0uid
+ // followed by \0view. This prefix comparison ties the d-column back to the document represented by the event key.
+ return documentContentKey.getColumnFamilyData().length() == 1 && documentContentKey.getColumnFamilyData().byteAt(0) == DOCUMENT_COLUMN_FAMILY_CHAR
+ && documentContentKey.getRow().equals(documentKey.getRow())
+ && documentContentKey.getColumnQualifier().toString().startsWith(documentKey.getColumnFamily().toString() + NULL);
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java
new file mode 100644
index 00000000000..0879a9480fc
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java
@@ -0,0 +1,98 @@
+package datawave.query.function;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.accumulo.core.data.Key;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.gson.Gson;
+
+import datawave.query.data.parsers.DatawaveKey;
+
+/**
+ * Match results for a single matched {@code d}-column entry.
+ *
+ * A {@code d}-column entry has a single view name, so matches are grouped only by search string within that view.
+ */
+public class DocumentMatchResults {
+ private static final Gson GSON = new Gson();
+
+ public static final String VIEW_FIELD = "view";
+ public static final String MATCHES_FIELD = "matches";
+
+ private final Key key;
+ private final Map> matches = new LinkedHashMap<>();
+
+ /**
+ * Creates an empty result container for a single matched {@code d}-column entry.
+ *
+ * @param key
+ * the matched {@code d}-column key
+ */
+ public DocumentMatchResults(Key key) {
+ this.key = key;
+ }
+
+ public Key getKey() {
+ return key;
+ }
+
+ /**
+ * @return the single view name associated with this matched {@code d}-column entry
+ */
+ @VisibleForTesting
+ public String getView() {
+ return Objects.toString(new DatawaveKey(key).getFieldName(), "");
+ }
+
+ /**
+ * Records offsets for a literal search string within this entry's view.
+ *
+ * @param search
+ * the matched literal string
+ * @param offsets
+ * ordered starting offsets where the string was found
+ */
+ public void addMatches(String search, List offsets) {
+ matches.computeIfAbsent(search, ignored -> new TreeSet<>()).addAll(offsets);
+ }
+
+ /**
+ * Builds the JSON-ready payload for this entry in the form {@code {"view":"...","matches":{search:[offsets]}}}.
+ *
+ * @return a payload map suitable for serialization into the {@code DOCUMENT_MATCHES} attribute, or an empty map if no matches are present
+ */
+ private Map getPayload() {
+ Map payload = new LinkedHashMap<>();
+ String view = getView();
+ if (view == null || matches.isEmpty()) {
+ return payload;
+ }
+ payload.put(VIEW_FIELD, view);
+ Map> jsonMatches = new LinkedHashMap<>();
+ for (Map.Entry> matchEntry : matches.entrySet()) {
+ jsonMatches.put(matchEntry.getKey(), new ArrayList<>(matchEntry.getValue()));
+ }
+ payload.put(MATCHES_FIELD, jsonMatches);
+ return payload;
+ }
+
+ /**
+ * Serializes this entry's payload into the {@code DOCUMENT_MATCHES} JSON representation.
+ *
+ * @return JSON string representation, or an empty string if no matches were recorded for the entry
+ */
+ public String toJson() {
+ Map payload = getPayload();
+ if (payload.isEmpty()) {
+ return "";
+ }
+ return GSON.toJson(payload);
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java
index 8e58173121f..682df0febdb 100644
--- a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java
+++ b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java
@@ -24,6 +24,7 @@
import datawave.query.jexl.DefaultArithmetic;
import datawave.query.jexl.DelayedNonEventIndexContext;
import datawave.query.jexl.HitListArithmetic;
+import datawave.query.jexl.functions.DocumentFunctions;
import datawave.query.postprocessing.tf.PhraseIndexes;
import datawave.query.postprocessing.tf.TermOffsetMap;
import datawave.query.transformer.ExcerptTransform;
@@ -97,6 +98,7 @@ public boolean apply(Tuple3 input) {
log.trace("Evaluating " + query + " against document " + input.second().getMetadata() + " with context " + input.third());
}
+ DocumentMatchContext documentMatchContext = (DocumentMatchContext) input.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME);
Object o = script.execute(input.third());
if (log.isTraceEnabled()) {
@@ -110,6 +112,20 @@ public boolean apply(Tuple3 input) {
((DelayedNonEventIndexContext) input.third()).populateDocument(input.second());
}
+ if (matched && documentMatchContext != null) {
+ Document document = input.second();
+ for (DocumentMatchResults entry : documentMatchContext.getMatches()) {
+ String documentMatches = entry.toJson();
+ if (documentMatches.isEmpty()) {
+ continue;
+ }
+
+ Content matchesAttribute = new Content(documentMatches, entry.getKey(), document.isToKeep());
+ matchesAttribute.setColumnVisibility(entry.getKey().getColumnVisibilityParsed());
+ document.put(DocumentFunctions.DOCUMENT_MATCHES, matchesAttribute);
+ }
+ }
+
if (arithmetic instanceof HitListArithmetic) {
HitListArithmetic hitListArithmetic = (HitListArithmetic) arithmetic;
if (matched) {
diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java
index e9f9ea037fa..f561d81a5a2 100644
--- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java
+++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java
@@ -68,6 +68,8 @@
import datawave.query.composite.CompositeMetadata;
import datawave.query.function.Aggregation;
import datawave.query.function.DataTypeAsField;
+import datawave.query.function.DocumentMatchConfig;
+import datawave.query.function.DocumentMatchContextFunction;
import datawave.query.function.DocumentMetadata;
import datawave.query.function.DocumentPermutation;
import datawave.query.function.DocumentProjection;
@@ -98,12 +100,14 @@
import datawave.query.iterator.profile.SourceTrackingIterator;
import datawave.query.iterator.waitwindow.WaitWindowObserver;
import datawave.query.iterator.waitwindow.WaitWindowOverseerIterator;
+import datawave.query.jexl.ArithmeticJexlEngines;
import datawave.query.jexl.DatawaveJexlContext;
import datawave.query.jexl.StatefulArithmetic;
import datawave.query.jexl.functions.FieldIndexAggregator;
import datawave.query.jexl.functions.IdentityAggregator;
import datawave.query.jexl.functions.KeyAdjudicator;
import datawave.query.jexl.visitors.DelayedNonEventSubTreeVisitor;
+import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor;
import datawave.query.jexl.visitors.IteratorBuildingVisitor;
import datawave.query.jexl.visitors.SatisfactionVisitor;
import datawave.query.jexl.visitors.VariableNameVisitor;
@@ -975,7 +979,7 @@ protected Iterator> getEvaluation(NestedQueryIterator d
// get the function we use for the tf functionality. Note we are
// getting an additional source deep copy for this function
- final Iterator>> itrWithContext;
+ Iterator>> itrWithContext;
// TODO: this should be dynamic based on the query fields, not a flag passed to the iterator
if (this.isTermFrequenciesRequired()) {
@@ -997,6 +1001,17 @@ protected Iterator> getEvaluation(NestedQueryIterator d
itrWithContext = Iterators.transform(tupleItr, new EmptyContext<>());
}
+ if (shouldCollectDocumentMatchContext(documentSource)) {
+ SortedKeyValueIterator documentMatchSource = getSourceDeepCopy("document-match context");
+ DocumentMatchConfig documentMatchConfig = new DocumentMatchConfig();
+ documentMatchConfig.setSource(documentMatchSource);
+ documentMatchConfig.setTimeFilter(getTimeFilter());
+ documentMatchConfig.setLimits(getDocumentMatchLimits());
+ Function>,Tuple3>> documentMatchFunction = buildDocumentMatchFunction(
+ documentMatchConfig);
+ itrWithContext = TraceIterators.transform(itrWithContext, documentMatchFunction, "Document Match Context Lookup");
+ }
+
try {
IteratorBuildingVisitor iteratorBuildingVisitor = createIteratorBuildingVisitor(getDocumentRange(documentSource), false, this.sortedUIDs);
Multimap delayedNonEventFieldMap = DelayedNonEventSubTreeVisitor.getDelayedNonEventFieldMap(iteratorBuildingVisitor,
@@ -1041,6 +1056,19 @@ protected Function,Tuple3>>
return TFFactory.getFunction(tfConfig);
}
+ /**
+ * This method exists so that extending classes can implement specific versions of the document-match context function. Specifically, so the
+ * {@link datawave.query.tld.TLDQueryIterator} can mark document-match collection as TLD-aware.
+ *
+ * @param documentMatchConfig
+ * a document-match configuration
+ * @return a document-match context function
+ */
+ protected Function>,Tuple3>> buildDocumentMatchFunction(
+ DocumentMatchConfig documentMatchConfig) {
+ return new DocumentMatchContextFunction(documentMatchConfig);
+ }
+
private Range getDocumentRange(NestedQueryIterator documentSource) {
if (null == documentSource) {
return range;
@@ -1085,7 +1113,7 @@ protected JexlEvaluation getJexlEvaluation(String query, NestedQueryIterator
+ * The top-level iterator option tells us whether the planned query requires document-match context anywhere. When a nested query is being evaluated, this
+ * method narrows that decision to the nested query so we only collect document-match context when the query actually being evaluated still contains
+ * {@code document:match(...)}.
+ *
+ * @param documentSource
+ * the nested query source for the current evaluation pass, if any
+ * @return true if document-match context should be collected for the current evaluation
+ */
+ protected boolean shouldCollectDocumentMatchContext(NestedQueryIterator documentSource) {
+ if (!isDocumentMatchContextRequired()) {
+ return false;
+ }
+
+ // At this point the planned query requires document-match context. If there is no nested source, or no nested
+ // query payload to inspect, we cannot narrow that requirement to a smaller subquery, so we conservatively keep
+ // document-match context collection enabled and return true in these cases.
+ if (documentSource == null) {
+ return true;
+ }
+ NestedQuery nestedQuery = documentSource.getNestedQuery();
+ if (nestedQuery == null || nestedQuery.getQuery() == null) {
+ return true;
+ }
+
+ ASTJexlScript nestedScript = nestedQuery.getScript();
+ if (nestedScript == null) {
+ nestedScript = ArithmeticJexlEngines.getEngine(getArithmetic()).parse(nestedQuery.getQuery());
+ }
+ return DocumentMatchFunctionVisitor.requiresDocumentMatchContext(nestedScript);
+ }
+
protected LimitFields getLimitFields() {
return new LimitFields(this.getLimitFieldsMap(), this.getMatchingFieldSets());
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java
index eb3e1485f99..595b9f5c0da 100644
--- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java
+++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java
@@ -67,6 +67,7 @@
import datawave.query.composite.CompositeMetadata;
import datawave.query.exceptions.DatawaveFatalQueryException;
import datawave.query.function.ConfiguredFunction;
+import datawave.query.function.DocumentMatchContext;
import datawave.query.function.DocumentPermutation;
import datawave.query.function.Equality;
import datawave.query.function.GetStartKey;
@@ -284,6 +285,11 @@ public class QueryOptions implements OptionDescriber {
public static final String TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS = "tf.agg.threshold";
+ public static final String DOCUMENT_MATCH_CONTEXT_REQUIRED = "document.match.context.required";
+ public static final String DOCUMENT_MATCH_MAX_ENCODED_SIZE = "document.match.max.encoded.size";
+ public static final String DOCUMENT_MATCH_MAX_DECODED_SIZE = "document.match.max.decoded.size";
+ public static final String DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE = "document.match.max.encoded.context.size";
+
public static final String FIELD_COUNTS = "field.counts";
public static final String TERM_COUNTS = "term.counts";
public static final String CARDINALITY_THRESHOLD = "cardinality.threshold";
@@ -405,6 +411,7 @@ public class QueryOptions implements OptionDescriber {
protected Map> nonIndexedDataTypeMap = Maps.newHashMap();
protected boolean termFrequenciesRequired = false;
+ protected boolean documentMatchContextRequired = false;
protected Set termFrequencyFields = Collections.emptySet();
protected Set contentExpansionFields;
@@ -467,6 +474,9 @@ public class QueryOptions implements OptionDescriber {
// aggregation thresholds
private int docAggregationThresholdMs = -1;
private int tfAggregationThresholdMs = -1;
+ private int documentMatchMaxEncodedSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE;
+ private int documentMatchMaxDecodedSize = DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE;
+ private int documentMatchMaxEncodedContextSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE;
private CountMap fieldCounts;
private CountMap termCounts;
@@ -533,6 +543,7 @@ public void deepCopy(QueryOptions other) {
this.evaluationFilter = other.evaluationFilter;
this.fiEvaluationFilter = other.fiEvaluationFilter;
this.eventEvaluationFilter = other.eventEvaluationFilter;
+ this.eventFilter = other.eventFilter;
this.ivaratorCacheDirConfigs = (other.ivaratorCacheDirConfigs == null) ? null : new ArrayList<>(other.ivaratorCacheDirConfigs);
this.hdfsSiteConfigURLs = other.hdfsSiteConfigURLs;
@@ -563,6 +574,7 @@ public void deepCopy(QueryOptions other) {
this.sortedUIDs = other.sortedUIDs;
this.termFrequenciesRequired = other.termFrequenciesRequired;
+ this.documentMatchContextRequired = other.documentMatchContextRequired;
this.termFrequencyFields = other.termFrequencyFields;
this.contentExpansionFields = other.contentExpansionFields;
@@ -592,6 +604,9 @@ public void deepCopy(QueryOptions other) {
this.docAggregationThresholdMs = other.docAggregationThresholdMs;
this.tfAggregationThresholdMs = other.tfAggregationThresholdMs;
+ this.documentMatchMaxEncodedSize = other.documentMatchMaxEncodedSize;
+ this.documentMatchMaxDecodedSize = other.documentMatchMaxDecodedSize;
+ this.documentMatchMaxEncodedContextSize = other.documentMatchMaxEncodedContextSize;
this.fieldCounts = other.fieldCounts;
this.termCounts = other.termCounts;
@@ -1435,6 +1450,11 @@ public IteratorOptions describeOptions() {
options.put(TF_NEXT_SEEK, "The number of next calls made by a Term Frequency data filter or aggregator before a seek is issued");
options.put(DOC_AGGREGATION_THRESHOLD_MS, "Document aggregations that exceed this threshold are logged as a warning");
options.put(TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS, "TermFrequency aggregations that exceed this threshold are logged as a warning");
+ options.put(DOCUMENT_MATCH_CONTEXT_REQUIRED, "Whether the query requires gathering document-match context");
+ options.put(DOCUMENT_MATCH_MAX_ENCODED_SIZE, "Maximum encoded d-column payload size, in bytes, to inspect for document:match evaluation");
+ options.put(DOCUMENT_MATCH_MAX_DECODED_SIZE, "Maximum decoded d-column payload size, in bytes, to inspect for document:match evaluation");
+ options.put(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE,
+ "Maximum aggregate encoded d-column payload size, in bytes, to retain in memory for document:match evaluation");
options.put(FIELD_COUNTS, "Map of field counts from the global index");
options.put(TERM_COUNTS, "Map of term counts from the global index");
return new IteratorOptions(getClass().getSimpleName(), "Runs a query against the DATAWAVE tables", options, null);
@@ -1655,6 +1675,22 @@ public boolean validateOptions(Map options) {
this.tfAggregationThresholdMs = Integer.parseInt(options.get(TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS));
}
+ if (options.containsKey(DOCUMENT_MATCH_CONTEXT_REQUIRED)) {
+ this.documentMatchContextRequired = Boolean.parseBoolean(options.get(DOCUMENT_MATCH_CONTEXT_REQUIRED));
+ }
+
+ if (options.containsKey(DOCUMENT_MATCH_MAX_ENCODED_SIZE)) {
+ this.documentMatchMaxEncodedSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_ENCODED_SIZE));
+ }
+
+ if (options.containsKey(DOCUMENT_MATCH_MAX_DECODED_SIZE)) {
+ this.documentMatchMaxDecodedSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_DECODED_SIZE));
+ }
+
+ if (options.containsKey(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)) {
+ this.documentMatchMaxEncodedContextSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE));
+ }
+
if (options.containsKey(DATATYPE_FILTER)) {
String option = options.get(DATATYPE_FILTER);
if (option != null && !option.isEmpty()) {
@@ -2462,6 +2498,42 @@ public void setTfAggregationThresholdMs(int tfAggregationThresholdMs) {
this.tfAggregationThresholdMs = tfAggregationThresholdMs;
}
+ public int getDocumentMatchMaxEncodedSize() {
+ return documentMatchMaxEncodedSize;
+ }
+
+ public void setDocumentMatchMaxEncodedSize(int documentMatchMaxEncodedSize) {
+ this.documentMatchMaxEncodedSize = documentMatchMaxEncodedSize;
+ }
+
+ public int getDocumentMatchMaxDecodedSize() {
+ return documentMatchMaxDecodedSize;
+ }
+
+ public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) {
+ this.documentMatchMaxDecodedSize = documentMatchMaxDecodedSize;
+ }
+
+ public int getDocumentMatchMaxEncodedContextSize() {
+ return documentMatchMaxEncodedContextSize;
+ }
+
+ public void setDocumentMatchMaxEncodedContextSize(int documentMatchMaxEncodedContextSize) {
+ this.documentMatchMaxEncodedContextSize = documentMatchMaxEncodedContextSize;
+ }
+
+ public boolean isDocumentMatchContextRequired() {
+ return documentMatchContextRequired;
+ }
+
+ public void setDocumentMatchContextRequired(boolean documentMatchContextRequired) {
+ this.documentMatchContextRequired = documentMatchContextRequired;
+ }
+
+ protected DocumentMatchContext.Limits getDocumentMatchLimits() {
+ return new DocumentMatchContext.Limits(documentMatchMaxEncodedSize, documentMatchMaxDecodedSize, documentMatchMaxEncodedContextSize);
+ }
+
/**
* Get an {@link Equality}
*
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java b/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java
index 7cba3801a7f..e904bc81047 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java
@@ -51,6 +51,7 @@
import datawave.query.attributes.ValueTuple;
import datawave.query.collections.FunctionalSet;
import datawave.query.jexl.functions.ContentFunctionsDescriptor;
+import datawave.query.jexl.functions.DocumentFunctions;
import datawave.query.jexl.functions.QueryFunctions;
import datawave.query.jexl.nodes.ExceededOr;
import datawave.query.jexl.nodes.QueryPropertyMarker;
@@ -128,6 +129,12 @@ public Object visit(ASTFunctionNode node, Object data) {
addHits(result);
+ if (isDocumentMatchFunction(nodeString) && result instanceof String && !hasSiblings(node)) {
+ boolean matched = !((String) result).isEmpty();
+ resultMap.put(nodeString, matched);
+ return matched;
+ }
+
// if the function stands alone, then it needs to return ag boolean
// if the function is paired with a method that is called on its results (like 'size') then the
// actual results must be returned.
@@ -139,6 +146,10 @@ public Object visit(ASTFunctionNode node, Object data) {
return result instanceof Collection ? !((Collection) result).isEmpty() : result;
}
+ private boolean isDocumentMatchFunction(String nodeString) {
+ return nodeString.startsWith(DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE + ":" + DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME);
+ }
+
/**
* Triggered when variable can not be resolved.
*
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java
new file mode 100644
index 00000000000..8b0984193fc
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java
@@ -0,0 +1,169 @@
+package datawave.query.jexl.functions;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.Objects;
+
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Value;
+import org.apache.log4j.Logger;
+
+import datawave.query.data.parsers.DatawaveKey;
+import datawave.query.function.DocumentMatchContext;
+import datawave.query.table.parser.ContentKeyValueFactory;
+
+/**
+ * Evaluation-phase JEXL functions for inspecting decoded shard-table {@code d}-column content.
+ *
+ * The current namespace exposes {@code document:match(...)} which decodes base64-encoded, gzip-compressed document payloads, performs case-sensitive literal
+ * substring matching, and returns the matched search string when any eligible {@code d}-column matches. Detailed per-entry offsets are accumulated in the
+ * supplied {@link DocumentMatchContext} and later serialized into {@code DOCUMENT_MATCHES} attributes by the surrounding evaluation flow.
+ */
+@JexlFunctions(descriptorFactory = "datawave.query.jexl.functions.DocumentFunctionsDescriptor")
+public class DocumentFunctions {
+ private static final Logger log = Logger.getLogger(DocumentFunctions.class);
+
+ public static final String DOCUMENT_FUNCTION_NAMESPACE = "document";
+ public static final String DOCUMENT_MATCH_FUNCTION_NAME = "match";
+ public static final String DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME = "documentMatchContext";
+ public static final String DOCUMENT_MATCHES = "DOCUMENT_MATCHES";
+
+ /**
+ * Evaluates the internal form of {@code document:match(STRING)} across all eligible views for the current document.
+ *
+ * @param context
+ * per-document context supplied by the evaluation pipeline
+ * @param search
+ * literal substring to search for
+ * @return the matched search string if any eligible {@code d}-column matches, or an empty string if no match is found
+ */
+ public static String match(DocumentMatchContext context, String search) {
+ return match(null, context, search);
+ }
+
+ /**
+ * Evaluates the internal form of {@code document:match(VIEWNAME, STRING)} against the current document.
+ *
+ * Matching is case-sensitive and literal. If {@code viewName} ends with {@code *}, it is treated as a prefix match against the view portion of the
+ * {@code d}-column qualifier. Oversized or undecodable payloads are skipped as non-matching. Matches from this invocation are accumulated in the supplied
+ * {@link DocumentMatchContext} on a per-{@code d}-column basis so the resulting {@code DOCUMENT_MATCHES} attributes can preserve each source visibility.
+ *
+ * @param viewName
+ * optional exact or prefix-matched view selector; {@code null} means evaluate all views
+ * @param context
+ * per-document context supplied by the evaluation pipeline
+ * @param search
+ * literal substring to search for
+ * @return the matched search string if any eligible {@code d}-column matches, or an empty string if no match is found
+ */
+ public static String match(String viewName, DocumentMatchContext context, String search) {
+ if (context == null || search == null) {
+ if (log.isDebugEnabled()) {
+ log.debug("Skipping document:match evaluation because context or search term was null");
+ }
+ return "";
+ }
+
+ if (log.isDebugEnabled()) {
+ log.debug("Evaluating document:match for search [" + search + "] view filter [" + viewName + "] across " + context.getDocumentEntries().size()
+ + " d-column entries");
+ }
+
+ boolean matched = false;
+ for (Entry entry : context.getDocumentEntries()) {
+ String candidateView = Objects.toString(new DatawaveKey(entry.getKey()).getFieldName(), "");
+ if (!matchesView(viewName, candidateView)) {
+ if (log.isDebugEnabled()) {
+ log.debug("Skipping d-column entry " + entry.getKey() + " because view [" + candidateView + "] does not match filter [" + viewName + "]");
+ }
+ continue;
+ }
+ byte[] encoded = entry.getValue().get();
+ if (encoded.length > context.getMaxEncodedValueSize()) {
+ log.debug("Skipping oversized d-column payload of " + encoded.length + " bytes for view " + candidateView);
+ continue;
+ }
+
+ try {
+ String decoded = decode(encoded, context.getMaxDecodedValueSize());
+ List offsets = findOffsets(decoded, search);
+ if (!offsets.isEmpty()) {
+ if (log.isDebugEnabled()) {
+ log.debug("document:match found offsets " + offsets + " for search [" + search + "] in view [" + candidateView + "] using key "
+ + entry.getKey());
+ }
+ context.addMatches(entry.getKey(), search, offsets);
+ matched = true;
+ } else if (log.isDebugEnabled()) {
+ log.debug("document:match found no offsets for search [" + search + "] in view [" + candidateView + "] using key " + entry.getKey());
+ }
+ } catch (IOException | IllegalArgumentException e) {
+ log.debug("Unable to decode d-column payload for view " + candidateView, e);
+ }
+ }
+ if (log.isDebugEnabled()) {
+ log.debug("document:match produced matched=" + matched + " for search [" + search + "]");
+ }
+ return matched ? search : "";
+ }
+
+ /**
+ * Determines whether a candidate view satisfies the requested selector.
+ *
+ * @param expectedView
+ * requested view selector; {@code null} matches all views and a trailing {@code *} indicates prefix matching
+ * @param candidateView
+ * extracted view name for the current {@code d}-column
+ * @return {@code true} if the candidate view should be evaluated
+ */
+ static boolean matchesView(String expectedView, String candidateView) {
+ if (expectedView == null) {
+ return true;
+ }
+ if (expectedView.endsWith("*")) {
+ String prefix = expectedView.substring(0, expectedView.length() - 1);
+ return candidateView.startsWith(prefix);
+ }
+ return expectedView.equals(candidateView);
+ }
+
+ /**
+ * Decodes a base64-encoded, gzip-compressed {@code d}-column payload while enforcing a maximum decoded size.
+ *
+ * @param encoded
+ * encoded payload bytes from the shard table
+ * @param maxDecodedValueSize
+ * maximum allowed decoded payload size in bytes
+ * @return the decoded UTF-8 content
+ * @throws IOException
+ * if the payload cannot be decoded or if the decoded size exceeds the configured limit
+ */
+ static String decode(byte[] encoded, int maxDecodedValueSize) throws IOException {
+ return ContentKeyValueFactory.decodeAndDecompressContentAsString(encoded, maxDecodedValueSize);
+ }
+
+ /**
+ * Finds all starting character offsets for a literal substring, including overlapping matches.
+ *
+ * @param decoded
+ * decoded document content
+ * @param search
+ * literal substring to search for
+ * @return ordered starting offsets for each match
+ */
+ static List findOffsets(String decoded, String search) {
+ List offsets = new ArrayList<>();
+ if (search.isEmpty()) {
+ return offsets;
+ }
+ int index = decoded.indexOf(search);
+ while (index >= 0) {
+ offsets.add(index);
+ index = decoded.indexOf(search, index + 1);
+ }
+ return offsets;
+ }
+
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java
new file mode 100644
index 00000000000..75124f58fc3
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java
@@ -0,0 +1,107 @@
+package datawave.query.jexl.functions;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.jexl3.parser.ASTFunctionNode;
+
+import datawave.query.attributes.AttributeFactory;
+import datawave.query.config.ShardQueryConfiguration;
+import datawave.query.jexl.ArithmeticJexlEngines;
+import datawave.query.jexl.functions.arguments.JexlArgumentDescriptor;
+import datawave.query.jexl.visitors.EventDataQueryExpressionVisitor;
+import datawave.query.util.DateIndexHelper;
+import datawave.query.util.MetadataHelper;
+import datawave.webservice.query.exception.BadRequestQueryException;
+import datawave.webservice.query.exception.DatawaveErrorCode;
+
+/**
+ * Argument-descriptor factory for the {@code document:*} JEXL namespace.
+ *
+ * {@code document:match(...)} is an evaluation-only function. It does not contribute field normalization rules, event-data filters, index expansion, or
+ * ivarator pushdown. This descriptor exists primarily to validate the namespace/function pairing and to return a descriptor that tells the planner to leave the
+ * function in the evaluation phase.
+ */
+@SuppressWarnings("unused")
+public class DocumentFunctionsDescriptor implements JexlFunctionArgumentDescriptorFactory {
+
+ /**
+ * Descriptor for {@code document:match(...)}.
+ *
+ * The function is evaluated only after a candidate document has been materialized, so all index-planning hooks intentionally report no fields and no index
+ * query contribution.
+ */
+ public static class DocumentJexlArgumentDescriptor implements JexlArgumentDescriptor {
+ @Override
+ public org.apache.commons.jexl3.parser.JexlNode getIndexQuery(ShardQueryConfiguration config, MetadataHelper helper, DateIndexHelper dateIndexHelper,
+ Set datatypeFilter) {
+ return TRUE_NODE;
+ }
+
+ @Override
+ public void addFilters(AttributeFactory attributeFactory, Map filterMap) {}
+
+ @Override
+ public Set fieldsForNormalization(MetadataHelper helper, Set datatypeFilter, int arg) {
+ return Collections.emptySet();
+ }
+
+ @Override
+ public Set fields(MetadataHelper helper, Set datatypeFilter) {
+ return Collections.emptySet();
+ }
+
+ @Override
+ public Set> fieldSets(MetadataHelper helper, Set datatypeFilter) {
+ return Collections.emptySet();
+ }
+
+ @Override
+ public boolean useOrForExpansion() {
+ return false;
+ }
+
+ @Override
+ public boolean regexArguments() {
+ return false;
+ }
+
+ @Override
+ public boolean allowIvaratorFiltering() {
+ return false;
+ }
+ }
+
+ /**
+ * Validates that the supplied function node represents {@code document:match(...)} and returns the evaluation-only descriptor for it.
+ *
+ * @param node
+ * function node from the parsed JEXL tree
+ * @return descriptor describing the planning behavior for {@code document:match(...)}
+ * @throws IllegalArgumentException
+ * if the namespace, function class, or argument count is invalid
+ */
+ @Override
+ public JexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) {
+ FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node);
+ Class> functionClass = (Class>) ArithmeticJexlEngines.functions().get(visitor.namespace());
+
+ if (!DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace())) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.JEXLNODEDESCRIPTOR_NAMESPACE_UNEXPECTED,
+ "Unexpected namespace " + visitor.namespace());
+ throw new IllegalArgumentException(qe);
+ }
+ if (!functionClass.equals(DocumentFunctions.class)) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.JEXLNODEDESCRIPTOR_NODE_FOR_FUNCTION,
+ "Unexpected function class " + functionClass);
+ throw new IllegalArgumentException(qe);
+ }
+ if (!DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(visitor.name()) || visitor.args().isEmpty() || visitor.args().size() > 3) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.WRONG_NUMBER_OF_ARGUMENTS,
+ "Wrong number of arguments to document:match");
+ throw new IllegalArgumentException(qe);
+ }
+ return new DocumentJexlArgumentDescriptor();
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java
index 9c456a36001..ac39b284c24 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java
@@ -9,6 +9,8 @@
import datawave.query.jexl.ArithmeticJexlEngines;
import datawave.query.jexl.functions.arguments.JexlArgumentDescriptor;
+import datawave.query.util.DateIndexHelper;
+import datawave.query.util.MetadataHelper;
import datawave.webservice.query.exception.BadRequestQueryException;
import datawave.webservice.query.exception.DatawaveErrorCode;
import datawave.webservice.query.exception.QueryException;
@@ -27,6 +29,10 @@ public interface JexlFunctionArgumentDescriptorFactory {
*/
JexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node);
+ /**
+ * Returned by {@link JexlArgumentDescriptor#getIndexQuery(datawave.query.config.ShardQueryConfiguration, MetadataHelper, DateIndexHelper, java.util.Set)}
+ * when index searching should be skipped for a function.
+ */
JexlNode TRUE_NODE = new ASTTrueNode(ParserTreeConstants.JJTTRUENODE);
/** An encapsulation of methods that can be used with this interface */
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java
index f1a15d5a8b4..f3d03615de7 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java
@@ -25,6 +25,7 @@ public class JexlFunctionNamespaceRegistry {
static {
registeredFunctions.put(ContentFunctions.CONTENT_FUNCTION_NAMESPACE, ContentFunctions.class);
+ registeredFunctions.put(DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE, DocumentFunctions.class);
registeredFunctions.put(NormalizationFunctions.NORMALIZATION_FUNCTION_NAMESPACE, NormalizationFunctions.class);
registeredFunctions.put(EvaluationPhaseFilterFunctions.EVAL_PHASE_FUNCTION_NAMESPACE, EvaluationPhaseFilterFunctions.class);
registeredFunctions.put(GroupingRequiredFilterFunctions.GROUPING_REQUIRED_FUNCTION_NAMESPACE, GroupingRequiredFilterFunctions.class);
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java
index f586d46c951..864051631e9 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java
@@ -35,7 +35,8 @@ public interface JexlArgumentDescriptor {
* the datatype filter
* @param settings
* the config settings
- * @return The query which will be used against the global index
+ * @return the query which will be used against the global index, or {@link datawave.query.jexl.functions.JexlFunctionArgumentDescriptorFactory#TRUE_NODE}
+ * if index searching should be skipped for this function
*/
JexlNode getIndexQuery(ShardQueryConfiguration settings, MetadataHelper metadataHelper, DateIndexHelper dateIndexHelper, Set datatypeFilter);
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java
new file mode 100644
index 00000000000..0dbfbc17f5d
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java
@@ -0,0 +1,107 @@
+package datawave.query.jexl.visitors;
+
+import org.apache.commons.jexl3.parser.ASTArguments;
+import org.apache.commons.jexl3.parser.ASTFunctionNode;
+import org.apache.commons.jexl3.parser.ASTJexlScript;
+import org.apache.commons.jexl3.parser.ASTNamespaceIdentifier;
+import org.apache.commons.jexl3.parser.JexlNode;
+import org.apache.commons.jexl3.parser.ParserTreeConstants;
+import org.apache.log4j.Logger;
+
+import datawave.query.jexl.JexlASTHelper;
+import datawave.query.jexl.JexlNodeFactory;
+import datawave.query.jexl.functions.DocumentFunctions;
+import datawave.query.jexl.functions.FunctionJexlNodeVisitor;
+
+/**
+ * Rewrites user-facing {@code document:match(...)} calls into the internal evaluation form that carries the reserved {@code documentMatchContext} argument
+ * explicitly.
+ *
+ * This mirrors the way {@code content:*} functions are evaluated with an explicit {@code termOffsetMap} argument, but preserves the external user syntax for
+ * document matching.
+ */
+public class DocumentMatchFunctionVisitor extends BaseVisitor {
+ protected static final Logger log = Logger.getLogger(DocumentMatchFunctionVisitor.class);
+ private boolean documentMatchContextRequired = false;
+
+ private DocumentMatchFunctionVisitor() {
+ // no-op, local construction only.
+ }
+
+ /**
+ * Determines whether the supplied script contains any {@code document:match(...)} calls.
+ *
+ * @param script
+ * script to inspect
+ * @return {@code true} if any document-match functions are present
+ */
+ public static boolean requiresDocumentMatchContext(ASTJexlScript script) {
+ return JexlASTHelper.getFunctionNodes(script).stream().map(FunctionJexlNodeVisitor::eval)
+ .anyMatch(function -> DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(function.namespace())
+ && DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(function.name()));
+ }
+
+ /**
+ * Rewrites all {@code document:match(...)} calls in the supplied script to include the reserved context identifier.
+ *
+ * @param script
+ * script to rewrite
+ * @return {@code true} if any document-match functions were found
+ */
+ public static boolean rewrite(ASTJexlScript script) {
+ DocumentMatchFunctionVisitor visitor = new DocumentMatchFunctionVisitor();
+ script.jjtAccept(visitor, null);
+ return visitor.documentMatchContextRequired;
+ }
+
+ @Override
+ public Object visit(ASTFunctionNode node, Object data) {
+ FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node);
+ if (DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace())) {
+ rewriteDocumentFunction(node, visitor);
+ return data;
+ }
+ return super.visit(node, data);
+ }
+
+ protected void rewriteDocumentFunction(ASTFunctionNode node, FunctionJexlNodeVisitor visitor) {
+ switch (visitor.name()) {
+ case DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME:
+ documentMatchContextRequired = true;
+ if (visitor.args().size() == 1) {
+ JexlASTHelper.replaceNodeSafely(node,
+ buildFunction(visitor.namespace(), visitor.name(),
+ JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME),
+ RebuildingVisitor.copy(visitor.args().get(0))));
+ } else if (visitor.args().size() == 2) {
+ JexlASTHelper.replaceNodeSafely(node,
+ buildFunction(visitor.namespace(), visitor.name(), RebuildingVisitor.copy(visitor.args().get(0)),
+ JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME),
+ RebuildingVisitor.copy(visitor.args().get(1))));
+ }
+ return;
+ default:
+ log.warn("unknown document function:" + visitor.name());
+ }
+ }
+
+ private static ASTFunctionNode buildFunction(String namespace, String functionName, JexlNode... arguments) {
+ ASTFunctionNode functionNode = new ASTFunctionNode(ParserTreeConstants.JJTFUNCTIONNODE);
+
+ ASTNamespaceIdentifier namespaceNode = new ASTNamespaceIdentifier(ParserTreeConstants.JJTNAMESPACEIDENTIFIER);
+ namespaceNode.setNamespace(namespace, functionName);
+ functionNode.jjtAddChild(namespaceNode, 0);
+ namespaceNode.jjtSetParent(functionNode);
+
+ ASTArguments argsNode = new ASTArguments(ParserTreeConstants.JJTARGUMENTS);
+ functionNode.jjtAddChild(argsNode, 1);
+ argsNode.jjtSetParent(functionNode);
+
+ for (int i = 0; i < arguments.length; i++) {
+ argsNode.jjtAddChild(arguments[i], i);
+ arguments[i].jjtSetParent(argsNode);
+ }
+
+ return functionNode;
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java b/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java
index 7b9b3fea3a8..e343b531f06 100644
--- a/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java
+++ b/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java
@@ -30,6 +30,7 @@
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.search.TermQuery;
+import datawave.query.language.functions.lucene.DocumentMatch;
import datawave.query.language.functions.lucene.EvaluationOnly;
import datawave.query.language.functions.lucene.Exclude;
import datawave.query.language.functions.lucene.Include;
@@ -48,7 +49,7 @@
@Deprecated
public class FunctionQueryNodeBuilder implements QueryBuilder {
- private Map allowedFunctionMap = Collections.synchronizedMap(new HashMap<>());
+ private final Map allowedFunctionMap = Collections.synchronizedMap(new HashMap<>());
public FunctionQueryNodeBuilder() {
addFunction(new IsNull());
@@ -57,6 +58,7 @@ public FunctionQueryNodeBuilder() {
addFunction(new Exclude());
addFunction(new Text());
addFunction(new Occurrence());
+ addFunction(new DocumentMatch());
addFunction(new EvaluationOnly());
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java
new file mode 100644
index 00000000000..c7dcd136844
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java
@@ -0,0 +1,67 @@
+package datawave.query.language.functions.jexl;
+
+import java.text.MessageFormat;
+import java.util.ArrayList;
+
+import datawave.query.language.functions.QueryFunction;
+import datawave.webservice.query.exception.BadRequestQueryException;
+import datawave.webservice.query.exception.DatawaveErrorCode;
+
+/**
+ * JEXL-language representation of {@code document:match(...)}.
+ *
+ * This function is produced by the query-language layer after parsing or after Lucene-to-JEXL translation. It validates the supported one-argument and
+ * two-argument forms and renders the canonical JEXL syntax consumed by the runtime query planner.
+ */
+public class DocumentMatch extends JexlQueryFunction {
+
+ public static final String DOCUMENT_MATCH_FUNCTION = "DOCUMENT_MATCH";
+ public static final String DOCUMENT_FIELD = "document";
+ public static final String DOCUMENT_NAMESPACE = DOCUMENT_FIELD + ":";
+ public static final String MATCH_FUNCTION = "match";
+
+ public DocumentMatch() {
+ super(DOCUMENT_MATCH_FUNCTION, new ArrayList<>());
+ }
+
+ /**
+ * Validates that {@code document:match(...)} received either one argument ({@code STRING}) or two arguments ({@code VIEWNAME, STRING}).
+ *
+ * @throws IllegalArgumentException
+ * if the function has no arguments or more than two arguments
+ */
+ @Override
+ public void validate() throws IllegalArgumentException {
+ if (this.parameterList == null || this.parameterList.isEmpty() || this.parameterList.size() > 2) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name));
+ throw new IllegalArgumentException(qe);
+ }
+ }
+
+ /**
+ * Renders the canonical JEXL form {@code document:match(...)} with escaped arguments.
+ *
+ * @return JEXL representation of this function
+ */
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(DOCUMENT_NAMESPACE + MATCH_FUNCTION + "(");
+ for (int i = 0; i < parameterList.size(); i++) {
+ if (i > 0) {
+ sb.append(", ");
+ }
+ sb.append(escapeString(parameterList.get(i)));
+ }
+ sb.append(")");
+ return sb.toString();
+ }
+
+ /**
+ * @return a fresh function instance for parser duplication
+ */
+ @Override
+ public QueryFunction duplicate() {
+ return new DocumentMatch();
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java
new file mode 100644
index 00000000000..96c99fcd687
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java
@@ -0,0 +1,107 @@
+package datawave.query.language.functions.lucene;
+
+import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_FIELD;
+import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_MATCH_FUNCTION;
+import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_NAMESPACE;
+import static datawave.query.language.functions.jexl.DocumentMatch.MATCH_FUNCTION;
+
+import java.text.MessageFormat;
+import java.util.ArrayList;
+
+import datawave.query.language.functions.QueryFunction;
+import datawave.query.search.WildcardFieldedFilter;
+import datawave.webservice.query.exception.BadRequestQueryException;
+import datawave.webservice.query.exception.DatawaveErrorCode;
+
+/**
+ * Lucene-language representation of {@code #DOCUMENT_MATCH(...)}.
+ *
+ * This class exists in the Lucene query-language layer so the parser can recognize the function and carry it through the same fielded-filter machinery used by
+ * other Lucene functions before the query is rendered into JEXL. The runtime semantics are still provided by {@code document:match(...)} in the evaluation
+ * phase.
+ */
+@Deprecated
+public class DocumentMatch extends LuceneQueryFunction {
+ private static class DocumentMatchFilter extends WildcardFieldedFilter {
+ private final String renderedQuery;
+
+ DocumentMatchFilter(String selector) {
+ super(true, WildcardFieldedFilter.BooleanType.AND);
+ setField(DOCUMENT_FIELD);
+ setSelector(selector);
+ this.renderedQuery = DOCUMENT_NAMESPACE + selector;
+ this.query = renderedQuery;
+ }
+
+ @Override
+ public String toString() {
+ return renderedQuery;
+ }
+ }
+
+ public DocumentMatch() {
+ super(DOCUMENT_MATCH_FUNCTION, new ArrayList<>());
+ }
+
+ /**
+ * Validates that {@code #DOCUMENT_MATCH(...)} received either one argument ({@code STRING}) or two arguments ({@code VIEWNAME, STRING}).
+ *
+ * @throws IllegalArgumentException
+ * if the function has no arguments or more than two arguments
+ */
+ @Override
+ public void validate() throws IllegalArgumentException {
+ if (this.parameterList == null || this.parameterList.isEmpty() || this.parameterList.size() > 2) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name));
+ throw new IllegalArgumentException(qe);
+ }
+ }
+
+ /**
+ * Initializes the Lucene-layer fielded-filter representation used during parsing.
+ *
+ * The synthetic {@code document:match(...)} selector created here is a parser-level representation only; actual evaluation is deferred until the translated
+ * JEXL query runs against candidate documents.
+ *
+ * @param parameterList
+ * parsed function arguments
+ * @param depth
+ * function-node depth in the Lucene parse tree
+ * @param parent
+ * parent query node
+ * @throws IllegalArgumentException
+ * if initialization fails
+ */
+ @Override
+ public void initialize(java.util.List parameterList, int depth, org.apache.lucene.queryparser.flexible.core.nodes.QueryNode parent)
+ throws IllegalArgumentException {
+ super.initialize(parameterList, depth, parent);
+ this.fieldedFilter = new DocumentMatchFilter(buildSelector());
+ }
+
+ /**
+ * Builds the parser-layer selector text {@code match(...)} from the raw Lucene arguments.
+ *
+ * @return selector text used by the synthetic fielded filter
+ */
+ private String buildSelector() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(MATCH_FUNCTION + "(");
+ for (int i = 0; i < parameterList.size(); i++) {
+ if (i > 0) {
+ sb.append(", ");
+ }
+ sb.append(parameterList.get(i));
+ }
+ sb.append(")");
+ return sb.toString();
+ }
+
+ /**
+ * @return a fresh function instance for parser duplication
+ */
+ @Override
+ public QueryFunction duplicate() {
+ return new DocumentMatch();
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java
index 3c20abd2401..bcdd386dfa6 100644
--- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java
+++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java
@@ -119,6 +119,7 @@
import datawave.query.jexl.visitors.ConjunctionEliminationVisitor;
import datawave.query.jexl.visitors.DepthVisitor;
import datawave.query.jexl.visitors.DisjunctionEliminationVisitor;
+import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor;
import datawave.query.jexl.visitors.ExecutableDeterminationVisitor;
import datawave.query.jexl.visitors.ExecutableDeterminationVisitor.STATE;
import datawave.query.jexl.visitors.ExecutableExpansionVisitor;
@@ -726,6 +727,9 @@ private void configureIterator(ShardQueryConfiguration config, IteratorSetting c
addOption(cfg, QueryOptions.TRACK_SIZES, Boolean.toString(config.isTrackSizes()), false);
addOption(cfg, QueryOptions.ACTIVE_QUERY_LOG_NAME, config.getActiveQueryLogName(), false);
+ // Add the thresholds for document matching if required
+ configureDocumentMatchOptions(config, cfg);
+
// Set the start and end dates
configureTypeMappings(config, cfg, metadataHelper, getCompressOptionMappings(), false);
}
@@ -1184,6 +1188,9 @@ protected ASTJexlScript processTree(final ASTJexlScript originalQueryTree, Shard
expandPushdownPullup(config, metadataHelper, timers, scannerFactory);
}
+ // rewrite document:match() functions to include the documentMatchContext variable.
+ config.setQueryTree(timedRewriteDocumentMatchFunctions(timers, config));
+
return config.getQueryTree();
}
@@ -1639,6 +1646,19 @@ protected ASTJexlScript timedRewriteNullFunctions(QueryStopwatch timers, ASTJexl
return visitorManager.timedVisit(timers, "Rewrite Null Functions", () -> RewriteNullFunctionsVisitor.rewriteNullFunctions(queryTree));
}
+ protected ASTJexlScript timedRewriteDocumentMatchFunctions(QueryStopwatch timers, ShardQueryConfiguration config) throws DatawaveQueryException {
+ return visitorManager.timedVisit(timers, "Rewrite Document Match Functions", () -> {
+ ASTJexlScript queryTree = config.getQueryTree();
+ DocumentMatchFunctionVisitor.rewrite(queryTree);
+ config.setDocumentMatchContextRequired(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(queryTree));
+ if (log.isDebugEnabled()) {
+ logQuery(queryTree, "Computed that the query " + (config.isDocumentMatchContextRequired() ? "requires" : "does not require")
+ + " document-match context lookup");
+ }
+ return queryTree;
+ });
+ }
+
protected ASTJexlScript timedEnforceUniqueTermsWithinExpressions(QueryStopwatch timers, final ASTJexlScript script) throws DatawaveQueryException {
return visitorManager.timedVisit(timers, "Enforce Unique Terms within AND and OR expressions", () -> (UniqueExpressionTermsVisitor.enforce(script)));
}
@@ -2385,6 +2405,16 @@ protected void configureAdditionalOptions(ShardQueryConfiguration config, Iterat
// no-op
}
+ protected void configureDocumentMatchOptions(ShardQueryConfiguration config, IteratorSetting cfg) {
+ boolean documentMatchContextRequired = config.isDocumentMatchContextRequired();
+ addOption(cfg, QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED, Boolean.toString(documentMatchContextRequired), false);
+ if (documentMatchContextRequired) {
+ addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedSize()), false);
+ addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE, Integer.toString(config.getDocumentMatchMaxDecodedSize()), false);
+ addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedContextSize()), false);
+ }
+ }
+
protected Future loadQueryIterator(final MetadataHelper metadataHelper, final ShardQueryConfiguration config, final Boolean isFullTable,
boolean isPreload) {
diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java
index 34e61842ada..be37b72c584 100644
--- a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java
+++ b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java
@@ -21,7 +21,6 @@
* This filter only operates on event keys.
*/
public class EventDataQueryFieldFilter implements EventDataQueryFilter {
-
private Key document = null;
// the number of times next is called before issuing a seek
private int maxNextCount = -1;
diff --git a/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java b/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java
index 8ed69bf246d..826b365f4e3 100644
--- a/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java
+++ b/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java
@@ -1,7 +1,9 @@
package datawave.query.table.parser;
import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.zip.GZIPInputStream;
@@ -16,6 +18,7 @@
import datawave.query.table.parser.EventKeyValueFactory.EventKeyValue;
public class ContentKeyValueFactory {
+ private static final int DECODE_BUFFER_SIZE = 4096;
private static final Logger log = Logger.getLogger(ContentKeyValueFactory.class);
@@ -55,7 +58,7 @@ public static ContentKeyValue parse(Key key, Value value, Authorizations auths,
public static byte[] decodeAndDecompressContent(byte[] contents) {
try {
- contents = decompress(Base64.getMimeDecoder().decode(contents));
+ contents = decodeAndDecompressContent(contents, Integer.MAX_VALUE);
} catch (IOException e) {
log.error("Error decompressing Base64 encoded GZIPInputStream", e);
} catch (Exception e) {
@@ -69,6 +72,22 @@ public static byte[] decodeAndDecompressContent(byte[] contents) {
return contents;
}
+ public static String decodeAndDecompressContentAsString(byte[] contents, int maxDecodedSize) throws IOException {
+ return new String(decodeAndDecompressContent(contents, maxDecodedSize), StandardCharsets.UTF_8);
+ }
+
+ public static byte[] decodeAndDecompressContent(byte[] contents, int maxDecodedSize) throws IOException {
+ byte[] decoded = Base64.getMimeDecoder().decode(contents);
+ try {
+ return decompress(decoded, maxDecodedSize);
+ } catch (IOException e) {
+ if (decoded.length > maxDecodedSize) {
+ throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecodedSize + " bytes", e);
+ }
+ return decoded;
+ }
+ }
+
private static boolean isCompressed(byte[] compressed) {
return (compressed[0] == (byte) (GZIPInputStream.GZIP_MAGIC)) && (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8));
}
@@ -83,6 +102,26 @@ private static byte[] decompress(byte[] compressed) throws IOException {
return decompressed;
}
+ private static byte[] decompress(byte[] compressed, int maxDecompressedSize) throws IOException {
+ if (!isCompressed(compressed)) {
+ return compressed;
+ }
+
+ try (GZIPInputStream gzip = new GZIPInputStream(new ByteArrayInputStream(compressed)); ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+ byte[] buffer = new byte[DECODE_BUFFER_SIZE];
+ int read;
+ int totalRead = 0;
+ while ((read = gzip.read(buffer)) >= 0) {
+ totalRead += read;
+ if (totalRead > maxDecompressedSize) {
+ throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecompressedSize + " bytes");
+ }
+ baos.write(buffer, 0, read);
+ }
+ return baos.toByteArray();
+ }
+ }
+
public static class ContentKeyValue extends EventKeyValue {
protected String viewName = null;
diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java
index 2a060393b8f..c2ef06fad4c 100644
--- a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java
+++ b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java
@@ -3447,6 +3447,30 @@ public void setTfAggregationThresholdMs(int tfAggregationThresholdMs) {
getConfig().setTfAggregationThresholdMs(tfAggregationThresholdMs);
}
+ public int getDocumentMatchMaxEncodedSize() {
+ return getConfig().getDocumentMatchMaxEncodedSize();
+ }
+
+ public void setDocumentMatchMaxEncodedSize(int documentMatchMaxEncodedSize) {
+ getConfig().setDocumentMatchMaxEncodedSize(documentMatchMaxEncodedSize);
+ }
+
+ public int getDocumentMatchMaxDecodedSize() {
+ return getConfig().getDocumentMatchMaxDecodedSize();
+ }
+
+ public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) {
+ getConfig().setDocumentMatchMaxDecodedSize(documentMatchMaxDecodedSize);
+ }
+
+ public int getDocumentMatchMaxEncodedContextSize() {
+ return getConfig().getDocumentMatchMaxEncodedContextSize();
+ }
+
+ public void setDocumentMatchMaxEncodedContextSize(int documentMatchMaxEncodedContextSize) {
+ getConfig().setDocumentMatchMaxEncodedContextSize(documentMatchMaxEncodedContextSize);
+ }
+
public boolean getPruneQueryOptions() {
return getConfig().getPruneQueryOptions();
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java
index a6ecc78aebf..645d5ad60f6 100644
--- a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java
+++ b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java
@@ -27,6 +27,8 @@
import datawave.query.attributes.AttributeFactory;
import datawave.query.attributes.Document;
+import datawave.query.function.DocumentMatchConfig;
+import datawave.query.function.DocumentMatchContextFunction;
import datawave.query.function.Equality;
import datawave.query.function.RangeProvider;
import datawave.query.function.TLDEquality;
@@ -266,6 +268,13 @@ protected Function,Tuple3>>
return TFFactory.getFunction(tfConfig);
}
+ @Override
+ protected Function>,Tuple3>> buildDocumentMatchFunction(
+ DocumentMatchConfig documentMatchConfig) {
+ documentMatchConfig.setTld(true);
+ return new DocumentMatchContextFunction(documentMatchConfig);
+ }
+
/**
* Get a {@link TLDRangeProvider}
*
diff --git a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java
new file mode 100644
index 00000000000..e6c32f1c26a
--- /dev/null
+++ b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java
@@ -0,0 +1,374 @@
+package datawave.query;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.net.URL;
+import java.nio.file.Path;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TimeZone;
+
+import org.apache.accumulo.core.client.AccumuloClient;
+import org.apache.accumulo.core.client.security.tokens.PasswordToken;
+import org.apache.accumulo.core.security.Authorizations;
+import org.apache.accumulo.core.security.ColumnVisibility;
+import org.apache.accumulo.minicluster.MiniAccumuloCluster;
+import org.apache.accumulo.minicluster.MiniAccumuloConfig;
+import org.apache.log4j.Logger;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.api.io.TempDir;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.context.annotation.ComponentScan;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit.jupiter.SpringExtension;
+
+import com.google.common.base.Preconditions;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
+
+import datawave.ingest.data.TypeRegistry;
+import datawave.query.attributes.Attribute;
+import datawave.query.attributes.Attributes;
+import datawave.query.attributes.Content;
+import datawave.query.attributes.Document;
+import datawave.query.function.DocumentMatchContext;
+import datawave.query.function.DocumentMatchResults;
+import datawave.query.iterator.ivarator.IvaratorCacheDirConfig;
+import datawave.query.jexl.functions.DocumentFunctions;
+import datawave.query.tables.ShardQueryLogic;
+import datawave.query.util.AbstractQueryTest;
+import datawave.query.util.WiseGuysIngest;
+
+/**
+ * MiniAccumulo-backed integration tests for {@code document:match(...)}.
+ *
+ * These tests exercise the full query path, including query parsing, planner wiring, shard-table document materialization, evaluation-phase document matching,
+ * and publication of the {@code DOCUMENT_MATCHES} attribute on returned documents.
+ */
+@ExtendWith(SpringExtension.class)
+@ComponentScan(basePackages = "datawave.query")
+// @formatter:off
+@ContextConfiguration(locations = {
+ "classpath:datawave/query/QueryLogicFactory.xml",
+ "classpath:beanRefContext.xml",
+ "classpath:MarkingFunctionsContext.xml",
+ "classpath:MetadataHelperContext.xml",
+ "classpath:CacheContext.xml"})
+// @formatter:on
+public class DocumentMatchQueryTest extends AbstractQueryTest {
+
+ private static final Logger log = Logger.getLogger(DocumentMatchQueryTest.class);
+ private static final Authorizations auths = new Authorizations("ALL");
+ private static final String PASSWORD = "password";
+
+ @TempDir
+ public static Path folder;
+
+ protected static MiniAccumuloCluster mac;
+ protected static AccumuloClient client;
+
+ @Autowired
+ @Qualifier("EventQuery")
+ protected ShardQueryLogic logic;
+
+ private final Map>>> expectedDocumentMatches = new HashMap<>();
+ private final Map> expectedDocumentMatchVisibilities = new HashMap<>();
+ private Boolean expectedDocumentMatchContextRequired;
+
+ @Override
+ public ShardQueryLogic getLogic() {
+ return logic;
+ }
+
+ @Override
+ public Authorizations getAuths() {
+ return auths;
+ }
+
+ @BeforeAll
+ public static void beforeAll() throws Exception {
+ System.setProperty("type.metadata.dir", folder.toFile().getAbsolutePath());
+
+ MiniAccumuloConfig cfg = new MiniAccumuloConfig(folder.toFile(), PASSWORD);
+ cfg.setNumTservers(1);
+ mac = new MiniAccumuloCluster(cfg);
+ mac.start();
+
+ client = mac.createAccumuloClient("root", new PasswordToken(PASSWORD));
+ client.securityOperations().changeUserAuthorizations("root", auths);
+ new QueryTestTableHelper(client, log);
+ WiseGuysIngest.writeItAll(client, WiseGuysIngest.WhatKindaRange.DOCUMENT);
+ }
+
+ @BeforeEach
+ public void beforeEach() {
+ TimeZone.setDefault(TimeZone.getTimeZone("GMT"));
+ setClientForTest(client);
+
+ URL hadoopConfig = this.getClass().getResource("/testhadoop.config");
+ Preconditions.checkNotNull(hadoopConfig);
+ logic.setHdfsSiteConfigURLs(hadoopConfig.toExternalForm());
+ logic.setIvaratorCacheDirConfigs(Collections.singletonList(new IvaratorCacheDirConfig(folder.toUri().toString())));
+ logic.setMaxFieldIndexRangeSplit(1);
+ logic.setCollapseUids(false);
+ logic.setFullTableScanEnabled(false);
+ logic.setDocumentMatchMaxDecodedSize(DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE);
+
+ givenParameter(QueryParameters.HIT_LIST, "true");
+ logic.setHitList(true);
+ givenDate("20091231", "20150101");
+ }
+
+ @AfterEach
+ public void afterEach() {
+ super.afterEach();
+ expectedDocumentMatches.clear();
+ expectedDocumentMatchVisibilities.clear();
+ expectedDocumentMatchContextRequired = null;
+ }
+
+ @AfterAll
+ public static void afterAll() throws Exception {
+ if (mac != null) {
+ mac.stop();
+ }
+ TypeRegistry.reset();
+ }
+
+ @Override
+ protected void extraConfigurations() {
+ // no-op
+ }
+
+ /**
+ * Verifies that returned documents expose the expected {@code DOCUMENT_MATCHES} payload when the current test configured one.
+ */
+ @Override
+ protected void extraAssertions() {
+ if (expectedDocumentMatchContextRequired != null) {
+ if (expectedDocumentMatchContextRequired) {
+ assertTrue(logic.getConfig().isDocumentMatchContextRequired(), "planned query did not require document-match context lookup");
+ } else {
+ assertFalse(logic.getConfig().isDocumentMatchContextRequired(), "planned query unexpectedly required document-match context lookup");
+ }
+ }
+
+ for (Document result : results) {
+ Attribute> uuid = result.get("UUID");
+ assertNotNull(uuid, "result did not contain UUID");
+
+ String uuidValue = getUUID(uuid);
+ Map>> expected = expectedDocumentMatches.get(uuidValue);
+ if (expected != null) {
+ Attribute> matches = result.get(DocumentFunctions.DOCUMENT_MATCHES);
+ assertNotNull(matches, "result did not contain DOCUMENT_MATCHES");
+ assertEquals(expected, getDocumentMatchesByView(matches));
+ }
+
+ Map expectedVisibilities = expectedDocumentMatchVisibilities.get(uuidValue);
+ if (expectedVisibilities != null) {
+ Attribute> matches = result.get(DocumentFunctions.DOCUMENT_MATCHES);
+ assertNotNull(matches, "result did not contain DOCUMENT_MATCHES");
+ assertEquals(expectedVisibilities, getDocumentMatchVisibilities(matches));
+ }
+ }
+ }
+
+ /**
+ * Verifies that JEXL {@code document:match(STRING)} evaluates across all views and returns the expected offsets.
+ */
+ @Test
+ public void testDocumentMatchJexlAllViews() throws Exception {
+ givenQuery("UUID == 'CAPONE' && document:match('can')");
+ expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(1);
+ expectUUIDs(java.util.Set.of("CAPONE"));
+ expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27))));
+ planAndExecuteQuery();
+ }
+
+ /**
+ * Verifies that JEXL {@code document:match(VIEWNAME, STRING)} restricts evaluation to the named view.
+ */
+ @Test
+ public void testDocumentMatchJexlSpecificView() throws Exception {
+ givenQuery("UUID == 'CAPONE' && document:match('CONTENT2', 'lawyer')");
+ expectPlan("UUID == 'capone' && document:match('CONTENT2', documentMatchContext, 'lawyer')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(1);
+ expectUUIDs(java.util.Set.of("CAPONE"));
+ expectedDocumentMatches.put("CAPONE", Map.of("CONTENT2", Map.of("lawyer", List.of(2))));
+ planAndExecuteQuery();
+ }
+
+ /**
+ * Verifies that multiple JEXL {@code document:match(...)} calls contribute one {@code DOCUMENT_MATCHES} value per matched {@code d}-column.
+ */
+ @Test
+ public void testDocumentMatchJexlAddsPerEntryMatchesAcrossCalls() throws Exception {
+ givenQuery("UUID == 'CAPONE' && document:match('CONTENT', 'can') && document:match('CONTENT2', 'lawyer')");
+ expectPlan("UUID == 'capone' && document:match('CONTENT', documentMatchContext, 'can') && document:match('CONTENT2', documentMatchContext, 'lawyer')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(1);
+ expectUUIDs(java.util.Set.of("CAPONE"));
+ expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("lawyer", List.of(2))));
+ planAndExecuteQuery();
+ }
+
+ /**
+ * Verifies that end-to-end {@code DOCUMENT_MATCHES} values preserve the visibilities carried by their source {@code d}-column entries.
+ */
+ @Test
+ public void testDocumentMatchJexlPreservesPerEntryVisibilities() throws Exception {
+ givenQuery("UUID == 'CAPONE' && document:match('can')");
+ expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(1);
+ expectUUIDs(java.util.Set.of("CAPONE"));
+ expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27))));
+ expectedDocumentMatchVisibilities.put("CAPONE", Map.of("CONTENT", new ColumnVisibility("ALL"), "CONTENT2", new ColumnVisibility("ALL")));
+ planAndExecuteQuery();
+ }
+
+ /**
+ * Verifies that a wildcard view match combined with a second targeted call accumulates per-entry matches without cross-entry merging.
+ */
+ @Test
+ public void testDocumentMatchJexlWildcardThenSpecificViewAccumulatesPerEntry() throws Exception {
+ givenQuery("UUID == 'CAPONE' && document:match('CONTENT*', 'can') && document:match('CONTENT2', 'lawyer')");
+ expectPlan("UUID == 'capone' && document:match('CONTENT*', documentMatchContext, 'can') && document:match('CONTENT2', documentMatchContext, 'lawyer')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(1);
+ expectUUIDs(java.util.Set.of("CAPONE"));
+ expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27), "lawyer", List.of(2))));
+ expectedDocumentMatchVisibilities.put("CAPONE", Map.of("CONTENT", new ColumnVisibility("ALL"), "CONTENT2", new ColumnVisibility("ALL")));
+ planAndExecuteQuery();
+ }
+
+ /**
+ * Verifies Lucene {@code #DOCUMENT_MATCH(...)} translation and wildcard view-prefix behavior in the full query path.
+ */
+ @Test
+ public void testDocumentMatchLuceneWildcardView() throws Exception {
+ givenParameter(QueryParameters.QUERY_SYNTAX, "LUCENE");
+ givenQuery("UUID:CAPONE AND #DOCUMENT_MATCH(CONTENT*,can)");
+ expectPlan("UUID == 'capone' && document:match('CONTENT*', documentMatchContext, 'can')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(1);
+ expectUUIDs(java.util.Set.of("CAPONE"));
+ expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27))));
+ planAndExecuteQuery();
+ }
+
+ private Map>> getDocumentMatchesByView(Attribute> attribute) {
+ Map>> values = new HashMap<>();
+ if (attribute instanceof Attributes) {
+ for (Attribute extends Comparable>> child : ((Attributes) attribute).getAttributes()) {
+ addDocumentMatch(values, ((Content) child).getContent());
+ }
+ } else {
+ addDocumentMatch(values, ((Content) attribute).getContent());
+ }
+ return values;
+ }
+
+ private Map getDocumentMatchVisibilities(Attribute> attribute) {
+ Map visibilities = new HashMap<>();
+ if (attribute instanceof Attributes) {
+ for (Attribute extends Comparable>> child : ((Attributes) attribute).getAttributes()) {
+ Content content = (Content) child;
+ visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility());
+ }
+ } else {
+ Content content = (Content) attribute;
+ visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility());
+ }
+ return visibilities;
+ }
+
+ private void addDocumentMatch(Map>> values, String json) {
+ JsonObject payload = JsonParser.parseString(json).getAsJsonObject();
+ String view = payload.get(DocumentMatchResults.VIEW_FIELD).getAsString();
+ JsonObject matches = payload.getAsJsonObject(DocumentMatchResults.MATCHES_FIELD);
+ Map> offsetsBySearch = new HashMap<>();
+ for (Map.Entry matchEntry : matches.entrySet()) {
+ List offsets = new java.util.ArrayList<>();
+ for (JsonElement offset : matchEntry.getValue().getAsJsonArray()) {
+ offsets.add(offset.getAsInt());
+ }
+ offsetsBySearch.put(matchEntry.getKey(), offsets);
+ }
+ values.put(view, offsetsBySearch);
+ }
+
+ private String getDocumentMatchView(String json) {
+ return JsonParser.parseString(json).getAsJsonObject().get(DocumentMatchResults.VIEW_FIELD).getAsString();
+ }
+
+ /**
+ * Verifies that a non-matching document-match term filters the document out of the result set.
+ */
+ @Test
+ public void testDocumentMatchNoMatchFiltersDocument() throws Exception {
+ givenQuery("UUID == 'CAPONE' && document:match('missing')");
+ expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'missing')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(0);
+ planAndExecuteQuery();
+ }
+
+ /**
+ * Verifies that document-match is case-sensitive in the full query path.
+ */
+ @Test
+ public void testDocumentMatchIsCaseSensitive() throws Exception {
+ givenQuery("UUID == 'CAPONE' && document:match('Can')");
+ expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'Can')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(0);
+ planAndExecuteQuery();
+ }
+
+ /**
+ * Verifies that decoded payloads larger than the configured limit are skipped as non-matching during end-to-end query execution.
+ */
+ @Test
+ public void testDocumentMatchOversizedDecodedPayloadIsSkipped() throws Exception {
+ logic.setDocumentMatchMaxDecodedSize(8);
+ givenQuery("UUID == 'CAPONE' && document:match('can')");
+ expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')");
+ expectedDocumentMatchContextRequired = true;
+ expectResultCount(0);
+ planAndExecuteQuery();
+ }
+
+ /**
+ * Verifies that queries without {@code document:match(...)} do not request document-match context lookup in the integration harness.
+ */
+ @Test
+ public void testQueryWithoutDocumentMatchDoesNotRequireContext() throws Exception {
+ givenQuery("UUID == 'CAPONE'");
+ expectPlan("UUID == 'capone'");
+ expectedDocumentMatchContextRequired = false;
+ expectResultCount(1);
+ expectUUIDs(java.util.Set.of("CAPONE"));
+ planAndExecuteQuery();
+ assertEquals(1, results.size());
+ Document result = results.iterator().next();
+ assertNull(result.get(DocumentFunctions.DOCUMENT_MATCHES), "query without document:match unexpectedly emitted DOCUMENT_MATCHES");
+ }
+}
diff --git a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java
index 1499ce689c1..51028b27764 100644
--- a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java
+++ b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java
@@ -46,6 +46,7 @@
import datawave.query.attributes.UniqueFields;
import datawave.query.common.grouping.GroupFields;
import datawave.query.config.annotation.AllHitsQueryConfig;
+import datawave.query.function.DocumentMatchContext;
import datawave.query.iterator.ivarator.IvaratorCacheDirConfig;
import datawave.query.iterator.logic.ContentSummaryIterator;
import datawave.query.iterator.logic.TermFrequencyExcerptIterator;
@@ -665,6 +666,14 @@ public void setUp() throws Exception {
updatedValues.put("allHitsQueryConfig", new AllHitsQueryConfig());
defaultValues.put("originalJexlQuery", null);
updatedValues.put("originalJexlQuery", "FIELD == 'VALUE'");
+ defaultValues.put("documentMatchMaxEncodedSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE);
+ updatedValues.put("documentMatchMaxEncodedSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE + 1);
+ defaultValues.put("documentMatchMaxDecodedSize", DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE);
+ updatedValues.put("documentMatchMaxDecodedSize", DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE + 1);
+ defaultValues.put("documentMatchMaxEncodedContextSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE);
+ updatedValues.put("documentMatchMaxEncodedContextSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE + 1);
+ defaultValues.put("documentMatchContextRequired", false);
+ updatedValues.put("documentMatchContextRequired", true);
}
private Query createQuery(String query) {
diff --git a/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java
new file mode 100644
index 00000000000..1d98c6bbdcb
--- /dev/null
+++ b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java
@@ -0,0 +1,192 @@
+package datawave.query.function;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotSame;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.iterators.IteratorEnvironment;
+import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
+import org.junit.jupiter.api.Test;
+
+import com.google.common.collect.Lists;
+
+import datawave.query.attributes.Document;
+import datawave.query.attributes.DocumentKey;
+import datawave.query.jexl.functions.DocumentFunctions;
+import datawave.query.util.Tuple3;
+import datawave.query.util.Tuples;
+
+/**
+ * Focused tests for {@link DocumentMatchContextFunction}.
+ */
+public class DocumentMatchContextFunctionTest {
+
+ /**
+ * Verifies that only matching {@code d}-column entries for the current document key are added to the evaluation side-channel.
+ */
+ @Test
+ public void testCollectsOnlyCurrentDocumentColumns() {
+ List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes())),
+ Map.entry(new Key("20240101_0", "d", "datatype\0uid\0META"), new Value("two".getBytes())),
+ Map.entry(new Key("20240101_0", "d", "datatype\0other\0BODY"), new Value("skip".getBytes())),
+ Map.entry(new Key("20240101_0", "tf", "datatype\0uid\0BODY"), new Value("skip".getBytes())));
+
+ DocumentMatchConfig config = new DocumentMatchConfig();
+ config.setSource(new ListBackedIterator(entries));
+ config.setLimits(new DocumentMatchContext.Limits(1234, 5678, 9012));
+ DocumentMatchContextFunction function = new DocumentMatchContextFunction(config);
+
+ Tuple3> result = function
+ .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap()));
+ DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME);
+
+ assertEquals(2, context.getDocumentEntries().size());
+ assertEquals(1234, context.getMaxEncodedValueSize());
+ assertEquals(5678, context.getMaxDecodedValueSize());
+ assertEquals(9012, context.getMaxEncodedContextSize());
+ }
+
+ /**
+ * Verifies that the function produces an empty context entry when a document has no retained {@code d}-column values.
+ */
+ @Test
+ public void testCollectsEmptyContextWhenNoDocumentColumnsExist() {
+ DocumentMatchConfig config = new DocumentMatchConfig();
+ config.setSource(new ListBackedIterator(Collections.emptyList()));
+ config.setLimits(new DocumentMatchContext.Limits(10, 20, 30));
+ DocumentMatchContextFunction function = new DocumentMatchContextFunction(config);
+
+ Tuple3> result = function
+ .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap()));
+ DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME);
+
+ assertTrue(context.getDocumentEntries().isEmpty());
+ }
+
+ /**
+ * Verifies that document-match context collection honors explicit {@code DOCKEY} attributes instead of assuming that the tuple key is the only event key.
+ */
+ @Test
+ public void testCollectsColumnsForDocumentKeysFromDocument() {
+ List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes())),
+ Map.entry(new Key("20240101_0", "d", "datatype\0child\0BODY"), new Value("two".getBytes())),
+ Map.entry(new Key("20240101_0", "d", "datatype\0other\0BODY"), new Value("skip".getBytes())));
+
+ DocumentMatchConfig config = new DocumentMatchConfig();
+ config.setSource(new ListBackedIterator(entries));
+ config.setLimits(new DocumentMatchContext.Limits(10, 20, 30));
+ config.setTld(true);
+ DocumentMatchContextFunction function = new DocumentMatchContextFunction(config);
+
+ Document document = new Document();
+ document.put(Document.DOCKEY_FIELD_NAME, new DocumentKey(new Key("20240101_0", "datatype\0uid"), false));
+ document.put(Document.DOCKEY_FIELD_NAME, new DocumentKey(new Key("20240101_0", "datatype\0child"), false));
+
+ Tuple3> result = function
+ .apply(Tuples.tuple(new Key("20240101_0", "datatype\0root"), document, Collections.emptyMap()));
+ DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME);
+
+ assertEquals(2, context.getDocumentEntries().size());
+ }
+
+ /**
+ * Verifies that collection skips individually oversized payloads and stops once the retained encoded bytes would exceed the configured aggregate limit.
+ */
+ @Test
+ public void testCollectsOnlyEntriesWithinAggregateEncodedContextLimit() {
+ List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("1234".getBytes())),
+ Map.entry(new Key("20240101_0", "d", "datatype\0uid\0META"), new Value("12345".getBytes())),
+ Map.entry(new Key("20240101_0", "d", "datatype\0uid\0TAIL"), new Value("12".getBytes())));
+
+ DocumentMatchConfig config = new DocumentMatchConfig();
+ config.setSource(new ListBackedIterator(entries));
+ config.setLimits(new DocumentMatchContext.Limits(10, 20, 4));
+ DocumentMatchContextFunction function = new DocumentMatchContextFunction(config);
+
+ Tuple3> result = function
+ .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap()));
+ DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME);
+
+ assertEquals(1, context.getDocumentEntries().size());
+ assertEquals("datatype\0uid\0BODY", context.getDocumentEntries().get(0).getKey().getColumnQualifier().toString());
+ }
+
+ /**
+ * Verifies that each application creates a fresh {@link DocumentMatchContext}, so mutable match state from a prior evaluation cannot leak into a later one.
+ */
+ @Test
+ public void testApplyCreatesFreshContextEachTime() {
+ List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes())));
+
+ DocumentMatchConfig config = new DocumentMatchConfig();
+ config.setSource(new ListBackedIterator(entries));
+ config.setLimits(new DocumentMatchContext.Limits(10, 20, 30));
+ DocumentMatchContextFunction function = new DocumentMatchContextFunction(config);
+
+ Tuple3> first = function
+ .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap()));
+ DocumentMatchContext firstContext = (DocumentMatchContext) first.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME);
+ firstContext.addMatches(entries.get(0).getKey(), "one", List.of(0));
+ assertEquals(1, firstContext.getMatches().size());
+
+ Tuple3> second = function
+ .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap()));
+ DocumentMatchContext secondContext = (DocumentMatchContext) second.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME);
+
+ assertNotSame(firstContext, secondContext);
+ assertTrue(secondContext.getMatches().isEmpty());
+ assertEquals(1, secondContext.getDocumentEntries().size());
+ }
+
+ private static class ListBackedIterator implements SortedKeyValueIterator {
+ private final List> entries;
+ private int index = -1;
+
+ private ListBackedIterator(List> entries) {
+ this.entries = entries;
+ }
+
+ @Override
+ public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) {}
+
+ @Override
+ public boolean hasTop() {
+ return index >= 0 && index < entries.size();
+ }
+
+ @Override
+ public void next() {
+ index++;
+ }
+
+ @Override
+ public void seek(Range range, java.util.Collection columnFamilies, boolean inclusive) {
+ index = 0;
+ while (index < entries.size() && !range.contains(entries.get(index).getKey())) {
+ index++;
+ }
+ }
+
+ @Override
+ public Key getTopKey() {
+ return entries.get(index).getKey();
+ }
+
+ @Override
+ public Value getTopValue() {
+ return entries.get(index).getValue();
+ }
+
+ @Override
+ public SortedKeyValueIterator deepCopy(IteratorEnvironment env) {
+ return new ListBackedIterator(entries);
+ }
+ }
+}
diff --git a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java
index b4af760243a..393dd11f70b 100644
--- a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java
+++ b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java
@@ -1,19 +1,31 @@
package datawave.query.function;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import static datawave.query.function.DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE;
+import static datawave.query.function.DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.function.Consumer;
+import java.util.function.Supplier;
import org.apache.accumulo.core.data.Key;
-import org.junit.Test;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.security.ColumnVisibility;
+import org.apache.commons.jexl3.parser.ASTJexlScript;
+import org.junit.jupiter.api.Test;
import com.google.common.collect.Maps;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
import datawave.ingest.protobuf.TermWeightPosition;
import datawave.query.Constants;
@@ -24,22 +36,26 @@
import datawave.query.attributes.Numeric;
import datawave.query.jexl.DatawaveJexlContext;
import datawave.query.jexl.HitListArithmetic;
+import datawave.query.jexl.JexlASTHelper;
+import datawave.query.jexl.functions.DocumentFunctions;
import datawave.query.jexl.functions.TermFrequencyList;
+import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor;
+import datawave.query.jexl.visitors.JexlStringBuildingVisitor;
import datawave.query.postprocessing.tf.TermOffsetMap;
import datawave.query.util.Tuple3;
public class JexlEvaluationTest {
+ public static final DocumentMatchContext.Limits TEST_DOCUMENT_MATCH_LIMITS = new DocumentMatchContext.Limits(1024, DEFAULT_MAX_DECODED_SIZE,
+ DEFAULT_MAX_ENCODED_CONTEXT_SIZE);
+
@Test
public void testSimpleQuery() {
String query = "FOO == 'bar'";
Document d = new Document();
d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true));
- DatawaveJexlContext context = new DatawaveJexlContext();
- d.visit(Collections.singleton("FOO"), context);
-
- assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context);
+ assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO")));
}
@Test
@@ -49,10 +65,7 @@ public void testRegexIntersection() {
d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true));
d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid"), true));
- DatawaveJexlContext context = new DatawaveJexlContext();
- d.visit(Collections.singleton("FOO"), context);
-
- assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context);
+ assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO")));
}
@Test
@@ -61,16 +74,15 @@ public void testRegexCaseIntersection() {
d.put("FOO", new Content("Bar", new Key("shard", "datatype\0uid"), true));
d.put("FOO", new Numeric("123", new Key("shard", "datatype\0uid"), true));
- DatawaveJexlContext context = new DatawaveJexlContext();
- d.visit(Collections.singleton("FOO"), context);
+ Supplier contextSupplier = contextFactory(d, Collections.singleton("FOO"));
// match the original value
String query = "FOO == 'bar' && FOO =~ '12.*'";
- assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context);
+ assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextSupplier);
// match the normalized value
query = "FOO == 'bar' && FOO =~ '\\+cE1\\.2.*'";
- assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context);
+ assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextSupplier);
}
@Test
@@ -81,10 +93,7 @@ public void testRegexUnion() {
d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true));
d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid"), true));
- DatawaveJexlContext context = new DatawaveJexlContext();
- d.visit(Collections.singleton("FOO"), context);
-
- assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context);
+ assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO")));
}
@Test
@@ -96,10 +105,7 @@ public void testHitTermSource() {
d.put("FOO", hitTermSource);
d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid2"), true));
- DatawaveJexlContext context = new DatawaveJexlContext();
- d.visit(Collections.singleton("FOO"), context);
-
- assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context);
+ assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO")));
Attributes hitTerm = (Attributes) d.getDictionary().get("HIT_TERM");
assertEquals(1, hitTerm.getAttributes().size());
@@ -151,10 +157,7 @@ public void testSomeFilterFunctions() {
// Assume fields are {ANCHOR, FOO, FOO2} and a constant doc key
private void evaluate(String query, Document d) {
- DatawaveJexlContext context = new DatawaveJexlContext();
- d.visit(Arrays.asList("ANCHOR", "FOO", "FOO2", "FOO3"), context);
-
- assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context);
+ assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Arrays.asList("ANCHOR", "FOO", "FOO2", "FOO3")));
}
@Test
@@ -166,9 +169,6 @@ public void testContentPhraseFunction() {
map.put("red", buildTfList("TOKFIELD", 2));
map.put("dog", buildTfList("TOKFIELD", 3));
- DatawaveJexlContext context = new DatawaveJexlContext();
- context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, new TermOffsetMap(map));
-
Key docKey = new Key("shard", "datatype\0uid");
Document d = new Document();
@@ -176,9 +176,8 @@ public void testContentPhraseFunction() {
d.put("TOKFIELD", new Content("big", docKey, true));
d.put("TOKFIELD", new Content("red", docKey, true));
d.put("TOKFIELD", new Content("dog", docKey, true));
- d.visit(Arrays.asList("FOO", "TOKFIELD"), context);
-
- assertEvaluation(query, docKey, d, context);
+ assertEvaluation(query, docKey, d, contextFactory(d, Arrays.asList("FOO", "TOKFIELD"),
+ ctx -> ctx.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, new TermOffsetMap(map))));
// assert that "big red dog" came back in the hit terms
boolean foundPhrase = false;
@@ -192,6 +191,128 @@ public void testContentPhraseFunction() {
assertTrue(foundPhrase);
}
+ @Test
+ public void testDocumentMatchAddsDocumentAttribute() {
+ String query = "FOO == 'bar' && document:match('car')";
+ Key docKey = new Key("shard", "datatype\0uid");
+ Document d = new Document();
+ d.put("FOO", new Content("bar", docKey, true));
+
+ final List> entries = List
+ .of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car"))));
+ assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx
+ .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS))));
+ assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5))), getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES)));
+ assertEquals(new ColumnVisibility("A"), d.get(DocumentFunctions.DOCUMENT_MATCHES).getColumnVisibility());
+ }
+
+ @Test
+ public void testDocumentMatchAddsPerEntryDocumentAttributesAcrossCalls() {
+ String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('CONTENT2', 'lawyer')";
+ Key docKey = new Key("shard", "datatype\0uid");
+ Document d = new Document();
+ d.put("FOO", new Content("bar", docKey, true));
+
+ final List> entries = List.of(
+ Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car"))),
+ Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "A"), new Value(buildEncodedValue("lawyer car"))));
+ assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx
+ .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS))));
+ assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5)), "CONTENT2", Map.of("lawyer", List.of(0))),
+ getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES)));
+ }
+
+ @Test
+ public void testDocumentMatchAccumulatesCallsWithinSameEntry() {
+ String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('BODY', 'lawyer')";
+ Key docKey = new Key("shard", "datatype\0uid");
+ Document d = new Document();
+ d.put("FOO", new Content("bar", docKey, true));
+
+ final List> entries = List
+ .of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car lawyer"))));
+ assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx
+ .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS))));
+ assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5), "lawyer", List.of(9))), getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES)));
+ }
+
+ @Test
+ public void testDocumentMatchPreservesPerEntryVisibilities() {
+ String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('CONTENT2', 'lawyer')";
+ Key docKey = new Key("shard", "datatype\0uid");
+ Document d = new Document();
+ d.put("FOO", new Content("bar", docKey, true));
+
+ final List> entries = List.of(
+ Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car"))),
+ Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "B"), new Value(buildEncodedValue("lawyer car"))));
+ assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx
+ .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS))));
+
+ assertEquals(Map.of("BODY", new ColumnVisibility("A"), "CONTENT2", new ColumnVisibility("B")),
+ getDocumentMatchVisibilitiesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES)));
+ }
+
+ private Map>> getDocumentMatchesByView(Attribute> attribute) {
+ Map>> values = new HashMap<>();
+ if (attribute instanceof Attributes) {
+ for (Attribute extends Comparable>> child : ((Attributes) attribute).getAttributes()) {
+ addDocumentMatch(values, ((Content) child).getContent());
+ }
+ } else {
+ addDocumentMatch(values, ((Content) attribute).getContent());
+ }
+ return values;
+ }
+
+ private Map getDocumentMatchVisibilitiesByView(Attribute> attribute) {
+ Map visibilities = new HashMap<>();
+ if (attribute instanceof Attributes) {
+ for (Attribute extends Comparable>> child : ((Attributes) attribute).getAttributes()) {
+ Content content = assertInstanceOf(Content.class, child);
+ visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility());
+ }
+ } else {
+ Content content = assertInstanceOf(Content.class, attribute);
+ visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility());
+ }
+ return visibilities;
+ }
+
+ private void addDocumentMatch(Map>> values, String json) {
+ JsonObject payload = JsonParser.parseString(json).getAsJsonObject();
+ String view = payload.get(DocumentMatchResults.VIEW_FIELD).getAsString();
+ JsonObject matches = payload.getAsJsonObject(DocumentMatchResults.MATCHES_FIELD);
+ Map> offsetsBySearch = new HashMap<>();
+ for (Map.Entry matchEntry : matches.entrySet()) {
+ List offsets = new ArrayList<>();
+ for (JsonElement offset : matchEntry.getValue().getAsJsonArray()) {
+ offsets.add(offset.getAsInt());
+ }
+ offsetsBySearch.put(matchEntry.getKey(), offsets);
+ }
+ values.put(view, offsetsBySearch);
+ }
+
+ private String getDocumentMatchView(String json) {
+ return JsonParser.parseString(json).getAsJsonObject().get(DocumentMatchResults.VIEW_FIELD).getAsString();
+ }
+
+ private byte[] buildEncodedValue(String content) {
+ try {
+ java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream();
+ java.io.OutputStream b64s = java.util.Base64.getEncoder().wrap(bos);
+ java.util.zip.GZIPOutputStream gzip = new java.util.zip.GZIPOutputStream(b64s);
+ gzip.write(content.getBytes());
+ gzip.close();
+ b64s.close();
+ bos.close();
+ return bos.toByteArray();
+ } catch (java.io.IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
@Test
public void testCompareFunction() {
// eq op
@@ -286,27 +407,48 @@ private void testCompare(String query, boolean expected) {
d.put("FIELD_C", new Content("zebra", docKey, true));
d.put("FIELD_C", new Content("zephyr", docKey, true));
- // populate context from doc
- DatawaveJexlContext context = new DatawaveJexlContext();
- d.visit(Arrays.asList("FOO", "FIELD_A", "FIELD_B", "FIELD_C"), context);
-
- assertEvaluation(query, docKey, d, context, expected);
+ assertEvaluation(query, docKey, d, contextFactory(d, Arrays.asList("FOO", "FIELD_A", "FIELD_B", "FIELD_C")), expected);
}
- private void assertEvaluation(String query, Key key, Document d, DatawaveJexlContext context) {
- assertEvaluation(query, key, d, context, true);
+ private void assertEvaluation(String query, Key key, Document d, Supplier contextSupplier) {
+ assertEvaluation(query, key, d, contextSupplier, true);
}
- private void assertEvaluation(String query, Key key, Document d, DatawaveJexlContext context, boolean expected) {
- JexlEvaluation evaluation = new JexlEvaluation(query);
- boolean result = evaluation.apply(new Tuple3<>(key, d, context));
+ private void assertEvaluation(String query, Key key, Document d, Supplier contextSupplier, boolean expected) {
+ JexlEvaluation evaluation = new JexlEvaluation(rewriteDocumentMatchFunctions(query));
+ boolean result = evaluation.apply(new Tuple3<>(key, d, contextSupplier.get()));
assertEquals(expected, result);
- evaluation = new JexlEvaluation(query, new HitListArithmetic());
- result = evaluation.apply(new Tuple3<>(key, d, context));
+ evaluation = new JexlEvaluation(rewriteDocumentMatchFunctions(query), new HitListArithmetic());
+ result = evaluation.apply(new Tuple3<>(key, d, contextSupplier.get()));
assertEquals(expected, result);
}
+ private Supplier contextFactory(Document document, Collection fields) {
+ return contextFactory(document, fields, context -> {});
+ }
+
+ private Supplier contextFactory(Document document, Collection fields, Consumer customizer) {
+ return () -> {
+ DatawaveJexlContext context = new DatawaveJexlContext();
+ document.visit(fields, context);
+ customizer.accept(context);
+ return context;
+ };
+ }
+
+ private String rewriteDocumentMatchFunctions(String query) {
+ try {
+ ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query);
+ if (!DocumentMatchFunctionVisitor.rewrite(script)) {
+ return query;
+ }
+ return JexlStringBuildingVisitor.buildQueryWithoutParse(script);
+ } catch (org.apache.commons.jexl3.parser.ParseException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
private TermFrequencyList buildTfList(String field, int... offsets) {
TermFrequencyList.Zone zone = buildZone(field);
List position = buildTermWeightPositions(offsets);
diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java
new file mode 100644
index 00000000000..6f8b9150bb3
--- /dev/null
+++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java
@@ -0,0 +1,319 @@
+package datawave.query.jexl.functions;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStream;
+import java.util.AbstractMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Value;
+import org.junit.jupiter.api.Test;
+
+import datawave.query.function.DocumentMatchContext;
+import datawave.query.function.DocumentMatchResults;
+
+/**
+ * Unit tests for {@link DocumentFunctions} covering view selection, matching semantics, payload limits, and per-{@code d}-column result accumulation.
+ */
+public class DocumentFunctionsTest {
+ /**
+ * Verifies that {@code document:match(STRING)} searches all available views and returns the matched search string when any view matches.
+ */
+ @Test
+ public void testMatchAcrossAllViews() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0META", "carpet")),
+ new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE,
+ DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ String result = DocumentFunctions.match(context, "car");
+
+ assertEquals("car", result);
+ assertEquals(2, context.getMatches().size());
+ }
+
+ /**
+ * Verifies that a trailing {@code *} in the requested view name performs prefix matching across views.
+ */
+ @Test
+ public void testWildcardViewMatch() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(
+ List.of(entry("test\0uid\0BODY", "car"), entry("test\0uid\0BODY_TEXT", "car car"), entry("test\0uid\0META", "car")),
+ new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE,
+ DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ String result = DocumentFunctions.match("BODY*", context, "car");
+
+ assertEquals("car", result);
+ }
+
+ /**
+ * Verifies that overlapping substring matches are reported with all starting offsets.
+ */
+ @Test
+ public void testOverlappingMatches() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "banana")), new DocumentMatchContext.Limits(1024,
+ DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ String result = DocumentFunctions.match("BODY", context, "ana");
+
+ assertEquals("ana", result);
+ }
+
+ /**
+ * Verifies that matching is case-sensitive.
+ */
+ @Test
+ public void testCaseSensitiveMatch() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024,
+ DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertTrue(DocumentFunctions.match(context, "Car").isEmpty());
+ }
+
+ /**
+ * Verifies that a null context is treated as a non-match.
+ */
+ @Test
+ public void testNullContextIsNonMatch() {
+ assertTrue(DocumentFunctions.match(null, "car").isEmpty());
+ }
+
+ /**
+ * Verifies that a null search term is treated as a non-match.
+ */
+ @Test
+ public void testNullSearchIsNonMatch() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024,
+ DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertTrue(DocumentFunctions.match(context, null).isEmpty());
+ assertTrue(DocumentFunctions.match("BODY", context, null).isEmpty());
+ }
+
+ /**
+ * Verifies that an empty search term is treated as a non-match.
+ */
+ @Test
+ public void testEmptySearchIsNonMatch() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024,
+ DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertTrue(DocumentFunctions.match(context, "").isEmpty());
+ assertTrue(context.getMatches().isEmpty());
+ }
+
+ /**
+ * Verifies that encoded payloads larger than the configured limit are skipped as non-matching.
+ */
+ @Test
+ public void testOversizedPayloadIsNonMatch() throws Exception {
+ Map.Entry entry = entry("test\0uid\0BODY", "scar car");
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry), new DocumentMatchContext.Limits(entry.getValue().get().length - 1,
+ DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertTrue(DocumentFunctions.match(context, "car").isEmpty());
+ }
+
+ /**
+ * Verifies that decoded payloads larger than the configured limit are skipped as non-matching.
+ */
+ @Test
+ public void testOversizedDecodedPayloadIsNonMatch() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")),
+ new DocumentMatchContext.Limits(1024, 3, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertTrue(DocumentFunctions.match(context, "car").isEmpty());
+ }
+
+ /**
+ * Verifies that an empty {@code d}-entry set yields no match.
+ */
+ @Test
+ public void testNoDocumentEntriesIsNonMatch() {
+ assertTrue(DocumentFunctions.match(new DocumentMatchContext(List.of(), new DocumentMatchContext.Limits(1024,
+ DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)), "car").isEmpty());
+ }
+
+ /**
+ * Verifies that undecodable payloads are treated as non-matching rather than failing evaluation.
+ */
+ @Test
+ public void testDecodeFailureIsNonMatch() {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(Map.entry(new Key("row", "d", "test\0uid\0BODY"), new Value("not-base64".getBytes()))),
+ new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE,
+ DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertTrue(DocumentFunctions.match(context, "car").isEmpty());
+ }
+
+ /**
+ * Verifies that MIME-style base64 payloads with trailing CRLF line breaks still decode and match correctly.
+ */
+ @Test
+ public void testMatchWithBase64LineBreaks() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entryWithEncodedSuffix("test\0uid\0BODY", "/* Origins */ Fix.", "\r\n")),
+ new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE,
+ DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ String result = DocumentFunctions.match("BODY", context, "Origins");
+
+ assertEquals("Origins", result);
+ }
+
+ /**
+ * Verifies that payloads stored as plain base64-encoded UTF-8 text still decode and match when gzip expansion is not possible.
+ */
+ @Test
+ public void testMatchWithBase64OnlyPayload() {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(base64OnlyEntry("test\0uid\0BODY", "/* Origins */ Fix.")),
+ new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE,
+ DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ String result = DocumentFunctions.match("BODY", context, "Origins");
+
+ assertEquals("Origins", result);
+ }
+
+ /**
+ * Verifies that multiple {@code document:match(...)} calls accumulate results on a per-{@code d}-column basis for document output.
+ */
+ @Test
+ public void testMatchAccumulatesPerEntryResultsAcrossCalls() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0CONTENT2", "lawyer car")),
+ new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE,
+ DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertEquals("car", DocumentFunctions.match("BODY", context, "car"));
+ assertEquals("lawyer", DocumentFunctions.match("CONTENT2", context, "lawyer"));
+ assertEquals(2, context.getMatches().size());
+ assertEquals(List.of("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[0]}}"),
+ context.getMatches().stream().map(DocumentMatchResults::toJson).sorted().collect(Collectors.toList()));
+ }
+
+ /**
+ * Verifies that repeated {@code document:match(...)} calls against the same {@code d}-column accumulate beneath that single entry payload.
+ */
+ @Test
+ public void testMatchAccumulatesSameEntryAcrossCalls() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car lawyer")), new DocumentMatchContext.Limits(1024,
+ DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertEquals("car", DocumentFunctions.match("BODY", context, "car"));
+ assertEquals("lawyer", DocumentFunctions.match("BODY", context, "lawyer"));
+ assertEquals(1, context.getMatches().size());
+ assertEquals("BODY", context.getMatches().get(0).getView());
+ assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5],\"lawyer\":[9]}}", context.getMatches().get(0).toJson());
+ }
+
+ /**
+ * Verifies that repeated identical {@code document:match(...)} calls against the same {@code d}-column do not duplicate offsets for the same search term.
+ */
+ @Test
+ public void testMatchRepeatsSameSearchWithinEntryAcrossCalls() throws Exception {
+ DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024,
+ DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE));
+
+ assertEquals("car", DocumentFunctions.match("BODY", context, "car"));
+ assertEquals("car", DocumentFunctions.match("BODY", context, "car"));
+ assertEquals(1, context.getMatches().size());
+ assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", context.getMatches().get(0).toJson());
+ }
+
+ /**
+ * Builds a test {@code d}-column entry with an empty visibility.
+ *
+ * @param cq
+ * column qualifier to use
+ * @param content
+ * decoded content to encode into the value
+ * @return encoded test entry
+ * @throws Exception
+ * if test payload creation fails
+ */
+ private Map.Entry entry(String cq, String content) throws Exception {
+ return entry(cq, content, "");
+ }
+
+ /**
+ * Builds a test {@code d}-column entry with caller-supplied visibility and gzip+base64 encoded content.
+ *
+ * @param cq
+ * column qualifier to use
+ * @param content
+ * decoded content to encode into the value
+ * @param visibility
+ * column visibility to attach to the key
+ * @return encoded test entry
+ * @throws Exception
+ * if test payload creation fails
+ */
+ private Map.Entry entry(String cq, String content, String visibility) throws Exception {
+ return entryWithEncodedSuffix(cq, content, visibility, "");
+ }
+
+ /**
+ * Builds a test {@code d}-column entry with caller-supplied visibility and an optional suffix appended to the encoded payload.
+ *
+ * @param cq
+ * column qualifier to use
+ * @param content
+ * decoded content to encode into the value
+ * @param visibility
+ * column visibility to attach to the key
+ * @param encodedSuffix
+ * suffix bytes to append after base64 encoding, such as {@code \r\n}
+ * @return encoded test entry
+ * @throws Exception
+ * if test payload creation fails
+ */
+ private Map.Entry entryWithEncodedSuffix(String cq, String content, String visibility, String encodedSuffix) throws Exception {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ OutputStream b64s = java.util.Base64.getEncoder().wrap(bos);
+ GZIPOutputStream gzip = new GZIPOutputStream(b64s);
+ gzip.write(content.getBytes());
+ gzip.close();
+ b64s.close();
+ if (!encodedSuffix.isEmpty()) {
+ bos.write(encodedSuffix.getBytes());
+ }
+ bos.close();
+ return new AbstractMap.SimpleEntry<>(new Key("row", "d", cq, visibility), new Value(bos.toByteArray()));
+ }
+
+ /**
+ * Builds a test {@code d}-column entry with an empty visibility and an optional suffix appended to the encoded payload.
+ *
+ * @param cq
+ * column qualifier to use
+ * @param content
+ * decoded content to encode into the value
+ * @param encodedSuffix
+ * suffix bytes to append after base64 encoding, such as {@code \r\n}
+ * @return encoded test entry
+ * @throws Exception
+ * if test payload creation fails
+ */
+ private Map.Entry entryWithEncodedSuffix(String cq, String content, String encodedSuffix) throws Exception {
+ return entryWithEncodedSuffix(cq, content, "", encodedSuffix);
+ }
+
+ /**
+ * Builds a test {@code d}-column entry whose value is only base64-encoded UTF-8 text.
+ *
+ * @param cq
+ * column qualifier to use
+ * @param content
+ * decoded content to encode into the value
+ * @return encoded test entry
+ */
+ private Map.Entry base64OnlyEntry(String cq, String content) {
+ byte[] encoded = java.util.Base64.getEncoder().encode(content.getBytes());
+ return new AbstractMap.SimpleEntry<>(new Key("row", "d", cq), new Value(encoded));
+ }
+}
diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java
new file mode 100644
index 00000000000..29e2816aefc
--- /dev/null
+++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java
@@ -0,0 +1,81 @@
+package datawave.query.jexl.visitors;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.commons.jexl3.parser.ASTJexlScript;
+import org.apache.commons.jexl3.parser.ParseException;
+import org.junit.Test;
+
+import datawave.query.jexl.JexlASTHelper;
+
+/**
+ * Focused tests for {@link DocumentMatchFunctionVisitor}.
+ */
+public class DocumentMatchFunctionVisitorTest {
+
+ /**
+ * Verifies that the visitor reports when a query needs the reserved document-match context variable.
+ *
+ * @throws Exception
+ * if parsing fails
+ */
+ @Test
+ public void testRewriteReportsWhetherDocumentMatchContextIsRequired() throws Exception {
+ assertFalse(DocumentMatchFunctionVisitor.rewrite(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar'")));
+ assertTrue(DocumentMatchFunctionVisitor.rewrite(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar' && document:match('car')")));
+ }
+
+ @Test
+ public void testRequiresDocumentMatchContext() throws Exception {
+ assertFalse(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar'")));
+ assertTrue(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar' && document:match('car')")));
+ }
+
+ /**
+ * Verifies that the one-argument form is rewritten to include the reserved context variable as the first argument.
+ *
+ * @throws ParseException
+ * if parsing fails
+ */
+ @Test
+ public void testRewriteSingleArgumentFunction() throws ParseException {
+ assertRewrite("document:match(documentMatchContext, 'car')", "document:match('car')");
+ }
+
+ /**
+ * Verifies that the two-argument form keeps the view selector first and inserts the reserved context variable before the search string.
+ *
+ * @throws ParseException
+ * if parsing fails
+ */
+ @Test
+ public void testRewriteTwoArgumentFunction() throws ParseException {
+ assertRewrite("document:match('BODY', documentMatchContext, 'car')", "document:match('BODY', 'car')");
+ }
+
+ @Test
+ public void testRewriteMutatesOriginalScript() throws ParseException {
+ ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("document:match('car')");
+ assertTrue(DocumentMatchFunctionVisitor.rewrite(script));
+ assertEquals("document:match(documentMatchContext, 'car')", JexlStringBuildingVisitor.buildQueryWithoutParse(script));
+ }
+
+ /**
+ * Verifies that the input form is rewritten the expected input.
+ *
+ * @param expected
+ * the expected re-written form
+ * @param input
+ * the input to rewrite
+ * @throws ParseException
+ * if parsing fails
+ */
+ private static void assertRewrite(String expected, String input) throws ParseException {
+ ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(input);
+ DocumentMatchFunctionVisitor.rewrite(script);
+ String rewritten = JexlStringBuildingVisitor.buildQueryWithoutParse(script);
+ assertEquals(expected, rewritten);
+ }
+}
diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java
index d5482f403eb..7a102b316a5 100644
--- a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java
+++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java
@@ -62,6 +62,16 @@ public void testMatchesInGroupFunctionQuoting() throws ParseException {
assertEquals("grouping:matchesInGroupLeft(FOO, 'foo', BAR, 'bar')", parseQuery("#MATCHES_IN_GROUP_LEFT(FOO, foo, BAR, bar)"));
}
+ @Test
+ public void testDocumentMatchFunctionTranslation() throws ParseException {
+ assertEquals("document:match('car')", parseQuery("#DOCUMENT_MATCH('car')"));
+ assertEquals("document:match('car')", parseQuery("#DOCUMENT_MATCH(car)"));
+ assertEquals("document:match('BODY', 'car')", parseQuery("#DOCUMENT_MATCH('BODY', 'car')"));
+ assertEquals("document:match('BODY', 'car')", parseQuery("#DOCUMENT_MATCH(BODY, car)"));
+ assertEquals("document:match('BODY*', 'car')", parseQuery("#DOCUMENT_MATCH(BODY*, car)"));
+ assertEquals("BODY == 'capone' && document:match('car')", parseQuery("BODY:capone AND #DOCUMENT_MATCH(car)"));
+ }
+
@Test
public void testComposableFunctions() throws ParseException {
assertEquals("filter:includeRegex(foo,bar).size() > 0", parseQuery("#JEXL(\"filter:includeRegex(foo,bar).size() > 0\")"));
diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java
index 192af5c6235..613f1e2bade 100644
--- a/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java
+++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java
@@ -204,6 +204,12 @@ public void testFunctions() throws ParseException {
luceneParser.parse("field:selector AND #include(field, testbade\\.scape)").getContents());
Assert.assertEquals("[AND,field:selector][posFilter: filter(true, AND, field, testbade\\.scape)]",
luceneParser.parse("field:selector AND #text(field, testbade\\.scape)").getContents());
+ Assert.assertEquals("[AND,field:selector][posFilter: document:match(car)]",
+ luceneParser.parse("field:selector AND #DOCUMENT_MATCH(car)").getContents());
+ Assert.assertEquals("[AND,field:selector][posFilter: document:match(BODY, car)]",
+ luceneParser.parse("field:selector AND #DOCUMENT_MATCH(BODY, car)").getContents());
+ Assert.assertEquals("[AND,field:selector][posFilter: document:match(BODY*, car)]",
+ luceneParser.parse("field:selector AND #DOCUMENT_MATCH(BODY*, car)").getContents());
}
@Test
diff --git a/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java b/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java
index 203060ed03c..5c487ddb8c7 100644
--- a/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java
+++ b/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java
@@ -1,12 +1,14 @@
package datawave.query.planner;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Set;
+import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.commons.jexl3.parser.ASTJexlScript;
import org.junit.jupiter.api.BeforeEach;
@@ -25,6 +27,8 @@
import datawave.query.config.ShardQueryConfiguration;
import datawave.query.exceptions.DatawaveFatalQueryException;
import datawave.query.exceptions.DatawaveQueryException;
+import datawave.query.iterator.QueryIterator;
+import datawave.query.iterator.QueryOptions;
import datawave.query.jexl.JexlASTHelper;
import datawave.query.tables.ScannerFactory;
import datawave.query.util.DateIndexHelper;
@@ -36,6 +40,46 @@
class DefaultQueryPlannerTest {
+ @Nested
+ class DocumentMatchOptionTests {
+
+ @Test
+ void testAddDocumentMatchOptionsWithoutContextRequired() {
+ DefaultQueryPlanner planner = new DefaultQueryPlanner();
+ ShardQueryConfiguration config = new ShardQueryConfiguration();
+ config.setDocumentMatchContextRequired(false);
+ config.setDocumentMatchMaxEncodedSize(111);
+ config.setDocumentMatchMaxDecodedSize(222);
+ config.setDocumentMatchMaxEncodedContextSize(333);
+ IteratorSetting cfg = new IteratorSetting(10, "query", QueryIterator.class);
+
+ planner.configureDocumentMatchOptions(config, cfg);
+
+ assertEquals("false", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED));
+ assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE));
+ assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE));
+ assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE));
+ }
+
+ @Test
+ void testAddDocumentMatchOptionsWithContextRequired() {
+ DefaultQueryPlanner planner = new DefaultQueryPlanner();
+ ShardQueryConfiguration config = new ShardQueryConfiguration();
+ config.setDocumentMatchContextRequired(true);
+ config.setDocumentMatchMaxEncodedSize(111);
+ config.setDocumentMatchMaxDecodedSize(222);
+ config.setDocumentMatchMaxEncodedContextSize(333);
+ IteratorSetting cfg = new IteratorSetting(10, "query", QueryIterator.class);
+
+ planner.configureDocumentMatchOptions(config, cfg);
+
+ assertEquals("true", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED));
+ assertEquals("111", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE));
+ assertEquals("222", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE));
+ assertEquals("333", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE));
+ }
+ }
+
/**
* Contains tests for
* {@link DefaultQueryPlanner#addDateFilters(ASTJexlScript, ScannerFactory, MetadataHelper, DateIndexHelper, ShardQueryConfiguration, Query)}
diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java
index 729c20a142e..ae169bea877 100644
--- a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java
+++ b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java
@@ -85,8 +85,10 @@
import datawave.query.QueryParameters;
import datawave.query.QueryTestTableHelper;
import datawave.query.RebuildingScannerTestHelper;
+import datawave.query.config.ShardQueryConfiguration;
import datawave.query.config.annotation.AllHitsQueryConfig;
import datawave.query.config.annotation.AnnotationConfig;
+import datawave.query.function.DocumentMatchContext;
import datawave.query.function.deserializer.KryoDocumentDeserializer;
import datawave.query.planner.DefaultQueryPlanner;
import datawave.query.planner.TimedVisitorManager;
@@ -257,6 +259,63 @@ public void tearDown() throws Exception {
this.endDate = null;
}
+ /**
+ * Verifies that the Spring-configured {@link ShardQueryLogic} bean exposes the document-match limits through its accessor surface.
+ */
+ @Test
+ public void testDocumentMatchLimitsDefaultFromSpringConfig() {
+ assertEquals(DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE, logic.getDocumentMatchMaxEncodedSize());
+ assertEquals(DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, logic.getDocumentMatchMaxDecodedSize());
+ assertEquals(DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE, logic.getDocumentMatchMaxEncodedContextSize());
+ }
+
+ /**
+ * Verifies that the bean-style setters update both the top-level logic and its backing configuration.
+ */
+ @Test
+ public void testDocumentMatchLimitSettersUpdateLogicAndConfig() {
+ int encoded = 1024;
+ int decoded = 2048;
+ int encodedContext = 4096;
+
+ logic.setDocumentMatchMaxEncodedSize(encoded);
+ logic.setDocumentMatchMaxDecodedSize(decoded);
+ logic.setDocumentMatchMaxEncodedContextSize(encodedContext);
+
+ assertEquals(encoded, logic.getDocumentMatchMaxEncodedSize());
+ assertEquals(decoded, logic.getDocumentMatchMaxDecodedSize());
+ assertEquals(encodedContext, logic.getDocumentMatchMaxEncodedContextSize());
+ assertEquals(encoded, logic.getConfig().getDocumentMatchMaxEncodedSize());
+ assertEquals(decoded, logic.getConfig().getDocumentMatchMaxDecodedSize());
+ assertEquals(encodedContext, logic.getConfig().getDocumentMatchMaxEncodedContextSize());
+ }
+
+ /**
+ * Verifies that document-match limit overrides survive {@link ShardQueryLogic#initialize(AccumuloClient, Query, Set)} and appear on the per-query config.
+ */
+ @Test
+ public void testDocumentMatchLimitsPropagateThroughInitialize() throws Exception {
+ int encoded = 4096;
+ int decoded = 8192;
+ int encodedContext = 16384;
+
+ logic.setDocumentMatchMaxEncodedSize(encoded);
+ logic.setDocumentMatchMaxDecodedSize(decoded);
+ logic.setDocumentMatchMaxEncodedContextSize(encodedContext);
+
+ this.query = "UUID == '" + caponeUID + "'";
+ this.startDate = dateFormat.parse("20091231");
+ this.endDate = dateFormat.parse("20150101");
+
+ Query settings = createSettings();
+ AccumuloClient client = createClient();
+ ShardQueryConfiguration config = (ShardQueryConfiguration) logic.initialize(client, settings, authSet);
+
+ assertEquals(encoded, config.getDocumentMatchMaxEncodedSize());
+ assertEquals(decoded, config.getDocumentMatchMaxDecodedSize());
+ assertEquals(encodedContext, config.getDocumentMatchMaxEncodedContextSize());
+ }
+
private AccumuloClient createClient() throws Exception {
AccumuloClient client = new QueryTestTableHelper(ShardRange.class.toString(), log, RebuildingScannerTestHelper.TEARDOWN.EVERY_OTHER_SANS_CONSISTENCY,
RebuildingScannerTestHelper.INTERRUPT.EVERY_OTHER).client;
diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml
index 5035b33c879..ded86c957fd 100644
--- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml
+++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml
@@ -28,6 +28,7 @@
+
@@ -400,6 +401,8 @@
+
+
diff --git a/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml b/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml
index 358193f223a..9ff297c3234 100644
--- a/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml
+++ b/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml
@@ -15,6 +15,7 @@
+
diff --git a/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml b/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml
index 960cbf64242..8251c66dcb7 100644
--- a/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml
+++ b/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml
@@ -24,6 +24,7 @@
+
@@ -410,6 +411,8 @@
+
+