From 9cdd70c40759a5b8de601a141b7c52a2ba92953e Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Sat, 21 Mar 2026 10:20:15 -0400 Subject: [PATCH 1/9] Adds a `document:match` query function for substring matching against `d` column entries * Adds the `document:match(viewname, string)` and `document:match(string)` query functions that will scan the `d` columns of candidate documents at evaluation time and filter those candidates whose values do not contain the string specified. * Exposed via Lucene syntax using the `#DOCUMENT_MATCH` operator. * If no view name is included as a function parameter all 'd' columns will be scanned. * The viewname can be a prefix that ends with '*' to search all views with the specified prefix. * If the specified string is found, the view name and start offsets for matches will be stored as a JSON map in the DOCUMENT_MATCHES field in the result. This change includes: * Lucene-to-JEXL translation * Planner/iterator wiring * Runtime document-match evaluation * Configurable limits for `d` column sizes to prevent evaluation of large documents * Unit and integration tests While useful in its own right, this is a predecessor for more advanced matching functions on `d` column payloads. --- .../tests/EventQueryDocumentMatch.test | 87 +++++ .../JexlFunctionNamespaceRegistryContext.xml | 1 + .../JexlFunctionNamespaceRegistryContext.xml | 1 + .../query/config/ShardQueryConfiguration.java | 44 +++ .../query/function/DocumentMatchConfig.java | 49 +++ .../query/function/DocumentMatchContext.java | 247 ++++++++++++++ .../DocumentMatchContextFunction.java | 150 +++++++++ .../query/function/DocumentMatchFactory.java | 31 ++ .../function/EmptyDocumentMatchFunction.java | 20 ++ .../query/function/JexlEvaluation.java | 18 +- .../query/function/KeyToDocumentData.java | 41 ++- .../query/iterator/QueryIterator.java | 63 +++- .../datawave/query/iterator/QueryOptions.java | 71 +++- .../query/jexl/DatawaveInterpreter.java | 15 + .../jexl/functions/DocumentFunctions.java | 259 +++++++++++++++ .../DocumentFunctionsDescriptor.java | 107 ++++++ .../JexlFunctionNamespaceRegistry.java | 1 + ...ocumentMatchFunctionRebuildingVisitor.java | 61 ++++ .../lucene/FunctionQueryNodeBuilder.java | 4 +- .../functions/jexl/DocumentMatch.java | 61 ++++ .../functions/lucene/DocumentMatch.java | 102 ++++++ .../query/planner/DefaultQueryPlanner.java | 10 + .../predicate/EventDataQueryFieldFilter.java | 14 + .../query/tables/ShardQueryLogic.java | 16 + .../datawave/query/tld/TLDQueryIterator.java | 9 + .../query/DocumentMatchQueryTest.java | 278 ++++++++++++++++ .../config/ShardQueryConfigurationTest.java | 7 + .../DocumentMatchContextFunctionTest.java | 141 ++++++++ .../query/function/JexlEvaluationTest.java | 77 ++++- .../jexl/functions/DocumentFunctionsTest.java | 306 ++++++++++++++++++ ...entMatchFunctionRebuildingVisitorTest.java | 55 ++++ .../jexl/TestLuceneToJexlQueryParser.java | 10 + .../parser/lucene/TestLuceneQueryParser.java | 6 + .../query/tables/ShardQueryLogicTest.java | 51 +++ .../datawave/query/QueryLogicFactory.xml | 3 + .../JexlFunctionNamespaceRegistryContext.xml | 1 + .../datawave/query/QueryLogicFactory.xml | 3 + 37 files changed, 2409 insertions(+), 11 deletions(-) create mode 100644 contrib/datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryDocumentMatch.test create mode 100644 warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchConfig.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchFactory.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/function/EmptyDocumentMatchFunction.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java create mode 100644 warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java create mode 100644 warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java create mode 100644 warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java create mode 100644 warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java diff --git a/contrib/datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryDocumentMatch.test b/contrib/datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryDocumentMatch.test new file mode 100644 index 00000000000..0b09e1ef0e0 --- /dev/null +++ b/contrib/datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryDocumentMatch.test @@ -0,0 +1,87 @@ + +################################################################ +# document:match query tests for EventQuery + +# These tests validate both the JEXL document:match(...) form and the +# Lucene #DOCUMENT_MATCH(...) form against a known Wikipedia event whose +# REVISION_COMMENT d-column contains the string "Origins". + +################################################################ +# JEXL create + +setCurlData query=$( urlencode "PAGE_TITLE == 'Anarchism' && document:match('REVISION_COMMENT', 'Origins')" ) \ + queryName=EventQueryDocumentMatchJexl \ + begin=20130301 \ + end=20130401 \ + pagesize=1 \ + auths=PUBLIC \ + columnVisibility=PRIVATE \ + query.syntax=JEXL + +configureTest \ + CreateDocumentMatchJexl \ + "Creates a JEXL EventQuery using document:match against REVISION_COMMENT d-column content" \ + "--header 'Content-Type: application/x-www-form-urlencoded' ${DW_CURL_DATA} -X POST ${URI_ROOT}/Query/EventQuery/create" \ + "application/xml;charset=UTF-8" \ + 200 + +runTest --set-query-id + +################################################################ +# JEXL next + +configureTest \ + DocumentMatchJexlPage1 \ + "Gets the first page of results for the JEXL document:match query in JSON format" \ + "--header 'Accept: application/json' -X GET ${URI_ROOT}/Query/${DW_QUERY_ID}/next" \ + application/json \ + 200 + +runTest + +################################################################ +# JEXL close + +configureCloseQueryTest ${DW_QUERY_ID} + +runTest + +################################################################ +# Lucene create + +setCurlData query=$( urlencode "PAGE_TITLE:Anarchism AND #DOCUMENT_MATCH(REVISION_COMMENT, Origins)" ) \ + queryName=EventQueryDocumentMatchLucene \ + begin=20130301 \ + end=20130401 \ + pagesize=1 \ + auths=PUBLIC \ + columnVisibility=PRIVATE \ + query.syntax=LUCENE + +configureTest \ + CreateDocumentMatchLucene \ + "Creates a Lucene EventQuery using #DOCUMENT_MATCH against REVISION_COMMENT d-column content" \ + "--header 'Content-Type: application/x-www-form-urlencoded' ${DW_CURL_DATA} -X POST ${URI_ROOT}/Query/EventQuery/create" \ + "application/xml;charset=UTF-8" \ + 200 + +runTest --set-query-id + +################################################################ +# Lucene next + +configureTest \ + DocumentMatchLucenePage1 \ + "Gets the first page of results for the Lucene #DOCUMENT_MATCH query in JSON format" \ + "--header 'Accept: application/json' -X GET ${URI_ROOT}/Query/${DW_QUERY_ID}/next" \ + application/json \ + 200 + +runTest + +################################################################ +# Lucene close + +configureCloseQueryTest ${DW_QUERY_ID} + +# This last test is executed by run.sh, as usual diff --git a/microservices/services/query-executor/service/src/main/resources/JexlFunctionNamespaceRegistryContext.xml b/microservices/services/query-executor/service/src/main/resources/JexlFunctionNamespaceRegistryContext.xml index ea1047f7fbb..56ef29bfa5a 100644 --- a/microservices/services/query-executor/service/src/main/resources/JexlFunctionNamespaceRegistryContext.xml +++ b/microservices/services/query-executor/service/src/main/resources/JexlFunctionNamespaceRegistryContext.xml @@ -15,6 +15,7 @@ + diff --git a/warehouse/ingest-configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml b/warehouse/ingest-configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml index 358193f223a..9ff297c3234 100644 --- a/warehouse/ingest-configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml +++ b/warehouse/ingest-configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml @@ -15,6 +15,7 @@ + diff --git a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java index f4c2256a407..a9f9d2ef58e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java +++ b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java @@ -53,6 +53,7 @@ import datawave.query.attributes.UniqueFields; import datawave.query.common.grouping.GroupFields; import datawave.query.config.annotation.AllHitsQueryConfig; +import datawave.query.function.DocumentMatchContext; import datawave.query.function.DocumentPermutation; import datawave.query.iterator.QueryIterator; import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; @@ -279,6 +280,8 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement private Set queryTermFrequencyFields = Collections.emptySet(); // Are we required to get term frequencies (i.e. does the query contain content functions) private boolean termFrequenciesRequired = false; + // Are we required to gather document-match context (i.e. does the query contain document:match functions) + private boolean documentMatchContextRequired = false; // Limit count of returned values for arbitrary fields. private Set limitFields = Collections.emptySet(); private Set matchingFieldSets = Collections.emptySet(); @@ -502,6 +505,14 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement * Term Frequency aggregations that exceed this threshold in milliseconds are logged as a warning */ private int tfAggregationThresholdMs = -1; + /** + * Maximum encoded d-column payload size, in bytes, to inspect for document:match evaluation + */ + private int documentMatchMaxEncodedSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE; + /** + * Maximum decoded d-column payload size, in bytes, to inspect for document:match evaluation + */ + private int documentMatchMaxDecodedSize = DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE; /** * Flag to control query option pruning in the visitor function. Queries that see significant or varied pruning via the RangeStream may see a benefit from @@ -718,6 +729,7 @@ public void copyFrom(ShardQueryConfiguration other) { this.setSortedUIDs(other.isSortedUIDs()); this.setQueryTermFrequencyFields(null == other.getQueryTermFrequencyFields() ? null : Sets.newHashSet(other.getQueryTermFrequencyFields())); this.setTermFrequenciesRequired(other.isTermFrequenciesRequired()); + this.setDocumentMatchContextRequired(other.isDocumentMatchContextRequired()); this.setLimitFields(null == other.getLimitFields() ? null : Sets.newHashSet(other.getLimitFields())); this.setMatchingFieldSets(null == other.getMatchingFieldSets() ? null : Sets.newHashSet(other.getMatchingFieldSets())); this.setLimitFieldsPreQueryEvaluation(other.isLimitFieldsPreQueryEvaluation()); @@ -836,6 +848,8 @@ public void copyFrom(ShardQueryConfiguration other) { this.setLazySetMechanismEnabled(other.isLazySetMechanismEnabled()); this.setDocAggregationThresholdMs(other.getDocAggregationThresholdMs()); this.setTfAggregationThresholdMs(other.getTfAggregationThresholdMs()); + this.setDocumentMatchMaxEncodedSize(other.getDocumentMatchMaxEncodedSize()); + this.setDocumentMatchMaxDecodedSize(other.getDocumentMatchMaxDecodedSize()); this.setGroupFields(GroupFields.copyOf(other.getGroupFields())); this.setPruneQueryOptions(other.getPruneQueryOptions()); this.setSortQueryPreIndexWithImpliedCounts(other.isSortQueryPreIndexWithImpliedCounts()); @@ -2344,6 +2358,14 @@ public void setTermFrequenciesRequired(boolean termFrequenciesRequired) { this.termFrequenciesRequired = termFrequenciesRequired; } + public boolean isDocumentMatchContextRequired() { + return documentMatchContextRequired; + } + + public void setDocumentMatchContextRequired(boolean documentMatchContextRequired) { + this.documentMatchContextRequired = documentMatchContextRequired; + } + public void setLimitTermExpansionToModel(boolean shouldLimitTermExpansionToModel) { this.shouldLimitTermExpansionToModel = shouldLimitTermExpansionToModel; } @@ -2891,6 +2913,22 @@ public void setTfAggregationThresholdMs(int tfAggregationThresholdMs) { this.tfAggregationThresholdMs = tfAggregationThresholdMs; } + public int getDocumentMatchMaxEncodedSize() { + return documentMatchMaxEncodedSize; + } + + public void setDocumentMatchMaxEncodedSize(int documentMatchMaxEncodedSize) { + this.documentMatchMaxEncodedSize = documentMatchMaxEncodedSize; + } + + public int getDocumentMatchMaxDecodedSize() { + return documentMatchMaxDecodedSize; + } + + public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { + this.documentMatchMaxDecodedSize = documentMatchMaxDecodedSize; + } + public GroupFields getGroupFields() { return groupFields; } @@ -3050,6 +3088,7 @@ public boolean equals(Object o) { Float.compare(that.getCollapseDatePercentThreshold(), getCollapseDatePercentThreshold()) == 0 && isSortedUIDs() == that.isSortedUIDs() && isTermFrequenciesRequired() == that.isTermFrequenciesRequired() && + isDocumentMatchContextRequired() == that.isDocumentMatchContextRequired() && isLimitFieldsPreQueryEvaluation() == that.isLimitFieldsPreQueryEvaluation() && isHitList() == that.isHitList() && isDateIndexTimeTravel() == that.isDateIndexTimeTravel() && @@ -3205,6 +3244,8 @@ public boolean equals(Object o) { isLazySetMechanismEnabled() == that.isLazySetMechanismEnabled() && getDocAggregationThresholdMs() == that.getDocAggregationThresholdMs() && getTfAggregationThresholdMs() == that.getTfAggregationThresholdMs() && + getDocumentMatchMaxEncodedSize() == that.getDocumentMatchMaxEncodedSize() && + getDocumentMatchMaxDecodedSize() == that.getDocumentMatchMaxDecodedSize() && getPruneQueryOptions() == that.getPruneQueryOptions() && isSortQueryPreIndexWithImpliedCounts() == that.isSortQueryPreIndexWithImpliedCounts() && isSortQueryPreIndexWithFieldCounts() == that.isSortQueryPreIndexWithFieldCounts() && @@ -3338,6 +3379,7 @@ public int hashCode() { isSortedUIDs(), getQueryTermFrequencyFields(), isTermFrequenciesRequired(), + isDocumentMatchContextRequired(), getLimitFields(), getMatchingFieldSets(), isLimitFieldsPreQueryEvaluation(), @@ -3443,6 +3485,8 @@ public int hashCode() { isLazySetMechanismEnabled(), getDocAggregationThresholdMs(), getTfAggregationThresholdMs(), + getDocumentMatchMaxEncodedSize(), + getDocumentMatchMaxDecodedSize(), getPruneQueryOptions(), isSortQueryPreIndexWithImpliedCounts(), isSortQueryPreIndexWithFieldCounts(), diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchConfig.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchConfig.java new file mode 100644 index 00000000000..45fd023d527 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchConfig.java @@ -0,0 +1,49 @@ +package datawave.query.function; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; + +import datawave.query.predicate.TimeFilter; + +/** + * Configuration used to build the document-match context lookup function that runs immediately before JEXL evaluation. + */ +public class DocumentMatchConfig { + private SortedKeyValueIterator source; + private TimeFilter timeFilter; + private DocumentMatchContext.Limits limits; + private boolean tld; + + public SortedKeyValueIterator getSource() { + return source; + } + + public void setSource(SortedKeyValueIterator source) { + this.source = source; + } + + public TimeFilter getTimeFilter() { + return timeFilter; + } + + public void setTimeFilter(TimeFilter timeFilter) { + this.timeFilter = timeFilter; + } + + public DocumentMatchContext.Limits getLimits() { + return limits; + } + + public void setLimits(DocumentMatchContext.Limits limits) { + this.limits = limits; + } + + public boolean isTld() { + return tld; + } + + public void setTld(boolean tld) { + this.tld = tld; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java new file mode 100644 index 00000000000..e81b97580ce --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java @@ -0,0 +1,247 @@ +package datawave.query.function; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.security.ColumnVisibility; + +import datawave.query.predicate.TimeFilter; + +/** + * Per-document runtime state used by {@code document:match(...)} evaluation. + *

+ * This context carries the raw {@code d}-column entries retained for a candidate document, the configured size limits used while decoding those payloads, the + * merged offset results accumulated across one or more {@code document:match(...)} calls, grouped first by matched string and then by view, and the first + * matched {@code d}-column key whose visibility should be applied to the derived {@code DOCUMENT_MATCHES} attribute. + */ +public class DocumentMatchContext { + public static final int DEFAULT_MAX_ENCODED_SIZE = 256 * 1024 * 1024; + public static final int DEFAULT_MAX_DECODED_SIZE = 384 * 1024 * 1024; + + /** + * Immutable runtime limits for {@code document:match(...)} payload processing. + */ + public static class Limits { + private final int maxEncodedValueSize; + private final int maxDecodedValueSize; + + /** + * @param maxEncodedValueSize + * maximum allowed encoded payload size in bytes + * @param maxDecodedValueSize + * maximum allowed decoded payload size in bytes + */ + public Limits(int maxEncodedValueSize, int maxDecodedValueSize) { + this.maxEncodedValueSize = maxEncodedValueSize; + this.maxDecodedValueSize = maxDecodedValueSize; + } + + /** + * @return the maximum encoded payload size, in bytes + */ + public int getMaxEncodedValueSize() { + return maxEncodedValueSize; + } + + /** + * @return the maximum decoded payload size, in bytes + */ + public int getMaxDecodedValueSize() { + return maxDecodedValueSize; + } + } + + private final List> dEntries; + private final Limits limits; + private final Map>> mergedMatches = new LinkedHashMap<>(); + private Key firstMatchingEntry; + private boolean visibilityMismatchLogged = false; + + public DocumentMatchContext(List> dEntries, Limits limits) { + this.dEntries = dEntries; + this.limits = limits; + } + + public DocumentMatchContext(List> dEntries, int maxEncodedValueSize) { + this(dEntries, new Limits(maxEncodedValueSize, DEFAULT_MAX_DECODED_SIZE)); + } + + public DocumentMatchContext(List> dEntries, int maxEncodedValueSize, int maxDecodedValueSize) { + this(dEntries, new Limits(maxEncodedValueSize, maxDecodedValueSize)); + } + + /** + * Builds a context from already-aggregated document entries using the default encoded and decoded payload limits. + * + * @param entries + * aggregated document entries + * @param timeFilter + * optional time filter to apply while selecting {@code d}-column entries + * @return a context containing only eligible {@code d}-column entries + */ + public static DocumentMatchContext from(List> entries, TimeFilter timeFilter) { + return from(entries, timeFilter, new Limits(DEFAULT_MAX_ENCODED_SIZE, DEFAULT_MAX_DECODED_SIZE)); + } + + /** + * Builds a context from already-aggregated document entries using a caller-supplied encoded payload limit and the default decoded payload limit. + * + * @param entries + * aggregated document entries + * @param timeFilter + * optional time filter to apply while selecting {@code d}-column entries + * @param maxEncodedValueSize + * maximum allowed encoded payload size in bytes + * @return a context containing only eligible {@code d}-column entries + */ + public static DocumentMatchContext from(List> entries, TimeFilter timeFilter, int maxEncodedValueSize) { + return from(entries, timeFilter, new Limits(maxEncodedValueSize, DEFAULT_MAX_DECODED_SIZE)); + } + + /** + * Builds a context from already-aggregated document entries using explicit encoded and decoded payload limits. + * + * @param entries + * aggregated document entries + * @param timeFilter + * optional time filter to apply while selecting {@code d}-column entries + * @param maxEncodedValueSize + * maximum allowed encoded payload size in bytes + * @param maxDecodedValueSize + * maximum allowed decoded payload size in bytes + * @return a context containing only eligible {@code d}-column entries + */ + public static DocumentMatchContext from(List> entries, TimeFilter timeFilter, int maxEncodedValueSize, int maxDecodedValueSize) { + return from(entries, timeFilter, new Limits(maxEncodedValueSize, maxDecodedValueSize)); + } + + /** + * Builds a context from already-aggregated document entries using explicit runtime limits. + * + * @param entries + * aggregated document entries + * @param timeFilter + * optional time filter to apply while selecting {@code d}-column entries + * @param limits + * payload-processing limits + * @return a context containing only eligible {@code d}-column entries + */ + public static DocumentMatchContext from(List> entries, TimeFilter timeFilter, Limits limits) { + List> dEntries = new ArrayList<>(); + for (Entry entry : entries) { + if (entry.getKey().getColumnFamily().toString().equals("d") && (timeFilter == null || timeFilter.apply(entry))) { + dEntries.add(entry); + } + } + return new DocumentMatchContext(dEntries, limits); + } + + public List> getdEntries() { + return Collections.unmodifiableList(dEntries); + } + + public int getMaxEncodedValueSize() { + return limits.getMaxEncodedValueSize(); + } + + public int getMaxDecodedValueSize() { + return limits.getMaxDecodedValueSize(); + } + + public Limits getLimits() { + return limits; + } + + /** + * Clears merged match state before evaluating a new document. + */ + public void clearMergedMatches() { + mergedMatches.clear(); + firstMatchingEntry = null; + visibilityMismatchLogged = false; + } + + /** + * Merges per-call matches into the document-wide result set. + * + * @param search + * the literal string matched by the invocation + * @param matches + * matches produced by one {@code document:match(...)} invocation, keyed by view name + */ + public void mergeMatches(String search, Map> matches) { + Map> searchMatches = mergedMatches.computeIfAbsent(search, key -> new LinkedHashMap<>()); + for (Entry> entry : matches.entrySet()) { + searchMatches.computeIfAbsent(entry.getKey(), key -> new LinkedHashSet<>()).addAll(entry.getValue()); + } + } + + /** + * @return a defensive copy of the merged document-wide match results + */ + public Map>> getMergedMatches() { + Map>> matches = new LinkedHashMap<>(); + for (Entry>> searchEntry : mergedMatches.entrySet()) { + Map> viewMatches = new LinkedHashMap<>(); + for (Entry> viewEntry : searchEntry.getValue().entrySet()) { + viewMatches.put(viewEntry.getKey(), new ArrayList<>(viewEntry.getValue())); + } + matches.put(searchEntry.getKey(), viewMatches); + } + return matches; + } + + /** + * @return the first {@code d}-column key that matched during evaluation, or {@code null} if no match has been recorded yet + */ + public Key getFirstMatchingEntry() { + return firstMatchingEntry; + } + + /** + * @return the visibility from the first matched {@code d}-column key, or {@code null} if no match has been recorded yet + */ + public ColumnVisibility getFirstMatchingColumnVisibility() { + if (firstMatchingEntry == null) { + return null; + } + return firstMatchingEntry.getColumnVisibilityParsed(); + } + + /** + * Records a matched {@code d}-column key and detects whether its visibility differs from the first matched key for the document. + * + * @param key + * the matched {@code d}-column key + * @return {@code true} if the key differs in visibility from the first matched key, otherwise {@code false} + */ + public boolean recordMatchingEntry(Key key) { + if (firstMatchingEntry == null) { + firstMatchingEntry = key; + return false; + } + return !firstMatchingEntry.getColumnVisibilityData().equals(key.getColumnVisibilityData()); + } + + /** + * @return {@code true} if a visibility mismatch has not yet been logged for the current document + */ + public boolean shouldLogVisibilityMismatch() { + return !visibilityMismatchLogged; + } + + /** + * Marks the current document as having already logged a visibility mismatch. + */ + public void markVisibilityMismatchLogged() { + visibilityMismatchLogged = true; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java new file mode 100644 index 00000000000..2740c36bf41 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java @@ -0,0 +1,150 @@ +package datawave.query.function; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.log4j.Logger; + +import com.google.common.base.Function; +import com.google.common.collect.Maps; + +import datawave.query.attributes.Attribute; +import datawave.query.attributes.Attributes; +import datawave.query.attributes.Document; +import datawave.query.attributes.DocumentKey; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.util.Tuple3; +import datawave.query.util.Tuples; + +/** + * Builds a {@link DocumentMatchContext} close to evaluation time and attaches it to the side-channel map used for JEXL context population. + */ +public class DocumentMatchContextFunction implements Function>,Tuple3>> { + private static final Logger log = Logger.getLogger(DocumentMatchContextFunction.class); + private final DocumentMatchConfig config; + + /** + * Creates a context-populating function from the supplied document-match configuration. + * + * @param config + * document-match configuration + */ + public DocumentMatchContextFunction(DocumentMatchConfig config) { + this.config = config; + } + + @Override + public Tuple3> apply(Tuple3> from) { + try { + Set documentKeys = getDocumentKeys(from.first(), from.second()); + if (log.isDebugEnabled()) { + log.debug("Collecting document-match context for tuple key " + from.first() + " using document keys " + documentKeys); + } + + List> dEntries = collectDocumentColumnAttributes(documentKeys); + DocumentMatchContext context = DocumentMatchContext.from(dEntries, config.getTimeFilter(), config.getLimits()); + if (log.isDebugEnabled()) { + log.debug("Collected " + dEntries.size() + " d-column entries for tuple key " + from.first()); + } + + Map map = from.third().isEmpty() ? new HashMap<>() : new HashMap<>(from.third()); + map.put(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, context); + return Tuples.tuple(from.first(), from.second(), map); + } catch (IOException e) { + throw new IllegalStateException("Unable to collect document-match context for " + from.first(), e); + } + } + + private List> collectDocumentColumnAttributes(Set documentKeys) throws IOException { + List> documentColumns = new ArrayList<>(); + for (Key documentKey : documentKeys) { + collectDocumentColumnAttributes(documentKey, documentColumns); + } + return documentColumns; + } + + private void collectDocumentColumnAttributes(Key documentKey, List> documentColumns) throws IOException { + String row = documentKey.getRow().toString(); + String datatypeAndUid = documentKey.getColumnFamily().toString(); + Key startKey = new Key(row, "d", datatypeAndUid + '\0'); + Key endKey = new Key(row, "d", datatypeAndUid + '\uffff'); + Range documentColumnRange = new Range(startKey, true, endKey, false); + if (log.isDebugEnabled()) { + log.debug("Seeking d-column range " + documentColumnRange + " for document key " + documentKey); + } + + config.getSource().seek(documentColumnRange, Collections.emptyList(), false); + + while (config.getSource().hasTop() && isDocumentColumn(config.getSource().getTopKey(), documentKey)) { + if (log.isDebugEnabled()) { + log.debug("Collected d-column entry " + config.getSource().getTopKey() + " for document key " + documentKey); + } + documentColumns.add(Maps.immutableEntry(config.getSource().getTopKey(), config.getSource().getTopValue())); + config.getSource().next(); + } + + if (log.isDebugEnabled()) { + log.debug("Finished d-column scan for document key " + documentKey + "; next top key is " + + (config.getSource().hasTop() ? config.getSource().getTopKey() : "")); + } + } + + private Set getDocumentKeys(Key tupleKey, Document document) { + Set docKeys = new HashSet<>((config.isTld()) ? 4 : 1); + Attribute docKeyAttr = document.get(Document.DOCKEY_FIELD_NAME); + if (docKeyAttr == null) { + docKeys.add(tupleKey); + return docKeys; + } + + if (docKeyAttr instanceof DocumentKey) { + docKeys.add(((DocumentKey) docKeyAttr).getDocKey()); + } else if (docKeyAttr instanceof Attributes) { + for (Attribute docKey : ((Attributes) docKeyAttr).getAttributes()) { + if (docKey instanceof DocumentKey) { + docKeys.add(((DocumentKey) docKey).getDocKey()); + } else { + throw new IllegalStateException("Unexpected sub-Attribute type for " + Document.DOCKEY_FIELD_NAME + ": " + docKey.getClass()); + } + } + } else { + throw new IllegalStateException("Unexpected Attribute type for " + Document.DOCKEY_FIELD_NAME + ": " + docKeyAttr.getClass()); + } + + if (docKeys.isEmpty()) { + docKeys.add(tupleKey); + } + return docKeys; + } + + /** + * Determines whether a scanned key is a {@code d}-column for the supplied document key. + *

+ * The comparison intentionally checks the scanned key's column qualifier against the document key's column family. For event keys, the column family is + * {@code datatype\0uid}, while {@code d}-column qualifiers are laid out as {@code datatype\0uid\0view}. Matching on this prefix ensures that the collected + * {@code d}-column belongs to the same document identity as the event key. + * + * @param documentContentKey + * scanned 'd' column shard-table key + * @param documentKey + * event or document key whose {@code datatype\0uid} identifies the document + * @return {@code true} if the scanned key is a matching {@code d}-column entry for the document + */ + private boolean isDocumentColumn(Key documentContentKey, Key documentKey) { + // A document key's column family is datatype\0uid, and a d-column qualifier begins with that same datatype\0uid + // followed by \0view. This prefix comparison ties the d-column back to the document represented by the event key. + return documentContentKey.getColumnFamilyData().length() == 1 && documentContentKey.getColumnFamilyData().byteAt(0) == 'd' + && documentContentKey.getRow().equals(documentKey.getRow()) + && documentContentKey.getColumnQualifier().toString().startsWith(documentKey.getColumnFamily().toString() + '\0'); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchFactory.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchFactory.java new file mode 100644 index 00000000000..ee5082f1a9b --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchFactory.java @@ -0,0 +1,31 @@ +package datawave.query.function; + +import java.util.Map; + +import org.apache.accumulo.core.data.Key; + +import com.google.common.base.Function; + +import datawave.query.attributes.Document; +import datawave.query.util.Tuple3; + +/** + * Builds the pre-evaluation function that populates {@link DocumentMatchContext} for {@code document:match(...)} evaluation. + */ +public class DocumentMatchFactory { + private DocumentMatchFactory() {} + + /** + * Returns a context-populating function for document matching. + * + * @param config + * document-match configuration + * @return either a context-populating function or a no-op function when no source is available + */ + public static Function>,Tuple3>> getFunction(DocumentMatchConfig config) { + if (config == null || config.getSource() == null) { + return new EmptyDocumentMatchFunction(); + } + return new DocumentMatchContextFunction(config); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/EmptyDocumentMatchFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/EmptyDocumentMatchFunction.java new file mode 100644 index 00000000000..083838613d0 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/EmptyDocumentMatchFunction.java @@ -0,0 +1,20 @@ +package datawave.query.function; + +import java.util.Map; + +import org.apache.accumulo.core.data.Key; + +import com.google.common.base.Function; + +import datawave.query.attributes.Document; +import datawave.query.util.Tuple3; + +/** + * No-op document-match context function used when the query does not contain {@code document:match(...)}. + */ +public class EmptyDocumentMatchFunction implements Function>,Tuple3>> { + @Override + public Tuple3> apply(Tuple3> from) { + return from; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java index 8e58173121f..aff353ae860 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java @@ -24,6 +24,7 @@ import datawave.query.jexl.DefaultArithmetic; import datawave.query.jexl.DelayedNonEventIndexContext; import datawave.query.jexl.HitListArithmetic; +import datawave.query.jexl.functions.DocumentFunctions; import datawave.query.postprocessing.tf.PhraseIndexes; import datawave.query.postprocessing.tf.TermOffsetMap; import datawave.query.transformer.ExcerptTransform; @@ -97,7 +98,12 @@ public boolean apply(Tuple3 input) { log.trace("Evaluating " + query + " against document " + input.second().getMetadata() + " with context " + input.third()); } - Object o = script.execute(input.third()); + DocumentMatchContext documentMatchContext = (DocumentMatchContext) input.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + if (documentMatchContext != null) { + documentMatchContext.clearMergedMatches(); + } + Object o; + o = script.execute(input.third()); if (log.isTraceEnabled()) { log.trace("Evaluation of " + query + " against document " + input.second().getMetadata() + " returned " + o); @@ -110,6 +116,16 @@ public boolean apply(Tuple3 input) { ((DelayedNonEventIndexContext) input.third()).populateDocument(input.second()); } + String documentMatches = (documentMatchContext == null) ? "" : DocumentFunctions.toJson(documentMatchContext.getMergedMatches()); + if (matched && !documentMatches.isEmpty()) { + Document document = input.second(); + Content matchesAttribute = new Content(documentMatches, document.getMetadata(), document.isToKeep()); + if (documentMatchContext != null && documentMatchContext.getFirstMatchingColumnVisibility() != null) { + matchesAttribute.setColumnVisibility(documentMatchContext.getFirstMatchingColumnVisibility()); + } + document.put(DocumentFunctions.DOCUMENT_MATCHES, matchesAttribute); + } + if (arithmetic instanceof HitListArithmetic) { HitListArithmetic hitListArithmetic = (HitListArithmetic) arithmetic; if (matched) { diff --git a/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java b/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java index 5611ba051b0..7659b36a4f7 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java @@ -214,7 +214,7 @@ public List> collectDocumentAttributes(final Key documentStartK while (docAttrKey != null) { boolean seeked = false; - if (equality.partOf(documentStartKey, docAttrKey.get())) { + if (isPartOfDocument(documentStartKey, docAttrKey.get())) { if (filter == null || filter.keep(docAttrKey.get())) { docKeys.add(getDocKey(docAttrKey.get())); } @@ -254,15 +254,52 @@ public List> collectDocumentAttributes(final Key documentStartK return documentAttributes; } + private boolean isPartOfDocument(Key documentStartKey, Key candidateKey) { + return equality.partOf(documentStartKey, candidateKey); + } + // map the key to the dockey (only shard, datatype, uid) public static Key getDocKey(Key key) { final ByteSequence row = key.getRowData(); - final ByteSequence cf = key.getColumnFamilyData(); + final ByteSequence cf = getDocColumnFamily(key); final ByteSequence cv = key.getColumnVisibilityData(); return new Key(row.getBackingArray(), row.offset(), row.length(), cf.getBackingArray(), cf.offset(), cf.length(), EMPTY_BYTE_SEQUENCE.getBackingArray(), EMPTY_BYTE_SEQUENCE.offset(), EMPTY_BYTE_SEQUENCE.length(), cv.getBackingArray(), cv.offset(), cv.length(), key.getTimestamp()); } + /** + * extracts the proper column family byte sequence from a key regardless of whether it is an event key or a 'd' column key. + * + * @param key + * the key to process + * @return the column family, consisting of datatype and uid. + */ + private static ByteSequence getDocColumnFamily(Key key) { + final ByteSequence cf = key.getColumnFamilyData(); + if (!"d".equals(key.getColumnFamily().toString())) { + return cf; + } + + ByteSequence cq = key.getColumnQualifierData(); + int firstNull = -1; + int secondNull = -1; + for (int i = 0; i < cq.length(); i++) { + if (cq.byteAt(i) == 0x00) { + if (firstNull < 0) { + firstNull = i; + } else { + secondNull = i; + break; + } + } + } + if (firstNull < 0) { + return cf; + } + int end = (secondNull < 0) ? cq.length() : secondNull; + return cq.subSequence(0, end); + } + private static List> appendHierarchyFields(List> documentAttributes, Key key, Range seekRange, DescendantCountFunction function, boolean includeParent) { if (function != null || includeParent) { diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java index e9f9ea037fa..40c329bba65 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java @@ -68,6 +68,8 @@ import datawave.query.composite.CompositeMetadata; import datawave.query.function.Aggregation; import datawave.query.function.DataTypeAsField; +import datawave.query.function.DocumentMatchConfig; +import datawave.query.function.DocumentMatchFactory; import datawave.query.function.DocumentMetadata; import datawave.query.function.DocumentPermutation; import datawave.query.function.DocumentProjection; @@ -98,13 +100,16 @@ import datawave.query.iterator.profile.SourceTrackingIterator; import datawave.query.iterator.waitwindow.WaitWindowObserver; import datawave.query.iterator.waitwindow.WaitWindowOverseerIterator; +import datawave.query.jexl.ArithmeticJexlEngines; import datawave.query.jexl.DatawaveJexlContext; import datawave.query.jexl.StatefulArithmetic; import datawave.query.jexl.functions.FieldIndexAggregator; import datawave.query.jexl.functions.IdentityAggregator; import datawave.query.jexl.functions.KeyAdjudicator; import datawave.query.jexl.visitors.DelayedNonEventSubTreeVisitor; +import datawave.query.jexl.visitors.DocumentMatchFunctionRebuildingVisitor; import datawave.query.jexl.visitors.IteratorBuildingVisitor; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; import datawave.query.jexl.visitors.SatisfactionVisitor; import datawave.query.jexl.visitors.VariableNameVisitor; import datawave.query.postprocessing.tf.TFFactory; @@ -256,6 +261,7 @@ public void init(SortedKeyValueIterator source, Map op this.exceededOrEvaluationCache = new HashMap<>(); this.myEvaluationFunction = getJexlEvaluation(this.getQuery(), arithmetic); + this.setRetainDocumentColumnFamily(false); this.documentOptions = options; this.myEnvironment = env; @@ -975,7 +981,7 @@ protected Iterator> getEvaluation(NestedQueryIterator d // get the function we use for the tf functionality. Note we are // getting an additional source deep copy for this function - final Iterator>> itrWithContext; + Iterator>> itrWithContext; // TODO: this should be dynamic based on the query fields, not a flag passed to the iterator if (this.isTermFrequenciesRequired()) { @@ -997,6 +1003,17 @@ protected Iterator> getEvaluation(NestedQueryIterator d itrWithContext = Iterators.transform(tupleItr, new EmptyContext<>()); } + if (shouldCollectDocumentMatchContext(documentSource)) { + SortedKeyValueIterator documentMatchSource = getSourceDeepCopy("document-match context"); + DocumentMatchConfig documentMatchConfig = new DocumentMatchConfig(); + documentMatchConfig.setSource(documentMatchSource); + documentMatchConfig.setTimeFilter(getTimeFilter()); + documentMatchConfig.setLimits(getDocumentMatchLimits()); + Function>,Tuple3>> documentMatchFunction = buildDocumentMatchFunction( + documentMatchConfig); + itrWithContext = TraceIterators.transform(itrWithContext, documentMatchFunction, "Document Match Context Lookup"); + } + try { IteratorBuildingVisitor iteratorBuildingVisitor = createIteratorBuildingVisitor(getDocumentRange(documentSource), false, this.sortedUIDs); Multimap delayedNonEventFieldMap = DelayedNonEventSubTreeVisitor.getDelayedNonEventFieldMap(iteratorBuildingVisitor, @@ -1041,6 +1058,19 @@ protected Function,Tuple3>> return TFFactory.getFunction(tfConfig); } + /** + * This method exists so that extending classes can implement specific versions of the document-match context function. Specifically, so the + * {@link datawave.query.tld.TLDQueryIterator} can mark document-match collection as TLD-aware. + * + * @param documentMatchConfig + * a document-match configuration + * @return a document-match context function + */ + protected Function>,Tuple3>> buildDocumentMatchFunction( + DocumentMatchConfig documentMatchConfig) { + return DocumentMatchFactory.getFunction(documentMatchConfig); + } + private Range getDocumentRange(NestedQueryIterator documentSource) { if (null == documentSource) { return range; @@ -1076,16 +1106,18 @@ protected JexlEvaluation getJexlEvaluation(String query, NestedQueryIterator nestedQuery = documentSource.getNestedQuery(); if (null == nestedQuery) { - jexlEvaluationFunction = new JexlEvaluation(query, arithmetic); + jexlEvaluationFunction = new JexlEvaluation(rewrittenQuery, arithmetic); } else { jexlEvaluationFunction = nestedQuery.getEvaluation(); if (null == jexlEvaluationFunction) { - jexlEvaluationFunction = new JexlEvaluation(query, arithmetic); + jexlEvaluationFunction = new JexlEvaluation(rewriteDocumentMatchFunctions(nestedQuery.getQuery(), arithmetic), arithmetic); } } } @@ -1100,6 +1132,29 @@ protected JexlEvaluation getJexlEvaluation(String query, NestedQueryIterator documentSource) { + if (!isDocumentMatchContextRequired()) { + return false; + } + if (documentSource == null) { + return true; + } + NestedQuery nestedQuery = documentSource.getNestedQuery(); + if (nestedQuery == null || nestedQuery.getQuery() == null) { + return true; + } + ASTJexlScript nestedScript = ArithmeticJexlEngines.getEngine(getArithmetic()).parse(nestedQuery.getQuery()); + return DocumentMatchFunctionRebuildingVisitor.requiresDocumentMatchContext(nestedScript); + } + protected LimitFields getLimitFields() { return new LimitFields(this.getLimitFieldsMap(), this.getMatchingFieldSets()); } diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java index eb3e1485f99..c9782dc1681 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java @@ -67,6 +67,7 @@ import datawave.query.composite.CompositeMetadata; import datawave.query.exceptions.DatawaveFatalQueryException; import datawave.query.function.ConfiguredFunction; +import datawave.query.function.DocumentMatchContext; import datawave.query.function.DocumentPermutation; import datawave.query.function.Equality; import datawave.query.function.GetStartKey; @@ -284,6 +285,10 @@ public class QueryOptions implements OptionDescriber { public static final String TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS = "tf.agg.threshold"; + public static final String DOCUMENT_MATCH_CONTEXT_REQUIRED = "document.match.context.required"; + public static final String DOCUMENT_MATCH_MAX_ENCODED_SIZE = "document.match.max.encoded.size"; + public static final String DOCUMENT_MATCH_MAX_DECODED_SIZE = "document.match.max.decoded.size"; + public static final String FIELD_COUNTS = "field.counts"; public static final String TERM_COUNTS = "term.counts"; public static final String CARDINALITY_THRESHOLD = "cardinality.threshold"; @@ -339,6 +344,7 @@ public class QueryOptions implements OptionDescriber { protected EventDataQueryFilter eventEvaluationFilter; // filter specifically for event keys. required when performing a seeking aggregation protected EventDataQueryFilter eventFilter; + protected boolean retainDocumentColumnFamily = false; protected int maxEvaluationPipelines = 25; protected int maxPipelineCachedResults = 25; @@ -405,6 +411,7 @@ public class QueryOptions implements OptionDescriber { protected Map> nonIndexedDataTypeMap = Maps.newHashMap(); protected boolean termFrequenciesRequired = false; + protected boolean documentMatchContextRequired = false; protected Set termFrequencyFields = Collections.emptySet(); protected Set contentExpansionFields; @@ -467,6 +474,8 @@ public class QueryOptions implements OptionDescriber { // aggregation thresholds private int docAggregationThresholdMs = -1; private int tfAggregationThresholdMs = -1; + private int documentMatchMaxEncodedSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE; + private int documentMatchMaxDecodedSize = DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE; private CountMap fieldCounts; private CountMap termCounts; @@ -533,6 +542,8 @@ public void deepCopy(QueryOptions other) { this.evaluationFilter = other.evaluationFilter; this.fiEvaluationFilter = other.fiEvaluationFilter; this.eventEvaluationFilter = other.eventEvaluationFilter; + this.eventFilter = other.eventFilter; + this.retainDocumentColumnFamily = other.retainDocumentColumnFamily; this.ivaratorCacheDirConfigs = (other.ivaratorCacheDirConfigs == null) ? null : new ArrayList<>(other.ivaratorCacheDirConfigs); this.hdfsSiteConfigURLs = other.hdfsSiteConfigURLs; @@ -563,6 +574,7 @@ public void deepCopy(QueryOptions other) { this.sortedUIDs = other.sortedUIDs; this.termFrequenciesRequired = other.termFrequenciesRequired; + this.documentMatchContextRequired = other.documentMatchContextRequired; this.termFrequencyFields = other.termFrequencyFields; this.contentExpansionFields = other.contentExpansionFields; @@ -592,6 +604,8 @@ public void deepCopy(QueryOptions other) { this.docAggregationThresholdMs = other.docAggregationThresholdMs; this.tfAggregationThresholdMs = other.tfAggregationThresholdMs; + this.documentMatchMaxEncodedSize = other.documentMatchMaxEncodedSize; + this.documentMatchMaxDecodedSize = other.documentMatchMaxDecodedSize; this.fieldCounts = other.fieldCounts; this.termCounts = other.termCounts; @@ -870,13 +884,25 @@ public EventDataQueryFilter getEventFilter() { // @formatter:off eventFilter = new EventDataQueryFieldFilter() .withFields(fields) - .withMaxNextCount(getEventNextSeek()); + .withMaxNextCount(getEventNextSeek()) + .withDocumentColumnFamily(retainDocumentColumnFamily); // @formatter:on } return eventFilter == null ? null : eventFilter.clone(); } + public void setRetainDocumentColumnFamily(boolean retainDocumentColumnFamily) { + if (this.retainDocumentColumnFamily != retainDocumentColumnFamily) { + this.retainDocumentColumnFamily = retainDocumentColumnFamily; + // invalidate the cached filters to force them to be rebuilt instead of + // caching stale clones created under the old settings. + this.eventFilter = null; + this.evaluationFilter = null; + this.eventEvaluationFilter = null; + } + } + /** * Get the event fields to retain * @@ -1435,6 +1461,9 @@ public IteratorOptions describeOptions() { options.put(TF_NEXT_SEEK, "The number of next calls made by a Term Frequency data filter or aggregator before a seek is issued"); options.put(DOC_AGGREGATION_THRESHOLD_MS, "Document aggregations that exceed this threshold are logged as a warning"); options.put(TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS, "TermFrequency aggregations that exceed this threshold are logged as a warning"); + options.put(DOCUMENT_MATCH_CONTEXT_REQUIRED, "Whether the query requires gathering document-match context"); + options.put(DOCUMENT_MATCH_MAX_ENCODED_SIZE, "Maximum encoded d-column payload size, in bytes, to inspect for document:match evaluation"); + options.put(DOCUMENT_MATCH_MAX_DECODED_SIZE, "Maximum decoded d-column payload size, in bytes, to inspect for document:match evaluation"); options.put(FIELD_COUNTS, "Map of field counts from the global index"); options.put(TERM_COUNTS, "Map of term counts from the global index"); return new IteratorOptions(getClass().getSimpleName(), "Runs a query against the DATAWAVE tables", options, null); @@ -1655,6 +1684,18 @@ public boolean validateOptions(Map options) { this.tfAggregationThresholdMs = Integer.parseInt(options.get(TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS)); } + if (options.containsKey(DOCUMENT_MATCH_CONTEXT_REQUIRED)) { + this.documentMatchContextRequired = Boolean.parseBoolean(options.get(DOCUMENT_MATCH_CONTEXT_REQUIRED)); + } + + if (options.containsKey(DOCUMENT_MATCH_MAX_ENCODED_SIZE)) { + this.documentMatchMaxEncodedSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_ENCODED_SIZE)); + } + + if (options.containsKey(DOCUMENT_MATCH_MAX_DECODED_SIZE)) { + this.documentMatchMaxDecodedSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_DECODED_SIZE)); + } + if (options.containsKey(DATATYPE_FILTER)) { String option = options.get(DATATYPE_FILTER); if (option != null && !option.isEmpty()) { @@ -2462,6 +2503,34 @@ public void setTfAggregationThresholdMs(int tfAggregationThresholdMs) { this.tfAggregationThresholdMs = tfAggregationThresholdMs; } + public int getDocumentMatchMaxEncodedSize() { + return documentMatchMaxEncodedSize; + } + + public void setDocumentMatchMaxEncodedSize(int documentMatchMaxEncodedSize) { + this.documentMatchMaxEncodedSize = documentMatchMaxEncodedSize; + } + + public int getDocumentMatchMaxDecodedSize() { + return documentMatchMaxDecodedSize; + } + + public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { + this.documentMatchMaxDecodedSize = documentMatchMaxDecodedSize; + } + + public boolean isDocumentMatchContextRequired() { + return documentMatchContextRequired; + } + + public void setDocumentMatchContextRequired(boolean documentMatchContextRequired) { + this.documentMatchContextRequired = documentMatchContextRequired; + } + + protected DocumentMatchContext.Limits getDocumentMatchLimits() { + return new DocumentMatchContext.Limits(documentMatchMaxEncodedSize, documentMatchMaxDecodedSize); + } + /** * Get an {@link Equality} * diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java b/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java index 7cba3801a7f..72a5b9fa04e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java @@ -51,6 +51,7 @@ import datawave.query.attributes.ValueTuple; import datawave.query.collections.FunctionalSet; import datawave.query.jexl.functions.ContentFunctionsDescriptor; +import datawave.query.jexl.functions.DocumentFunctions; import datawave.query.jexl.functions.QueryFunctions; import datawave.query.jexl.nodes.ExceededOr; import datawave.query.jexl.nodes.QueryPropertyMarker; @@ -128,6 +129,16 @@ public Object visit(ASTFunctionNode node, Object data) { addHits(result); + if (isDocumentMatchFunction(nodeString) && result instanceof String) { + if (hasSiblings(node)) { + resultMap.put(nodeString, result); + return result; + } + boolean matched = !((String) result).isEmpty(); + resultMap.put(nodeString, matched); + return matched; + } + // if the function stands alone, then it needs to return ag boolean // if the function is paired with a method that is called on its results (like 'size') then the // actual results must be returned. @@ -139,6 +150,10 @@ public Object visit(ASTFunctionNode node, Object data) { return result instanceof Collection ? !((Collection) result).isEmpty() : result; } + private boolean isDocumentMatchFunction(String nodeString) { + return nodeString.startsWith(DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE + ":" + DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME); + } + /** * Triggered when variable can not be resolved. * diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java new file mode 100644 index 00000000000..3c3baa192a9 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java @@ -0,0 +1,259 @@ +package datawave.query.jexl.functions; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Base64; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.zip.GZIPInputStream; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; +import org.apache.log4j.Logger; + +import com.google.gson.Gson; + +import datawave.query.function.DocumentMatchContext; + +/** + * Evaluation-phase JEXL functions for inspecting decoded shard-table {@code d}-column content. + *

+ * The current namespace exposes {@code document:match(...)} which decodes base64-encoded, gzip-compressed document payloads, performs case-sensitive literal + * substring matching, and returns a JSON object keyed first by matched string, then by view name, with starting character offsets as the leaf values. + * Per-document state is supplied explicitly through {@link DocumentMatchContext} by the surrounding evaluation flow. + */ +@JexlFunctions(descriptorFactory = "datawave.query.jexl.functions.DocumentFunctionsDescriptor") +public class DocumentFunctions { + private static final Logger log = Logger.getLogger(DocumentFunctions.class); + private static final Gson GSON = new Gson(); + + public static final String DOCUMENT_FUNCTION_NAMESPACE = "document"; + public static final String DOCUMENT_MATCH_FUNCTION_NAME = "match"; + public static final String DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME = "documentMatchContext"; + public static final String DOCUMENT_MATCHES = "DOCUMENT_MATCHES"; + private static final int DECODE_BUFFER_SIZE = 4096; + + /** + * Evaluates the internal form of {@code document:match(STRING)} across all eligible views for the current document. + * + * @param context + * per-document context supplied by the evaluation pipeline + * @param search + * literal substring to search for + * @return a JSON object for this invocation keyed by matched string and then by view name, or an empty string if no match is found + */ + public static String match(DocumentMatchContext context, String search) { + return match(null, context, search); + } + + /** + * Evaluates the internal form of {@code document:match(VIEWNAME, STRING)} against the current document. + *

+ * Matching is case-sensitive and literal. If {@code viewName} ends with {@code *}, it is treated as a prefix match against the view portion of the + * {@code d}-column qualifier. Oversized or undecodable payloads are skipped as non-matching. Matches from this invocation are merged into the document-wide + * result set stored in the supplied {@link DocumentMatchContext}. + * + * @param viewName + * optional exact or prefix-matched view selector; {@code null} means evaluate all views + * @param context + * per-document context supplied by the evaluation pipeline + * @param search + * literal substring to search for + * @return a JSON object for this invocation keyed by matched string and then by view name, or an empty string if no match is found + */ + public static String match(String viewName, DocumentMatchContext context, String search) { + if (context == null || search == null) { + if (log.isDebugEnabled()) { + log.debug("Skipping document:match evaluation because context or search term was null"); + } + return ""; + } + + if (log.isDebugEnabled()) { + log.debug("Evaluating document:match for search [" + search + "] view filter [" + viewName + "] across " + context.getdEntries().size() + + " d-column entries"); + } + + Map> matches = new LinkedHashMap<>(); + for (Entry entry : context.getdEntries()) { + String candidateView = extractViewName(entry.getKey()); + if (!matchesView(viewName, candidateView)) { + if (log.isDebugEnabled()) { + log.debug("Skipping d-column entry " + entry.getKey() + " because view [" + candidateView + "] does not match filter [" + viewName + "]"); + } + continue; + } + byte[] encoded = entry.getValue().get(); + if (encoded.length > context.getMaxEncodedValueSize()) { + log.debug("Skipping oversized d-column payload of " + encoded.length + " bytes for view " + candidateView); + continue; + } + + try { + String decoded = decode(encoded, context.getMaxDecodedValueSize()); + List offsets = findOffsets(decoded, search); + if (!offsets.isEmpty()) { + if (log.isDebugEnabled()) { + log.debug("document:match found offsets " + offsets + " for search [" + search + "] in view [" + candidateView + "] using key " + + entry.getKey()); + } + if (context.recordMatchingEntry(entry.getKey()) && context.shouldLogVisibilityMismatch()) { + log.info("document:match encountered differing d-column visibilities for document " + context.getFirstMatchingEntry().getRow() + '/' + + context.getFirstMatchingEntry().getColumnFamily() + "; using visibility from first matched d-column " + + context.getFirstMatchingEntry() + " and ignoring differing visibility on " + entry.getKey()); + context.markVisibilityMismatchLogged(); + } + matches.computeIfAbsent(candidateView, k -> new ArrayList<>()).addAll(offsets); + } else if (log.isDebugEnabled()) { + log.debug("document:match found no offsets for search [" + search + "] in view [" + candidateView + "] using key " + entry.getKey()); + } + } catch (IOException | IllegalArgumentException e) { + log.debug("Unable to decode d-column payload for view " + candidateView, e); + } + } + + context.mergeMatches(search, matches); + if (log.isDebugEnabled()) { + log.debug("document:match merged matches for search [" + search + "]: " + matches); + } + return toJson(search, matches); + } + + /** + * Extracts the view name from a {@code d}-column qualifier whose layout is expected to be {@code datatype\0uid\0view}. + * + * @param key + * shard-table {@code d}-column key + * @return the extracted view name, or an empty string if the qualifier does not have the expected structure + */ + static String extractViewName(Key key) { + String cq = key.getColumnQualifier().toString(); + int firstNull = cq.indexOf('\0'); + if (firstNull < 0) { + return ""; + } + int secondNull = cq.indexOf('\0', firstNull + 1); + if (secondNull < 0 || secondNull + 1 >= cq.length()) { + return ""; + } + return cq.substring(secondNull + 1); + } + + /** + * Determines whether a candidate view satisfies the requested selector. + * + * @param expectedView + * requested view selector; {@code null} matches all views and a trailing {@code *} indicates prefix matching + * @param candidateView + * extracted view name for the current {@code d}-column + * @return {@code true} if the candidate view should be evaluated + */ + static boolean matchesView(String expectedView, String candidateView) { + if (expectedView == null) { + return true; + } + if (expectedView.endsWith("*")) { + String prefix = expectedView.substring(0, expectedView.length() - 1); + return candidateView.startsWith(prefix); + } + return expectedView.equals(candidateView); + } + + /** + * Decodes a base64-encoded, gzip-compressed {@code d}-column payload while enforcing a maximum decoded size. + * + * @param encoded + * encoded payload bytes from the shard table + * @param maxDecodedValueSize + * maximum allowed decoded payload size in bytes + * @return the decoded UTF-8 content + * @throws IOException + * if the payload cannot be decoded or if the decoded size exceeds the configured limit + */ + static String decode(byte[] encoded, int maxDecodedValueSize) throws IOException { + byte[] decodedBytes; + try (ByteArrayInputStream bais = new ByteArrayInputStream(encoded)) { + decodedBytes = Base64.getMimeDecoder().decode(bais.readAllBytes()); + } + + try (ByteArrayInputStream decodedInput = new ByteArrayInputStream(decodedBytes); + GZIPInputStream gzipInputStream = new GZIPInputStream(decodedInput); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + byte[] buffer = new byte[DECODE_BUFFER_SIZE]; + int read; + int totalRead = 0; + while ((read = gzipInputStream.read(buffer)) >= 0) { + totalRead += read; + if (totalRead > maxDecodedValueSize) { + throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecodedValueSize + " bytes"); + } + baos.write(buffer, 0, read); + } + return baos.toString(StandardCharsets.UTF_8); + } catch (IOException e) { + if (decodedBytes.length > maxDecodedValueSize) { + throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecodedValueSize + " bytes", e); + } + return new String(decodedBytes, StandardCharsets.UTF_8); + } + } + + /** + * Finds all starting character offsets for a literal substring, including overlapping matches. + * + * @param decoded + * decoded document content + * @param search + * literal substring to search for + * @return ordered starting offsets for each match + */ + static List findOffsets(String decoded, String search) { + List offsets = new ArrayList<>(); + if (search.isEmpty()) { + return offsets; + } + int index = decoded.indexOf(search); + while (index >= 0) { + offsets.add(index); + index = decoded.indexOf(search, index + 1); + } + return offsets; + } + + /** + * Serializes matches for one {@code document:match(...)} invocation to the JSON payload shape stored in {@code DOCUMENT_MATCHES}. + * + * @param search + * literal string matched by the invocation + * @param matches + * map of view name to ordered character offsets + * @return JSON string representation, or an empty string if the map is empty + */ + public static String toJson(String search, Map> matches) { + if (matches.isEmpty()) { + return ""; + } + Map>> payload = new LinkedHashMap<>(); + payload.put(search, matches); + return GSON.toJson(payload); + } + + /** + * Serializes merged document-wide matches to the JSON payload stored in {@code DOCUMENT_MATCHES}. + * + * @param matches + * map of matched string to per-view ordered character offsets + * @return JSON string representation, or an empty string if the map is empty + */ + public static String toJson(Map>> matches) { + if (matches.isEmpty()) { + return ""; + } + return GSON.toJson(matches); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java new file mode 100644 index 00000000000..ea5cdf802d7 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java @@ -0,0 +1,107 @@ +package datawave.query.jexl.functions; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTFunctionNode; + +import datawave.query.attributes.AttributeFactory; +import datawave.query.config.ShardQueryConfiguration; +import datawave.query.jexl.ArithmeticJexlEngines; +import datawave.query.jexl.functions.arguments.JexlArgumentDescriptor; +import datawave.query.jexl.visitors.EventDataQueryExpressionVisitor; +import datawave.query.util.DateIndexHelper; +import datawave.query.util.MetadataHelper; +import datawave.webservice.query.exception.BadRequestQueryException; +import datawave.webservice.query.exception.DatawaveErrorCode; + +/** + * Argument-descriptor factory for the {@code document:*} JEXL namespace. + *

+ * {@code document:match(...)} is an evaluation-only function. It does not contribute field normalization rules, event-data filters, index expansion, or + * ivarator pushdown. This descriptor exists primarily to validate the namespace/function pairing and to return a descriptor that tells the planner to leave the + * function in the evaluation phase. + */ +@SuppressWarnings("unused") +public class DocumentFunctionsDescriptor implements JexlFunctionArgumentDescriptorFactory { + + /** + * Descriptor for {@code document:match(...)}. + *

+ * The function is evaluated only after a candidate document has been materialized, so all index-planning hooks intentionally report no fields and no index + * query contribution. + */ + public static class DocumentJexlArgumentDescriptor implements JexlArgumentDescriptor { + @Override + public org.apache.commons.jexl3.parser.JexlNode getIndexQuery(ShardQueryConfiguration config, MetadataHelper helper, DateIndexHelper dateIndexHelper, + Set datatypeFilter) { + return TRUE_NODE; + } + + @Override + public void addFilters(AttributeFactory attributeFactory, Map filterMap) {} + + @Override + public Set fieldsForNormalization(MetadataHelper helper, Set datatypeFilter, int arg) { + return Collections.emptySet(); + } + + @Override + public Set fields(MetadataHelper helper, Set datatypeFilter) { + return Collections.emptySet(); + } + + @Override + public Set> fieldSets(MetadataHelper helper, Set datatypeFilter) { + return Collections.emptySet(); + } + + @Override + public boolean useOrForExpansion() { + return false; + } + + @Override + public boolean regexArguments() { + return false; + } + + @Override + public boolean allowIvaratorFiltering() { + return false; + } + } + + /** + * Validates that the supplied function node represents {@code document:match(...)} and returns the evaluation-only descriptor for it. + * + * @param node + * function node from the parsed JEXL tree + * @return descriptor describing the planning behavior for {@code document:match(...)} + * @throws IllegalArgumentException + * if the namespace, function class, or argument count is invalid + */ + @Override + public JexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) { + FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node); + Class functionClass = (Class) ArithmeticJexlEngines.functions().get(visitor.namespace()); + + if (!DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace())) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.JEXLNODEDESCRIPTOR_NAMESPACE_UNEXPECTED, + "Unexpected namespace " + visitor.namespace()); + throw new IllegalArgumentException(qe); + } + if (!functionClass.equals(DocumentFunctions.class)) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.JEXLNODEDESCRIPTOR_NODE_FOR_FUNCTION, + "Unexpected function class " + functionClass); + throw new IllegalArgumentException(qe); + } + if (!DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(visitor.name()) || visitor.args().isEmpty() || visitor.args().size() > 2) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.WRONG_NUMBER_OF_ARGUMENTS, + "Wrong number of arguments to document:match"); + throw new IllegalArgumentException(qe); + } + return new DocumentJexlArgumentDescriptor(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java index f1a15d5a8b4..f3d03615de7 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java @@ -25,6 +25,7 @@ public class JexlFunctionNamespaceRegistry { static { registeredFunctions.put(ContentFunctions.CONTENT_FUNCTION_NAMESPACE, ContentFunctions.class); + registeredFunctions.put(DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE, DocumentFunctions.class); registeredFunctions.put(NormalizationFunctions.NORMALIZATION_FUNCTION_NAMESPACE, NormalizationFunctions.class); registeredFunctions.put(EvaluationPhaseFilterFunctions.EVAL_PHASE_FUNCTION_NAMESPACE, EvaluationPhaseFilterFunctions.class); registeredFunctions.put(GroupingRequiredFilterFunctions.GROUPING_REQUIRED_FUNCTION_NAMESPACE, GroupingRequiredFilterFunctions.class); diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java new file mode 100644 index 00000000000..468ef44bf8f --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java @@ -0,0 +1,61 @@ +package datawave.query.jexl.visitors; + +import org.apache.commons.jexl3.parser.ASTFunctionNode; +import org.apache.commons.jexl3.parser.ASTJexlScript; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.JexlNodeFactory; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.jexl.functions.FunctionJexlNodeVisitor; + +/** + * Rewrites user-facing {@code document:match(...)} calls into the internal evaluation form that carries the reserved {@code documentMatchContext} argument + * explicitly. + *

+ * This mirrors the way {@code content:*} functions are evaluated with an explicit {@code termOffsetMap} argument, but preserves the external user syntax for + * document matching. + */ +public class DocumentMatchFunctionRebuildingVisitor extends RebuildingVisitor { + + /** + * Determines whether the supplied script contains any {@code document:match(...)} calls. + * + * @param script + * script to inspect + * @return {@code true} if any document-match functions are present + */ + public static boolean requiresDocumentMatchContext(ASTJexlScript script) { + return JexlASTHelper.getFunctionNodes(script).stream().map(FunctionJexlNodeVisitor::eval) + .anyMatch(function -> DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(function.namespace()) + && DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(function.name())); + } + + /** + * Rewrites all {@code document:match(...)} calls in the supplied script to include the reserved context identifier. + * + * @param script + * script to rewrite + * @return rewritten script + */ + public static ASTJexlScript rewrite(ASTJexlScript script) { + return (ASTJexlScript) script.jjtAccept(new DocumentMatchFunctionRebuildingVisitor(), null); + } + + @Override + public Object visit(ASTFunctionNode node, Object data) { + FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node); + if (DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace()) + && DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(visitor.name())) { + if (visitor.args().size() == 1) { + return FunctionJexlNodeVisitor.makeFunctionFrom(visitor.namespace(), visitor.name(), + JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), + RebuildingVisitor.copy(visitor.args().get(0))); + } else if (visitor.args().size() == 2) { + return FunctionJexlNodeVisitor.makeFunctionFrom(visitor.namespace(), visitor.name(), RebuildingVisitor.copy(visitor.args().get(0)), + JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), + RebuildingVisitor.copy(visitor.args().get(1))); + } + } + return super.visit(node, data); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java b/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java index 7b9b3fea3a8..e343b531f06 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java @@ -30,6 +30,7 @@ import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.search.TermQuery; +import datawave.query.language.functions.lucene.DocumentMatch; import datawave.query.language.functions.lucene.EvaluationOnly; import datawave.query.language.functions.lucene.Exclude; import datawave.query.language.functions.lucene.Include; @@ -48,7 +49,7 @@ @Deprecated public class FunctionQueryNodeBuilder implements QueryBuilder { - private Map allowedFunctionMap = Collections.synchronizedMap(new HashMap<>()); + private final Map allowedFunctionMap = Collections.synchronizedMap(new HashMap<>()); public FunctionQueryNodeBuilder() { addFunction(new IsNull()); @@ -57,6 +58,7 @@ public FunctionQueryNodeBuilder() { addFunction(new Exclude()); addFunction(new Text()); addFunction(new Occurrence()); + addFunction(new DocumentMatch()); addFunction(new EvaluationOnly()); } diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java new file mode 100644 index 00000000000..0c54d1a0c59 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java @@ -0,0 +1,61 @@ +package datawave.query.language.functions.jexl; + +import java.text.MessageFormat; +import java.util.ArrayList; + +import datawave.query.language.functions.QueryFunction; +import datawave.webservice.query.exception.BadRequestQueryException; +import datawave.webservice.query.exception.DatawaveErrorCode; + +/** + * JEXL-language representation of {@code document:match(...)}. + *

+ * This function is produced by the query-language layer after parsing or after Lucene-to-JEXL translation. It validates the supported one-argument and + * two-argument forms and renders the canonical JEXL syntax consumed by the runtime query planner. + */ +public class DocumentMatch extends JexlQueryFunction { + public DocumentMatch() { + super("DOCUMENT_MATCH", new ArrayList<>()); + } + + /** + * Validates that {@code document:match(...)} received either one argument ({@code STRING}) or two arguments ({@code VIEWNAME, STRING}). + * + * @throws IllegalArgumentException + * if the function has no arguments or more than two arguments + */ + @Override + public void validate() throws IllegalArgumentException { + if (this.parameterList == null || this.parameterList.isEmpty() || this.parameterList.size() > 2) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name)); + throw new IllegalArgumentException(qe); + } + } + + /** + * Renders the canonical JEXL form {@code document:match(...)} with escaped arguments. + * + * @return JEXL representation of this function + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("document:match("); + for (int i = 0; i < parameterList.size(); i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(escapeString(parameterList.get(i))); + } + sb.append(")"); + return sb.toString(); + } + + /** + * @return a fresh function instance for parser duplication + */ + @Override + public QueryFunction duplicate() { + return new DocumentMatch(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java new file mode 100644 index 00000000000..84c7a155a2d --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java @@ -0,0 +1,102 @@ +package datawave.query.language.functions.lucene; + +import java.text.MessageFormat; +import java.util.ArrayList; + +import datawave.query.language.functions.QueryFunction; +import datawave.query.search.WildcardFieldedFilter; +import datawave.webservice.query.exception.BadRequestQueryException; +import datawave.webservice.query.exception.DatawaveErrorCode; + +/** + * Lucene-language representation of {@code #DOCUMENT_MATCH(...)}. + *

+ * This class exists in the Lucene query-language layer so the parser can recognize the function and carry it through the same fielded-filter machinery used by + * other Lucene functions before the query is rendered into JEXL. The runtime semantics are still provided by {@code document:match(...)} in the evaluation + * phase. + */ +@Deprecated +public class DocumentMatch extends LuceneQueryFunction { + private static class DocumentMatchFilter extends WildcardFieldedFilter { + private final String renderedQuery; + + DocumentMatchFilter(String selector) { + super(true, WildcardFieldedFilter.BooleanType.AND); + setField("document"); + setSelector(selector); + this.renderedQuery = "document:" + selector; + this.query = renderedQuery; + } + + @Override + public String toString() { + return renderedQuery; + } + } + + public DocumentMatch() { + super("DOCUMENT_MATCH", new ArrayList<>()); + } + + /** + * Validates that {@code #DOCUMENT_MATCH(...)} received either one argument ({@code STRING}) or two arguments ({@code VIEWNAME, STRING}). + * + * @throws IllegalArgumentException + * if the function has no arguments or more than two arguments + */ + @Override + public void validate() throws IllegalArgumentException { + if (this.parameterList == null || this.parameterList.isEmpty() || this.parameterList.size() > 2) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name)); + throw new IllegalArgumentException(qe); + } + } + + /** + * Initializes the Lucene-layer fielded-filter representation used during parsing. + *

+ * The synthetic {@code document:match(...)} selector created here is a parser-level representation only; actual evaluation is deferred until the translated + * JEXL query runs against candidate documents. + * + * @param parameterList + * parsed function arguments + * @param depth + * function-node depth in the Lucene parse tree + * @param parent + * parent query node + * @throws IllegalArgumentException + * if initialization fails + */ + @Override + public void initialize(java.util.List parameterList, int depth, org.apache.lucene.queryparser.flexible.core.nodes.QueryNode parent) + throws IllegalArgumentException { + super.initialize(parameterList, depth, parent); + this.fieldedFilter = new DocumentMatchFilter(buildSelector()); + } + + /** + * Builds the parser-layer selector text {@code match(...)} from the raw Lucene arguments. + * + * @return selector text used by the synthetic fielded filter + */ + private String buildSelector() { + StringBuilder sb = new StringBuilder(); + sb.append("match("); + for (int i = 0; i < parameterList.size(); i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(parameterList.get(i)); + } + sb.append(")"); + return sb.toString(); + } + + /** + * @return a fresh function instance for parser duplication + */ + @Override + public QueryFunction duplicate() { + return new DocumentMatch(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 3c20abd2401..a06fd608091 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -119,6 +119,7 @@ import datawave.query.jexl.visitors.ConjunctionEliminationVisitor; import datawave.query.jexl.visitors.DepthVisitor; import datawave.query.jexl.visitors.DisjunctionEliminationVisitor; +import datawave.query.jexl.visitors.DocumentMatchFunctionRebuildingVisitor; import datawave.query.jexl.visitors.ExecutableDeterminationVisitor; import datawave.query.jexl.visitors.ExecutableDeterminationVisitor.STATE; import datawave.query.jexl.visitors.ExecutableExpansionVisitor; @@ -720,6 +721,7 @@ private void configureIterator(ShardQueryConfiguration config, IteratorSetting c addOption(cfg, QueryOptions.HIT_LIST, Boolean.toString(config.isHitList()), false); addOption(cfg, QueryOptions.TERM_FREQUENCY_FIELDS, Joiner.on(',').join(config.getQueryTermFrequencyFields()), false); addOption(cfg, QueryOptions.TERM_FREQUENCIES_REQUIRED, Boolean.toString(config.isTermFrequenciesRequired()), false); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED, Boolean.toString(config.isDocumentMatchContextRequired()), false); addOption(cfg, QueryOptions.QUERY, newQueryString, false); addOption(cfg, QueryOptions.QUERY_ID, config.getQuery().getId().toString(), false); addOption(cfg, QueryOptions.FULL_TABLE_SCAN_ONLY, Boolean.toString(isFullTable), false); @@ -1512,6 +1514,12 @@ protected void timedCheckForTokenizedFields(QueryStopwatch timers, String stage, } } + config.setDocumentMatchContextRequired(DocumentMatchFunctionRebuildingVisitor.requiresDocumentMatchContext(config.getQueryTree())); + if (log.isDebugEnabled()) { + logQuery(config.getQueryTree(), "Computed that the query " + (config.isDocumentMatchContextRequired() ? "requires" : "does not require") + + " document-match context lookup"); + } + stopwatch.stop(); } @@ -2432,6 +2440,8 @@ protected Future loadQueryIterator(final MetadataHelper metadat addOption(cfg, QueryOptions.MAX_PIPELINE_CACHED_RESULTS, Integer.toString(config.getMaxPipelineCachedResults()), false); addOption(cfg, QueryOptions.MAX_IVARATOR_SOURCES, Integer.toString(config.getMaxIvaratorSources()), false); addOption(cfg, QueryOptions.MAX_IVARATOR_SOURCE_WAIT, Long.toString(config.getMaxIvaratorSourceWait()), false); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedSize()), false); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE, Integer.toString(config.getDocumentMatchMaxDecodedSize()), false); if (config.getYieldThresholdMs() != Long.MAX_VALUE && config.getYieldThresholdMs() > 0) { addOption(cfg, QueryOptions.YIELD_THRESHOLD_MS, Long.toString(config.getYieldThresholdMs()), false); diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java index 34e61842ada..1628da3c4bc 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java @@ -21,6 +21,7 @@ * This filter only operates on event keys. */ public class EventDataQueryFieldFilter implements EventDataQueryFilter { + private static final String DOCUMENT_COLUMN_FAMILY = "d"; private Key document = null; // the number of times next is called before issuing a seek @@ -33,6 +34,7 @@ public class EventDataQueryFieldFilter implements EventDataQueryFilter { // the set of fields to retain private TreeSet fields; private final EventKey parser; + private boolean retainDocumentColumnFamily = false; /** * Default constructor @@ -53,6 +55,7 @@ public EventDataQueryFieldFilter(EventDataQueryFieldFilter other) { } this.maxNextCount = other.maxNextCount; this.fields = new TreeSet<>(other.fields); + this.retainDocumentColumnFamily = other.retainDocumentColumnFamily; // need to create a separate parser as the parser is not thread safe this.parser = new EventKey(); // do not copy nextCount or currentField because that is internal state @@ -84,6 +87,11 @@ public EventDataQueryFieldFilter withMaxNextCount(int maxNextCount) { return this; } + public EventDataQueryFieldFilter withDocumentColumnFamily(boolean retainDocumentColumnFamily) { + this.retainDocumentColumnFamily = retainDocumentColumnFamily; + return this; + } + @Override public void startNewDocument(Key document) { this.document = document; @@ -126,6 +134,12 @@ public boolean peek(@Nullable Map.Entry entry) { * @return true if the key should be retained */ private boolean apply(Key key, boolean update) { + if (retainDocumentColumnFamily && DOCUMENT_COLUMN_FAMILY.equals(key.getColumnFamily().toString())) { + nextCount = 0; + currentField = null; + return true; + } + parser.parse(key); String field = parser.getField(); field = JexlASTHelper.deconstructIdentifier(field); diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java index 2a060393b8f..33dabcb78ee 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java @@ -3447,6 +3447,22 @@ public void setTfAggregationThresholdMs(int tfAggregationThresholdMs) { getConfig().setTfAggregationThresholdMs(tfAggregationThresholdMs); } + public int getDocumentMatchMaxEncodedSize() { + return getConfig().getDocumentMatchMaxEncodedSize(); + } + + public void setDocumentMatchMaxEncodedSize(int documentMatchMaxEncodedSize) { + getConfig().setDocumentMatchMaxEncodedSize(documentMatchMaxEncodedSize); + } + + public int getDocumentMatchMaxDecodedSize() { + return getConfig().getDocumentMatchMaxDecodedSize(); + } + + public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { + getConfig().setDocumentMatchMaxDecodedSize(documentMatchMaxDecodedSize); + } + public boolean getPruneQueryOptions() { return getConfig().getPruneQueryOptions(); } diff --git a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java index a6ecc78aebf..72457b20f1f 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java @@ -27,6 +27,8 @@ import datawave.query.attributes.AttributeFactory; import datawave.query.attributes.Document; +import datawave.query.function.DocumentMatchConfig; +import datawave.query.function.DocumentMatchFactory; import datawave.query.function.Equality; import datawave.query.function.RangeProvider; import datawave.query.function.TLDEquality; @@ -266,6 +268,13 @@ protected Function,Tuple3>> return TFFactory.getFunction(tfConfig); } + @Override + protected Function>,Tuple3>> buildDocumentMatchFunction( + DocumentMatchConfig documentMatchConfig) { + documentMatchConfig.setTld(true); + return DocumentMatchFactory.getFunction(documentMatchConfig); + } + /** * Get a {@link TLDRangeProvider} * diff --git a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java new file mode 100644 index 00000000000..ef85ec77738 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java @@ -0,0 +1,278 @@ +package datawave.query; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.net.URL; +import java.nio.file.Path; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.TimeZone; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.security.tokens.PasswordToken; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.minicluster.MiniAccumuloCluster; +import org.apache.accumulo.minicluster.MiniAccumuloConfig; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.ComponentScan; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import com.google.common.base.Preconditions; + +import datawave.ingest.data.TypeRegistry; +import datawave.query.attributes.Attribute; +import datawave.query.attributes.Document; +import datawave.query.function.DocumentMatchContext; +import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.tables.ShardQueryLogic; +import datawave.query.util.AbstractQueryTest; +import datawave.query.util.WiseGuysIngest; + +@ExtendWith(SpringExtension.class) +@ComponentScan(basePackages = "datawave.query") +// @formatter:off +@ContextConfiguration(locations = { + "classpath:datawave/query/QueryLogicFactory.xml", + "classpath:beanRefContext.xml", + "classpath:MarkingFunctionsContext.xml", + "classpath:MetadataHelperContext.xml", + "classpath:CacheContext.xml"}) +// @formatter:on +/** + * MiniAccumulo-backed integration tests for {@code document:match(...)}. + *

+ * These tests exercise the full query path, including query parsing, planner wiring, shard-table document materialization, evaluation-phase document matching, + * and publication of the {@code DOCUMENT_MATCHES} attribute on returned documents. + */ +public class DocumentMatchQueryTest extends AbstractQueryTest { + + private static final Logger log = Logger.getLogger(DocumentMatchQueryTest.class); + private static final Authorizations auths = new Authorizations("ALL"); + private static final String PASSWORD = "password"; + + @TempDir + public static Path folder; + + protected static MiniAccumuloCluster mac; + protected static AccumuloClient client; + + @Autowired + @Qualifier("EventQuery") + protected ShardQueryLogic logic; + + private final Map expectedDocumentMatches = new HashMap<>(); + private Boolean expectedDocumentMatchContextRequired; + + @Override + public ShardQueryLogic getLogic() { + return logic; + } + + @Override + public Authorizations getAuths() { + return auths; + } + + @BeforeAll + public static void beforeAll() throws Exception { + System.setProperty("type.metadata.dir", folder.toFile().getAbsolutePath()); + + MiniAccumuloConfig cfg = new MiniAccumuloConfig(folder.toFile(), PASSWORD); + cfg.setNumTservers(1); + mac = new MiniAccumuloCluster(cfg); + mac.start(); + + client = mac.createAccumuloClient("root", new PasswordToken(PASSWORD)); + client.securityOperations().changeUserAuthorizations("root", auths); + new QueryTestTableHelper(client, log); + WiseGuysIngest.writeItAll(client, WiseGuysIngest.WhatKindaRange.DOCUMENT); + } + + @BeforeEach + public void beforeEach() { + TimeZone.setDefault(TimeZone.getTimeZone("GMT")); + setClientForTest(client); + + URL hadoopConfig = this.getClass().getResource("/testhadoop.config"); + Preconditions.checkNotNull(hadoopConfig); + logic.setHdfsSiteConfigURLs(hadoopConfig.toExternalForm()); + logic.setIvaratorCacheDirConfigs(Collections.singletonList(new IvaratorCacheDirConfig(folder.toUri().toString()))); + logic.setMaxFieldIndexRangeSplit(1); + logic.setCollapseUids(false); + logic.setFullTableScanEnabled(true); + logic.setDocumentMatchMaxDecodedSize(DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE); + + givenParameter(QueryParameters.HIT_LIST, "true"); + logic.setHitList(true); + givenDate("20091231", "20150101"); + } + + @AfterEach + public void afterEach() { + super.afterEach(); + expectedDocumentMatches.clear(); + expectedDocumentMatchContextRequired = null; + } + + @AfterAll + public static void afterAll() throws Exception { + if (mac != null) { + mac.stop(); + } + TypeRegistry.reset(); + } + + @Override + protected void extraConfigurations() { + // no-op + } + + /** + * Verifies that returned documents expose the expected {@code DOCUMENT_MATCHES} payload when the current test configured one. + */ + @Override + protected void extraAssertions() { + if (expectedDocumentMatchContextRequired != null) { + if (expectedDocumentMatchContextRequired) { + assertTrue(logic.getConfig().isDocumentMatchContextRequired(), "planned query did not require document-match context lookup"); + } else { + assertFalse(logic.getConfig().isDocumentMatchContextRequired(), "planned query unexpectedly required document-match context lookup"); + } + } + + for (Document result : results) { + Attribute uuid = result.get("UUID"); + assertNotNull(uuid, "result did not contain UUID"); + + String uuidValue = getUUID(uuid); + String expected = expectedDocumentMatches.get(uuidValue); + if (expected != null) { + Attribute matches = result.get(DocumentFunctions.DOCUMENT_MATCHES); + assertNotNull(matches, "result did not contain DOCUMENT_MATCHES"); + assertEquals(expected, matches.getData().toString()); + } + } + } + + /** + * Verifies that JEXL {@code document:match(STRING)} evaluates across all views and returns the expected offsets. + */ + @Test + public void testDocumentMatchJexlAllViews() throws Exception { + disableQueryPlanAssertion(); + givenQuery("UUID == 'CAPONE' && document:match('can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", "{\"can\":{\"CONTENT\":[4,61],\"CONTENT2\":[27]}}"); + planAndExecuteQuery(); + } + + /** + * Verifies that JEXL {@code document:match(VIEWNAME, STRING)} restricts evaluation to the named view. + */ + @Test + public void testDocumentMatchJexlSpecificView() throws Exception { + disableQueryPlanAssertion(); + givenQuery("UUID == 'CAPONE' && document:match('CONTENT2', 'lawyer')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", "{\"lawyer\":{\"CONTENT2\":[2]}}"); + planAndExecuteQuery(); + } + + /** + * Verifies that multiple JEXL {@code document:match(...)} calls contribute to one merged {@code DOCUMENT_MATCHES} payload. + */ + @Test + public void testDocumentMatchJexlMergesMatchesAcrossCalls() throws Exception { + disableQueryPlanAssertion(); + givenQuery("UUID == 'CAPONE' && document:match('CONTENT', 'can') && document:match('CONTENT2', 'lawyer')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", "{\"can\":{\"CONTENT\":[4,61]},\"lawyer\":{\"CONTENT2\":[2]}}"); + planAndExecuteQuery(); + } + + /** + * Verifies Lucene {@code #DOCUMENT_MATCH(...)} translation and wildcard view-prefix behavior in the full query path. + */ + @Test + public void testDocumentMatchLuceneWildcardView() throws Exception { + disableQueryPlanAssertion(); + givenParameter(QueryParameters.QUERY_SYNTAX, "LUCENE"); + givenQuery("UUID:CAPONE AND #DOCUMENT_MATCH(CONTENT*,can)"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", "{\"can\":{\"CONTENT\":[4,61],\"CONTENT2\":[27]}}"); + planAndExecuteQuery(); + } + + /** + * Verifies that a non-matching document-match term filters the document out of the result set. + */ + @Test + public void testDocumentMatchNoMatchFiltersDocument() throws Exception { + disableQueryPlanAssertion(); + givenQuery("UUID == 'CAPONE' && document:match('missing')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(0); + planAndExecuteQuery(); + } + + /** + * Verifies that document-match is case-sensitive in the full query path. + */ + @Test + public void testDocumentMatchIsCaseSensitive() throws Exception { + disableQueryPlanAssertion(); + givenQuery("UUID == 'CAPONE' && document:match('Can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(0); + planAndExecuteQuery(); + } + + /** + * Verifies that decoded payloads larger than the configured limit are skipped as non-matching during end-to-end query execution. + */ + @Test + public void testDocumentMatchOversizedDecodedPayloadIsSkipped() throws Exception { + disableQueryPlanAssertion(); + logic.setDocumentMatchMaxDecodedSize(8); + givenQuery("UUID == 'CAPONE' && document:match('can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(0); + planAndExecuteQuery(); + } + + /** + * Verifies that queries without {@code document:match(...)} do not request document-match context lookup in the integration harness. + */ + @Test + public void testQueryWithoutDocumentMatchDoesNotRequireContext() throws Exception { + disableQueryPlanAssertion(); + givenQuery("UUID == 'CAPONE'"); + expectedDocumentMatchContextRequired = false; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + planAndExecuteQuery(); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java index 1499ce689c1..186dedd591c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java @@ -46,6 +46,7 @@ import datawave.query.attributes.UniqueFields; import datawave.query.common.grouping.GroupFields; import datawave.query.config.annotation.AllHitsQueryConfig; +import datawave.query.function.DocumentMatchContext; import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; import datawave.query.iterator.logic.ContentSummaryIterator; import datawave.query.iterator.logic.TermFrequencyExcerptIterator; @@ -665,6 +666,12 @@ public void setUp() throws Exception { updatedValues.put("allHitsQueryConfig", new AllHitsQueryConfig()); defaultValues.put("originalJexlQuery", null); updatedValues.put("originalJexlQuery", "FIELD == 'VALUE'"); + defaultValues.put("documentMatchMaxEncodedSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE); + updatedValues.put("documentMatchMaxEncodedSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE + 1); + defaultValues.put("documentMatchMaxDecodedSize", DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE); + updatedValues.put("documentMatchMaxDecodedSize", DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE + 1); + defaultValues.put("documentMatchContextRequired", false); + updatedValues.put("documentMatchContextRequired", true); } private Query createQuery(String query) { diff --git a/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java new file mode 100644 index 00000000000..718b526897e --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java @@ -0,0 +1,141 @@ +package datawave.query.function; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.junit.Test; + +import com.google.common.collect.Lists; + +import datawave.query.attributes.Document; +import datawave.query.attributes.DocumentKey; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.util.Tuple3; +import datawave.query.util.Tuples; + +/** + * Focused tests for {@link DocumentMatchContextFunction}. + */ +public class DocumentMatchContextFunctionTest { + + /** + * Verifies that only matching {@code d}-column entries for the current document key are added to the evaluation side-channel. + */ + @Test + public void testCollectsOnlyCurrentDocumentColumns() { + List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0uid\0META"), new Value("two".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0other\0BODY"), new Value("skip".getBytes())), + Map.entry(new Key("20240101_0", "tf", "datatype\0uid\0BODY"), new Value("skip".getBytes()))); + + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(entries)); + config.setLimits(new DocumentMatchContext.Limits(1234, 5678)); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Tuple3> result = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertEquals(2, context.getdEntries().size()); + assertEquals(1234, context.getMaxEncodedValueSize()); + assertEquals(5678, context.getMaxDecodedValueSize()); + } + + /** + * Verifies that the function produces an empty context entry when a document has no retained {@code d}-column values. + */ + @Test + public void testCollectsEmptyContextWhenNoDocumentColumnsExist() { + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(Collections.emptyList())); + config.setLimits(new DocumentMatchContext.Limits(10, 20)); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Tuple3> result = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertTrue(context.getdEntries().isEmpty()); + } + + /** + * Verifies that document-match context collection honors explicit {@code DOCKEY} attributes instead of assuming that the tuple key is the only event key. + */ + @Test + public void testCollectsColumnsForDocumentKeysFromDocument() { + List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0child\0BODY"), new Value("two".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0other\0BODY"), new Value("skip".getBytes()))); + + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(entries)); + config.setLimits(new DocumentMatchContext.Limits(10, 20)); + config.setTld(true); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Document document = new Document(); + document.put(Document.DOCKEY_FIELD_NAME, new DocumentKey(new Key("20240101_0", "datatype\0uid"), false)); + document.put(Document.DOCKEY_FIELD_NAME, new DocumentKey(new Key("20240101_0", "datatype\0child"), false)); + + Tuple3> result = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0root"), document, Collections.emptyMap())); + DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertEquals(2, context.getdEntries().size()); + } + + private static class ListBackedIterator implements SortedKeyValueIterator { + private final List> entries; + private int index = -1; + + private ListBackedIterator(List> entries) { + this.entries = entries; + } + + @Override + public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) {} + + @Override + public boolean hasTop() { + return index >= 0 && index < entries.size(); + } + + @Override + public void next() { + index++; + } + + @Override + public void seek(Range range, java.util.Collection columnFamilies, boolean inclusive) { + index = 0; + while (index < entries.size() && !range.contains(entries.get(index).getKey())) { + index++; + } + } + + @Override + public Key getTopKey() { + return entries.get(index).getKey(); + } + + @Override + public Value getTopValue() { + return entries.get(index).getValue(); + } + + @Override + public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { + return new ListBackedIterator(entries); + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java index b4af760243a..138c106bd7f 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java @@ -11,6 +11,8 @@ import java.util.Map; import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.commons.jexl3.parser.ASTJexlScript; import org.junit.Test; import com.google.common.collect.Maps; @@ -24,7 +26,11 @@ import datawave.query.attributes.Numeric; import datawave.query.jexl.DatawaveJexlContext; import datawave.query.jexl.HitListArithmetic; +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.functions.DocumentFunctions; import datawave.query.jexl.functions.TermFrequencyList; +import datawave.query.jexl.visitors.DocumentMatchFunctionRebuildingVisitor; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; import datawave.query.postprocessing.tf.TermOffsetMap; import datawave.query.util.Tuple3; @@ -192,6 +198,61 @@ public void testContentPhraseFunction() { assertTrue(foundPhrase); } + @Test + public void testDocumentMatchAddsDocumentAttribute() { + String query = "FOO == 'bar' && document:match('car')"; + Key docKey = new Key("shard", "datatype\0uid"); + Document d = new Document(); + d.put("FOO", new Content("bar", docKey, true)); + + DatawaveJexlContext context = new DatawaveJexlContext(); + d.visit(Collections.singleton("FOO"), context); + context.set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, + new DocumentMatchContext(List.of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), + new org.apache.accumulo.core.data.Value(buildEncodedValue("scar car")))), 1024)); + + assertEvaluation(query, docKey, d, context); + assertEquals("{\"car\":{\"BODY\":[1,5]}}", ((Content) d.get(DocumentFunctions.DOCUMENT_MATCHES)).getContent()); + assertEquals(new ColumnVisibility("A"), d.get(DocumentFunctions.DOCUMENT_MATCHES).getColumnVisibility()); + } + + @Test + public void testDocumentMatchMergesDocumentAttributeAcrossCalls() { + String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('CONTENT2', 'lawyer')"; + Key docKey = new Key("shard", "datatype\0uid"); + Document d = new Document(); + d.put("FOO", new Content("bar", docKey, true)); + + DatawaveJexlContext context = new DatawaveJexlContext(); + d.visit(Collections.singleton("FOO"), context); + context.set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, + new DocumentMatchContext(List.of( + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), + new org.apache.accumulo.core.data.Value(buildEncodedValue("scar car"))), + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "A"), + new org.apache.accumulo.core.data.Value(buildEncodedValue("lawyer car")))), + 1024)); + + assertEvaluation(query, docKey, d, context); + assertEquals("{\"car\":{\"BODY\":[1,5]},\"lawyer\":{\"CONTENT2\":[0]}}", ((Content) d.get(DocumentFunctions.DOCUMENT_MATCHES)).getContent()); + assertEquals(new ColumnVisibility("A"), d.get(DocumentFunctions.DOCUMENT_MATCHES).getColumnVisibility()); + } + + private byte[] buildEncodedValue(String content) { + try { + java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); + java.io.OutputStream b64s = java.util.Base64.getEncoder().wrap(bos); + java.util.zip.GZIPOutputStream gzip = new java.util.zip.GZIPOutputStream(b64s); + gzip.write(content.getBytes()); + gzip.close(); + b64s.close(); + bos.close(); + return bos.toByteArray(); + } catch (java.io.IOException e) { + throw new RuntimeException(e); + } + } + @Test public void testCompareFunction() { // eq op @@ -298,15 +359,27 @@ private void assertEvaluation(String query, Key key, Document d, DatawaveJexlCon } private void assertEvaluation(String query, Key key, Document d, DatawaveJexlContext context, boolean expected) { - JexlEvaluation evaluation = new JexlEvaluation(query); + JexlEvaluation evaluation = new JexlEvaluation(rewriteDocumentMatchFunctions(query)); boolean result = evaluation.apply(new Tuple3<>(key, d, context)); assertEquals(expected, result); - evaluation = new JexlEvaluation(query, new HitListArithmetic()); + evaluation = new JexlEvaluation(rewriteDocumentMatchFunctions(query), new HitListArithmetic()); result = evaluation.apply(new Tuple3<>(key, d, context)); assertEquals(expected, result); } + private String rewriteDocumentMatchFunctions(String query) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + if (!DocumentMatchFunctionRebuildingVisitor.requiresDocumentMatchContext(script)) { + return query; + } + return JexlStringBuildingVisitor.buildQueryWithoutParse(DocumentMatchFunctionRebuildingVisitor.rewrite(script)); + } catch (org.apache.commons.jexl3.parser.ParseException e) { + throw new RuntimeException(e); + } + } + private TermFrequencyList buildTfList(String field, int... offsets) { TermFrequencyList.Zone zone = buildZone(field); List position = buildTermWeightPositions(offsets); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java new file mode 100644 index 00000000000..102b916451c --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java @@ -0,0 +1,306 @@ +package datawave.query.jexl.functions; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.OutputStream; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPOutputStream; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.log4j.AppenderSkeleton; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.spi.LoggingEvent; +import org.junit.Test; + +import datawave.query.function.DocumentMatchContext; + +/** + * Unit tests for {@link DocumentFunctions} covering view selection, matching semantics, payload limits, merged results, and visibility handling. + */ +public class DocumentFunctionsTest { + private final Logger logger = Logger.getLogger(DocumentFunctions.class); + + /** + * Verifies that {@code document:match(STRING)} searches all available views and records offsets per view beneath the matched-string key. + */ + @Test + public void testMatchAcrossAllViews() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0META", "carpet")), 1024); + + String result = DocumentFunctions.match(context, "car"); + + assertEquals("{\"car\":{\"BODY\":[1,5],\"META\":[0]}}", result); + assertEquals(result, DocumentFunctions.toJson(context.getMergedMatches())); + } + + /** + * Verifies that a trailing {@code *} in the requested view name performs prefix matching across views. + */ + @Test + public void testWildcardViewMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext( + List.of(entry("test\0uid\0BODY", "car"), entry("test\0uid\0BODY_TEXT", "car car"), entry("test\0uid\0META", "car")), 1024); + + String result = DocumentFunctions.match("BODY*", context, "car"); + + assertEquals("{\"car\":{\"BODY\":[0],\"BODY_TEXT\":[0,4]}}", result); + } + + /** + * Verifies that overlapping substring matches are reported with all starting offsets. + */ + @Test + public void testOverlappingMatches() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "banana")), 1024); + + String result = DocumentFunctions.match("BODY", context, "ana"); + + assertEquals("{\"ana\":{\"BODY\":[1,3]}}", result); + } + + /** + * Verifies that matching is case-sensitive. + */ + @Test + public void testCaseSensitiveMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), 1024); + + assertTrue(DocumentFunctions.match(context, "Car").isEmpty()); + } + + /** + * Verifies that encoded payloads larger than the configured limit are skipped as non-matching. + */ + @Test + public void testOversizedPayloadIsNonMatch() throws Exception { + Map.Entry entry = entry("test\0uid\0BODY", "scar car"); + DocumentMatchContext context = new DocumentMatchContext(List.of(entry), entry.getValue().get().length - 1); + + assertTrue(DocumentFunctions.match(context, "car").isEmpty()); + } + + /** + * Verifies that decoded payloads larger than the configured limit are skipped as non-matching. + */ + @Test + public void testOversizedDecodedPayloadIsNonMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), 1024, 3); + + assertTrue(DocumentFunctions.match(context, "car").isEmpty()); + } + + /** + * Verifies that an empty {@code d}-entry set yields no match. + */ + @Test + public void testNoDocumentEntriesIsNonMatch() { + assertTrue(DocumentFunctions.match(new DocumentMatchContext(List.of(), 1024), "car").isEmpty()); + } + + /** + * Verifies that undecodable payloads are treated as non-matching rather than failing evaluation. + */ + @Test + public void testDecodeFailureIsNonMatch() { + DocumentMatchContext context = new DocumentMatchContext(List.of(Map.entry(new Key("row", "d", "test\0uid\0BODY"), new Value("not-base64".getBytes()))), + 1024); + + assertTrue(DocumentFunctions.match(context, "car").isEmpty()); + } + + /** + * Verifies that MIME-style base64 payloads with trailing CRLF line breaks still decode and match correctly. + */ + @Test + public void testMatchWithBase64LineBreaks() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entryWithEncodedSuffix("test\0uid\0BODY", "/* Origins */ Fix.", "\r\n")), 1024); + + String result = DocumentFunctions.match("BODY", context, "Origins"); + + assertEquals("{\"Origins\":{\"BODY\":[3]}}", result); + } + + /** + * Verifies that payloads stored as plain base64-encoded UTF-8 text still decode and match when gzip expansion is not possible. + */ + @Test + public void testMatchWithBase64OnlyPayload() { + DocumentMatchContext context = new DocumentMatchContext(List.of(base64OnlyEntry("test\0uid\0BODY", "/* Origins */ Fix.")), 1024); + + String result = DocumentFunctions.match("BODY", context, "Origins"); + + assertEquals("{\"Origins\":{\"BODY\":[3]}}", result); + } + + /** + * Verifies that multiple {@code document:match(...)} calls merge their offsets into the document-wide result set. + */ + @Test + public void testMatchMergesResultsAcrossCalls() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0CONTENT2", "lawyer car")), + 1024); + + assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); + assertEquals("{\"lawyer\":{\"CONTENT2\":[0]}}", DocumentFunctions.match("CONTENT2", context, "lawyer")); + assertEquals("{\"car\":{\"BODY\":[1,5]},\"lawyer\":{\"CONTENT2\":[0]}}", DocumentFunctions.toJson(context.getMergedMatches())); + } + + /** + * Verifies that repeated {@code document:match(...)} calls for the same string merge under one top-level match-string key. + */ + @Test + public void testMatchMergesSameSearchAcrossCalls() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0CONTENT2", "lawyer car")), + 1024); + + assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); + assertEquals("{\"car\":{\"CONTENT2\":[7]}}", DocumentFunctions.match("CONTENT2", context, "car")); + assertEquals("{\"car\":{\"BODY\":[1,5],\"CONTENT2\":[7]}}", DocumentFunctions.toJson(context.getMergedMatches())); + } + + /** + * Verifies that the first matched {@code d}-column visibility is retained and that a later mismatch produces a single info-level log message. + */ + @Test + public void testMatchLogsVisibilityMismatchAndKeepsFirstVisibility() throws Exception { + TestAppender appender = new TestAppender(); + Level originalLevel = logger.getLevel(); + logger.addAppender(appender); + logger.setLevel(Level.INFO); + try { + DocumentMatchContext context = new DocumentMatchContext( + List.of(entry("test\0uid\0BODY", "scar car", "A"), entry("test\0uid\0CONTENT2", "lawyer car", "B")), 1024); + + assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); + assertEquals("{\"lawyer\":{\"CONTENT2\":[0]}}", DocumentFunctions.match("CONTENT2", context, "lawyer")); + assertEquals(new ColumnVisibility("A"), context.getFirstMatchingColumnVisibility()); + assertEquals(1, appender.infoMessages.size()); + assertTrue(appender.infoMessages.get(0).contains("differing d-column visibilities")); + } finally { + logger.removeAppender(appender); + logger.setLevel(originalLevel); + } + } + + /** + * Builds a test {@code d}-column entry with an empty visibility. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @return encoded test entry + * @throws Exception + * if test payload creation fails + */ + private Map.Entry entry(String cq, String content) throws Exception { + return entry(cq, content, ""); + } + + /** + * Builds a test {@code d}-column entry with caller-supplied visibility and gzip+base64 encoded content. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @param visibility + * column visibility to attach to the key + * @return encoded test entry + * @throws Exception + * if test payload creation fails + */ + private Map.Entry entry(String cq, String content, String visibility) throws Exception { + return entryWithEncodedSuffix(cq, content, visibility, ""); + } + + /** + * Builds a test {@code d}-column entry with caller-supplied visibility and an optional suffix appended to the encoded payload. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @param visibility + * column visibility to attach to the key + * @param encodedSuffix + * suffix bytes to append after base64 encoding, such as {@code \r\n} + * @return encoded test entry + * @throws Exception + * if test payload creation fails + */ + private Map.Entry entryWithEncodedSuffix(String cq, String content, String visibility, String encodedSuffix) throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + OutputStream b64s = java.util.Base64.getEncoder().wrap(bos); + GZIPOutputStream gzip = new GZIPOutputStream(b64s); + gzip.write(content.getBytes()); + gzip.close(); + b64s.close(); + if (!encodedSuffix.isEmpty()) { + bos.write(encodedSuffix.getBytes()); + } + bos.close(); + return new AbstractMap.SimpleEntry<>(new Key("row", "d", cq, visibility), new Value(bos.toByteArray())); + } + + /** + * Builds a test {@code d}-column entry with an empty visibility and an optional suffix appended to the encoded payload. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @param encodedSuffix + * suffix bytes to append after base64 encoding, such as {@code \r\n} + * @return encoded test entry + * @throws Exception + * if test payload creation fails + */ + private Map.Entry entryWithEncodedSuffix(String cq, String content, String encodedSuffix) throws Exception { + return entryWithEncodedSuffix(cq, content, "", encodedSuffix); + } + + /** + * Builds a test {@code d}-column entry whose value is only base64-encoded UTF-8 text. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @return encoded test entry + */ + private Map.Entry base64OnlyEntry(String cq, String content) { + byte[] encoded = java.util.Base64.getEncoder().encode(content.getBytes()); + return new AbstractMap.SimpleEntry<>(new Key("row", "d", cq), new Value(encoded)); + } + + /** + * Minimal log4j appender used to capture info-level visibility-mismatch messages. + */ + private static class TestAppender extends AppenderSkeleton { + private final List infoMessages = new ArrayList<>(); + + @Override + protected void append(LoggingEvent event) { + if (Level.INFO.equals(event.getLevel())) { + infoMessages.add(event.getRenderedMessage()); + } + } + + @Override + public void close() {} + + @Override + public boolean requiresLayout() { + return false; + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java new file mode 100644 index 00000000000..1fd49c3d86a --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java @@ -0,0 +1,55 @@ +package datawave.query.jexl.visitors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.junit.Test; + +import datawave.query.jexl.JexlASTHelper; + +/** + * Focused tests for {@link DocumentMatchFunctionRebuildingVisitor}. + */ +public class DocumentMatchFunctionRebuildingVisitorTest { + + /** + * Verifies that the visitor reports when a query needs the reserved document-match context variable. + * + * @throws Exception + * if parsing fails + */ + @Test + public void testRequiresDocumentMatchContext() throws Exception { + assertFalse(DocumentMatchFunctionRebuildingVisitor.requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar'"))); + assertTrue(DocumentMatchFunctionRebuildingVisitor + .requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar' && document:match('car')"))); + } + + /** + * Verifies that the one-argument form is rewritten to include the reserved context variable as the first argument. + * + * @throws Exception + * if parsing fails + */ + @Test + public void testRewriteSingleArgumentFunction() throws Exception { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("document:match('car')"); + String rewritten = JexlStringBuildingVisitor.buildQueryWithoutParse(DocumentMatchFunctionRebuildingVisitor.rewrite(script)); + assertEquals("document:match(documentMatchContext, 'car')", rewritten); + } + + /** + * Verifies that the two-argument form keeps the view selector first and inserts the reserved context variable before the search string. + * + * @throws Exception + * if parsing fails + */ + @Test + public void testRewriteTwoArgumentFunction() throws Exception { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("document:match('BODY', 'car')"); + String rewritten = JexlStringBuildingVisitor.buildQueryWithoutParse(DocumentMatchFunctionRebuildingVisitor.rewrite(script)); + assertEquals("document:match('BODY', documentMatchContext, 'car')", rewritten); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java index d5482f403eb..7a102b316a5 100644 --- a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java @@ -62,6 +62,16 @@ public void testMatchesInGroupFunctionQuoting() throws ParseException { assertEquals("grouping:matchesInGroupLeft(FOO, 'foo', BAR, 'bar')", parseQuery("#MATCHES_IN_GROUP_LEFT(FOO, foo, BAR, bar)")); } + @Test + public void testDocumentMatchFunctionTranslation() throws ParseException { + assertEquals("document:match('car')", parseQuery("#DOCUMENT_MATCH('car')")); + assertEquals("document:match('car')", parseQuery("#DOCUMENT_MATCH(car)")); + assertEquals("document:match('BODY', 'car')", parseQuery("#DOCUMENT_MATCH('BODY', 'car')")); + assertEquals("document:match('BODY', 'car')", parseQuery("#DOCUMENT_MATCH(BODY, car)")); + assertEquals("document:match('BODY*', 'car')", parseQuery("#DOCUMENT_MATCH(BODY*, car)")); + assertEquals("BODY == 'capone' && document:match('car')", parseQuery("BODY:capone AND #DOCUMENT_MATCH(car)")); + } + @Test public void testComposableFunctions() throws ParseException { assertEquals("filter:includeRegex(foo,bar).size() > 0", parseQuery("#JEXL(\"filter:includeRegex(foo,bar).size() > 0\")")); diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java index 192af5c6235..613f1e2bade 100644 --- a/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java @@ -204,6 +204,12 @@ public void testFunctions() throws ParseException { luceneParser.parse("field:selector AND #include(field, testbade\\.scape)").getContents()); Assert.assertEquals("[AND,field:selector][posFilter: filter(true, AND, field, testbade\\.scape)]", luceneParser.parse("field:selector AND #text(field, testbade\\.scape)").getContents()); + Assert.assertEquals("[AND,field:selector][posFilter: document:match(car)]", + luceneParser.parse("field:selector AND #DOCUMENT_MATCH(car)").getContents()); + Assert.assertEquals("[AND,field:selector][posFilter: document:match(BODY, car)]", + luceneParser.parse("field:selector AND #DOCUMENT_MATCH(BODY, car)").getContents()); + Assert.assertEquals("[AND,field:selector][posFilter: document:match(BODY*, car)]", + luceneParser.parse("field:selector AND #DOCUMENT_MATCH(BODY*, car)").getContents()); } @Test diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java index 729c20a142e..f997019bbf2 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java @@ -85,8 +85,10 @@ import datawave.query.QueryParameters; import datawave.query.QueryTestTableHelper; import datawave.query.RebuildingScannerTestHelper; +import datawave.query.config.ShardQueryConfiguration; import datawave.query.config.annotation.AllHitsQueryConfig; import datawave.query.config.annotation.AnnotationConfig; +import datawave.query.function.DocumentMatchContext; import datawave.query.function.deserializer.KryoDocumentDeserializer; import datawave.query.planner.DefaultQueryPlanner; import datawave.query.planner.TimedVisitorManager; @@ -257,6 +259,55 @@ public void tearDown() throws Exception { this.endDate = null; } + /** + * Verifies that the Spring-configured {@link ShardQueryLogic} bean exposes the document-match limits through its accessor surface. + */ + @Test + public void testDocumentMatchLimitsDefaultFromSpringConfig() { + assertEquals(DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE, logic.getDocumentMatchMaxEncodedSize()); + assertEquals(DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, logic.getDocumentMatchMaxDecodedSize()); + } + + /** + * Verifies that the bean-style setters update both the top-level logic and its backing configuration. + */ + @Test + public void testDocumentMatchLimitSettersUpdateLogicAndConfig() { + int encoded = 1024; + int decoded = 2048; + + logic.setDocumentMatchMaxEncodedSize(encoded); + logic.setDocumentMatchMaxDecodedSize(decoded); + + assertEquals(encoded, logic.getDocumentMatchMaxEncodedSize()); + assertEquals(decoded, logic.getDocumentMatchMaxDecodedSize()); + assertEquals(encoded, logic.getConfig().getDocumentMatchMaxEncodedSize()); + assertEquals(decoded, logic.getConfig().getDocumentMatchMaxDecodedSize()); + } + + /** + * Verifies that document-match limit overrides survive {@link ShardQueryLogic#initialize(AccumuloClient, Query, Set)} and appear on the per-query config. + */ + @Test + public void testDocumentMatchLimitsPropagateThroughInitialize() throws Exception { + int encoded = 4096; + int decoded = 8192; + + logic.setDocumentMatchMaxEncodedSize(encoded); + logic.setDocumentMatchMaxDecodedSize(decoded); + + this.query = "UUID == '" + caponeUID + "'"; + this.startDate = dateFormat.parse("20091231"); + this.endDate = dateFormat.parse("20150101"); + + Query settings = createSettings(); + AccumuloClient client = createClient(); + ShardQueryConfiguration config = (ShardQueryConfiguration) logic.initialize(client, settings, authSet); + + assertEquals(encoded, config.getDocumentMatchMaxEncodedSize()); + assertEquals(decoded, config.getDocumentMatchMaxDecodedSize()); + } + private AccumuloClient createClient() throws Exception { AccumuloClient client = new QueryTestTableHelper(ShardRange.class.toString(), log, RebuildingScannerTestHelper.TEARDOWN.EVERY_OTHER_SANS_CONSISTENCY, RebuildingScannerTestHelper.INTERRUPT.EVERY_OTHER).client; diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml index 5035b33c879..ded86c957fd 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml @@ -28,6 +28,7 @@ + @@ -400,6 +401,8 @@ + + diff --git a/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml b/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml index 358193f223a..9ff297c3234 100644 --- a/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml +++ b/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml @@ -15,6 +15,7 @@ + diff --git a/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml b/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml index 960cbf64242..8251c66dcb7 100644 --- a/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml +++ b/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml @@ -24,6 +24,7 @@ + @@ -410,6 +411,8 @@ + + From acfa5f979a1a30b2b4fffe8fa16199fc8585304f Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Thu, 26 Mar 2026 20:54:16 -0400 Subject: [PATCH 2/9] Initial round of updates to document:match code per code review --- .../DocumentMatchContextFunction.java | 18 +++++---- .../function/IndexOnlyKeyToDocumentData.java | 4 +- .../query/function/JexlEvaluation.java | 3 +- .../query/function/KeyToDocumentData.java | 10 ++--- ...ocumentMatchFunctionRebuildingVisitor.java | 40 +++++++++++++------ .../query/DocumentMatchQueryTest.java | 2 +- ...entMatchFunctionRebuildingVisitorTest.java | 32 ++++++++++----- 7 files changed, 70 insertions(+), 39 deletions(-) diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java index 2740c36bf41..718713a4d7f 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java @@ -13,6 +13,7 @@ import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.log4j.Logger; import com.google.common.base.Function; @@ -32,6 +33,7 @@ public class DocumentMatchContextFunction implements Function>,Tuple3>> { private static final Logger log = Logger.getLogger(DocumentMatchContextFunction.class); private final DocumentMatchConfig config; + private final SortedKeyValueIterator source; /** * Creates a context-populating function from the supplied document-match configuration. @@ -41,6 +43,7 @@ public class DocumentMatchContextFunction implements Function")); + log.debug("Finished d-column scan for document key " + documentKey + "; next top key is " + (source.hasTop() ? source.getTopKey() : "")); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java b/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java index 6ff31065b71..4c1ad8ef90c 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java @@ -157,7 +157,7 @@ public Entry apply(final Entry from) { } // get the document key - Key docKey = getDocKey(from.getKey()); + Key docKey = getDocumentKey(from.getKey()); // Ensure that we have a non-empty column qualifier final Key stopKey = new Key(from.getKey().getRow().toString(), from.getKey().getColumnFamily().toString(), @@ -482,7 +482,7 @@ public Entry next() { if (null != next) { final List> keyValues = new LinkedList<>(); keyValues.add(next); - Key docKey = getDocKey(next.getKey()); + Key docKey = getDocumentKey(next.getKey()); final DocumentData documentData = new DocumentData(this.iteratorDocumentKey, Collections.singleton(docKey), keyValues, true); entry = Maps.immutableEntry(documentData, this.iteratorDocument); } else if (next == ITERATOR_COMPLETE_KEY) { diff --git a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java index aff353ae860..74b51712441 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java @@ -102,8 +102,7 @@ public boolean apply(Tuple3 input) { if (documentMatchContext != null) { documentMatchContext.clearMergedMatches(); } - Object o; - o = script.execute(input.third()); + Object o = script.execute(input.third()); if (log.isTraceEnabled()) { log.trace("Evaluation of " + query + " against document " + input.second().getMetadata() + " returned " + o); diff --git a/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java b/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java index 7659b36a4f7..652c8238be0 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java @@ -216,7 +216,7 @@ public List> collectDocumentAttributes(final Key documentStartK boolean seeked = false; if (isPartOfDocument(documentStartKey, docAttrKey.get())) { if (filter == null || filter.keep(docAttrKey.get())) { - docKeys.add(getDocKey(docAttrKey.get())); + docKeys.add(getDocumentKey(docAttrKey.get())); } if (filter == null || filter.apply(Maps.immutableEntry(docAttrKey.get(), StringUtils.EMPTY))) { @@ -258,10 +258,10 @@ private boolean isPartOfDocument(Key documentStartKey, Key candidateKey) { return equality.partOf(documentStartKey, candidateKey); } - // map the key to the dockey (only shard, datatype, uid) - public static Key getDocKey(Key key) { + // map the key to the document key (only shard, datatype, uid) + public static Key getDocumentKey(Key key) { final ByteSequence row = key.getRowData(); - final ByteSequence cf = getDocColumnFamily(key); + final ByteSequence cf = getDocumentColumnFamily(key); final ByteSequence cv = key.getColumnVisibilityData(); return new Key(row.getBackingArray(), row.offset(), row.length(), cf.getBackingArray(), cf.offset(), cf.length(), EMPTY_BYTE_SEQUENCE.getBackingArray(), EMPTY_BYTE_SEQUENCE.offset(), EMPTY_BYTE_SEQUENCE.length(), cv.getBackingArray(), cv.offset(), cv.length(), key.getTimestamp()); @@ -274,7 +274,7 @@ public static Key getDocKey(Key key) { * the key to process * @return the column family, consisting of datatype and uid. */ - private static ByteSequence getDocColumnFamily(Key key) { + private static ByteSequence getDocumentColumnFamily(Key key) { final ByteSequence cf = key.getColumnFamilyData(); if (!"d".equals(key.getColumnFamily().toString())) { return cf; diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java index 468ef44bf8f..70b31f11c39 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java @@ -2,6 +2,7 @@ import org.apache.commons.jexl3.parser.ASTFunctionNode; import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.log4j.Logger; import datawave.query.jexl.JexlASTHelper; import datawave.query.jexl.JexlNodeFactory; @@ -16,6 +17,11 @@ * document matching. */ public class DocumentMatchFunctionRebuildingVisitor extends RebuildingVisitor { + protected static final Logger log = Logger.getLogger(DocumentMatchFunctionRebuildingVisitor.class); + + private DocumentMatchFunctionRebuildingVisitor() { + // no-op, local construction only. + } /** * Determines whether the supplied script contains any {@code document:match(...)} calls. @@ -44,18 +50,28 @@ public static ASTJexlScript rewrite(ASTJexlScript script) { @Override public Object visit(ASTFunctionNode node, Object data) { FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node); - if (DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace()) - && DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(visitor.name())) { - if (visitor.args().size() == 1) { - return FunctionJexlNodeVisitor.makeFunctionFrom(visitor.namespace(), visitor.name(), - JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), - RebuildingVisitor.copy(visitor.args().get(0))); - } else if (visitor.args().size() == 2) { - return FunctionJexlNodeVisitor.makeFunctionFrom(visitor.namespace(), visitor.name(), RebuildingVisitor.copy(visitor.args().get(0)), - JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), - RebuildingVisitor.copy(visitor.args().get(1))); - } + if (DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace())) { + return handeDocumentFunction(visitor, data); + } + return data; // no-op + } + + protected Object handeDocumentFunction(FunctionJexlNodeVisitor visitor, Object data) { + // noinspection SwitchStatementWithTooFewBranches - placeholder for future expansion + switch (visitor.name()) { + case DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME: + if (visitor.args().size() == 1) { + return FunctionJexlNodeVisitor.makeFunctionFrom(visitor.namespace(), visitor.name(), + JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), + RebuildingVisitor.copy(visitor.args().get(0))); + } else if (visitor.args().size() == 2) { + return FunctionJexlNodeVisitor.makeFunctionFrom(visitor.namespace(), visitor.name(), RebuildingVisitor.copy(visitor.args().get(0)), + JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), + RebuildingVisitor.copy(visitor.args().get(1))); + } + default: + log.warn("unknown document function:" + visitor.name()); + return data; // no-op } - return super.visit(node, data); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java index ef85ec77738..82ca55f5c39 100644 --- a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java @@ -114,7 +114,7 @@ public void beforeEach() { logic.setIvaratorCacheDirConfigs(Collections.singletonList(new IvaratorCacheDirConfig(folder.toUri().toString()))); logic.setMaxFieldIndexRangeSplit(1); logic.setCollapseUids(false); - logic.setFullTableScanEnabled(true); + logic.setFullTableScanEnabled(false); logic.setDocumentMatchMaxDecodedSize(DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE); givenParameter(QueryParameters.HIT_LIST, "true"); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java index 1fd49c3d86a..2e2b3fd94b2 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java @@ -5,6 +5,7 @@ import static org.junit.Assert.assertTrue; import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ParseException; import org.junit.Test; import datawave.query.jexl.JexlASTHelper; @@ -30,26 +31,39 @@ public void testRequiresDocumentMatchContext() throws Exception { /** * Verifies that the one-argument form is rewritten to include the reserved context variable as the first argument. * - * @throws Exception + * @throws ParseException * if parsing fails */ @Test - public void testRewriteSingleArgumentFunction() throws Exception { - ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("document:match('car')"); - String rewritten = JexlStringBuildingVisitor.buildQueryWithoutParse(DocumentMatchFunctionRebuildingVisitor.rewrite(script)); - assertEquals("document:match(documentMatchContext, 'car')", rewritten); + public void testRewriteSingleArgumentFunction() throws ParseException { + assertRewrite("document:match(documentMatchContext, 'car')", "document:match('car')"); + } /** * Verifies that the two-argument form keeps the view selector first and inserts the reserved context variable before the search string. * - * @throws Exception + * @throws ParseException * if parsing fails */ @Test - public void testRewriteTwoArgumentFunction() throws Exception { - ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("document:match('BODY', 'car')"); + public void testRewriteTwoArgumentFunction() throws ParseException { + assertRewrite("document:match('BODY', documentMatchContext, 'car')", "document:match('BODY', 'car')"); + } + + /** + * Verifies that the input form is rewritten the expected input. + * + * @param expected + * the expected re-written form + * @param input + * the input to rewrite + * @throws ParseException + * if parsing fails + */ + private static void assertRewrite(String expected, String input) throws ParseException { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(input); String rewritten = JexlStringBuildingVisitor.buildQueryWithoutParse(DocumentMatchFunctionRebuildingVisitor.rewrite(script)); - assertEquals("document:match('BODY', documentMatchContext, 'car')", rewritten); + assertEquals(expected, rewritten); } } From 365fc9e30763a5dec007672a1b5a4e39d329d9a4 Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Sun, 29 Mar 2026 18:20:25 -0400 Subject: [PATCH 3/9] Additional updates to document:match code per review --- .../query/iterator/QueryIterator.java | 42 ++++--- .../DocumentFunctionsDescriptor.java | 2 +- ...ocumentMatchFunctionRebuildingVisitor.java | 77 ------------- .../DocumentMatchFunctionVisitor.java | 107 ++++++++++++++++++ .../query/planner/DefaultQueryPlanner.java | 24 ++-- .../query/function/JexlEvaluationTest.java | 6 +- ... => DocumentMatchFunctionVisitorTest.java} | 26 +++-- 7 files changed, 172 insertions(+), 112 deletions(-) delete mode 100644 warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java rename warehouse/query-core/src/test/java/datawave/query/jexl/visitors/{DocumentMatchFunctionRebuildingVisitorTest.java => DocumentMatchFunctionVisitorTest.java} (62%) diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java index 40c329bba65..f3b8415c3f1 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java @@ -107,9 +107,8 @@ import datawave.query.jexl.functions.IdentityAggregator; import datawave.query.jexl.functions.KeyAdjudicator; import datawave.query.jexl.visitors.DelayedNonEventSubTreeVisitor; -import datawave.query.jexl.visitors.DocumentMatchFunctionRebuildingVisitor; +import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor; import datawave.query.jexl.visitors.IteratorBuildingVisitor; -import datawave.query.jexl.visitors.JexlStringBuildingVisitor; import datawave.query.jexl.visitors.SatisfactionVisitor; import datawave.query.jexl.visitors.VariableNameVisitor; import datawave.query.postprocessing.tf.TFFactory; @@ -1106,18 +1105,16 @@ protected JexlEvaluation getJexlEvaluation(String query, NestedQueryIterator nestedQuery = documentSource.getNestedQuery(); if (null == nestedQuery) { - jexlEvaluationFunction = new JexlEvaluation(rewrittenQuery, arithmetic); + jexlEvaluationFunction = new JexlEvaluation(query, arithmetic); } else { jexlEvaluationFunction = nestedQuery.getEvaluation(); if (null == jexlEvaluationFunction) { - jexlEvaluationFunction = new JexlEvaluation(rewriteDocumentMatchFunctions(nestedQuery.getQuery(), arithmetic), arithmetic); + jexlEvaluationFunction = new JexlEvaluation(nestedQuery.getQuery(), arithmetic); } } } @@ -1132,18 +1129,25 @@ protected JexlEvaluation getJexlEvaluation(String query, NestedQueryIterator + * The top-level iterator option tells us whether the planned query requires document-match context anywhere. When a nested query is being evaluated, this + * method narrows that decision to the nested query so we only collect document-match context when the query actually being evaluated still contains + * {@code document:match(...)}. + * + * @param documentSource + * the nested query source for the current evaluation pass, if any + * @return true if document-match context should be collected for the current evaluation + */ protected boolean shouldCollectDocumentMatchContext(NestedQueryIterator documentSource) { if (!isDocumentMatchContextRequired()) { return false; } + + // At this point the planned query requires document-match context. If there is no nested source, or no nested + // query payload to inspect, we cannot narrow that requirement to a smaller subquery, so we conservatively keep + // document-match context collection enabled and return true in these cases. if (documentSource == null) { return true; } @@ -1151,8 +1155,12 @@ protected boolean shouldCollectDocumentMatchContext(NestedQueryIterator doc if (nestedQuery == null || nestedQuery.getQuery() == null) { return true; } - ASTJexlScript nestedScript = ArithmeticJexlEngines.getEngine(getArithmetic()).parse(nestedQuery.getQuery()); - return DocumentMatchFunctionRebuildingVisitor.requiresDocumentMatchContext(nestedScript); + + ASTJexlScript nestedScript = nestedQuery.getScript(); + if (nestedScript == null) { + nestedScript = ArithmeticJexlEngines.getEngine(getArithmetic()).parse(nestedQuery.getQuery()); + } + return DocumentMatchFunctionVisitor.requiresDocumentMatchContext(nestedScript); } protected LimitFields getLimitFields() { diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java index ea5cdf802d7..75124f58fc3 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java @@ -97,7 +97,7 @@ public JexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) { "Unexpected function class " + functionClass); throw new IllegalArgumentException(qe); } - if (!DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(visitor.name()) || visitor.args().isEmpty() || visitor.args().size() > 2) { + if (!DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(visitor.name()) || visitor.args().isEmpty() || visitor.args().size() > 3) { BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.WRONG_NUMBER_OF_ARGUMENTS, "Wrong number of arguments to document:match"); throw new IllegalArgumentException(qe); diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java deleted file mode 100644 index 70b31f11c39..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitor.java +++ /dev/null @@ -1,77 +0,0 @@ -package datawave.query.jexl.visitors; - -import org.apache.commons.jexl3.parser.ASTFunctionNode; -import org.apache.commons.jexl3.parser.ASTJexlScript; -import org.apache.log4j.Logger; - -import datawave.query.jexl.JexlASTHelper; -import datawave.query.jexl.JexlNodeFactory; -import datawave.query.jexl.functions.DocumentFunctions; -import datawave.query.jexl.functions.FunctionJexlNodeVisitor; - -/** - * Rewrites user-facing {@code document:match(...)} calls into the internal evaluation form that carries the reserved {@code documentMatchContext} argument - * explicitly. - *

- * This mirrors the way {@code content:*} functions are evaluated with an explicit {@code termOffsetMap} argument, but preserves the external user syntax for - * document matching. - */ -public class DocumentMatchFunctionRebuildingVisitor extends RebuildingVisitor { - protected static final Logger log = Logger.getLogger(DocumentMatchFunctionRebuildingVisitor.class); - - private DocumentMatchFunctionRebuildingVisitor() { - // no-op, local construction only. - } - - /** - * Determines whether the supplied script contains any {@code document:match(...)} calls. - * - * @param script - * script to inspect - * @return {@code true} if any document-match functions are present - */ - public static boolean requiresDocumentMatchContext(ASTJexlScript script) { - return JexlASTHelper.getFunctionNodes(script).stream().map(FunctionJexlNodeVisitor::eval) - .anyMatch(function -> DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(function.namespace()) - && DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(function.name())); - } - - /** - * Rewrites all {@code document:match(...)} calls in the supplied script to include the reserved context identifier. - * - * @param script - * script to rewrite - * @return rewritten script - */ - public static ASTJexlScript rewrite(ASTJexlScript script) { - return (ASTJexlScript) script.jjtAccept(new DocumentMatchFunctionRebuildingVisitor(), null); - } - - @Override - public Object visit(ASTFunctionNode node, Object data) { - FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node); - if (DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace())) { - return handeDocumentFunction(visitor, data); - } - return data; // no-op - } - - protected Object handeDocumentFunction(FunctionJexlNodeVisitor visitor, Object data) { - // noinspection SwitchStatementWithTooFewBranches - placeholder for future expansion - switch (visitor.name()) { - case DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME: - if (visitor.args().size() == 1) { - return FunctionJexlNodeVisitor.makeFunctionFrom(visitor.namespace(), visitor.name(), - JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), - RebuildingVisitor.copy(visitor.args().get(0))); - } else if (visitor.args().size() == 2) { - return FunctionJexlNodeVisitor.makeFunctionFrom(visitor.namespace(), visitor.name(), RebuildingVisitor.copy(visitor.args().get(0)), - JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), - RebuildingVisitor.copy(visitor.args().get(1))); - } - default: - log.warn("unknown document function:" + visitor.name()); - return data; // no-op - } - } -} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java new file mode 100644 index 00000000000..0dbfbc17f5d --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java @@ -0,0 +1,107 @@ +package datawave.query.jexl.visitors; + +import org.apache.commons.jexl3.parser.ASTArguments; +import org.apache.commons.jexl3.parser.ASTFunctionNode; +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ASTNamespaceIdentifier; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.ParserTreeConstants; +import org.apache.log4j.Logger; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.JexlNodeFactory; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.jexl.functions.FunctionJexlNodeVisitor; + +/** + * Rewrites user-facing {@code document:match(...)} calls into the internal evaluation form that carries the reserved {@code documentMatchContext} argument + * explicitly. + *

+ * This mirrors the way {@code content:*} functions are evaluated with an explicit {@code termOffsetMap} argument, but preserves the external user syntax for + * document matching. + */ +public class DocumentMatchFunctionVisitor extends BaseVisitor { + protected static final Logger log = Logger.getLogger(DocumentMatchFunctionVisitor.class); + private boolean documentMatchContextRequired = false; + + private DocumentMatchFunctionVisitor() { + // no-op, local construction only. + } + + /** + * Determines whether the supplied script contains any {@code document:match(...)} calls. + * + * @param script + * script to inspect + * @return {@code true} if any document-match functions are present + */ + public static boolean requiresDocumentMatchContext(ASTJexlScript script) { + return JexlASTHelper.getFunctionNodes(script).stream().map(FunctionJexlNodeVisitor::eval) + .anyMatch(function -> DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(function.namespace()) + && DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(function.name())); + } + + /** + * Rewrites all {@code document:match(...)} calls in the supplied script to include the reserved context identifier. + * + * @param script + * script to rewrite + * @return {@code true} if any document-match functions were found + */ + public static boolean rewrite(ASTJexlScript script) { + DocumentMatchFunctionVisitor visitor = new DocumentMatchFunctionVisitor(); + script.jjtAccept(visitor, null); + return visitor.documentMatchContextRequired; + } + + @Override + public Object visit(ASTFunctionNode node, Object data) { + FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node); + if (DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace())) { + rewriteDocumentFunction(node, visitor); + return data; + } + return super.visit(node, data); + } + + protected void rewriteDocumentFunction(ASTFunctionNode node, FunctionJexlNodeVisitor visitor) { + switch (visitor.name()) { + case DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME: + documentMatchContextRequired = true; + if (visitor.args().size() == 1) { + JexlASTHelper.replaceNodeSafely(node, + buildFunction(visitor.namespace(), visitor.name(), + JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), + RebuildingVisitor.copy(visitor.args().get(0)))); + } else if (visitor.args().size() == 2) { + JexlASTHelper.replaceNodeSafely(node, + buildFunction(visitor.namespace(), visitor.name(), RebuildingVisitor.copy(visitor.args().get(0)), + JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), + RebuildingVisitor.copy(visitor.args().get(1)))); + } + return; + default: + log.warn("unknown document function:" + visitor.name()); + } + } + + private static ASTFunctionNode buildFunction(String namespace, String functionName, JexlNode... arguments) { + ASTFunctionNode functionNode = new ASTFunctionNode(ParserTreeConstants.JJTFUNCTIONNODE); + + ASTNamespaceIdentifier namespaceNode = new ASTNamespaceIdentifier(ParserTreeConstants.JJTNAMESPACEIDENTIFIER); + namespaceNode.setNamespace(namespace, functionName); + functionNode.jjtAddChild(namespaceNode, 0); + namespaceNode.jjtSetParent(functionNode); + + ASTArguments argsNode = new ASTArguments(ParserTreeConstants.JJTARGUMENTS); + functionNode.jjtAddChild(argsNode, 1); + argsNode.jjtSetParent(functionNode); + + for (int i = 0; i < arguments.length; i++) { + argsNode.jjtAddChild(arguments[i], i); + arguments[i].jjtSetParent(argsNode); + } + + return functionNode; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index a06fd608091..f1561d7fbe0 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -119,7 +119,7 @@ import datawave.query.jexl.visitors.ConjunctionEliminationVisitor; import datawave.query.jexl.visitors.DepthVisitor; import datawave.query.jexl.visitors.DisjunctionEliminationVisitor; -import datawave.query.jexl.visitors.DocumentMatchFunctionRebuildingVisitor; +import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor; import datawave.query.jexl.visitors.ExecutableDeterminationVisitor; import datawave.query.jexl.visitors.ExecutableDeterminationVisitor.STATE; import datawave.query.jexl.visitors.ExecutableExpansionVisitor; @@ -1186,6 +1186,9 @@ protected ASTJexlScript processTree(final ASTJexlScript originalQueryTree, Shard expandPushdownPullup(config, metadataHelper, timers, scannerFactory); } + // rewrite document:match() functions to include the documentMatchContext variable. + config.setQueryTree(timedRewriteDocumentMatchFunctions(timers, config)); + return config.getQueryTree(); } @@ -1514,12 +1517,6 @@ protected void timedCheckForTokenizedFields(QueryStopwatch timers, String stage, } } - config.setDocumentMatchContextRequired(DocumentMatchFunctionRebuildingVisitor.requiresDocumentMatchContext(config.getQueryTree())); - if (log.isDebugEnabled()) { - logQuery(config.getQueryTree(), "Computed that the query " + (config.isDocumentMatchContextRequired() ? "requires" : "does not require") - + " document-match context lookup"); - } - stopwatch.stop(); } @@ -1647,6 +1644,19 @@ protected ASTJexlScript timedRewriteNullFunctions(QueryStopwatch timers, ASTJexl return visitorManager.timedVisit(timers, "Rewrite Null Functions", () -> RewriteNullFunctionsVisitor.rewriteNullFunctions(queryTree)); } + protected ASTJexlScript timedRewriteDocumentMatchFunctions(QueryStopwatch timers, ShardQueryConfiguration config) throws DatawaveQueryException { + return visitorManager.timedVisit(timers, "Rewrite Document Match Functions", () -> { + ASTJexlScript queryTree = config.getQueryTree(); + DocumentMatchFunctionVisitor.rewrite(queryTree); + config.setDocumentMatchContextRequired(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(queryTree)); + if (log.isDebugEnabled()) { + logQuery(queryTree, "Computed that the query " + (config.isDocumentMatchContextRequired() ? "requires" : "does not require") + + " document-match context lookup"); + } + return queryTree; + }); + } + protected ASTJexlScript timedEnforceUniqueTermsWithinExpressions(QueryStopwatch timers, final ASTJexlScript script) throws DatawaveQueryException { return visitorManager.timedVisit(timers, "Enforce Unique Terms within AND and OR expressions", () -> (UniqueExpressionTermsVisitor.enforce(script))); } diff --git a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java index 138c106bd7f..0166964627c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java @@ -29,7 +29,7 @@ import datawave.query.jexl.JexlASTHelper; import datawave.query.jexl.functions.DocumentFunctions; import datawave.query.jexl.functions.TermFrequencyList; -import datawave.query.jexl.visitors.DocumentMatchFunctionRebuildingVisitor; +import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor; import datawave.query.jexl.visitors.JexlStringBuildingVisitor; import datawave.query.postprocessing.tf.TermOffsetMap; import datawave.query.util.Tuple3; @@ -371,10 +371,10 @@ private void assertEvaluation(String query, Key key, Document d, DatawaveJexlCon private String rewriteDocumentMatchFunctions(String query) { try { ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); - if (!DocumentMatchFunctionRebuildingVisitor.requiresDocumentMatchContext(script)) { + if (!DocumentMatchFunctionVisitor.rewrite(script)) { return query; } - return JexlStringBuildingVisitor.buildQueryWithoutParse(DocumentMatchFunctionRebuildingVisitor.rewrite(script)); + return JexlStringBuildingVisitor.buildQueryWithoutParse(script); } catch (org.apache.commons.jexl3.parser.ParseException e) { throw new RuntimeException(e); } diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java similarity index 62% rename from warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java rename to warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java index 2e2b3fd94b2..29e2816aefc 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionRebuildingVisitorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java @@ -11,9 +11,9 @@ import datawave.query.jexl.JexlASTHelper; /** - * Focused tests for {@link DocumentMatchFunctionRebuildingVisitor}. + * Focused tests for {@link DocumentMatchFunctionVisitor}. */ -public class DocumentMatchFunctionRebuildingVisitorTest { +public class DocumentMatchFunctionVisitorTest { /** * Verifies that the visitor reports when a query needs the reserved document-match context variable. @@ -21,11 +21,16 @@ public class DocumentMatchFunctionRebuildingVisitorTest { * @throws Exception * if parsing fails */ + @Test + public void testRewriteReportsWhetherDocumentMatchContextIsRequired() throws Exception { + assertFalse(DocumentMatchFunctionVisitor.rewrite(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar'"))); + assertTrue(DocumentMatchFunctionVisitor.rewrite(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar' && document:match('car')"))); + } + @Test public void testRequiresDocumentMatchContext() throws Exception { - assertFalse(DocumentMatchFunctionRebuildingVisitor.requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar'"))); - assertTrue(DocumentMatchFunctionRebuildingVisitor - .requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar' && document:match('car')"))); + assertFalse(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar'"))); + assertTrue(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar' && document:match('car')"))); } /** @@ -37,7 +42,6 @@ public void testRequiresDocumentMatchContext() throws Exception { @Test public void testRewriteSingleArgumentFunction() throws ParseException { assertRewrite("document:match(documentMatchContext, 'car')", "document:match('car')"); - } /** @@ -51,6 +55,13 @@ public void testRewriteTwoArgumentFunction() throws ParseException { assertRewrite("document:match('BODY', documentMatchContext, 'car')", "document:match('BODY', 'car')"); } + @Test + public void testRewriteMutatesOriginalScript() throws ParseException { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("document:match('car')"); + assertTrue(DocumentMatchFunctionVisitor.rewrite(script)); + assertEquals("document:match(documentMatchContext, 'car')", JexlStringBuildingVisitor.buildQueryWithoutParse(script)); + } + /** * Verifies that the input form is rewritten the expected input. * @@ -63,7 +74,8 @@ public void testRewriteTwoArgumentFunction() throws ParseException { */ private static void assertRewrite(String expected, String input) throws ParseException { ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(input); - String rewritten = JexlStringBuildingVisitor.buildQueryWithoutParse(DocumentMatchFunctionRebuildingVisitor.rewrite(script)); + DocumentMatchFunctionVisitor.rewrite(script); + String rewritten = JexlStringBuildingVisitor.buildQueryWithoutParse(script); assertEquals(expected, rewritten); } } From e48ef3334d0f8b1d623d115c87614398c9e5bd18 Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Sun, 29 Mar 2026 19:05:45 -0400 Subject: [PATCH 4/9] Unit test updates per document:match code review --- .../query/DocumentMatchQueryTest.java | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java index 82ca55f5c39..edcf2f1b226 100644 --- a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java @@ -43,6 +43,12 @@ import datawave.query.util.AbstractQueryTest; import datawave.query.util.WiseGuysIngest; +/** + * MiniAccumulo-backed integration tests for {@code document:match(...)}. + *

+ * These tests exercise the full query path, including query parsing, planner wiring, shard-table document materialization, evaluation-phase document matching, + * and publication of the {@code DOCUMENT_MATCHES} attribute on returned documents. + */ @ExtendWith(SpringExtension.class) @ComponentScan(basePackages = "datawave.query") // @formatter:off @@ -53,12 +59,6 @@ "classpath:MetadataHelperContext.xml", "classpath:CacheContext.xml"}) // @formatter:on -/** - * MiniAccumulo-backed integration tests for {@code document:match(...)}. - *

- * These tests exercise the full query path, including query parsing, planner wiring, shard-table document materialization, evaluation-phase document matching, - * and publication of the {@code DOCUMENT_MATCHES} attribute on returned documents. - */ public class DocumentMatchQueryTest extends AbstractQueryTest { private static final Logger log = Logger.getLogger(DocumentMatchQueryTest.class); @@ -174,8 +174,8 @@ protected void extraAssertions() { */ @Test public void testDocumentMatchJexlAllViews() throws Exception { - disableQueryPlanAssertion(); givenQuery("UUID == 'CAPONE' && document:match('can')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')"); expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); @@ -188,8 +188,8 @@ public void testDocumentMatchJexlAllViews() throws Exception { */ @Test public void testDocumentMatchJexlSpecificView() throws Exception { - disableQueryPlanAssertion(); givenQuery("UUID == 'CAPONE' && document:match('CONTENT2', 'lawyer')"); + expectPlan("UUID == 'capone' && document:match('CONTENT2', documentMatchContext, 'lawyer')"); expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); @@ -202,8 +202,8 @@ public void testDocumentMatchJexlSpecificView() throws Exception { */ @Test public void testDocumentMatchJexlMergesMatchesAcrossCalls() throws Exception { - disableQueryPlanAssertion(); givenQuery("UUID == 'CAPONE' && document:match('CONTENT', 'can') && document:match('CONTENT2', 'lawyer')"); + expectPlan("UUID == 'capone' && document:match('CONTENT', documentMatchContext, 'can') && document:match('CONTENT2', documentMatchContext, 'lawyer')"); expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); @@ -216,9 +216,9 @@ public void testDocumentMatchJexlMergesMatchesAcrossCalls() throws Exception { */ @Test public void testDocumentMatchLuceneWildcardView() throws Exception { - disableQueryPlanAssertion(); givenParameter(QueryParameters.QUERY_SYNTAX, "LUCENE"); givenQuery("UUID:CAPONE AND #DOCUMENT_MATCH(CONTENT*,can)"); + expectPlan("UUID == 'capone' && document:match('CONTENT*', documentMatchContext, 'can')"); expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); @@ -231,8 +231,8 @@ public void testDocumentMatchLuceneWildcardView() throws Exception { */ @Test public void testDocumentMatchNoMatchFiltersDocument() throws Exception { - disableQueryPlanAssertion(); givenQuery("UUID == 'CAPONE' && document:match('missing')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'missing')"); expectedDocumentMatchContextRequired = true; expectResultCount(0); planAndExecuteQuery(); @@ -243,8 +243,8 @@ public void testDocumentMatchNoMatchFiltersDocument() throws Exception { */ @Test public void testDocumentMatchIsCaseSensitive() throws Exception { - disableQueryPlanAssertion(); givenQuery("UUID == 'CAPONE' && document:match('Can')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'Can')"); expectedDocumentMatchContextRequired = true; expectResultCount(0); planAndExecuteQuery(); @@ -255,9 +255,9 @@ public void testDocumentMatchIsCaseSensitive() throws Exception { */ @Test public void testDocumentMatchOversizedDecodedPayloadIsSkipped() throws Exception { - disableQueryPlanAssertion(); logic.setDocumentMatchMaxDecodedSize(8); givenQuery("UUID == 'CAPONE' && document:match('can')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')"); expectedDocumentMatchContextRequired = true; expectResultCount(0); planAndExecuteQuery(); @@ -268,8 +268,8 @@ public void testDocumentMatchOversizedDecodedPayloadIsSkipped() throws Exception */ @Test public void testQueryWithoutDocumentMatchDoesNotRequireContext() throws Exception { - disableQueryPlanAssertion(); givenQuery("UUID == 'CAPONE'"); + expectPlan("UUID == 'capone'"); expectedDocumentMatchContextRequired = false; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); From 472b64d4c499afbcd123e439c5137c0898bd70ba Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Sun, 29 Mar 2026 19:51:22 -0400 Subject: [PATCH 5/9] Updated document:match iterator configuration per code review feedback --- .../query/planner/DefaultQueryPlanner.java | 15 +++++-- .../planner/DefaultQueryPlannerTest.java | 39 +++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index f1561d7fbe0..0cfaa5d723d 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -721,13 +721,15 @@ private void configureIterator(ShardQueryConfiguration config, IteratorSetting c addOption(cfg, QueryOptions.HIT_LIST, Boolean.toString(config.isHitList()), false); addOption(cfg, QueryOptions.TERM_FREQUENCY_FIELDS, Joiner.on(',').join(config.getQueryTermFrequencyFields()), false); addOption(cfg, QueryOptions.TERM_FREQUENCIES_REQUIRED, Boolean.toString(config.isTermFrequenciesRequired()), false); - addOption(cfg, QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED, Boolean.toString(config.isDocumentMatchContextRequired()), false); addOption(cfg, QueryOptions.QUERY, newQueryString, false); addOption(cfg, QueryOptions.QUERY_ID, config.getQuery().getId().toString(), false); addOption(cfg, QueryOptions.FULL_TABLE_SCAN_ONLY, Boolean.toString(isFullTable), false); addOption(cfg, QueryOptions.TRACK_SIZES, Boolean.toString(config.isTrackSizes()), false); addOption(cfg, QueryOptions.ACTIVE_QUERY_LOG_NAME, config.getActiveQueryLogName(), false); + // Add the thresholds for document matching if required + configureDocumentMatchOptions(config, cfg); + // Set the start and end dates configureTypeMappings(config, cfg, metadataHelper, getCompressOptionMappings(), false); } @@ -2403,6 +2405,15 @@ protected void configureAdditionalOptions(ShardQueryConfiguration config, Iterat // no-op } + protected void configureDocumentMatchOptions(ShardQueryConfiguration config, IteratorSetting cfg) { + boolean documentMatchContextRequired = config.isDocumentMatchContextRequired(); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED, Boolean.toString(documentMatchContextRequired), false); + if (documentMatchContextRequired) { + addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedSize()), false); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE, Integer.toString(config.getDocumentMatchMaxDecodedSize()), false); + } + } + protected Future loadQueryIterator(final MetadataHelper metadataHelper, final ShardQueryConfiguration config, final Boolean isFullTable, boolean isPreload) { @@ -2450,8 +2461,6 @@ protected Future loadQueryIterator(final MetadataHelper metadat addOption(cfg, QueryOptions.MAX_PIPELINE_CACHED_RESULTS, Integer.toString(config.getMaxPipelineCachedResults()), false); addOption(cfg, QueryOptions.MAX_IVARATOR_SOURCES, Integer.toString(config.getMaxIvaratorSources()), false); addOption(cfg, QueryOptions.MAX_IVARATOR_SOURCE_WAIT, Long.toString(config.getMaxIvaratorSourceWait()), false); - addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedSize()), false); - addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE, Integer.toString(config.getDocumentMatchMaxDecodedSize()), false); if (config.getYieldThresholdMs() != Long.MAX_VALUE && config.getYieldThresholdMs() > 0) { addOption(cfg, QueryOptions.YIELD_THRESHOLD_MS, Long.toString(config.getYieldThresholdMs()), false); diff --git a/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java b/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java index 203060ed03c..3f4fd6c3e61 100644 --- a/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java @@ -7,6 +7,7 @@ import java.util.Date; import java.util.Set; +import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.commons.jexl3.parser.ASTJexlScript; import org.junit.jupiter.api.BeforeEach; @@ -25,6 +26,8 @@ import datawave.query.config.ShardQueryConfiguration; import datawave.query.exceptions.DatawaveFatalQueryException; import datawave.query.exceptions.DatawaveQueryException; +import datawave.query.iterator.QueryIterator; +import datawave.query.iterator.QueryOptions; import datawave.query.jexl.JexlASTHelper; import datawave.query.tables.ScannerFactory; import datawave.query.util.DateIndexHelper; @@ -36,6 +39,42 @@ class DefaultQueryPlannerTest { + @Nested + class DocumentMatchOptionTests { + + @Test + void testAddDocumentMatchOptionsWithoutContextRequired() { + DefaultQueryPlanner planner = new DefaultQueryPlanner(); + ShardQueryConfiguration config = new ShardQueryConfiguration(); + config.setDocumentMatchContextRequired(false); + config.setDocumentMatchMaxEncodedSize(111); + config.setDocumentMatchMaxDecodedSize(222); + IteratorSetting cfg = new IteratorSetting(10, "query", QueryIterator.class); + + planner.configureDocumentMatchOptions(config, cfg); + + Assertions.assertEquals("false", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED)); + Assertions.assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE)); + Assertions.assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE)); + } + + @Test + void testAddDocumentMatchOptionsWithContextRequired() { + DefaultQueryPlanner planner = new DefaultQueryPlanner(); + ShardQueryConfiguration config = new ShardQueryConfiguration(); + config.setDocumentMatchContextRequired(true); + config.setDocumentMatchMaxEncodedSize(111); + config.setDocumentMatchMaxDecodedSize(222); + IteratorSetting cfg = new IteratorSetting(10, "query", QueryIterator.class); + + planner.configureDocumentMatchOptions(config, cfg); + + Assertions.assertEquals("true", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED)); + Assertions.assertEquals("111", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE)); + Assertions.assertEquals("222", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE)); + } + } + /** * Contains tests for * {@link DefaultQueryPlanner#addDateFilters(ASTJexlScript, ScannerFactory, MetadataHelper, DateIndexHelper, ShardQueryConfiguration, Query)} From d1af3c3936ec4fcb56ed84a10b706e0322ea8af0 Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Sat, 11 Apr 2026 10:21:31 -0400 Subject: [PATCH 6/9] Applied first round of feedback from code review: * Added javadoc regarding TRUE_NODE to JexlFunctionArgumentDescriptorFactory that shows this should be used when index searching should be skipped for a function * Added documentMatchMaxEncodedContextSize to limit total size of encoded d columns collected in DocumentMatchContextFunction. --- .../query/config/ShardQueryConfiguration.java | 15 ++++ .../query/function/DocumentMatchContext.java | 71 +++++-------------- .../DocumentMatchContextFunction.java | 53 ++++++++++++-- .../datawave/query/iterator/QueryOptions.java | 19 ++++- ...JexlFunctionArgumentDescriptorFactory.java | 6 ++ .../arguments/JexlArgumentDescriptor.java | 3 +- .../query/planner/DefaultQueryPlanner.java | 1 + .../query/tables/ShardQueryLogic.java | 8 +++ .../config/ShardQueryConfigurationTest.java | 2 + .../DocumentMatchContextFunctionTest.java | 29 +++++++- .../query/function/JexlEvaluationTest.java | 18 +++-- .../jexl/functions/DocumentFunctionsTest.java | 44 ++++++++---- .../planner/DefaultQueryPlannerTest.java | 17 +++-- .../query/tables/ShardQueryLogicTest.java | 8 +++ 14 files changed, 205 insertions(+), 89 deletions(-) diff --git a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java index a9f9d2ef58e..c48cafb4c2a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java +++ b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java @@ -513,6 +513,10 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement * Maximum decoded d-column payload size, in bytes, to inspect for document:match evaluation */ private int documentMatchMaxDecodedSize = DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE; + /** + * Maximum aggregate encoded d-column payload size, in bytes, to retain in memory for document:match evaluation + */ + private int documentMatchMaxEncodedContextSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE; /** * Flag to control query option pruning in the visitor function. Queries that see significant or varied pruning via the RangeStream may see a benefit from @@ -850,6 +854,7 @@ public void copyFrom(ShardQueryConfiguration other) { this.setTfAggregationThresholdMs(other.getTfAggregationThresholdMs()); this.setDocumentMatchMaxEncodedSize(other.getDocumentMatchMaxEncodedSize()); this.setDocumentMatchMaxDecodedSize(other.getDocumentMatchMaxDecodedSize()); + this.setDocumentMatchMaxEncodedContextSize(other.getDocumentMatchMaxEncodedContextSize()); this.setGroupFields(GroupFields.copyOf(other.getGroupFields())); this.setPruneQueryOptions(other.getPruneQueryOptions()); this.setSortQueryPreIndexWithImpliedCounts(other.isSortQueryPreIndexWithImpliedCounts()); @@ -2929,6 +2934,14 @@ public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { this.documentMatchMaxDecodedSize = documentMatchMaxDecodedSize; } + public int getDocumentMatchMaxEncodedContextSize() { + return documentMatchMaxEncodedContextSize; + } + + public void setDocumentMatchMaxEncodedContextSize(int documentMatchMaxEncodedContextSize) { + this.documentMatchMaxEncodedContextSize = documentMatchMaxEncodedContextSize; + } + public GroupFields getGroupFields() { return groupFields; } @@ -3246,6 +3259,7 @@ public boolean equals(Object o) { getTfAggregationThresholdMs() == that.getTfAggregationThresholdMs() && getDocumentMatchMaxEncodedSize() == that.getDocumentMatchMaxEncodedSize() && getDocumentMatchMaxDecodedSize() == that.getDocumentMatchMaxDecodedSize() && + getDocumentMatchMaxEncodedContextSize() == that.getDocumentMatchMaxEncodedContextSize() && getPruneQueryOptions() == that.getPruneQueryOptions() && isSortQueryPreIndexWithImpliedCounts() == that.isSortQueryPreIndexWithImpliedCounts() && isSortQueryPreIndexWithFieldCounts() == that.isSortQueryPreIndexWithFieldCounts() && @@ -3487,6 +3501,7 @@ public int hashCode() { getTfAggregationThresholdMs(), getDocumentMatchMaxEncodedSize(), getDocumentMatchMaxDecodedSize(), + getDocumentMatchMaxEncodedContextSize(), getPruneQueryOptions(), isSortQueryPreIndexWithImpliedCounts(), isSortQueryPreIndexWithFieldCounts(), diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java index e81b97580ce..81d426c8c84 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java @@ -25,6 +25,7 @@ public class DocumentMatchContext { public static final int DEFAULT_MAX_ENCODED_SIZE = 256 * 1024 * 1024; public static final int DEFAULT_MAX_DECODED_SIZE = 384 * 1024 * 1024; + public static final int DEFAULT_MAX_ENCODED_CONTEXT_SIZE = 256 * 1024 * 1024; /** * Immutable runtime limits for {@code document:match(...)} payload processing. @@ -32,16 +33,20 @@ public class DocumentMatchContext { public static class Limits { private final int maxEncodedValueSize; private final int maxDecodedValueSize; + private final int maxEncodedContextSize; /** * @param maxEncodedValueSize * maximum allowed encoded payload size in bytes * @param maxDecodedValueSize * maximum allowed decoded payload size in bytes + * @param maxEncodedContextSize + * maximum allowed aggregate encoded payload size retained for a document, in bytes */ - public Limits(int maxEncodedValueSize, int maxDecodedValueSize) { + public Limits(int maxEncodedValueSize, int maxDecodedValueSize, int maxEncodedContextSize) { this.maxEncodedValueSize = maxEncodedValueSize; this.maxDecodedValueSize = maxDecodedValueSize; + this.maxEncodedContextSize = maxEncodedContextSize; } /** @@ -57,6 +62,13 @@ public int getMaxEncodedValueSize() { public int getMaxDecodedValueSize() { return maxDecodedValueSize; } + + /** + * @return the maximum aggregate encoded payload size retained for a document, in bytes + */ + public int getMaxEncodedContextSize() { + return maxEncodedContextSize; + } } private final List> dEntries; @@ -70,59 +82,6 @@ public DocumentMatchContext(List> dEntries, Limits limits) { this.limits = limits; } - public DocumentMatchContext(List> dEntries, int maxEncodedValueSize) { - this(dEntries, new Limits(maxEncodedValueSize, DEFAULT_MAX_DECODED_SIZE)); - } - - public DocumentMatchContext(List> dEntries, int maxEncodedValueSize, int maxDecodedValueSize) { - this(dEntries, new Limits(maxEncodedValueSize, maxDecodedValueSize)); - } - - /** - * Builds a context from already-aggregated document entries using the default encoded and decoded payload limits. - * - * @param entries - * aggregated document entries - * @param timeFilter - * optional time filter to apply while selecting {@code d}-column entries - * @return a context containing only eligible {@code d}-column entries - */ - public static DocumentMatchContext from(List> entries, TimeFilter timeFilter) { - return from(entries, timeFilter, new Limits(DEFAULT_MAX_ENCODED_SIZE, DEFAULT_MAX_DECODED_SIZE)); - } - - /** - * Builds a context from already-aggregated document entries using a caller-supplied encoded payload limit and the default decoded payload limit. - * - * @param entries - * aggregated document entries - * @param timeFilter - * optional time filter to apply while selecting {@code d}-column entries - * @param maxEncodedValueSize - * maximum allowed encoded payload size in bytes - * @return a context containing only eligible {@code d}-column entries - */ - public static DocumentMatchContext from(List> entries, TimeFilter timeFilter, int maxEncodedValueSize) { - return from(entries, timeFilter, new Limits(maxEncodedValueSize, DEFAULT_MAX_DECODED_SIZE)); - } - - /** - * Builds a context from already-aggregated document entries using explicit encoded and decoded payload limits. - * - * @param entries - * aggregated document entries - * @param timeFilter - * optional time filter to apply while selecting {@code d}-column entries - * @param maxEncodedValueSize - * maximum allowed encoded payload size in bytes - * @param maxDecodedValueSize - * maximum allowed decoded payload size in bytes - * @return a context containing only eligible {@code d}-column entries - */ - public static DocumentMatchContext from(List> entries, TimeFilter timeFilter, int maxEncodedValueSize, int maxDecodedValueSize) { - return from(entries, timeFilter, new Limits(maxEncodedValueSize, maxDecodedValueSize)); - } - /** * Builds a context from already-aggregated document entries using explicit runtime limits. * @@ -156,6 +115,10 @@ public int getMaxDecodedValueSize() { return limits.getMaxDecodedValueSize(); } + public int getMaxEncodedContextSize() { + return limits.getMaxEncodedContextSize(); + } + public Limits getLimits() { return limits; } diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java index 718713a4d7f..d9baa868630 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java @@ -24,6 +24,7 @@ import datawave.query.attributes.Document; import datawave.query.attributes.DocumentKey; import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.predicate.TimeFilter; import datawave.query.util.Tuple3; import datawave.query.util.Tuples; @@ -70,13 +71,17 @@ public Tuple3> apply(Tuple3> collectDocumentColumnAttributes(Set documentKeys) throws IOException { List> documentColumns = new ArrayList<>(); + long retainedBytes = 0L; for (Key documentKey : documentKeys) { - collectDocumentColumnAttributes(documentKey, documentColumns); + retainedBytes = collectDocumentColumnAttributes(documentKey, documentColumns, retainedBytes); + if (retainedBytes >= config.getLimits().getMaxEncodedContextSize()) { + break; + } } return documentColumns; } - private void collectDocumentColumnAttributes(Key documentKey, List> documentColumns) throws IOException { + private long collectDocumentColumnAttributes(Key documentKey, List> documentColumns, long retainedBytes) throws IOException { String row = documentKey.getRow().toString(); String datatypeAndUid = documentKey.getColumnFamily().toString(); Key startKey = new Key(row, "d", datatypeAndUid + '\0'); @@ -89,16 +94,56 @@ private void collectDocumentColumnAttributes(Key documentKey, List entry = Maps.immutableEntry(source.getTopKey(), source.getTopValue()); + if (!shouldRetainDocumentColumn(entry, documentKey)) { + source.next(); + continue; + } + + int encodedLength = entry.getValue().get().length; + long retainedBytesWithEntry = retainedBytes + encodedLength; + if (retainedBytesWithEntry > config.getLimits().getMaxEncodedContextSize()) { + if (log.isDebugEnabled()) { + log.debug("Reached aggregate encoded document-match context limit of " + config.getLimits().getMaxEncodedContextSize() + + " bytes while collecting d-column entry " + entry.getKey() + " for document key " + documentKey + + "; skipping this and remaining d-column entries"); + } + return config.getLimits().getMaxEncodedContextSize(); + } + if (log.isDebugEnabled()) { - log.debug("Collected d-column entry " + source.getTopKey() + " for document key " + documentKey); + log.debug("Collected d-column entry " + entry.getKey() + " for document key " + documentKey); } - documentColumns.add(Maps.immutableEntry(source.getTopKey(), source.getTopValue())); + documentColumns.add(entry); + retainedBytes = retainedBytesWithEntry; source.next(); } if (log.isDebugEnabled()) { log.debug("Finished d-column scan for document key " + documentKey + "; next top key is " + (source.hasTop() ? source.getTopKey() : "")); } + return retainedBytes; + } + + private boolean shouldRetainDocumentColumn(Entry entry, Key documentKey) { + TimeFilter timeFilter = config.getTimeFilter(); + if (timeFilter != null && !timeFilter.apply(entry)) { + if (log.isDebugEnabled()) { + log.debug("Skipping d-column entry " + entry.getKey() + " for document key " + documentKey + " because it did not match the time filter"); + } + return false; + } + + int encodedLength = entry.getValue().get().length; + if (encodedLength > config.getLimits().getMaxEncodedValueSize()) { + if (log.isDebugEnabled()) { + log.debug("Skipping oversized d-column entry " + entry.getKey() + " for document key " + documentKey + " because encoded payload size " + + encodedLength + " exceeds configured limit of " + config.getLimits().getMaxEncodedValueSize() + " bytes"); + } + return false; + } + + return true; } private Set getDocumentKeys(Key tupleKey, Document document) { diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java index c9782dc1681..47c0d2aea1b 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java @@ -288,6 +288,7 @@ public class QueryOptions implements OptionDescriber { public static final String DOCUMENT_MATCH_CONTEXT_REQUIRED = "document.match.context.required"; public static final String DOCUMENT_MATCH_MAX_ENCODED_SIZE = "document.match.max.encoded.size"; public static final String DOCUMENT_MATCH_MAX_DECODED_SIZE = "document.match.max.decoded.size"; + public static final String DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE = "document.match.max.encoded.context.size"; public static final String FIELD_COUNTS = "field.counts"; public static final String TERM_COUNTS = "term.counts"; @@ -476,6 +477,7 @@ public class QueryOptions implements OptionDescriber { private int tfAggregationThresholdMs = -1; private int documentMatchMaxEncodedSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE; private int documentMatchMaxDecodedSize = DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE; + private int documentMatchMaxEncodedContextSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE; private CountMap fieldCounts; private CountMap termCounts; @@ -606,6 +608,7 @@ public void deepCopy(QueryOptions other) { this.tfAggregationThresholdMs = other.tfAggregationThresholdMs; this.documentMatchMaxEncodedSize = other.documentMatchMaxEncodedSize; this.documentMatchMaxDecodedSize = other.documentMatchMaxDecodedSize; + this.documentMatchMaxEncodedContextSize = other.documentMatchMaxEncodedContextSize; this.fieldCounts = other.fieldCounts; this.termCounts = other.termCounts; @@ -1464,6 +1467,8 @@ public IteratorOptions describeOptions() { options.put(DOCUMENT_MATCH_CONTEXT_REQUIRED, "Whether the query requires gathering document-match context"); options.put(DOCUMENT_MATCH_MAX_ENCODED_SIZE, "Maximum encoded d-column payload size, in bytes, to inspect for document:match evaluation"); options.put(DOCUMENT_MATCH_MAX_DECODED_SIZE, "Maximum decoded d-column payload size, in bytes, to inspect for document:match evaluation"); + options.put(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE, + "Maximum aggregate encoded d-column payload size, in bytes, to retain in memory for document:match evaluation"); options.put(FIELD_COUNTS, "Map of field counts from the global index"); options.put(TERM_COUNTS, "Map of term counts from the global index"); return new IteratorOptions(getClass().getSimpleName(), "Runs a query against the DATAWAVE tables", options, null); @@ -1696,6 +1701,10 @@ public boolean validateOptions(Map options) { this.documentMatchMaxDecodedSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_DECODED_SIZE)); } + if (options.containsKey(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)) { + this.documentMatchMaxEncodedContextSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)); + } + if (options.containsKey(DATATYPE_FILTER)) { String option = options.get(DATATYPE_FILTER); if (option != null && !option.isEmpty()) { @@ -2519,6 +2528,14 @@ public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { this.documentMatchMaxDecodedSize = documentMatchMaxDecodedSize; } + public int getDocumentMatchMaxEncodedContextSize() { + return documentMatchMaxEncodedContextSize; + } + + public void setDocumentMatchMaxEncodedContextSize(int documentMatchMaxEncodedContextSize) { + this.documentMatchMaxEncodedContextSize = documentMatchMaxEncodedContextSize; + } + public boolean isDocumentMatchContextRequired() { return documentMatchContextRequired; } @@ -2528,7 +2545,7 @@ public void setDocumentMatchContextRequired(boolean documentMatchContextRequired } protected DocumentMatchContext.Limits getDocumentMatchLimits() { - return new DocumentMatchContext.Limits(documentMatchMaxEncodedSize, documentMatchMaxDecodedSize); + return new DocumentMatchContext.Limits(documentMatchMaxEncodedSize, documentMatchMaxDecodedSize, documentMatchMaxEncodedContextSize); } /** diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java index 9c456a36001..ac39b284c24 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java @@ -9,6 +9,8 @@ import datawave.query.jexl.ArithmeticJexlEngines; import datawave.query.jexl.functions.arguments.JexlArgumentDescriptor; +import datawave.query.util.DateIndexHelper; +import datawave.query.util.MetadataHelper; import datawave.webservice.query.exception.BadRequestQueryException; import datawave.webservice.query.exception.DatawaveErrorCode; import datawave.webservice.query.exception.QueryException; @@ -27,6 +29,10 @@ public interface JexlFunctionArgumentDescriptorFactory { */ JexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node); + /** + * Returned by {@link JexlArgumentDescriptor#getIndexQuery(datawave.query.config.ShardQueryConfiguration, MetadataHelper, DateIndexHelper, java.util.Set)} + * when index searching should be skipped for a function. + */ JexlNode TRUE_NODE = new ASTTrueNode(ParserTreeConstants.JJTTRUENODE); /** An encapsulation of methods that can be used with this interface */ diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java index f586d46c951..864051631e9 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java @@ -35,7 +35,8 @@ public interface JexlArgumentDescriptor { * the datatype filter * @param settings * the config settings - * @return The query which will be used against the global index + * @return the query which will be used against the global index, or {@link datawave.query.jexl.functions.JexlFunctionArgumentDescriptorFactory#TRUE_NODE} + * if index searching should be skipped for this function */ JexlNode getIndexQuery(ShardQueryConfiguration settings, MetadataHelper metadataHelper, DateIndexHelper dateIndexHelper, Set datatypeFilter); diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 0cfaa5d723d..bcdd386dfa6 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -2411,6 +2411,7 @@ protected void configureDocumentMatchOptions(ShardQueryConfiguration config, Ite if (documentMatchContextRequired) { addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedSize()), false); addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE, Integer.toString(config.getDocumentMatchMaxDecodedSize()), false); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedContextSize()), false); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java index 33dabcb78ee..c2ef06fad4c 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java @@ -3463,6 +3463,14 @@ public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { getConfig().setDocumentMatchMaxDecodedSize(documentMatchMaxDecodedSize); } + public int getDocumentMatchMaxEncodedContextSize() { + return getConfig().getDocumentMatchMaxEncodedContextSize(); + } + + public void setDocumentMatchMaxEncodedContextSize(int documentMatchMaxEncodedContextSize) { + getConfig().setDocumentMatchMaxEncodedContextSize(documentMatchMaxEncodedContextSize); + } + public boolean getPruneQueryOptions() { return getConfig().getPruneQueryOptions(); } diff --git a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java index 186dedd591c..51028b27764 100644 --- a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java @@ -670,6 +670,8 @@ public void setUp() throws Exception { updatedValues.put("documentMatchMaxEncodedSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE + 1); defaultValues.put("documentMatchMaxDecodedSize", DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE); updatedValues.put("documentMatchMaxDecodedSize", DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE + 1); + defaultValues.put("documentMatchMaxEncodedContextSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE); + updatedValues.put("documentMatchMaxEncodedContextSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE + 1); defaultValues.put("documentMatchContextRequired", false); updatedValues.put("documentMatchContextRequired", true); } diff --git a/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java index 718b526897e..2e6d6644d06 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java @@ -39,7 +39,7 @@ public void testCollectsOnlyCurrentDocumentColumns() { DocumentMatchConfig config = new DocumentMatchConfig(); config.setSource(new ListBackedIterator(entries)); - config.setLimits(new DocumentMatchContext.Limits(1234, 5678)); + config.setLimits(new DocumentMatchContext.Limits(1234, 5678, 9012)); DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); Tuple3> result = function @@ -49,6 +49,7 @@ public void testCollectsOnlyCurrentDocumentColumns() { assertEquals(2, context.getdEntries().size()); assertEquals(1234, context.getMaxEncodedValueSize()); assertEquals(5678, context.getMaxDecodedValueSize()); + assertEquals(9012, context.getMaxEncodedContextSize()); } /** @@ -58,7 +59,7 @@ public void testCollectsOnlyCurrentDocumentColumns() { public void testCollectsEmptyContextWhenNoDocumentColumnsExist() { DocumentMatchConfig config = new DocumentMatchConfig(); config.setSource(new ListBackedIterator(Collections.emptyList())); - config.setLimits(new DocumentMatchContext.Limits(10, 20)); + config.setLimits(new DocumentMatchContext.Limits(10, 20, 30)); DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); Tuple3> result = function @@ -79,7 +80,7 @@ public void testCollectsColumnsForDocumentKeysFromDocument() { DocumentMatchConfig config = new DocumentMatchConfig(); config.setSource(new ListBackedIterator(entries)); - config.setLimits(new DocumentMatchContext.Limits(10, 20)); + config.setLimits(new DocumentMatchContext.Limits(10, 20, 30)); config.setTld(true); DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); @@ -94,6 +95,28 @@ public void testCollectsColumnsForDocumentKeysFromDocument() { assertEquals(2, context.getdEntries().size()); } + /** + * Verifies that collection skips individually oversized payloads and stops once the retained encoded bytes would exceed the configured aggregate limit. + */ + @Test + public void testCollectsOnlyEntriesWithinAggregateEncodedContextLimit() { + List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("1234".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0uid\0META"), new Value("12345".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0uid\0TAIL"), new Value("12".getBytes()))); + + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(entries)); + config.setLimits(new DocumentMatchContext.Limits(10, 20, 4)); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Tuple3> result = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertEquals(1, context.getdEntries().size()); + assertEquals("datatype\0uid\0BODY", context.getdEntries().get(0).getKey().getColumnQualifier().toString()); + } + private static class ListBackedIterator implements SortedKeyValueIterator { private final List> entries; private int index = -1; diff --git a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java index 0166964627c..4a810b09d3d 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java @@ -208,8 +208,11 @@ public void testDocumentMatchAddsDocumentAttribute() { DatawaveJexlContext context = new DatawaveJexlContext(); d.visit(Collections.singleton("FOO"), context); context.set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, - new DocumentMatchContext(List.of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), - new org.apache.accumulo.core.data.Value(buildEncodedValue("scar car")))), 1024)); + new DocumentMatchContext( + List.of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), + new org.apache.accumulo.core.data.Value(buildEncodedValue("scar car")))), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE))); assertEvaluation(query, docKey, d, context); assertEquals("{\"car\":{\"BODY\":[1,5]}}", ((Content) d.get(DocumentFunctions.DOCUMENT_MATCHES)).getContent()); @@ -226,12 +229,13 @@ public void testDocumentMatchMergesDocumentAttributeAcrossCalls() { DatawaveJexlContext context = new DatawaveJexlContext(); d.visit(Collections.singleton("FOO"), context); context.set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, - new DocumentMatchContext(List.of( - Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), + new DocumentMatchContext( + List.of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new org.apache.accumulo.core.data.Value(buildEncodedValue("scar car"))), - Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "A"), - new org.apache.accumulo.core.data.Value(buildEncodedValue("lawyer car")))), - 1024)); + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "A"), + new org.apache.accumulo.core.data.Value(buildEncodedValue("lawyer car")))), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE))); assertEvaluation(query, docKey, d, context); assertEquals("{\"car\":{\"BODY\":[1,5]},\"lawyer\":{\"CONTENT2\":[0]}}", ((Content) d.get(DocumentFunctions.DOCUMENT_MATCHES)).getContent()); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java index 102b916451c..396020665f7 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java @@ -33,7 +33,9 @@ public class DocumentFunctionsTest { */ @Test public void testMatchAcrossAllViews() throws Exception { - DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0META", "carpet")), 1024); + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0META", "carpet")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); String result = DocumentFunctions.match(context, "car"); @@ -47,7 +49,9 @@ public void testMatchAcrossAllViews() throws Exception { @Test public void testWildcardViewMatch() throws Exception { DocumentMatchContext context = new DocumentMatchContext( - List.of(entry("test\0uid\0BODY", "car"), entry("test\0uid\0BODY_TEXT", "car car"), entry("test\0uid\0META", "car")), 1024); + List.of(entry("test\0uid\0BODY", "car"), entry("test\0uid\0BODY_TEXT", "car car"), entry("test\0uid\0META", "car")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); String result = DocumentFunctions.match("BODY*", context, "car"); @@ -59,7 +63,8 @@ public void testWildcardViewMatch() throws Exception { */ @Test public void testOverlappingMatches() throws Exception { - DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "banana")), 1024); + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "banana")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); String result = DocumentFunctions.match("BODY", context, "ana"); @@ -71,7 +76,8 @@ public void testOverlappingMatches() throws Exception { */ @Test public void testCaseSensitiveMatch() throws Exception { - DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), 1024); + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); assertTrue(DocumentFunctions.match(context, "Car").isEmpty()); } @@ -82,7 +88,8 @@ public void testCaseSensitiveMatch() throws Exception { @Test public void testOversizedPayloadIsNonMatch() throws Exception { Map.Entry entry = entry("test\0uid\0BODY", "scar car"); - DocumentMatchContext context = new DocumentMatchContext(List.of(entry), entry.getValue().get().length - 1); + DocumentMatchContext context = new DocumentMatchContext(List.of(entry), new DocumentMatchContext.Limits(entry.getValue().get().length - 1, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); assertTrue(DocumentFunctions.match(context, "car").isEmpty()); } @@ -92,7 +99,8 @@ public void testOversizedPayloadIsNonMatch() throws Exception { */ @Test public void testOversizedDecodedPayloadIsNonMatch() throws Exception { - DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), 1024, 3); + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), + new DocumentMatchContext.Limits(1024, 3, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); assertTrue(DocumentFunctions.match(context, "car").isEmpty()); } @@ -102,7 +110,8 @@ public void testOversizedDecodedPayloadIsNonMatch() throws Exception { */ @Test public void testNoDocumentEntriesIsNonMatch() { - assertTrue(DocumentFunctions.match(new DocumentMatchContext(List.of(), 1024), "car").isEmpty()); + assertTrue(DocumentFunctions.match(new DocumentMatchContext(List.of(), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)), "car").isEmpty()); } /** @@ -111,7 +120,8 @@ public void testNoDocumentEntriesIsNonMatch() { @Test public void testDecodeFailureIsNonMatch() { DocumentMatchContext context = new DocumentMatchContext(List.of(Map.entry(new Key("row", "d", "test\0uid\0BODY"), new Value("not-base64".getBytes()))), - 1024); + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); assertTrue(DocumentFunctions.match(context, "car").isEmpty()); } @@ -121,7 +131,9 @@ public void testDecodeFailureIsNonMatch() { */ @Test public void testMatchWithBase64LineBreaks() throws Exception { - DocumentMatchContext context = new DocumentMatchContext(List.of(entryWithEncodedSuffix("test\0uid\0BODY", "/* Origins */ Fix.", "\r\n")), 1024); + DocumentMatchContext context = new DocumentMatchContext(List.of(entryWithEncodedSuffix("test\0uid\0BODY", "/* Origins */ Fix.", "\r\n")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); String result = DocumentFunctions.match("BODY", context, "Origins"); @@ -133,7 +145,9 @@ public void testMatchWithBase64LineBreaks() throws Exception { */ @Test public void testMatchWithBase64OnlyPayload() { - DocumentMatchContext context = new DocumentMatchContext(List.of(base64OnlyEntry("test\0uid\0BODY", "/* Origins */ Fix.")), 1024); + DocumentMatchContext context = new DocumentMatchContext(List.of(base64OnlyEntry("test\0uid\0BODY", "/* Origins */ Fix.")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); String result = DocumentFunctions.match("BODY", context, "Origins"); @@ -146,7 +160,8 @@ public void testMatchWithBase64OnlyPayload() { @Test public void testMatchMergesResultsAcrossCalls() throws Exception { DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0CONTENT2", "lawyer car")), - 1024); + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); assertEquals("{\"lawyer\":{\"CONTENT2\":[0]}}", DocumentFunctions.match("CONTENT2", context, "lawyer")); @@ -159,7 +174,8 @@ public void testMatchMergesResultsAcrossCalls() throws Exception { @Test public void testMatchMergesSameSearchAcrossCalls() throws Exception { DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0CONTENT2", "lawyer car")), - 1024); + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); assertEquals("{\"car\":{\"CONTENT2\":[7]}}", DocumentFunctions.match("CONTENT2", context, "car")); @@ -177,7 +193,9 @@ public void testMatchLogsVisibilityMismatchAndKeepsFirstVisibility() throws Exce logger.setLevel(Level.INFO); try { DocumentMatchContext context = new DocumentMatchContext( - List.of(entry("test\0uid\0BODY", "scar car", "A"), entry("test\0uid\0CONTENT2", "lawyer car", "B")), 1024); + List.of(entry("test\0uid\0BODY", "scar car", "A"), entry("test\0uid\0CONTENT2", "lawyer car", "B")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); assertEquals("{\"lawyer\":{\"CONTENT2\":[0]}}", DocumentFunctions.match("CONTENT2", context, "lawyer")); diff --git a/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java b/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java index 3f4fd6c3e61..5c487ddb8c7 100644 --- a/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java @@ -1,6 +1,7 @@ package datawave.query.planner; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import java.text.SimpleDateFormat; @@ -49,13 +50,15 @@ void testAddDocumentMatchOptionsWithoutContextRequired() { config.setDocumentMatchContextRequired(false); config.setDocumentMatchMaxEncodedSize(111); config.setDocumentMatchMaxDecodedSize(222); + config.setDocumentMatchMaxEncodedContextSize(333); IteratorSetting cfg = new IteratorSetting(10, "query", QueryIterator.class); planner.configureDocumentMatchOptions(config, cfg); - Assertions.assertEquals("false", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED)); - Assertions.assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE)); - Assertions.assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE)); + assertEquals("false", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED)); + assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE)); + assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE)); + assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)); } @Test @@ -65,13 +68,15 @@ void testAddDocumentMatchOptionsWithContextRequired() { config.setDocumentMatchContextRequired(true); config.setDocumentMatchMaxEncodedSize(111); config.setDocumentMatchMaxDecodedSize(222); + config.setDocumentMatchMaxEncodedContextSize(333); IteratorSetting cfg = new IteratorSetting(10, "query", QueryIterator.class); planner.configureDocumentMatchOptions(config, cfg); - Assertions.assertEquals("true", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED)); - Assertions.assertEquals("111", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE)); - Assertions.assertEquals("222", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE)); + assertEquals("true", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED)); + assertEquals("111", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE)); + assertEquals("222", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE)); + assertEquals("333", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java index f997019bbf2..ae169bea877 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java @@ -266,6 +266,7 @@ public void tearDown() throws Exception { public void testDocumentMatchLimitsDefaultFromSpringConfig() { assertEquals(DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE, logic.getDocumentMatchMaxEncodedSize()); assertEquals(DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, logic.getDocumentMatchMaxDecodedSize()); + assertEquals(DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE, logic.getDocumentMatchMaxEncodedContextSize()); } /** @@ -275,14 +276,18 @@ public void testDocumentMatchLimitsDefaultFromSpringConfig() { public void testDocumentMatchLimitSettersUpdateLogicAndConfig() { int encoded = 1024; int decoded = 2048; + int encodedContext = 4096; logic.setDocumentMatchMaxEncodedSize(encoded); logic.setDocumentMatchMaxDecodedSize(decoded); + logic.setDocumentMatchMaxEncodedContextSize(encodedContext); assertEquals(encoded, logic.getDocumentMatchMaxEncodedSize()); assertEquals(decoded, logic.getDocumentMatchMaxDecodedSize()); + assertEquals(encodedContext, logic.getDocumentMatchMaxEncodedContextSize()); assertEquals(encoded, logic.getConfig().getDocumentMatchMaxEncodedSize()); assertEquals(decoded, logic.getConfig().getDocumentMatchMaxDecodedSize()); + assertEquals(encodedContext, logic.getConfig().getDocumentMatchMaxEncodedContextSize()); } /** @@ -292,9 +297,11 @@ public void testDocumentMatchLimitSettersUpdateLogicAndConfig() { public void testDocumentMatchLimitsPropagateThroughInitialize() throws Exception { int encoded = 4096; int decoded = 8192; + int encodedContext = 16384; logic.setDocumentMatchMaxEncodedSize(encoded); logic.setDocumentMatchMaxDecodedSize(decoded); + logic.setDocumentMatchMaxEncodedContextSize(encodedContext); this.query = "UUID == '" + caponeUID + "'"; this.startDate = dateFormat.parse("20091231"); @@ -306,6 +313,7 @@ public void testDocumentMatchLimitsPropagateThroughInitialize() throws Exception assertEquals(encoded, config.getDocumentMatchMaxEncodedSize()); assertEquals(decoded, config.getDocumentMatchMaxDecodedSize()); + assertEquals(encodedContext, config.getDocumentMatchMaxEncodedContextSize()); } private AccumuloClient createClient() throws Exception { From de3eecbd82de9727ee7c9b2e13d336e904612de0 Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Sat, 11 Apr 2026 11:26:57 -0400 Subject: [PATCH 7/9] A second round of addressing PR comments * Clean up duplicate d column decode paths by tailoring the decode methods in ContentKeyValueFactory * Improve handling for documentMatchFunction cases in DatawaveInterpreter * Employ constants where possible --- .../DocumentMatchContextFunction.java | 12 ++++-- .../query/jexl/DatawaveInterpreter.java | 6 +-- .../jexl/functions/DocumentFunctions.java | 33 +-------------- .../functions/jexl/DocumentMatch.java | 10 ++++- .../functions/lucene/DocumentMatch.java | 13 ++++-- .../table/parser/ContentKeyValueFactory.java | 41 ++++++++++++++++++- 6 files changed, 68 insertions(+), 47 deletions(-) diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java index d9baa868630..fe024a77b16 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java @@ -33,6 +33,9 @@ */ public class DocumentMatchContextFunction implements Function>,Tuple3>> { private static final Logger log = Logger.getLogger(DocumentMatchContextFunction.class); + private static final String DOCUMENT_COLUMN_FAMILY_STRING = "d"; + private static final char DOCUMENT_COLUMN_FAMILY_CHAR = 'd'; + private static final char NULL = '\0'; private final DocumentMatchConfig config; private final SortedKeyValueIterator source; @@ -84,8 +87,9 @@ private List> collectDocumentColumnAttributes(Set document private long collectDocumentColumnAttributes(Key documentKey, List> documentColumns, long retainedBytes) throws IOException { String row = documentKey.getRow().toString(); String datatypeAndUid = documentKey.getColumnFamily().toString(); - Key startKey = new Key(row, "d", datatypeAndUid + '\0'); - Key endKey = config.isTld() ? new Key(row, "d", datatypeAndUid + '\uffff') : new Key(row, "d", datatypeAndUid + '.'); + Key startKey = new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + NULL); + Key endKey = config.isTld() ? new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + '\uffff') + : new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + '.'); Range documentColumnRange = new Range(startKey, true, endKey, false); if (log.isDebugEnabled()) { log.debug("Seeking d-column range " + documentColumnRange + " for document key " + documentKey); @@ -190,8 +194,8 @@ private Set getDocumentKeys(Key tupleKey, Document document) { private boolean isDocumentColumn(Key documentContentKey, Key documentKey) { // A document key's column family is datatype\0uid, and a d-column qualifier begins with that same datatype\0uid // followed by \0view. This prefix comparison ties the d-column back to the document represented by the event key. - return documentContentKey.getColumnFamilyData().length() == 1 && documentContentKey.getColumnFamilyData().byteAt(0) == 'd' + return documentContentKey.getColumnFamilyData().length() == 1 && documentContentKey.getColumnFamilyData().byteAt(0) == DOCUMENT_COLUMN_FAMILY_CHAR && documentContentKey.getRow().equals(documentKey.getRow()) - && documentContentKey.getColumnQualifier().toString().startsWith(documentKey.getColumnFamily().toString() + '\0'); + && documentContentKey.getColumnQualifier().toString().startsWith(documentKey.getColumnFamily().toString() + NULL); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java b/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java index 72a5b9fa04e..e904bc81047 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java @@ -129,11 +129,7 @@ public Object visit(ASTFunctionNode node, Object data) { addHits(result); - if (isDocumentMatchFunction(nodeString) && result instanceof String) { - if (hasSiblings(node)) { - resultMap.put(nodeString, result); - return result; - } + if (isDocumentMatchFunction(nodeString) && result instanceof String && !hasSiblings(node)) { boolean matched = !((String) result).isEmpty(); resultMap.put(nodeString, matched); return matched; diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java index 3c3baa192a9..aea21efd19a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java @@ -1,16 +1,11 @@ package datawave.query.jexl.functions; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Base64; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.zip.GZIPInputStream; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Value; @@ -19,6 +14,7 @@ import com.google.gson.Gson; import datawave.query.function.DocumentMatchContext; +import datawave.query.table.parser.ContentKeyValueFactory; /** * Evaluation-phase JEXL functions for inspecting decoded shard-table {@code d}-column content. @@ -36,7 +32,6 @@ public class DocumentFunctions { public static final String DOCUMENT_MATCH_FUNCTION_NAME = "match"; public static final String DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME = "documentMatchContext"; public static final String DOCUMENT_MATCHES = "DOCUMENT_MATCHES"; - private static final int DECODE_BUFFER_SIZE = 4096; /** * Evaluates the internal form of {@code document:match(STRING)} across all eligible views for the current document. @@ -176,31 +171,7 @@ static boolean matchesView(String expectedView, String candidateView) { * if the payload cannot be decoded or if the decoded size exceeds the configured limit */ static String decode(byte[] encoded, int maxDecodedValueSize) throws IOException { - byte[] decodedBytes; - try (ByteArrayInputStream bais = new ByteArrayInputStream(encoded)) { - decodedBytes = Base64.getMimeDecoder().decode(bais.readAllBytes()); - } - - try (ByteArrayInputStream decodedInput = new ByteArrayInputStream(decodedBytes); - GZIPInputStream gzipInputStream = new GZIPInputStream(decodedInput); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - byte[] buffer = new byte[DECODE_BUFFER_SIZE]; - int read; - int totalRead = 0; - while ((read = gzipInputStream.read(buffer)) >= 0) { - totalRead += read; - if (totalRead > maxDecodedValueSize) { - throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecodedValueSize + " bytes"); - } - baos.write(buffer, 0, read); - } - return baos.toString(StandardCharsets.UTF_8); - } catch (IOException e) { - if (decodedBytes.length > maxDecodedValueSize) { - throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecodedValueSize + " bytes", e); - } - return new String(decodedBytes, StandardCharsets.UTF_8); - } + return ContentKeyValueFactory.decodeAndDecompressContentAsString(encoded, maxDecodedValueSize); } /** diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java index 0c54d1a0c59..c7dcd136844 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java @@ -14,8 +14,14 @@ * two-argument forms and renders the canonical JEXL syntax consumed by the runtime query planner. */ public class DocumentMatch extends JexlQueryFunction { + + public static final String DOCUMENT_MATCH_FUNCTION = "DOCUMENT_MATCH"; + public static final String DOCUMENT_FIELD = "document"; + public static final String DOCUMENT_NAMESPACE = DOCUMENT_FIELD + ":"; + public static final String MATCH_FUNCTION = "match"; + public DocumentMatch() { - super("DOCUMENT_MATCH", new ArrayList<>()); + super(DOCUMENT_MATCH_FUNCTION, new ArrayList<>()); } /** @@ -40,7 +46,7 @@ public void validate() throws IllegalArgumentException { @Override public String toString() { StringBuilder sb = new StringBuilder(); - sb.append("document:match("); + sb.append(DOCUMENT_NAMESPACE + MATCH_FUNCTION + "("); for (int i = 0; i < parameterList.size(); i++) { if (i > 0) { sb.append(", "); diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java index 84c7a155a2d..96c99fcd687 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java @@ -1,5 +1,10 @@ package datawave.query.language.functions.lucene; +import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_FIELD; +import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_MATCH_FUNCTION; +import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_NAMESPACE; +import static datawave.query.language.functions.jexl.DocumentMatch.MATCH_FUNCTION; + import java.text.MessageFormat; import java.util.ArrayList; @@ -22,9 +27,9 @@ private static class DocumentMatchFilter extends WildcardFieldedFilter { DocumentMatchFilter(String selector) { super(true, WildcardFieldedFilter.BooleanType.AND); - setField("document"); + setField(DOCUMENT_FIELD); setSelector(selector); - this.renderedQuery = "document:" + selector; + this.renderedQuery = DOCUMENT_NAMESPACE + selector; this.query = renderedQuery; } @@ -35,7 +40,7 @@ public String toString() { } public DocumentMatch() { - super("DOCUMENT_MATCH", new ArrayList<>()); + super(DOCUMENT_MATCH_FUNCTION, new ArrayList<>()); } /** @@ -81,7 +86,7 @@ public void initialize(java.util.List parameterList, int depth, org.apac */ private String buildSelector() { StringBuilder sb = new StringBuilder(); - sb.append("match("); + sb.append(MATCH_FUNCTION + "("); for (int i = 0; i < parameterList.size(); i++) { if (i > 0) { sb.append(", "); diff --git a/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java b/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java index 8ed69bf246d..826b365f4e3 100644 --- a/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java +++ b/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java @@ -1,7 +1,9 @@ package datawave.query.table.parser; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Base64; import java.util.zip.GZIPInputStream; @@ -16,6 +18,7 @@ import datawave.query.table.parser.EventKeyValueFactory.EventKeyValue; public class ContentKeyValueFactory { + private static final int DECODE_BUFFER_SIZE = 4096; private static final Logger log = Logger.getLogger(ContentKeyValueFactory.class); @@ -55,7 +58,7 @@ public static ContentKeyValue parse(Key key, Value value, Authorizations auths, public static byte[] decodeAndDecompressContent(byte[] contents) { try { - contents = decompress(Base64.getMimeDecoder().decode(contents)); + contents = decodeAndDecompressContent(contents, Integer.MAX_VALUE); } catch (IOException e) { log.error("Error decompressing Base64 encoded GZIPInputStream", e); } catch (Exception e) { @@ -69,6 +72,22 @@ public static byte[] decodeAndDecompressContent(byte[] contents) { return contents; } + public static String decodeAndDecompressContentAsString(byte[] contents, int maxDecodedSize) throws IOException { + return new String(decodeAndDecompressContent(contents, maxDecodedSize), StandardCharsets.UTF_8); + } + + public static byte[] decodeAndDecompressContent(byte[] contents, int maxDecodedSize) throws IOException { + byte[] decoded = Base64.getMimeDecoder().decode(contents); + try { + return decompress(decoded, maxDecodedSize); + } catch (IOException e) { + if (decoded.length > maxDecodedSize) { + throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecodedSize + " bytes", e); + } + return decoded; + } + } + private static boolean isCompressed(byte[] compressed) { return (compressed[0] == (byte) (GZIPInputStream.GZIP_MAGIC)) && (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8)); } @@ -83,6 +102,26 @@ private static byte[] decompress(byte[] compressed) throws IOException { return decompressed; } + private static byte[] decompress(byte[] compressed, int maxDecompressedSize) throws IOException { + if (!isCompressed(compressed)) { + return compressed; + } + + try (GZIPInputStream gzip = new GZIPInputStream(new ByteArrayInputStream(compressed)); ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + byte[] buffer = new byte[DECODE_BUFFER_SIZE]; + int read; + int totalRead = 0; + while ((read = gzip.read(buffer)) >= 0) { + totalRead += read; + if (totalRead > maxDecompressedSize) { + throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecompressedSize + " bytes"); + } + baos.write(buffer, 0, read); + } + return baos.toByteArray(); + } + } + public static class ContentKeyValue extends EventKeyValue { protected String viewName = null; From 3caf506973c4bfce406766e20b7fdf8250678538 Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Sun, 12 Apr 2026 18:02:18 -0400 Subject: [PATCH 8/9] Another round of addressing PR comments * Avoid clearing documentMatchContext in JexlEvaluation added tests to validate this is the right thing to do * Avoid merging all results into a single Attribute and choosing the first visbility, adds multiple values for the DOCUMENT_MATCHES field with the appropriate visibility based on the original d-column. * Significant refactoring of the return format as a result of avoiding merges - adds DocumentMatchResults object to hold results. * Updated the document match function to return the matched string if there's a successful match, an empty string if not. There was no need to return a full JSON object containing all matches because this comes from the DocumentMatchContext. * Properly dedups offsets in cases where multiple document match functions against the same query string return the same offsets for a document. * Updated unit tests to reflect new conditions, edge cases, incorrect input. --- .../datawave/query/attributes/Content.java | 6 + .../query/function/DocumentMatchContext.java | 130 ++++--------- .../query/function/DocumentMatchResults.java | 100 ++++++++++ .../query/function/JexlEvaluation.java | 19 +- .../jexl/functions/DocumentFunctions.java | 88 +++------ .../query/DocumentMatchQueryTest.java | 99 +++++++++- .../DocumentMatchContextFunctionTest.java | 44 ++++- .../query/function/JexlEvaluationTest.java | 178 +++++++++++------- .../jexl/functions/DocumentFunctionsTest.java | 146 +++++++------- 9 files changed, 483 insertions(+), 327 deletions(-) create mode 100644 warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java diff --git a/warehouse/query-core/src/main/java/datawave/query/attributes/Content.java b/warehouse/query-core/src/main/java/datawave/query/attributes/Content.java index 8b1f1aea793..2debde33060 100644 --- a/warehouse/query-core/src/main/java/datawave/query/attributes/Content.java +++ b/warehouse/query-core/src/main/java/datawave/query/attributes/Content.java @@ -42,6 +42,12 @@ public Content(String content, Key docKey, boolean toKeep, Attribute source) this.source = source; } + public static Content withKeyMetadata(String content, Key metadata, boolean toKeep) { + Content attribute = new Content(content, null, toKeep, null); + attribute.metadata = metadata; + return attribute; + } + @Override public long sizeInBytes() { if (sizeInBytes == Long.MIN_VALUE) { diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java index 81d426c8c84..2febb7c4cd9 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java @@ -3,24 +3,22 @@ import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; -import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.Set; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.security.ColumnVisibility; import datawave.query.predicate.TimeFilter; /** * Per-document runtime state used by {@code document:match(...)} evaluation. *

- * This context carries the raw {@code d}-column entries retained for a candidate document, the configured size limits used while decoding those payloads, the - * merged offset results accumulated across one or more {@code document:match(...)} calls, grouped first by matched string and then by view, and the first - * matched {@code d}-column key whose visibility should be applied to the derived {@code DOCUMENT_MATCHES} attribute. + * This context carries the raw {@code d}-column entries retained for a candidate document, the configured size limits used while decoding those payloads, and + * the per-{@code d}-column {@link DocumentMatchResults} accumulated across one or more {@code document:match(...)} calls for a single evaluation. Context + * instances are expected to be created fresh for each evaluation pass rather than reused across multiple documents or repeated evaluations of the same + * document. */ public class DocumentMatchContext { public static final int DEFAULT_MAX_ENCODED_SIZE = 256 * 1024 * 1024; @@ -71,14 +69,20 @@ public int getMaxEncodedContextSize() { } } - private final List> dEntries; + private final List> documentEntries; private final Limits limits; - private final Map>> mergedMatches = new LinkedHashMap<>(); - private Key firstMatchingEntry; - private boolean visibilityMismatchLogged = false; + private final Map matches = new LinkedHashMap<>(); - public DocumentMatchContext(List> dEntries, Limits limits) { - this.dEntries = dEntries; + /** + * Creates a per-evaluation match context for the retained {@code d}-column entries of a single candidate document. + * + * @param documentEntries + * retained {@code d}-column entries for the document being evaluated + * @param limits + * payload-processing limits applied during decode and match extraction + */ + public DocumentMatchContext(List> documentEntries, Limits limits) { + this.documentEntries = documentEntries; this.limits = limits; } @@ -91,20 +95,23 @@ public DocumentMatchContext(List> dEntries, Limits limits) { * optional time filter to apply while selecting {@code d}-column entries * @param limits * payload-processing limits - * @return a context containing only eligible {@code d}-column entries + * @return a fresh context containing only eligible {@code d}-column entries for a single evaluation pass */ public static DocumentMatchContext from(List> entries, TimeFilter timeFilter, Limits limits) { - List> dEntries = new ArrayList<>(); + List> documentEntries = new ArrayList<>(); for (Entry entry : entries) { if (entry.getKey().getColumnFamily().toString().equals("d") && (timeFilter == null || timeFilter.apply(entry))) { - dEntries.add(entry); + documentEntries.add(entry); } } - return new DocumentMatchContext(dEntries, limits); + return new DocumentMatchContext(documentEntries, limits); } - public List> getdEntries() { - return Collections.unmodifiableList(dEntries); + /** + * @return the retained {@code d}-column entries available to {@code document:match(...)} during the current evaluation + */ + public List> getDocumentEntries() { + return Collections.unmodifiableList(documentEntries); } public int getMaxEncodedValueSize() { @@ -119,92 +126,33 @@ public int getMaxEncodedContextSize() { return limits.getMaxEncodedContextSize(); } - public Limits getLimits() { - return limits; - } - /** - * Clears merged match state before evaluating a new document. + * @return the payload-processing limits associated with this evaluation context */ - public void clearMergedMatches() { - mergedMatches.clear(); - firstMatchingEntry = null; - visibilityMismatchLogged = false; + public Limits getLimits() { + return limits; } /** - * Merges per-call matches into the document-wide result set. + * Records per-call matches in the per-{@code d}-column document-wide result set. * + * @param key + * the matched {@code d}-column key * @param search * the literal string matched by the invocation - * @param matches - * matches produced by one {@code document:match(...)} invocation, keyed by view name - */ - public void mergeMatches(String search, Map> matches) { - Map> searchMatches = mergedMatches.computeIfAbsent(search, key -> new LinkedHashMap<>()); - for (Entry> entry : matches.entrySet()) { - searchMatches.computeIfAbsent(entry.getKey(), key -> new LinkedHashSet<>()).addAll(entry.getValue()); - } - } - - /** - * @return a defensive copy of the merged document-wide match results + * @param offsets + * starting offsets found in the matched view */ - public Map>> getMergedMatches() { - Map>> matches = new LinkedHashMap<>(); - for (Entry>> searchEntry : mergedMatches.entrySet()) { - Map> viewMatches = new LinkedHashMap<>(); - for (Entry> viewEntry : searchEntry.getValue().entrySet()) { - viewMatches.put(viewEntry.getKey(), new ArrayList<>(viewEntry.getValue())); - } - matches.put(searchEntry.getKey(), viewMatches); - } - return matches; + public void addMatches(Key key, String search, List offsets) { + matches.computeIfAbsent(key, DocumentMatchResults::new).addMatches(search, offsets); } /** - * @return the first {@code d}-column key that matched during evaluation, or {@code null} if no match has been recorded yet - */ - public Key getFirstMatchingEntry() { - return firstMatchingEntry; - } - - /** - * @return the visibility from the first matched {@code d}-column key, or {@code null} if no match has been recorded yet - */ - public ColumnVisibility getFirstMatchingColumnVisibility() { - if (firstMatchingEntry == null) { - return null; - } - return firstMatchingEntry.getColumnVisibilityParsed(); - } - - /** - * Records a matched {@code d}-column key and detects whether its visibility differs from the first matched key for the document. + * Returns the per-entry match results accumulated during this evaluation. * - * @param key - * the matched {@code d}-column key - * @return {@code true} if the key differs in visibility from the first matched key, otherwise {@code false} - */ - public boolean recordMatchingEntry(Key key) { - if (firstMatchingEntry == null) { - firstMatchingEntry = key; - return false; - } - return !firstMatchingEntry.getColumnVisibilityData().equals(key.getColumnVisibilityData()); - } - - /** - * @return {@code true} if a visibility mismatch has not yet been logged for the current document - */ - public boolean shouldLogVisibilityMismatch() { - return !visibilityMismatchLogged; - } - - /** - * Marks the current document as having already logged a visibility mismatch. + * @return an immutable snapshot view of the accumulated per-{@code d}-column match results */ - public void markVisibilityMismatchLogged() { - visibilityMismatchLogged = true; + public List getMatches() { + return List.copyOf(matches.values()); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java new file mode 100644 index 00000000000..7ce4551332c --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java @@ -0,0 +1,100 @@ +package datawave.query.function; + +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.accumulo.core.data.Key; + +import com.google.common.annotations.VisibleForTesting; + +import datawave.query.data.parsers.DatawaveKey; + +/** + * Match results for a single matched {@code d}-column entry. + *

+ * A {@code d}-column entry has a single view name, so matches are grouped only by search string within that view. + */ +public class DocumentMatchResults { + public static final String VIEW_FIELD = "view"; + public static final String MATCHES_FIELD = "matches"; + + private final Key key; + private final Map> matches = new LinkedHashMap<>(); + + /** + * Creates an empty result container for a single matched {@code d}-column entry. + * + * @param key + * the matched {@code d}-column key + */ + public DocumentMatchResults(Key key) { + this.key = key; + } + + private DocumentMatchResults(DocumentMatchResults other) { + this.key = other.key; + for (Map.Entry> searchEntry : other.matches.entrySet()) { + this.matches.put(searchEntry.getKey(), new TreeSet<>(searchEntry.getValue())); + } + } + + public Key getKey() { + return key; + } + + /** + * @return the single view name associated with this matched {@code d}-column entry + */ + @VisibleForTesting + public String getView() { + return Objects.toString(new DatawaveKey(key).getFieldName(), ""); + } + + /** + * Records offsets for a literal search string within this entry's view. + * + * @param search + * the matched literal string + * @param offsets + * ordered starting offsets where the string was found + */ + public void addMatches(String search, List offsets) { + matches.computeIfAbsent(search, ignored -> new TreeSet<>()).addAll(offsets); + } + + /** + * @param search + * a matched literal string + * @return {@code true} if this entry contains offsets for the supplied search string + */ + public boolean containsSearch(String search) { + return matches.containsKey(search); + } + + /** + * Builds the JSON-ready payload for this entry in the form {@code {"view":"...","matches":{search:[offsets]}}}. + * + * @return a payload map suitable for serialization into the {@code DOCUMENT_MATCHES} attribute, or an empty map if no matches are present + */ + public Map getPayload() { + Map payload = new LinkedHashMap<>(); + String view = getView(); + if (view == null || matches.isEmpty()) { + return payload; + } + payload.put(VIEW_FIELD, view); + payload.put(MATCHES_FIELD, matches); + return payload; + } + + /** + * @return a defensive copy of this entry's match results + */ + public DocumentMatchResults copy() { + return new DocumentMatchResults(this); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java index 74b51712441..83ae3db0c2d 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java @@ -99,9 +99,6 @@ public boolean apply(Tuple3 input) { } DocumentMatchContext documentMatchContext = (DocumentMatchContext) input.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); - if (documentMatchContext != null) { - documentMatchContext.clearMergedMatches(); - } Object o = script.execute(input.third()); if (log.isTraceEnabled()) { @@ -115,14 +112,18 @@ public boolean apply(Tuple3 input) { ((DelayedNonEventIndexContext) input.third()).populateDocument(input.second()); } - String documentMatches = (documentMatchContext == null) ? "" : DocumentFunctions.toJson(documentMatchContext.getMergedMatches()); - if (matched && !documentMatches.isEmpty()) { + if (matched && documentMatchContext != null) { Document document = input.second(); - Content matchesAttribute = new Content(documentMatches, document.getMetadata(), document.isToKeep()); - if (documentMatchContext != null && documentMatchContext.getFirstMatchingColumnVisibility() != null) { - matchesAttribute.setColumnVisibility(documentMatchContext.getFirstMatchingColumnVisibility()); + for (DocumentMatchResults entry : documentMatchContext.getMatches()) { + String documentMatches = DocumentFunctions.toDocumentMatchesJson(entry.getPayload()); + if (documentMatches.isEmpty()) { + continue; + } + + Content matchesAttribute = Content.withKeyMetadata(documentMatches, entry.getKey(), document.isToKeep()); + matchesAttribute.setColumnVisibility(entry.getKey().getColumnVisibilityParsed()); + document.put(DocumentFunctions.DOCUMENT_MATCHES, matchesAttribute); } - document.put(DocumentFunctions.DOCUMENT_MATCHES, matchesAttribute); } if (arithmetic instanceof HitListArithmetic) { diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java index aea21efd19a..c6d5b79dd32 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java @@ -2,10 +2,10 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Objects; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Value; @@ -13,6 +13,7 @@ import com.google.gson.Gson; +import datawave.query.data.parsers.DatawaveKey; import datawave.query.function.DocumentMatchContext; import datawave.query.table.parser.ContentKeyValueFactory; @@ -20,8 +21,8 @@ * Evaluation-phase JEXL functions for inspecting decoded shard-table {@code d}-column content. *

* The current namespace exposes {@code document:match(...)} which decodes base64-encoded, gzip-compressed document payloads, performs case-sensitive literal - * substring matching, and returns a JSON object keyed first by matched string, then by view name, with starting character offsets as the leaf values. - * Per-document state is supplied explicitly through {@link DocumentMatchContext} by the surrounding evaluation flow. + * substring matching, and returns the matched search string when any eligible {@code d}-column matches. Detailed per-entry offsets are accumulated in the + * supplied {@link DocumentMatchContext} and later serialized into {@code DOCUMENT_MATCHES} attributes by the surrounding evaluation flow. */ @JexlFunctions(descriptorFactory = "datawave.query.jexl.functions.DocumentFunctionsDescriptor") public class DocumentFunctions { @@ -40,7 +41,7 @@ public class DocumentFunctions { * per-document context supplied by the evaluation pipeline * @param search * literal substring to search for - * @return a JSON object for this invocation keyed by matched string and then by view name, or an empty string if no match is found + * @return the matched search string if any eligible {@code d}-column matches, or an empty string if no match is found */ public static String match(DocumentMatchContext context, String search) { return match(null, context, search); @@ -50,8 +51,8 @@ public static String match(DocumentMatchContext context, String search) { * Evaluates the internal form of {@code document:match(VIEWNAME, STRING)} against the current document. *

* Matching is case-sensitive and literal. If {@code viewName} ends with {@code *}, it is treated as a prefix match against the view portion of the - * {@code d}-column qualifier. Oversized or undecodable payloads are skipped as non-matching. Matches from this invocation are merged into the document-wide - * result set stored in the supplied {@link DocumentMatchContext}. + * {@code d}-column qualifier. Oversized or undecodable payloads are skipped as non-matching. Matches from this invocation are accumulated in the supplied + * {@link DocumentMatchContext} on a per-{@code d}-column basis so the resulting {@code DOCUMENT_MATCHES} attributes can preserve each source visibility. * * @param viewName * optional exact or prefix-matched view selector; {@code null} means evaluate all views @@ -59,7 +60,7 @@ public static String match(DocumentMatchContext context, String search) { * per-document context supplied by the evaluation pipeline * @param search * literal substring to search for - * @return a JSON object for this invocation keyed by matched string and then by view name, or an empty string if no match is found + * @return the matched search string if any eligible {@code d}-column matches, or an empty string if no match is found */ public static String match(String viewName, DocumentMatchContext context, String search) { if (context == null || search == null) { @@ -70,13 +71,13 @@ public static String match(String viewName, DocumentMatchContext context, String } if (log.isDebugEnabled()) { - log.debug("Evaluating document:match for search [" + search + "] view filter [" + viewName + "] across " + context.getdEntries().size() + log.debug("Evaluating document:match for search [" + search + "] view filter [" + viewName + "] across " + context.getDocumentEntries().size() + " d-column entries"); } - Map> matches = new LinkedHashMap<>(); - for (Entry entry : context.getdEntries()) { - String candidateView = extractViewName(entry.getKey()); + boolean matched = false; + for (Entry entry : context.getDocumentEntries()) { + String candidateView = Objects.toString(new DatawaveKey(entry.getKey()).getFieldName(), ""); if (!matchesView(viewName, candidateView)) { if (log.isDebugEnabled()) { log.debug("Skipping d-column entry " + entry.getKey() + " because view [" + candidateView + "] does not match filter [" + viewName + "]"); @@ -97,13 +98,8 @@ public static String match(String viewName, DocumentMatchContext context, String log.debug("document:match found offsets " + offsets + " for search [" + search + "] in view [" + candidateView + "] using key " + entry.getKey()); } - if (context.recordMatchingEntry(entry.getKey()) && context.shouldLogVisibilityMismatch()) { - log.info("document:match encountered differing d-column visibilities for document " + context.getFirstMatchingEntry().getRow() + '/' - + context.getFirstMatchingEntry().getColumnFamily() + "; using visibility from first matched d-column " - + context.getFirstMatchingEntry() + " and ignoring differing visibility on " + entry.getKey()); - context.markVisibilityMismatchLogged(); - } - matches.computeIfAbsent(candidateView, k -> new ArrayList<>()).addAll(offsets); + context.addMatches(entry.getKey(), search, offsets); + matched = true; } else if (log.isDebugEnabled()) { log.debug("document:match found no offsets for search [" + search + "] in view [" + candidateView + "] using key " + entry.getKey()); } @@ -111,32 +107,10 @@ public static String match(String viewName, DocumentMatchContext context, String log.debug("Unable to decode d-column payload for view " + candidateView, e); } } - - context.mergeMatches(search, matches); if (log.isDebugEnabled()) { - log.debug("document:match merged matches for search [" + search + "]: " + matches); + log.debug("document:match produced matched=" + matched + " for search [" + search + "]"); } - return toJson(search, matches); - } - - /** - * Extracts the view name from a {@code d}-column qualifier whose layout is expected to be {@code datatype\0uid\0view}. - * - * @param key - * shard-table {@code d}-column key - * @return the extracted view name, or an empty string if the qualifier does not have the expected structure - */ - static String extractViewName(Key key) { - String cq = key.getColumnQualifier().toString(); - int firstNull = cq.indexOf('\0'); - if (firstNull < 0) { - return ""; - } - int secondNull = cq.indexOf('\0', firstNull + 1); - if (secondNull < 0 || secondNull + 1 >= cq.length()) { - return ""; - } - return cq.substring(secondNull + 1); + return matched ? search : ""; } /** @@ -197,34 +171,16 @@ static List findOffsets(String decoded, String search) { } /** - * Serializes matches for one {@code document:match(...)} invocation to the JSON payload shape stored in {@code DOCUMENT_MATCHES}. + * Serializes one per-entry {@code DOCUMENT_MATCHES} payload in the form {@code {"view":"...","matches":{search:[offsets]}}}. * - * @param search - * literal string matched by the invocation - * @param matches - * map of view name to ordered character offsets - * @return JSON string representation, or an empty string if the map is empty + * @param payload + * per-entry payload built from a single matched {@code d}-column + * @return JSON string representation, or an empty string if the payload is empty */ - public static String toJson(String search, Map> matches) { - if (matches.isEmpty()) { + public static String toDocumentMatchesJson(Map payload) { + if (payload.isEmpty()) { return ""; } - Map>> payload = new LinkedHashMap<>(); - payload.put(search, matches); return GSON.toJson(payload); } - - /** - * Serializes merged document-wide matches to the JSON payload stored in {@code DOCUMENT_MATCHES}. - * - * @param matches - * map of matched string to per-view ordered character offsets - * @return JSON string representation, or an empty string if the map is empty - */ - public static String toJson(Map>> matches) { - if (matches.isEmpty()) { - return ""; - } - return GSON.toJson(matches); - } } diff --git a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java index edcf2f1b226..c18470eadac 100644 --- a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java @@ -3,18 +3,22 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.net.URL; import java.nio.file.Path; import java.util.Collections; import java.util.HashMap; +import java.util.LinkedHashSet; import java.util.Map; +import java.util.Set; import java.util.TimeZone; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.security.tokens.PasswordToken; import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.accumulo.minicluster.MiniAccumuloCluster; import org.apache.accumulo.minicluster.MiniAccumuloConfig; import org.apache.log4j.Logger; @@ -35,6 +39,8 @@ import datawave.ingest.data.TypeRegistry; import datawave.query.attributes.Attribute; +import datawave.query.attributes.Attributes; +import datawave.query.attributes.Content; import datawave.query.attributes.Document; import datawave.query.function.DocumentMatchContext; import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; @@ -75,7 +81,8 @@ public class DocumentMatchQueryTest extends AbstractQueryTest { @Qualifier("EventQuery") protected ShardQueryLogic logic; - private final Map expectedDocumentMatches = new HashMap<>(); + private final Map> expectedDocumentMatches = new HashMap<>(); + private final Map> expectedDocumentMatchVisibilities = new HashMap<>(); private Boolean expectedDocumentMatchContextRequired; @Override @@ -126,6 +133,7 @@ public void beforeEach() { public void afterEach() { super.afterEach(); expectedDocumentMatches.clear(); + expectedDocumentMatchVisibilities.clear(); expectedDocumentMatchContextRequired = null; } @@ -160,11 +168,18 @@ protected void extraAssertions() { assertNotNull(uuid, "result did not contain UUID"); String uuidValue = getUUID(uuid); - String expected = expectedDocumentMatches.get(uuidValue); + Set expected = expectedDocumentMatches.get(uuidValue); if (expected != null) { Attribute matches = result.get(DocumentFunctions.DOCUMENT_MATCHES); assertNotNull(matches, "result did not contain DOCUMENT_MATCHES"); - assertEquals(expected, matches.getData().toString()); + assertEquals(expected, getDocumentMatchContents(matches)); + } + + Map expectedVisibilities = expectedDocumentMatchVisibilities.get(uuidValue); + if (expectedVisibilities != null) { + Attribute matches = result.get(DocumentFunctions.DOCUMENT_MATCHES); + assertNotNull(matches, "result did not contain DOCUMENT_MATCHES"); + assertEquals(expectedVisibilities, getDocumentMatchVisibilities(matches)); } } } @@ -179,7 +194,8 @@ public void testDocumentMatchJexlAllViews() throws Exception { expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", "{\"can\":{\"CONTENT\":[4,61],\"CONTENT2\":[27]}}"); + expectedDocumentMatches.put("CAPONE", + Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27]}}")); planAndExecuteQuery(); } @@ -193,21 +209,56 @@ public void testDocumentMatchJexlSpecificView() throws Exception { expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", "{\"lawyer\":{\"CONTENT2\":[2]}}"); + expectedDocumentMatches.put("CAPONE", Collections.singleton("{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[2]}}")); planAndExecuteQuery(); } /** - * Verifies that multiple JEXL {@code document:match(...)} calls contribute to one merged {@code DOCUMENT_MATCHES} payload. + * Verifies that multiple JEXL {@code document:match(...)} calls contribute one {@code DOCUMENT_MATCHES} value per matched {@code d}-column. */ @Test - public void testDocumentMatchJexlMergesMatchesAcrossCalls() throws Exception { + public void testDocumentMatchJexlAddsPerEntryMatchesAcrossCalls() throws Exception { givenQuery("UUID == 'CAPONE' && document:match('CONTENT', 'can') && document:match('CONTENT2', 'lawyer')"); expectPlan("UUID == 'capone' && document:match('CONTENT', documentMatchContext, 'can') && document:match('CONTENT2', documentMatchContext, 'lawyer')"); expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", "{\"can\":{\"CONTENT\":[4,61]},\"lawyer\":{\"CONTENT2\":[2]}}"); + expectedDocumentMatches.put("CAPONE", + Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[2]}}")); + planAndExecuteQuery(); + } + + /** + * Verifies that end-to-end {@code DOCUMENT_MATCHES} values preserve the visibilities carried by their source {@code d}-column entries. + */ + @Test + public void testDocumentMatchJexlPreservesPerEntryVisibilities() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('can')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", + Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27]}}")); + expectedDocumentMatchVisibilities.put("CAPONE", Map.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", new ColumnVisibility("ALL"), + "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27]}}", new ColumnVisibility("ALL"))); + planAndExecuteQuery(); + } + + /** + * Verifies that a wildcard view match combined with a second targeted call accumulates per-entry matches without cross-entry merging. + */ + @Test + public void testDocumentMatchJexlWildcardThenSpecificViewAccumulatesPerEntry() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('CONTENT*', 'can') && document:match('CONTENT2', 'lawyer')"); + expectPlan("UUID == 'capone' && document:match('CONTENT*', documentMatchContext, 'can') && document:match('CONTENT2', documentMatchContext, 'lawyer')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", + Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27],\"lawyer\":[2]}}")); + expectedDocumentMatchVisibilities.put("CAPONE", Map.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", new ColumnVisibility("ALL"), + "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27],\"lawyer\":[2]}}", new ColumnVisibility("ALL"))); planAndExecuteQuery(); } @@ -222,10 +273,37 @@ public void testDocumentMatchLuceneWildcardView() throws Exception { expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", "{\"can\":{\"CONTENT\":[4,61],\"CONTENT2\":[27]}}"); + expectedDocumentMatches.put("CAPONE", + Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27]}}")); planAndExecuteQuery(); } + private Set getDocumentMatchContents(Attribute attribute) { + Set values = new LinkedHashSet<>(); + if (attribute instanceof Attributes) { + for (Attribute> child : ((Attributes) attribute).getAttributes()) { + values.add(((Content) child).getContent()); + } + } else { + values.add(((Content) attribute).getContent()); + } + return values; + } + + private Map getDocumentMatchVisibilities(Attribute attribute) { + Map visibilities = new HashMap<>(); + if (attribute instanceof Attributes) { + for (Attribute> child : ((Attributes) attribute).getAttributes()) { + Content content = (Content) child; + visibilities.put(content.getContent(), content.getColumnVisibility()); + } + } else { + Content content = (Content) attribute; + visibilities.put(content.getContent(), content.getColumnVisibility()); + } + return visibilities; + } + /** * Verifies that a non-matching document-match term filters the document out of the result set. */ @@ -274,5 +352,8 @@ public void testQueryWithoutDocumentMatchDoesNotRequireContext() throws Exceptio expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); planAndExecuteQuery(); + assertEquals(1, results.size()); + Document result = results.iterator().next(); + assertNull(result.get(DocumentFunctions.DOCUMENT_MATCHES), "query without document:match unexpectedly emitted DOCUMENT_MATCHES"); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java index 2e6d6644d06..1d98c6bbdcb 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java @@ -1,7 +1,8 @@ package datawave.query.function; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.Collections; import java.util.List; @@ -12,7 +13,7 @@ import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.IteratorEnvironment; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; -import org.junit.Test; +import org.junit.jupiter.api.Test; import com.google.common.collect.Lists; @@ -46,7 +47,7 @@ public void testCollectsOnlyCurrentDocumentColumns() { .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); - assertEquals(2, context.getdEntries().size()); + assertEquals(2, context.getDocumentEntries().size()); assertEquals(1234, context.getMaxEncodedValueSize()); assertEquals(5678, context.getMaxDecodedValueSize()); assertEquals(9012, context.getMaxEncodedContextSize()); @@ -66,7 +67,7 @@ public void testCollectsEmptyContextWhenNoDocumentColumnsExist() { .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); - assertTrue(context.getdEntries().isEmpty()); + assertTrue(context.getDocumentEntries().isEmpty()); } /** @@ -92,7 +93,7 @@ public void testCollectsColumnsForDocumentKeysFromDocument() { .apply(Tuples.tuple(new Key("20240101_0", "datatype\0root"), document, Collections.emptyMap())); DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); - assertEquals(2, context.getdEntries().size()); + assertEquals(2, context.getDocumentEntries().size()); } /** @@ -113,8 +114,35 @@ public void testCollectsOnlyEntriesWithinAggregateEncodedContextLimit() { .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); - assertEquals(1, context.getdEntries().size()); - assertEquals("datatype\0uid\0BODY", context.getdEntries().get(0).getKey().getColumnQualifier().toString()); + assertEquals(1, context.getDocumentEntries().size()); + assertEquals("datatype\0uid\0BODY", context.getDocumentEntries().get(0).getKey().getColumnQualifier().toString()); + } + + /** + * Verifies that each application creates a fresh {@link DocumentMatchContext}, so mutable match state from a prior evaluation cannot leak into a later one. + */ + @Test + public void testApplyCreatesFreshContextEachTime() { + List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes()))); + + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(entries)); + config.setLimits(new DocumentMatchContext.Limits(10, 20, 30)); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Tuple3> first = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext firstContext = (DocumentMatchContext) first.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + firstContext.addMatches(entries.get(0).getKey(), "one", List.of(0)); + assertEquals(1, firstContext.getMatches().size()); + + Tuple3> second = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext secondContext = (DocumentMatchContext) second.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertNotSame(firstContext, secondContext); + assertTrue(secondContext.getMatches().isEmpty()); + assertEquals(1, secondContext.getDocumentEntries().size()); } private static class ListBackedIterator implements SortedKeyValueIterator { diff --git a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java index 4a810b09d3d..3c05522c78c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java @@ -1,19 +1,28 @@ package datawave.query.function; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static datawave.query.function.DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE; +import static datawave.query.function.DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.function.Consumer; +import java.util.function.Supplier; import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.commons.jexl3.parser.ASTJexlScript; -import org.junit.Test; +import org.junit.jupiter.api.Test; import com.google.common.collect.Maps; @@ -36,16 +45,16 @@ public class JexlEvaluationTest { + public static final DocumentMatchContext.Limits TEST_DOCUMENT_MATCH_LIMITS = new DocumentMatchContext.Limits(1024, DEFAULT_MAX_DECODED_SIZE, + DEFAULT_MAX_ENCODED_CONTEXT_SIZE); + @Test public void testSimpleQuery() { String query = "FOO == 'bar'"; Document d = new Document(); d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO"))); } @Test @@ -55,10 +64,7 @@ public void testRegexIntersection() { d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true)); d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO"))); } @Test @@ -67,16 +73,15 @@ public void testRegexCaseIntersection() { d.put("FOO", new Content("Bar", new Key("shard", "datatype\0uid"), true)); d.put("FOO", new Numeric("123", new Key("shard", "datatype\0uid"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); + Supplier contextSupplier = contextFactory(d, Collections.singleton("FOO")); // match the original value String query = "FOO == 'bar' && FOO =~ '12.*'"; - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextSupplier); // match the normalized value query = "FOO == 'bar' && FOO =~ '\\+cE1\\.2.*'"; - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextSupplier); } @Test @@ -87,10 +92,7 @@ public void testRegexUnion() { d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true)); d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO"))); } @Test @@ -102,10 +104,7 @@ public void testHitTermSource() { d.put("FOO", hitTermSource); d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid2"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO"))); Attributes hitTerm = (Attributes) d.getDictionary().get("HIT_TERM"); assertEquals(1, hitTerm.getAttributes().size()); @@ -157,10 +156,7 @@ public void testSomeFilterFunctions() { // Assume fields are {ANCHOR, FOO, FOO2} and a constant doc key private void evaluate(String query, Document d) { - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Arrays.asList("ANCHOR", "FOO", "FOO2", "FOO3"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Arrays.asList("ANCHOR", "FOO", "FOO2", "FOO3"))); } @Test @@ -172,9 +168,6 @@ public void testContentPhraseFunction() { map.put("red", buildTfList("TOKFIELD", 2)); map.put("dog", buildTfList("TOKFIELD", 3)); - DatawaveJexlContext context = new DatawaveJexlContext(); - context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, new TermOffsetMap(map)); - Key docKey = new Key("shard", "datatype\0uid"); Document d = new Document(); @@ -182,9 +175,8 @@ public void testContentPhraseFunction() { d.put("TOKFIELD", new Content("big", docKey, true)); d.put("TOKFIELD", new Content("red", docKey, true)); d.put("TOKFIELD", new Content("dog", docKey, true)); - d.visit(Arrays.asList("FOO", "TOKFIELD"), context); - - assertEvaluation(query, docKey, d, context); + assertEvaluation(query, docKey, d, contextFactory(d, Arrays.asList("FOO", "TOKFIELD"), + ctx -> ctx.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, new TermOffsetMap(map)))); // assert that "big red dog" came back in the hit terms boolean foundPhrase = false; @@ -205,41 +197,80 @@ public void testDocumentMatchAddsDocumentAttribute() { Document d = new Document(); d.put("FOO", new Content("bar", docKey, true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - context.set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, - new DocumentMatchContext( - List.of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), - new org.apache.accumulo.core.data.Value(buildEncodedValue("scar car")))), - new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, - DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE))); - - assertEvaluation(query, docKey, d, context); - assertEquals("{\"car\":{\"BODY\":[1,5]}}", ((Content) d.get(DocumentFunctions.DOCUMENT_MATCHES)).getContent()); + final List> entries = List + .of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car")))); + assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx + .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); + assertEquals(Collections.singleton("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}"), + getDocumentMatchContents(d.get(DocumentFunctions.DOCUMENT_MATCHES))); assertEquals(new ColumnVisibility("A"), d.get(DocumentFunctions.DOCUMENT_MATCHES).getColumnVisibility()); } @Test - public void testDocumentMatchMergesDocumentAttributeAcrossCalls() { + public void testDocumentMatchAddsPerEntryDocumentAttributesAcrossCalls() { String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('CONTENT2', 'lawyer')"; Key docKey = new Key("shard", "datatype\0uid"); Document d = new Document(); d.put("FOO", new Content("bar", docKey, true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - context.set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, - new DocumentMatchContext( - List.of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), - new org.apache.accumulo.core.data.Value(buildEncodedValue("scar car"))), - Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "A"), - new org.apache.accumulo.core.data.Value(buildEncodedValue("lawyer car")))), - new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, - DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE))); - - assertEvaluation(query, docKey, d, context); - assertEquals("{\"car\":{\"BODY\":[1,5]},\"lawyer\":{\"CONTENT2\":[0]}}", ((Content) d.get(DocumentFunctions.DOCUMENT_MATCHES)).getContent()); - assertEquals(new ColumnVisibility("A"), d.get(DocumentFunctions.DOCUMENT_MATCHES).getColumnVisibility()); + final List> entries = List.of( + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car"))), + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "A"), new Value(buildEncodedValue("lawyer car")))); + assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx + .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); + assertEquals(Set.of("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[0]}}"), + getDocumentMatchContents(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + } + + @Test + public void testDocumentMatchAccumulatesCallsWithinSameEntry() { + String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('BODY', 'lawyer')"; + Key docKey = new Key("shard", "datatype\0uid"); + Document d = new Document(); + d.put("FOO", new Content("bar", docKey, true)); + + final List> entries = List + .of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car lawyer")))); + assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx + .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); + assertEquals(Collections.singleton("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5],\"lawyer\":[9]}}"), + getDocumentMatchContents(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + } + + @Test + public void testDocumentMatchPreservesPerEntryVisibilities() { + String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('CONTENT2', 'lawyer')"; + Key docKey = new Key("shard", "datatype\0uid"); + Document d = new Document(); + d.put("FOO", new Content("bar", docKey, true)); + + final List> entries = List.of( + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car"))), + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "B"), new Value(buildEncodedValue("lawyer car")))); + assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx + .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); + + Attributes matches = assertInstanceOf(Attributes.class, d.get(DocumentFunctions.DOCUMENT_MATCHES)); + Map visibilitiesByPayload = new HashMap<>(); + for (Attribute> attribute : matches.getAttributes()) { + Content content = assertInstanceOf(Content.class, attribute); + visibilitiesByPayload.put(content.getContent(), content.getColumnVisibility()); + } + + assertEquals(new ColumnVisibility("A"), visibilitiesByPayload.get("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}")); + assertEquals(new ColumnVisibility("B"), visibilitiesByPayload.get("{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[0]}}")); + } + + private Set getDocumentMatchContents(Attribute attribute) { + Set values = new LinkedHashSet<>(); + if (attribute instanceof Attributes) { + for (Attribute> child : ((Attributes) attribute).getAttributes()) { + values.add(((Content) child).getContent()); + } + } else { + values.add(((Content) attribute).getContent()); + } + return values; } private byte[] buildEncodedValue(String content) { @@ -351,27 +382,36 @@ private void testCompare(String query, boolean expected) { d.put("FIELD_C", new Content("zebra", docKey, true)); d.put("FIELD_C", new Content("zephyr", docKey, true)); - // populate context from doc - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Arrays.asList("FOO", "FIELD_A", "FIELD_B", "FIELD_C"), context); - - assertEvaluation(query, docKey, d, context, expected); + assertEvaluation(query, docKey, d, contextFactory(d, Arrays.asList("FOO", "FIELD_A", "FIELD_B", "FIELD_C")), expected); } - private void assertEvaluation(String query, Key key, Document d, DatawaveJexlContext context) { - assertEvaluation(query, key, d, context, true); + private void assertEvaluation(String query, Key key, Document d, Supplier contextSupplier) { + assertEvaluation(query, key, d, contextSupplier, true); } - private void assertEvaluation(String query, Key key, Document d, DatawaveJexlContext context, boolean expected) { + private void assertEvaluation(String query, Key key, Document d, Supplier contextSupplier, boolean expected) { JexlEvaluation evaluation = new JexlEvaluation(rewriteDocumentMatchFunctions(query)); - boolean result = evaluation.apply(new Tuple3<>(key, d, context)); + boolean result = evaluation.apply(new Tuple3<>(key, d, contextSupplier.get())); assertEquals(expected, result); evaluation = new JexlEvaluation(rewriteDocumentMatchFunctions(query), new HitListArithmetic()); - result = evaluation.apply(new Tuple3<>(key, d, context)); + result = evaluation.apply(new Tuple3<>(key, d, contextSupplier.get())); assertEquals(expected, result); } + private Supplier contextFactory(Document document, Collection fields) { + return contextFactory(document, fields, context -> {}); + } + + private Supplier contextFactory(Document document, Collection fields, Consumer customizer) { + return () -> { + DatawaveJexlContext context = new DatawaveJexlContext(); + document.visit(fields, context); + customizer.accept(context); + return context; + }; + } + private String rewriteDocumentMatchFunctions(String query) { try { ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java index 396020665f7..7faa3d4cda0 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java @@ -1,35 +1,27 @@ package datawave.query.jexl.functions; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.ByteArrayOutputStream; import java.io.OutputStream; import java.util.AbstractMap; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.zip.GZIPOutputStream; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.security.ColumnVisibility; -import org.apache.log4j.AppenderSkeleton; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.spi.LoggingEvent; -import org.junit.Test; +import org.junit.jupiter.api.Test; import datawave.query.function.DocumentMatchContext; /** - * Unit tests for {@link DocumentFunctions} covering view selection, matching semantics, payload limits, merged results, and visibility handling. + * Unit tests for {@link DocumentFunctions} covering view selection, matching semantics, payload limits, and per-{@code d}-column result accumulation. */ public class DocumentFunctionsTest { - private final Logger logger = Logger.getLogger(DocumentFunctions.class); - /** - * Verifies that {@code document:match(STRING)} searches all available views and records offsets per view beneath the matched-string key. + * Verifies that {@code document:match(STRING)} searches all available views and returns the matched search string when any view matches. */ @Test public void testMatchAcrossAllViews() throws Exception { @@ -39,8 +31,8 @@ public void testMatchAcrossAllViews() throws Exception { String result = DocumentFunctions.match(context, "car"); - assertEquals("{\"car\":{\"BODY\":[1,5],\"META\":[0]}}", result); - assertEquals(result, DocumentFunctions.toJson(context.getMergedMatches())); + assertEquals("car", result); + assertEquals(2, context.getMatches().size()); } /** @@ -55,7 +47,7 @@ public void testWildcardViewMatch() throws Exception { String result = DocumentFunctions.match("BODY*", context, "car"); - assertEquals("{\"car\":{\"BODY\":[0],\"BODY_TEXT\":[0,4]}}", result); + assertEquals("car", result); } /** @@ -68,7 +60,7 @@ public void testOverlappingMatches() throws Exception { String result = DocumentFunctions.match("BODY", context, "ana"); - assertEquals("{\"ana\":{\"BODY\":[1,3]}}", result); + assertEquals("ana", result); } /** @@ -82,6 +74,38 @@ public void testCaseSensitiveMatch() throws Exception { assertTrue(DocumentFunctions.match(context, "Car").isEmpty()); } + /** + * Verifies that a null context is treated as a non-match. + */ + @Test + public void testNullContextIsNonMatch() { + assertTrue(DocumentFunctions.match(null, "car").isEmpty()); + } + + /** + * Verifies that a null search term is treated as a non-match. + */ + @Test + public void testNullSearchIsNonMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertTrue(DocumentFunctions.match(context, null).isEmpty()); + assertTrue(DocumentFunctions.match("BODY", context, null).isEmpty()); + } + + /** + * Verifies that an empty search term is treated as a non-match. + */ + @Test + public void testEmptySearchIsNonMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertTrue(DocumentFunctions.match(context, "").isEmpty()); + assertTrue(context.getMatches().isEmpty()); + } + /** * Verifies that encoded payloads larger than the configured limit are skipped as non-matching. */ @@ -137,7 +161,7 @@ public void testMatchWithBase64LineBreaks() throws Exception { String result = DocumentFunctions.match("BODY", context, "Origins"); - assertEquals("{\"Origins\":{\"BODY\":[3]}}", result); + assertEquals("Origins", result); } /** @@ -151,61 +175,55 @@ public void testMatchWithBase64OnlyPayload() { String result = DocumentFunctions.match("BODY", context, "Origins"); - assertEquals("{\"Origins\":{\"BODY\":[3]}}", result); + assertEquals("Origins", result); } /** - * Verifies that multiple {@code document:match(...)} calls merge their offsets into the document-wide result set. + * Verifies that multiple {@code document:match(...)} calls accumulate results on a per-{@code d}-column basis for document output. */ @Test - public void testMatchMergesResultsAcrossCalls() throws Exception { + public void testMatchAccumulatesPerEntryResultsAcrossCalls() throws Exception { DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0CONTENT2", "lawyer car")), new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); - assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); - assertEquals("{\"lawyer\":{\"CONTENT2\":[0]}}", DocumentFunctions.match("CONTENT2", context, "lawyer")); - assertEquals("{\"car\":{\"BODY\":[1,5]},\"lawyer\":{\"CONTENT2\":[0]}}", DocumentFunctions.toJson(context.getMergedMatches())); + assertEquals("car", DocumentFunctions.match("BODY", context, "car")); + assertEquals("lawyer", DocumentFunctions.match("CONTENT2", context, "lawyer")); + assertEquals(2, context.getMatches().size()); + assertTrue(context.getMatches().stream().anyMatch(matches -> matches.containsSearch("car"))); + assertTrue(context.getMatches().stream().anyMatch(matches -> matches.containsSearch("lawyer"))); + assertTrue(context.getMatches().stream().anyMatch(matches -> "BODY".equals(matches.getView()))); + assertTrue(context.getMatches().stream().anyMatch(matches -> "CONTENT2".equals(matches.getView()))); } /** - * Verifies that repeated {@code document:match(...)} calls for the same string merge under one top-level match-string key. + * Verifies that repeated {@code document:match(...)} calls against the same {@code d}-column accumulate beneath that single entry payload. */ @Test - public void testMatchMergesSameSearchAcrossCalls() throws Exception { - DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0CONTENT2", "lawyer car")), - new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, - DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + public void testMatchAccumulatesSameEntryAcrossCalls() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car lawyer")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); - assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); - assertEquals("{\"car\":{\"CONTENT2\":[7]}}", DocumentFunctions.match("CONTENT2", context, "car")); - assertEquals("{\"car\":{\"BODY\":[1,5],\"CONTENT2\":[7]}}", DocumentFunctions.toJson(context.getMergedMatches())); + assertEquals("car", DocumentFunctions.match("BODY", context, "car")); + assertEquals("lawyer", DocumentFunctions.match("BODY", context, "lawyer")); + assertEquals(1, context.getMatches().size()); + assertEquals("BODY", context.getMatches().get(0).getView()); + assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5],\"lawyer\":[9]}}", + DocumentFunctions.toDocumentMatchesJson(context.getMatches().get(0).getPayload())); } /** - * Verifies that the first matched {@code d}-column visibility is retained and that a later mismatch produces a single info-level log message. + * Verifies that repeated identical {@code document:match(...)} calls against the same {@code d}-column do not duplicate offsets for the same search term. */ @Test - public void testMatchLogsVisibilityMismatchAndKeepsFirstVisibility() throws Exception { - TestAppender appender = new TestAppender(); - Level originalLevel = logger.getLevel(); - logger.addAppender(appender); - logger.setLevel(Level.INFO); - try { - DocumentMatchContext context = new DocumentMatchContext( - List.of(entry("test\0uid\0BODY", "scar car", "A"), entry("test\0uid\0CONTENT2", "lawyer car", "B")), - new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, - DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); - - assertEquals("{\"car\":{\"BODY\":[1,5]}}", DocumentFunctions.match("BODY", context, "car")); - assertEquals("{\"lawyer\":{\"CONTENT2\":[0]}}", DocumentFunctions.match("CONTENT2", context, "lawyer")); - assertEquals(new ColumnVisibility("A"), context.getFirstMatchingColumnVisibility()); - assertEquals(1, appender.infoMessages.size()); - assertTrue(appender.infoMessages.get(0).contains("differing d-column visibilities")); - } finally { - logger.removeAppender(appender); - logger.setLevel(originalLevel); - } + public void testMatchRepeatsSameSearchWithinEntryAcrossCalls() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertEquals("car", DocumentFunctions.match("BODY", context, "car")); + assertEquals("car", DocumentFunctions.match("BODY", context, "car")); + assertEquals(1, context.getMatches().size()); + assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", DocumentFunctions.toDocumentMatchesJson(context.getMatches().get(0).getPayload())); } /** @@ -299,26 +317,4 @@ private Map.Entry base64OnlyEntry(String cq, String content) { byte[] encoded = java.util.Base64.getEncoder().encode(content.getBytes()); return new AbstractMap.SimpleEntry<>(new Key("row", "d", cq), new Value(encoded)); } - - /** - * Minimal log4j appender used to capture info-level visibility-mismatch messages. - */ - private static class TestAppender extends AppenderSkeleton { - private final List infoMessages = new ArrayList<>(); - - @Override - protected void append(LoggingEvent event) { - if (Level.INFO.equals(event.getLevel())) { - infoMessages.add(event.getRenderedMessage()); - } - } - - @Override - public void close() {} - - @Override - public boolean requiresLayout() { - return false; - } - } } From 28bd000acaf4ec3adcc1f339e1efbe2aab91ac98 Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Sun, 12 Apr 2026 22:16:54 -0400 Subject: [PATCH 9/9] Cleanup pass post review refactoring * Consolidate serialization to DocumentMatchResults * Removed the dead DocumentMatchFactory and EmptyDocumentMatchFunctions, updated QueryIterator and TLDQueryIterator to construct DocumentMatchContextFunction directly * Removed dead code from DocumentMatchResults (copy, contained search, payload builder) * Removed unnnecessary Content.withKeyMetadata helper * Cleaned up some brittleness in the tesks related to JSON assertions - now assert the structure instead of exact string nuts * Additional validation of visibility in unit tests --- .../datawave/query/attributes/Content.java | 6 -- .../query/function/DocumentMatchFactory.java | 31 --------- .../query/function/DocumentMatchResults.java | 40 ++++++----- .../function/EmptyDocumentMatchFunction.java | 20 ------ .../function/IndexOnlyKeyToDocumentData.java | 4 +- .../query/function/JexlEvaluation.java | 4 +- .../query/function/KeyToDocumentData.java | 47 ++----------- .../query/iterator/QueryIterator.java | 5 +- .../datawave/query/iterator/QueryOptions.java | 16 +---- .../jexl/functions/DocumentFunctions.java | 17 ----- .../predicate/EventDataQueryFieldFilter.java | 15 ----- .../datawave/query/tld/TLDQueryIterator.java | 4 +- .../query/DocumentMatchQueryTest.java | 67 ++++++++++++------- .../query/function/JexlEvaluationTest.java | 67 +++++++++++++------ .../jexl/functions/DocumentFunctionsTest.java | 13 ++-- 15 files changed, 126 insertions(+), 230 deletions(-) delete mode 100644 warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchFactory.java delete mode 100644 warehouse/query-core/src/main/java/datawave/query/function/EmptyDocumentMatchFunction.java diff --git a/warehouse/query-core/src/main/java/datawave/query/attributes/Content.java b/warehouse/query-core/src/main/java/datawave/query/attributes/Content.java index 2debde33060..8b1f1aea793 100644 --- a/warehouse/query-core/src/main/java/datawave/query/attributes/Content.java +++ b/warehouse/query-core/src/main/java/datawave/query/attributes/Content.java @@ -42,12 +42,6 @@ public Content(String content, Key docKey, boolean toKeep, Attribute source) this.source = source; } - public static Content withKeyMetadata(String content, Key metadata, boolean toKeep) { - Content attribute = new Content(content, null, toKeep, null); - attribute.metadata = metadata; - return attribute; - } - @Override public long sizeInBytes() { if (sizeInBytes == Long.MIN_VALUE) { diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchFactory.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchFactory.java deleted file mode 100644 index ee5082f1a9b..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchFactory.java +++ /dev/null @@ -1,31 +0,0 @@ -package datawave.query.function; - -import java.util.Map; - -import org.apache.accumulo.core.data.Key; - -import com.google.common.base.Function; - -import datawave.query.attributes.Document; -import datawave.query.util.Tuple3; - -/** - * Builds the pre-evaluation function that populates {@link DocumentMatchContext} for {@code document:match(...)} evaluation. - */ -public class DocumentMatchFactory { - private DocumentMatchFactory() {} - - /** - * Returns a context-populating function for document matching. - * - * @param config - * document-match configuration - * @return either a context-populating function or a no-op function when no source is available - */ - public static Function>,Tuple3>> getFunction(DocumentMatchConfig config) { - if (config == null || config.getSource() == null) { - return new EmptyDocumentMatchFunction(); - } - return new DocumentMatchContextFunction(config); - } -} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java index 7ce4551332c..0879a9480fc 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java @@ -1,5 +1,6 @@ package datawave.query.function; +import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -10,6 +11,7 @@ import org.apache.accumulo.core.data.Key; import com.google.common.annotations.VisibleForTesting; +import com.google.gson.Gson; import datawave.query.data.parsers.DatawaveKey; @@ -19,6 +21,8 @@ * A {@code d}-column entry has a single view name, so matches are grouped only by search string within that view. */ public class DocumentMatchResults { + private static final Gson GSON = new Gson(); + public static final String VIEW_FIELD = "view"; public static final String MATCHES_FIELD = "matches"; @@ -35,13 +39,6 @@ public DocumentMatchResults(Key key) { this.key = key; } - private DocumentMatchResults(DocumentMatchResults other) { - this.key = other.key; - for (Map.Entry> searchEntry : other.matches.entrySet()) { - this.matches.put(searchEntry.getKey(), new TreeSet<>(searchEntry.getValue())); - } - } - public Key getKey() { return key; } @@ -66,35 +63,36 @@ public void addMatches(String search, List offsets) { matches.computeIfAbsent(search, ignored -> new TreeSet<>()).addAll(offsets); } - /** - * @param search - * a matched literal string - * @return {@code true} if this entry contains offsets for the supplied search string - */ - public boolean containsSearch(String search) { - return matches.containsKey(search); - } - /** * Builds the JSON-ready payload for this entry in the form {@code {"view":"...","matches":{search:[offsets]}}}. * * @return a payload map suitable for serialization into the {@code DOCUMENT_MATCHES} attribute, or an empty map if no matches are present */ - public Map getPayload() { + private Map getPayload() { Map payload = new LinkedHashMap<>(); String view = getView(); if (view == null || matches.isEmpty()) { return payload; } payload.put(VIEW_FIELD, view); - payload.put(MATCHES_FIELD, matches); + Map> jsonMatches = new LinkedHashMap<>(); + for (Map.Entry> matchEntry : matches.entrySet()) { + jsonMatches.put(matchEntry.getKey(), new ArrayList<>(matchEntry.getValue())); + } + payload.put(MATCHES_FIELD, jsonMatches); return payload; } /** - * @return a defensive copy of this entry's match results + * Serializes this entry's payload into the {@code DOCUMENT_MATCHES} JSON representation. + * + * @return JSON string representation, or an empty string if no matches were recorded for the entry */ - public DocumentMatchResults copy() { - return new DocumentMatchResults(this); + public String toJson() { + Map payload = getPayload(); + if (payload.isEmpty()) { + return ""; + } + return GSON.toJson(payload); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/function/EmptyDocumentMatchFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/EmptyDocumentMatchFunction.java deleted file mode 100644 index 083838613d0..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/function/EmptyDocumentMatchFunction.java +++ /dev/null @@ -1,20 +0,0 @@ -package datawave.query.function; - -import java.util.Map; - -import org.apache.accumulo.core.data.Key; - -import com.google.common.base.Function; - -import datawave.query.attributes.Document; -import datawave.query.util.Tuple3; - -/** - * No-op document-match context function used when the query does not contain {@code document:match(...)}. - */ -public class EmptyDocumentMatchFunction implements Function>,Tuple3>> { - @Override - public Tuple3> apply(Tuple3> from) { - return from; - } -} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java b/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java index 4c1ad8ef90c..6ff31065b71 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java @@ -157,7 +157,7 @@ public Entry apply(final Entry from) { } // get the document key - Key docKey = getDocumentKey(from.getKey()); + Key docKey = getDocKey(from.getKey()); // Ensure that we have a non-empty column qualifier final Key stopKey = new Key(from.getKey().getRow().toString(), from.getKey().getColumnFamily().toString(), @@ -482,7 +482,7 @@ public Entry next() { if (null != next) { final List> keyValues = new LinkedList<>(); keyValues.add(next); - Key docKey = getDocumentKey(next.getKey()); + Key docKey = getDocKey(next.getKey()); final DocumentData documentData = new DocumentData(this.iteratorDocumentKey, Collections.singleton(docKey), keyValues, true); entry = Maps.immutableEntry(documentData, this.iteratorDocument); } else if (next == ITERATOR_COMPLETE_KEY) { diff --git a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java index 83ae3db0c2d..682df0febdb 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java @@ -115,12 +115,12 @@ public boolean apply(Tuple3 input) { if (matched && documentMatchContext != null) { Document document = input.second(); for (DocumentMatchResults entry : documentMatchContext.getMatches()) { - String documentMatches = DocumentFunctions.toDocumentMatchesJson(entry.getPayload()); + String documentMatches = entry.toJson(); if (documentMatches.isEmpty()) { continue; } - Content matchesAttribute = Content.withKeyMetadata(documentMatches, entry.getKey(), document.isToKeep()); + Content matchesAttribute = new Content(documentMatches, entry.getKey(), document.isToKeep()); matchesAttribute.setColumnVisibility(entry.getKey().getColumnVisibilityParsed()); document.put(DocumentFunctions.DOCUMENT_MATCHES, matchesAttribute); } diff --git a/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java b/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java index 652c8238be0..5611ba051b0 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java @@ -214,9 +214,9 @@ public List> collectDocumentAttributes(final Key documentStartK while (docAttrKey != null) { boolean seeked = false; - if (isPartOfDocument(documentStartKey, docAttrKey.get())) { + if (equality.partOf(documentStartKey, docAttrKey.get())) { if (filter == null || filter.keep(docAttrKey.get())) { - docKeys.add(getDocumentKey(docAttrKey.get())); + docKeys.add(getDocKey(docAttrKey.get())); } if (filter == null || filter.apply(Maps.immutableEntry(docAttrKey.get(), StringUtils.EMPTY))) { @@ -254,52 +254,15 @@ public List> collectDocumentAttributes(final Key documentStartK return documentAttributes; } - private boolean isPartOfDocument(Key documentStartKey, Key candidateKey) { - return equality.partOf(documentStartKey, candidateKey); - } - - // map the key to the document key (only shard, datatype, uid) - public static Key getDocumentKey(Key key) { + // map the key to the dockey (only shard, datatype, uid) + public static Key getDocKey(Key key) { final ByteSequence row = key.getRowData(); - final ByteSequence cf = getDocumentColumnFamily(key); + final ByteSequence cf = key.getColumnFamilyData(); final ByteSequence cv = key.getColumnVisibilityData(); return new Key(row.getBackingArray(), row.offset(), row.length(), cf.getBackingArray(), cf.offset(), cf.length(), EMPTY_BYTE_SEQUENCE.getBackingArray(), EMPTY_BYTE_SEQUENCE.offset(), EMPTY_BYTE_SEQUENCE.length(), cv.getBackingArray(), cv.offset(), cv.length(), key.getTimestamp()); } - /** - * extracts the proper column family byte sequence from a key regardless of whether it is an event key or a 'd' column key. - * - * @param key - * the key to process - * @return the column family, consisting of datatype and uid. - */ - private static ByteSequence getDocumentColumnFamily(Key key) { - final ByteSequence cf = key.getColumnFamilyData(); - if (!"d".equals(key.getColumnFamily().toString())) { - return cf; - } - - ByteSequence cq = key.getColumnQualifierData(); - int firstNull = -1; - int secondNull = -1; - for (int i = 0; i < cq.length(); i++) { - if (cq.byteAt(i) == 0x00) { - if (firstNull < 0) { - firstNull = i; - } else { - secondNull = i; - break; - } - } - } - if (firstNull < 0) { - return cf; - } - int end = (secondNull < 0) ? cq.length() : secondNull; - return cq.subSequence(0, end); - } - private static List> appendHierarchyFields(List> documentAttributes, Key key, Range seekRange, DescendantCountFunction function, boolean includeParent) { if (function != null || includeParent) { diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java index f3b8415c3f1..f561d81a5a2 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java @@ -69,7 +69,7 @@ import datawave.query.function.Aggregation; import datawave.query.function.DataTypeAsField; import datawave.query.function.DocumentMatchConfig; -import datawave.query.function.DocumentMatchFactory; +import datawave.query.function.DocumentMatchContextFunction; import datawave.query.function.DocumentMetadata; import datawave.query.function.DocumentPermutation; import datawave.query.function.DocumentProjection; @@ -260,7 +260,6 @@ public void init(SortedKeyValueIterator source, Map op this.exceededOrEvaluationCache = new HashMap<>(); this.myEvaluationFunction = getJexlEvaluation(this.getQuery(), arithmetic); - this.setRetainDocumentColumnFamily(false); this.documentOptions = options; this.myEnvironment = env; @@ -1067,7 +1066,7 @@ protected Function,Tuple3>> */ protected Function>,Tuple3>> buildDocumentMatchFunction( DocumentMatchConfig documentMatchConfig) { - return DocumentMatchFactory.getFunction(documentMatchConfig); + return new DocumentMatchContextFunction(documentMatchConfig); } private Range getDocumentRange(NestedQueryIterator documentSource) { diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java index 47c0d2aea1b..595b9f5c0da 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java @@ -345,7 +345,6 @@ public class QueryOptions implements OptionDescriber { protected EventDataQueryFilter eventEvaluationFilter; // filter specifically for event keys. required when performing a seeking aggregation protected EventDataQueryFilter eventFilter; - protected boolean retainDocumentColumnFamily = false; protected int maxEvaluationPipelines = 25; protected int maxPipelineCachedResults = 25; @@ -545,7 +544,6 @@ public void deepCopy(QueryOptions other) { this.fiEvaluationFilter = other.fiEvaluationFilter; this.eventEvaluationFilter = other.eventEvaluationFilter; this.eventFilter = other.eventFilter; - this.retainDocumentColumnFamily = other.retainDocumentColumnFamily; this.ivaratorCacheDirConfigs = (other.ivaratorCacheDirConfigs == null) ? null : new ArrayList<>(other.ivaratorCacheDirConfigs); this.hdfsSiteConfigURLs = other.hdfsSiteConfigURLs; @@ -887,25 +885,13 @@ public EventDataQueryFilter getEventFilter() { // @formatter:off eventFilter = new EventDataQueryFieldFilter() .withFields(fields) - .withMaxNextCount(getEventNextSeek()) - .withDocumentColumnFamily(retainDocumentColumnFamily); + .withMaxNextCount(getEventNextSeek()); // @formatter:on } return eventFilter == null ? null : eventFilter.clone(); } - public void setRetainDocumentColumnFamily(boolean retainDocumentColumnFamily) { - if (this.retainDocumentColumnFamily != retainDocumentColumnFamily) { - this.retainDocumentColumnFamily = retainDocumentColumnFamily; - // invalidate the cached filters to force them to be rebuilt instead of - // caching stale clones created under the old settings. - this.eventFilter = null; - this.evaluationFilter = null; - this.eventEvaluationFilter = null; - } - } - /** * Get the event fields to retain * diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java index c6d5b79dd32..8b0984193fc 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java @@ -3,7 +3,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.Map.Entry; import java.util.Objects; @@ -11,8 +10,6 @@ import org.apache.accumulo.core.data.Value; import org.apache.log4j.Logger; -import com.google.gson.Gson; - import datawave.query.data.parsers.DatawaveKey; import datawave.query.function.DocumentMatchContext; import datawave.query.table.parser.ContentKeyValueFactory; @@ -27,7 +24,6 @@ @JexlFunctions(descriptorFactory = "datawave.query.jexl.functions.DocumentFunctionsDescriptor") public class DocumentFunctions { private static final Logger log = Logger.getLogger(DocumentFunctions.class); - private static final Gson GSON = new Gson(); public static final String DOCUMENT_FUNCTION_NAMESPACE = "document"; public static final String DOCUMENT_MATCH_FUNCTION_NAME = "match"; @@ -170,17 +166,4 @@ static List findOffsets(String decoded, String search) { return offsets; } - /** - * Serializes one per-entry {@code DOCUMENT_MATCHES} payload in the form {@code {"view":"...","matches":{search:[offsets]}}}. - * - * @param payload - * per-entry payload built from a single matched {@code d}-column - * @return JSON string representation, or an empty string if the payload is empty - */ - public static String toDocumentMatchesJson(Map payload) { - if (payload.isEmpty()) { - return ""; - } - return GSON.toJson(payload); - } } diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java index 1628da3c4bc..be37b72c584 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java @@ -21,8 +21,6 @@ * This filter only operates on event keys. */ public class EventDataQueryFieldFilter implements EventDataQueryFilter { - private static final String DOCUMENT_COLUMN_FAMILY = "d"; - private Key document = null; // the number of times next is called before issuing a seek private int maxNextCount = -1; @@ -34,7 +32,6 @@ public class EventDataQueryFieldFilter implements EventDataQueryFilter { // the set of fields to retain private TreeSet fields; private final EventKey parser; - private boolean retainDocumentColumnFamily = false; /** * Default constructor @@ -55,7 +52,6 @@ public EventDataQueryFieldFilter(EventDataQueryFieldFilter other) { } this.maxNextCount = other.maxNextCount; this.fields = new TreeSet<>(other.fields); - this.retainDocumentColumnFamily = other.retainDocumentColumnFamily; // need to create a separate parser as the parser is not thread safe this.parser = new EventKey(); // do not copy nextCount or currentField because that is internal state @@ -87,11 +83,6 @@ public EventDataQueryFieldFilter withMaxNextCount(int maxNextCount) { return this; } - public EventDataQueryFieldFilter withDocumentColumnFamily(boolean retainDocumentColumnFamily) { - this.retainDocumentColumnFamily = retainDocumentColumnFamily; - return this; - } - @Override public void startNewDocument(Key document) { this.document = document; @@ -134,12 +125,6 @@ public boolean peek(@Nullable Map.Entry entry) { * @return true if the key should be retained */ private boolean apply(Key key, boolean update) { - if (retainDocumentColumnFamily && DOCUMENT_COLUMN_FAMILY.equals(key.getColumnFamily().toString())) { - nextCount = 0; - currentField = null; - return true; - } - parser.parse(key); String field = parser.getField(); field = JexlASTHelper.deconstructIdentifier(field); diff --git a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java index 72457b20f1f..645d5ad60f6 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java @@ -28,7 +28,7 @@ import datawave.query.attributes.AttributeFactory; import datawave.query.attributes.Document; import datawave.query.function.DocumentMatchConfig; -import datawave.query.function.DocumentMatchFactory; +import datawave.query.function.DocumentMatchContextFunction; import datawave.query.function.Equality; import datawave.query.function.RangeProvider; import datawave.query.function.TLDEquality; @@ -272,7 +272,7 @@ protected Function,Tuple3>> protected Function>,Tuple3>> buildDocumentMatchFunction( DocumentMatchConfig documentMatchConfig) { documentMatchConfig.setTld(true); - return DocumentMatchFactory.getFunction(documentMatchConfig); + return new DocumentMatchContextFunction(documentMatchConfig); } /** diff --git a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java index c18470eadac..e6c32f1c26a 100644 --- a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java @@ -10,9 +10,8 @@ import java.nio.file.Path; import java.util.Collections; import java.util.HashMap; -import java.util.LinkedHashSet; +import java.util.List; import java.util.Map; -import java.util.Set; import java.util.TimeZone; import org.apache.accumulo.core.client.AccumuloClient; @@ -36,6 +35,9 @@ import org.springframework.test.context.junit.jupiter.SpringExtension; import com.google.common.base.Preconditions; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; import datawave.ingest.data.TypeRegistry; import datawave.query.attributes.Attribute; @@ -43,6 +45,7 @@ import datawave.query.attributes.Content; import datawave.query.attributes.Document; import datawave.query.function.DocumentMatchContext; +import datawave.query.function.DocumentMatchResults; import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; import datawave.query.jexl.functions.DocumentFunctions; import datawave.query.tables.ShardQueryLogic; @@ -81,7 +84,7 @@ public class DocumentMatchQueryTest extends AbstractQueryTest { @Qualifier("EventQuery") protected ShardQueryLogic logic; - private final Map> expectedDocumentMatches = new HashMap<>(); + private final Map>>> expectedDocumentMatches = new HashMap<>(); private final Map> expectedDocumentMatchVisibilities = new HashMap<>(); private Boolean expectedDocumentMatchContextRequired; @@ -168,11 +171,11 @@ protected void extraAssertions() { assertNotNull(uuid, "result did not contain UUID"); String uuidValue = getUUID(uuid); - Set expected = expectedDocumentMatches.get(uuidValue); + Map>> expected = expectedDocumentMatches.get(uuidValue); if (expected != null) { Attribute matches = result.get(DocumentFunctions.DOCUMENT_MATCHES); assertNotNull(matches, "result did not contain DOCUMENT_MATCHES"); - assertEquals(expected, getDocumentMatchContents(matches)); + assertEquals(expected, getDocumentMatchesByView(matches)); } Map expectedVisibilities = expectedDocumentMatchVisibilities.get(uuidValue); @@ -194,8 +197,7 @@ public void testDocumentMatchJexlAllViews() throws Exception { expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", - Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27]}}")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27)))); planAndExecuteQuery(); } @@ -209,7 +211,7 @@ public void testDocumentMatchJexlSpecificView() throws Exception { expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", Collections.singleton("{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[2]}}")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT2", Map.of("lawyer", List.of(2)))); planAndExecuteQuery(); } @@ -223,8 +225,7 @@ public void testDocumentMatchJexlAddsPerEntryMatchesAcrossCalls() throws Excepti expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", - Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[2]}}")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("lawyer", List.of(2)))); planAndExecuteQuery(); } @@ -238,10 +239,8 @@ public void testDocumentMatchJexlPreservesPerEntryVisibilities() throws Exceptio expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", - Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27]}}")); - expectedDocumentMatchVisibilities.put("CAPONE", Map.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", new ColumnVisibility("ALL"), - "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27]}}", new ColumnVisibility("ALL"))); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27)))); + expectedDocumentMatchVisibilities.put("CAPONE", Map.of("CONTENT", new ColumnVisibility("ALL"), "CONTENT2", new ColumnVisibility("ALL"))); planAndExecuteQuery(); } @@ -255,10 +254,8 @@ public void testDocumentMatchJexlWildcardThenSpecificViewAccumulatesPerEntry() t expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", - Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27],\"lawyer\":[2]}}")); - expectedDocumentMatchVisibilities.put("CAPONE", Map.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", new ColumnVisibility("ALL"), - "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27],\"lawyer\":[2]}}", new ColumnVisibility("ALL"))); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27), "lawyer", List.of(2)))); + expectedDocumentMatchVisibilities.put("CAPONE", Map.of("CONTENT", new ColumnVisibility("ALL"), "CONTENT2", new ColumnVisibility("ALL"))); planAndExecuteQuery(); } @@ -273,19 +270,18 @@ public void testDocumentMatchLuceneWildcardView() throws Exception { expectedDocumentMatchContextRequired = true; expectResultCount(1); expectUUIDs(java.util.Set.of("CAPONE")); - expectedDocumentMatches.put("CAPONE", - Set.of("{\"view\":\"CONTENT\",\"matches\":{\"can\":[4,61]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"can\":[27]}}")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27)))); planAndExecuteQuery(); } - private Set getDocumentMatchContents(Attribute attribute) { - Set values = new LinkedHashSet<>(); + private Map>> getDocumentMatchesByView(Attribute attribute) { + Map>> values = new HashMap<>(); if (attribute instanceof Attributes) { for (Attribute> child : ((Attributes) attribute).getAttributes()) { - values.add(((Content) child).getContent()); + addDocumentMatch(values, ((Content) child).getContent()); } } else { - values.add(((Content) attribute).getContent()); + addDocumentMatch(values, ((Content) attribute).getContent()); } return values; } @@ -295,15 +291,34 @@ private Map getDocumentMatchVisibilities(Attribute a if (attribute instanceof Attributes) { for (Attribute> child : ((Attributes) attribute).getAttributes()) { Content content = (Content) child; - visibilities.put(content.getContent(), content.getColumnVisibility()); + visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility()); } } else { Content content = (Content) attribute; - visibilities.put(content.getContent(), content.getColumnVisibility()); + visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility()); } return visibilities; } + private void addDocumentMatch(Map>> values, String json) { + JsonObject payload = JsonParser.parseString(json).getAsJsonObject(); + String view = payload.get(DocumentMatchResults.VIEW_FIELD).getAsString(); + JsonObject matches = payload.getAsJsonObject(DocumentMatchResults.MATCHES_FIELD); + Map> offsetsBySearch = new HashMap<>(); + for (Map.Entry matchEntry : matches.entrySet()) { + List offsets = new java.util.ArrayList<>(); + for (JsonElement offset : matchEntry.getValue().getAsJsonArray()) { + offsets.add(offset.getAsInt()); + } + offsetsBySearch.put(matchEntry.getKey(), offsets); + } + values.put(view, offsetsBySearch); + } + + private String getDocumentMatchView(String json) { + return JsonParser.parseString(json).getAsJsonObject().get(DocumentMatchResults.VIEW_FIELD).getAsString(); + } + /** * Verifies that a non-matching document-match term filters the document out of the result set. */ diff --git a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java index 3c05522c78c..393dd11f70b 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java @@ -11,10 +11,8 @@ import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.LinkedHashSet; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.function.Consumer; import java.util.function.Supplier; @@ -25,6 +23,9 @@ import org.junit.jupiter.api.Test; import com.google.common.collect.Maps; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; import datawave.ingest.protobuf.TermWeightPosition; import datawave.query.Constants; @@ -201,8 +202,7 @@ public void testDocumentMatchAddsDocumentAttribute() { .of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car")))); assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); - assertEquals(Collections.singleton("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}"), - getDocumentMatchContents(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5))), getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES))); assertEquals(new ColumnVisibility("A"), d.get(DocumentFunctions.DOCUMENT_MATCHES).getColumnVisibility()); } @@ -218,8 +218,8 @@ public void testDocumentMatchAddsPerEntryDocumentAttributesAcrossCalls() { Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "A"), new Value(buildEncodedValue("lawyer car")))); assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); - assertEquals(Set.of("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[0]}}"), - getDocumentMatchContents(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5)), "CONTENT2", Map.of("lawyer", List.of(0))), + getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES))); } @Test @@ -233,8 +233,7 @@ public void testDocumentMatchAccumulatesCallsWithinSameEntry() { .of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car lawyer")))); assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); - assertEquals(Collections.singleton("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5],\"lawyer\":[9]}}"), - getDocumentMatchContents(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5), "lawyer", List.of(9))), getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES))); } @Test @@ -250,29 +249,55 @@ public void testDocumentMatchPreservesPerEntryVisibilities() { assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); - Attributes matches = assertInstanceOf(Attributes.class, d.get(DocumentFunctions.DOCUMENT_MATCHES)); - Map visibilitiesByPayload = new HashMap<>(); - for (Attribute> attribute : matches.getAttributes()) { - Content content = assertInstanceOf(Content.class, attribute); - visibilitiesByPayload.put(content.getContent(), content.getColumnVisibility()); - } - - assertEquals(new ColumnVisibility("A"), visibilitiesByPayload.get("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}")); - assertEquals(new ColumnVisibility("B"), visibilitiesByPayload.get("{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[0]}}")); + assertEquals(Map.of("BODY", new ColumnVisibility("A"), "CONTENT2", new ColumnVisibility("B")), + getDocumentMatchVisibilitiesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES))); } - private Set getDocumentMatchContents(Attribute attribute) { - Set values = new LinkedHashSet<>(); + private Map>> getDocumentMatchesByView(Attribute attribute) { + Map>> values = new HashMap<>(); if (attribute instanceof Attributes) { for (Attribute> child : ((Attributes) attribute).getAttributes()) { - values.add(((Content) child).getContent()); + addDocumentMatch(values, ((Content) child).getContent()); } } else { - values.add(((Content) attribute).getContent()); + addDocumentMatch(values, ((Content) attribute).getContent()); } return values; } + private Map getDocumentMatchVisibilitiesByView(Attribute attribute) { + Map visibilities = new HashMap<>(); + if (attribute instanceof Attributes) { + for (Attribute> child : ((Attributes) attribute).getAttributes()) { + Content content = assertInstanceOf(Content.class, child); + visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility()); + } + } else { + Content content = assertInstanceOf(Content.class, attribute); + visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility()); + } + return visibilities; + } + + private void addDocumentMatch(Map>> values, String json) { + JsonObject payload = JsonParser.parseString(json).getAsJsonObject(); + String view = payload.get(DocumentMatchResults.VIEW_FIELD).getAsString(); + JsonObject matches = payload.getAsJsonObject(DocumentMatchResults.MATCHES_FIELD); + Map> offsetsBySearch = new HashMap<>(); + for (Map.Entry matchEntry : matches.entrySet()) { + List offsets = new ArrayList<>(); + for (JsonElement offset : matchEntry.getValue().getAsJsonArray()) { + offsets.add(offset.getAsInt()); + } + offsetsBySearch.put(matchEntry.getKey(), offsets); + } + values.put(view, offsetsBySearch); + } + + private String getDocumentMatchView(String json) { + return JsonParser.parseString(json).getAsJsonObject().get(DocumentMatchResults.VIEW_FIELD).getAsString(); + } + private byte[] buildEncodedValue(String content) { try { java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java index 7faa3d4cda0..6f8b9150bb3 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java @@ -8,6 +8,7 @@ import java.util.AbstractMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import java.util.zip.GZIPOutputStream; import org.apache.accumulo.core.data.Key; @@ -15,6 +16,7 @@ import org.junit.jupiter.api.Test; import datawave.query.function.DocumentMatchContext; +import datawave.query.function.DocumentMatchResults; /** * Unit tests for {@link DocumentFunctions} covering view selection, matching semantics, payload limits, and per-{@code d}-column result accumulation. @@ -190,10 +192,8 @@ public void testMatchAccumulatesPerEntryResultsAcrossCalls() throws Exception { assertEquals("car", DocumentFunctions.match("BODY", context, "car")); assertEquals("lawyer", DocumentFunctions.match("CONTENT2", context, "lawyer")); assertEquals(2, context.getMatches().size()); - assertTrue(context.getMatches().stream().anyMatch(matches -> matches.containsSearch("car"))); - assertTrue(context.getMatches().stream().anyMatch(matches -> matches.containsSearch("lawyer"))); - assertTrue(context.getMatches().stream().anyMatch(matches -> "BODY".equals(matches.getView()))); - assertTrue(context.getMatches().stream().anyMatch(matches -> "CONTENT2".equals(matches.getView()))); + assertEquals(List.of("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[0]}}"), + context.getMatches().stream().map(DocumentMatchResults::toJson).sorted().collect(Collectors.toList())); } /** @@ -208,8 +208,7 @@ public void testMatchAccumulatesSameEntryAcrossCalls() throws Exception { assertEquals("lawyer", DocumentFunctions.match("BODY", context, "lawyer")); assertEquals(1, context.getMatches().size()); assertEquals("BODY", context.getMatches().get(0).getView()); - assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5],\"lawyer\":[9]}}", - DocumentFunctions.toDocumentMatchesJson(context.getMatches().get(0).getPayload())); + assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5],\"lawyer\":[9]}}", context.getMatches().get(0).toJson()); } /** @@ -223,7 +222,7 @@ public void testMatchRepeatsSameSearchWithinEntryAcrossCalls() throws Exception assertEquals("car", DocumentFunctions.match("BODY", context, "car")); assertEquals("car", DocumentFunctions.match("BODY", context, "car")); assertEquals(1, context.getMatches().size()); - assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", DocumentFunctions.toDocumentMatchesJson(context.getMatches().get(0).getPayload())); + assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", context.getMatches().get(0).toJson()); } /**