diff --git a/contrib/datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryDocumentMatch.test b/contrib/datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryDocumentMatch.test new file mode 100644 index 00000000000..0b09e1ef0e0 --- /dev/null +++ b/contrib/datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryDocumentMatch.test @@ -0,0 +1,87 @@ + +################################################################ +# document:match query tests for EventQuery + +# These tests validate both the JEXL document:match(...) form and the +# Lucene #DOCUMENT_MATCH(...) form against a known Wikipedia event whose +# REVISION_COMMENT d-column contains the string "Origins". + +################################################################ +# JEXL create + +setCurlData query=$( urlencode "PAGE_TITLE == 'Anarchism' && document:match('REVISION_COMMENT', 'Origins')" ) \ + queryName=EventQueryDocumentMatchJexl \ + begin=20130301 \ + end=20130401 \ + pagesize=1 \ + auths=PUBLIC \ + columnVisibility=PRIVATE \ + query.syntax=JEXL + +configureTest \ + CreateDocumentMatchJexl \ + "Creates a JEXL EventQuery using document:match against REVISION_COMMENT d-column content" \ + "--header 'Content-Type: application/x-www-form-urlencoded' ${DW_CURL_DATA} -X POST ${URI_ROOT}/Query/EventQuery/create" \ + "application/xml;charset=UTF-8" \ + 200 + +runTest --set-query-id + +################################################################ +# JEXL next + +configureTest \ + DocumentMatchJexlPage1 \ + "Gets the first page of results for the JEXL document:match query in JSON format" \ + "--header 'Accept: application/json' -X GET ${URI_ROOT}/Query/${DW_QUERY_ID}/next" \ + application/json \ + 200 + +runTest + +################################################################ +# JEXL close + +configureCloseQueryTest ${DW_QUERY_ID} + +runTest + +################################################################ +# Lucene create + +setCurlData query=$( urlencode "PAGE_TITLE:Anarchism AND #DOCUMENT_MATCH(REVISION_COMMENT, Origins)" ) \ + queryName=EventQueryDocumentMatchLucene \ + begin=20130301 \ + end=20130401 \ + pagesize=1 \ + auths=PUBLIC \ + columnVisibility=PRIVATE \ + query.syntax=LUCENE + +configureTest \ + CreateDocumentMatchLucene \ + "Creates a Lucene EventQuery using #DOCUMENT_MATCH against REVISION_COMMENT d-column content" \ + "--header 'Content-Type: application/x-www-form-urlencoded' ${DW_CURL_DATA} -X POST ${URI_ROOT}/Query/EventQuery/create" \ + "application/xml;charset=UTF-8" \ + 200 + +runTest --set-query-id + +################################################################ +# Lucene next + +configureTest \ + DocumentMatchLucenePage1 \ + "Gets the first page of results for the Lucene #DOCUMENT_MATCH query in JSON format" \ + "--header 'Accept: application/json' -X GET ${URI_ROOT}/Query/${DW_QUERY_ID}/next" \ + application/json \ + 200 + +runTest + +################################################################ +# Lucene close + +configureCloseQueryTest ${DW_QUERY_ID} + +# This last test is executed by run.sh, as usual diff --git a/microservices/services/query-executor/service/src/main/resources/JexlFunctionNamespaceRegistryContext.xml b/microservices/services/query-executor/service/src/main/resources/JexlFunctionNamespaceRegistryContext.xml index ea1047f7fbb..56ef29bfa5a 100644 --- a/microservices/services/query-executor/service/src/main/resources/JexlFunctionNamespaceRegistryContext.xml +++ b/microservices/services/query-executor/service/src/main/resources/JexlFunctionNamespaceRegistryContext.xml @@ -15,6 +15,7 @@ + diff --git a/warehouse/ingest-configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml b/warehouse/ingest-configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml index 358193f223a..9ff297c3234 100644 --- a/warehouse/ingest-configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml +++ b/warehouse/ingest-configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml @@ -15,6 +15,7 @@ + diff --git a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java index f4c2256a407..c48cafb4c2a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java +++ b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java @@ -53,6 +53,7 @@ import datawave.query.attributes.UniqueFields; import datawave.query.common.grouping.GroupFields; import datawave.query.config.annotation.AllHitsQueryConfig; +import datawave.query.function.DocumentMatchContext; import datawave.query.function.DocumentPermutation; import datawave.query.iterator.QueryIterator; import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; @@ -279,6 +280,8 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement private Set queryTermFrequencyFields = Collections.emptySet(); // Are we required to get term frequencies (i.e. does the query contain content functions) private boolean termFrequenciesRequired = false; + // Are we required to gather document-match context (i.e. does the query contain document:match functions) + private boolean documentMatchContextRequired = false; // Limit count of returned values for arbitrary fields. private Set limitFields = Collections.emptySet(); private Set matchingFieldSets = Collections.emptySet(); @@ -502,6 +505,18 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement * Term Frequency aggregations that exceed this threshold in milliseconds are logged as a warning */ private int tfAggregationThresholdMs = -1; + /** + * Maximum encoded d-column payload size, in bytes, to inspect for document:match evaluation + */ + private int documentMatchMaxEncodedSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE; + /** + * Maximum decoded d-column payload size, in bytes, to inspect for document:match evaluation + */ + private int documentMatchMaxDecodedSize = DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE; + /** + * Maximum aggregate encoded d-column payload size, in bytes, to retain in memory for document:match evaluation + */ + private int documentMatchMaxEncodedContextSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE; /** * Flag to control query option pruning in the visitor function. Queries that see significant or varied pruning via the RangeStream may see a benefit from @@ -718,6 +733,7 @@ public void copyFrom(ShardQueryConfiguration other) { this.setSortedUIDs(other.isSortedUIDs()); this.setQueryTermFrequencyFields(null == other.getQueryTermFrequencyFields() ? null : Sets.newHashSet(other.getQueryTermFrequencyFields())); this.setTermFrequenciesRequired(other.isTermFrequenciesRequired()); + this.setDocumentMatchContextRequired(other.isDocumentMatchContextRequired()); this.setLimitFields(null == other.getLimitFields() ? null : Sets.newHashSet(other.getLimitFields())); this.setMatchingFieldSets(null == other.getMatchingFieldSets() ? null : Sets.newHashSet(other.getMatchingFieldSets())); this.setLimitFieldsPreQueryEvaluation(other.isLimitFieldsPreQueryEvaluation()); @@ -836,6 +852,9 @@ public void copyFrom(ShardQueryConfiguration other) { this.setLazySetMechanismEnabled(other.isLazySetMechanismEnabled()); this.setDocAggregationThresholdMs(other.getDocAggregationThresholdMs()); this.setTfAggregationThresholdMs(other.getTfAggregationThresholdMs()); + this.setDocumentMatchMaxEncodedSize(other.getDocumentMatchMaxEncodedSize()); + this.setDocumentMatchMaxDecodedSize(other.getDocumentMatchMaxDecodedSize()); + this.setDocumentMatchMaxEncodedContextSize(other.getDocumentMatchMaxEncodedContextSize()); this.setGroupFields(GroupFields.copyOf(other.getGroupFields())); this.setPruneQueryOptions(other.getPruneQueryOptions()); this.setSortQueryPreIndexWithImpliedCounts(other.isSortQueryPreIndexWithImpliedCounts()); @@ -2344,6 +2363,14 @@ public void setTermFrequenciesRequired(boolean termFrequenciesRequired) { this.termFrequenciesRequired = termFrequenciesRequired; } + public boolean isDocumentMatchContextRequired() { + return documentMatchContextRequired; + } + + public void setDocumentMatchContextRequired(boolean documentMatchContextRequired) { + this.documentMatchContextRequired = documentMatchContextRequired; + } + public void setLimitTermExpansionToModel(boolean shouldLimitTermExpansionToModel) { this.shouldLimitTermExpansionToModel = shouldLimitTermExpansionToModel; } @@ -2891,6 +2918,30 @@ public void setTfAggregationThresholdMs(int tfAggregationThresholdMs) { this.tfAggregationThresholdMs = tfAggregationThresholdMs; } + public int getDocumentMatchMaxEncodedSize() { + return documentMatchMaxEncodedSize; + } + + public void setDocumentMatchMaxEncodedSize(int documentMatchMaxEncodedSize) { + this.documentMatchMaxEncodedSize = documentMatchMaxEncodedSize; + } + + public int getDocumentMatchMaxDecodedSize() { + return documentMatchMaxDecodedSize; + } + + public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { + this.documentMatchMaxDecodedSize = documentMatchMaxDecodedSize; + } + + public int getDocumentMatchMaxEncodedContextSize() { + return documentMatchMaxEncodedContextSize; + } + + public void setDocumentMatchMaxEncodedContextSize(int documentMatchMaxEncodedContextSize) { + this.documentMatchMaxEncodedContextSize = documentMatchMaxEncodedContextSize; + } + public GroupFields getGroupFields() { return groupFields; } @@ -3050,6 +3101,7 @@ public boolean equals(Object o) { Float.compare(that.getCollapseDatePercentThreshold(), getCollapseDatePercentThreshold()) == 0 && isSortedUIDs() == that.isSortedUIDs() && isTermFrequenciesRequired() == that.isTermFrequenciesRequired() && + isDocumentMatchContextRequired() == that.isDocumentMatchContextRequired() && isLimitFieldsPreQueryEvaluation() == that.isLimitFieldsPreQueryEvaluation() && isHitList() == that.isHitList() && isDateIndexTimeTravel() == that.isDateIndexTimeTravel() && @@ -3205,6 +3257,9 @@ public boolean equals(Object o) { isLazySetMechanismEnabled() == that.isLazySetMechanismEnabled() && getDocAggregationThresholdMs() == that.getDocAggregationThresholdMs() && getTfAggregationThresholdMs() == that.getTfAggregationThresholdMs() && + getDocumentMatchMaxEncodedSize() == that.getDocumentMatchMaxEncodedSize() && + getDocumentMatchMaxDecodedSize() == that.getDocumentMatchMaxDecodedSize() && + getDocumentMatchMaxEncodedContextSize() == that.getDocumentMatchMaxEncodedContextSize() && getPruneQueryOptions() == that.getPruneQueryOptions() && isSortQueryPreIndexWithImpliedCounts() == that.isSortQueryPreIndexWithImpliedCounts() && isSortQueryPreIndexWithFieldCounts() == that.isSortQueryPreIndexWithFieldCounts() && @@ -3338,6 +3393,7 @@ public int hashCode() { isSortedUIDs(), getQueryTermFrequencyFields(), isTermFrequenciesRequired(), + isDocumentMatchContextRequired(), getLimitFields(), getMatchingFieldSets(), isLimitFieldsPreQueryEvaluation(), @@ -3443,6 +3499,9 @@ public int hashCode() { isLazySetMechanismEnabled(), getDocAggregationThresholdMs(), getTfAggregationThresholdMs(), + getDocumentMatchMaxEncodedSize(), + getDocumentMatchMaxDecodedSize(), + getDocumentMatchMaxEncodedContextSize(), getPruneQueryOptions(), isSortQueryPreIndexWithImpliedCounts(), isSortQueryPreIndexWithFieldCounts(), diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchConfig.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchConfig.java new file mode 100644 index 00000000000..45fd023d527 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchConfig.java @@ -0,0 +1,49 @@ +package datawave.query.function; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; + +import datawave.query.predicate.TimeFilter; + +/** + * Configuration used to build the document-match context lookup function that runs immediately before JEXL evaluation. + */ +public class DocumentMatchConfig { + private SortedKeyValueIterator source; + private TimeFilter timeFilter; + private DocumentMatchContext.Limits limits; + private boolean tld; + + public SortedKeyValueIterator getSource() { + return source; + } + + public void setSource(SortedKeyValueIterator source) { + this.source = source; + } + + public TimeFilter getTimeFilter() { + return timeFilter; + } + + public void setTimeFilter(TimeFilter timeFilter) { + this.timeFilter = timeFilter; + } + + public DocumentMatchContext.Limits getLimits() { + return limits; + } + + public void setLimits(DocumentMatchContext.Limits limits) { + this.limits = limits; + } + + public boolean isTld() { + return tld; + } + + public void setTld(boolean tld) { + this.tld = tld; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java new file mode 100644 index 00000000000..2febb7c4cd9 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContext.java @@ -0,0 +1,158 @@ +package datawave.query.function; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; + +import datawave.query.predicate.TimeFilter; + +/** + * Per-document runtime state used by {@code document:match(...)} evaluation. + *

+ * This context carries the raw {@code d}-column entries retained for a candidate document, the configured size limits used while decoding those payloads, and + * the per-{@code d}-column {@link DocumentMatchResults} accumulated across one or more {@code document:match(...)} calls for a single evaluation. Context + * instances are expected to be created fresh for each evaluation pass rather than reused across multiple documents or repeated evaluations of the same + * document. + */ +public class DocumentMatchContext { + public static final int DEFAULT_MAX_ENCODED_SIZE = 256 * 1024 * 1024; + public static final int DEFAULT_MAX_DECODED_SIZE = 384 * 1024 * 1024; + public static final int DEFAULT_MAX_ENCODED_CONTEXT_SIZE = 256 * 1024 * 1024; + + /** + * Immutable runtime limits for {@code document:match(...)} payload processing. + */ + public static class Limits { + private final int maxEncodedValueSize; + private final int maxDecodedValueSize; + private final int maxEncodedContextSize; + + /** + * @param maxEncodedValueSize + * maximum allowed encoded payload size in bytes + * @param maxDecodedValueSize + * maximum allowed decoded payload size in bytes + * @param maxEncodedContextSize + * maximum allowed aggregate encoded payload size retained for a document, in bytes + */ + public Limits(int maxEncodedValueSize, int maxDecodedValueSize, int maxEncodedContextSize) { + this.maxEncodedValueSize = maxEncodedValueSize; + this.maxDecodedValueSize = maxDecodedValueSize; + this.maxEncodedContextSize = maxEncodedContextSize; + } + + /** + * @return the maximum encoded payload size, in bytes + */ + public int getMaxEncodedValueSize() { + return maxEncodedValueSize; + } + + /** + * @return the maximum decoded payload size, in bytes + */ + public int getMaxDecodedValueSize() { + return maxDecodedValueSize; + } + + /** + * @return the maximum aggregate encoded payload size retained for a document, in bytes + */ + public int getMaxEncodedContextSize() { + return maxEncodedContextSize; + } + } + + private final List> documentEntries; + private final Limits limits; + private final Map matches = new LinkedHashMap<>(); + + /** + * Creates a per-evaluation match context for the retained {@code d}-column entries of a single candidate document. + * + * @param documentEntries + * retained {@code d}-column entries for the document being evaluated + * @param limits + * payload-processing limits applied during decode and match extraction + */ + public DocumentMatchContext(List> documentEntries, Limits limits) { + this.documentEntries = documentEntries; + this.limits = limits; + } + + /** + * Builds a context from already-aggregated document entries using explicit runtime limits. + * + * @param entries + * aggregated document entries + * @param timeFilter + * optional time filter to apply while selecting {@code d}-column entries + * @param limits + * payload-processing limits + * @return a fresh context containing only eligible {@code d}-column entries for a single evaluation pass + */ + public static DocumentMatchContext from(List> entries, TimeFilter timeFilter, Limits limits) { + List> documentEntries = new ArrayList<>(); + for (Entry entry : entries) { + if (entry.getKey().getColumnFamily().toString().equals("d") && (timeFilter == null || timeFilter.apply(entry))) { + documentEntries.add(entry); + } + } + return new DocumentMatchContext(documentEntries, limits); + } + + /** + * @return the retained {@code d}-column entries available to {@code document:match(...)} during the current evaluation + */ + public List> getDocumentEntries() { + return Collections.unmodifiableList(documentEntries); + } + + public int getMaxEncodedValueSize() { + return limits.getMaxEncodedValueSize(); + } + + public int getMaxDecodedValueSize() { + return limits.getMaxDecodedValueSize(); + } + + public int getMaxEncodedContextSize() { + return limits.getMaxEncodedContextSize(); + } + + /** + * @return the payload-processing limits associated with this evaluation context + */ + public Limits getLimits() { + return limits; + } + + /** + * Records per-call matches in the per-{@code d}-column document-wide result set. + * + * @param key + * the matched {@code d}-column key + * @param search + * the literal string matched by the invocation + * @param offsets + * starting offsets found in the matched view + */ + public void addMatches(Key key, String search, List offsets) { + matches.computeIfAbsent(key, DocumentMatchResults::new).addMatches(search, offsets); + } + + /** + * Returns the per-entry match results accumulated during this evaluation. + * + * @return an immutable snapshot view of the accumulated per-{@code d}-column match results + */ + public List getMatches() { + return List.copyOf(matches.values()); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java new file mode 100644 index 00000000000..fe024a77b16 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchContextFunction.java @@ -0,0 +1,201 @@ +package datawave.query.function; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.apache.log4j.Logger; + +import com.google.common.base.Function; +import com.google.common.collect.Maps; + +import datawave.query.attributes.Attribute; +import datawave.query.attributes.Attributes; +import datawave.query.attributes.Document; +import datawave.query.attributes.DocumentKey; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.predicate.TimeFilter; +import datawave.query.util.Tuple3; +import datawave.query.util.Tuples; + +/** + * Builds a {@link DocumentMatchContext} close to evaluation time and attaches it to the side-channel map used for JEXL context population. + */ +public class DocumentMatchContextFunction implements Function>,Tuple3>> { + private static final Logger log = Logger.getLogger(DocumentMatchContextFunction.class); + private static final String DOCUMENT_COLUMN_FAMILY_STRING = "d"; + private static final char DOCUMENT_COLUMN_FAMILY_CHAR = 'd'; + private static final char NULL = '\0'; + private final DocumentMatchConfig config; + private final SortedKeyValueIterator source; + + /** + * Creates a context-populating function from the supplied document-match configuration. + * + * @param config + * document-match configuration + */ + public DocumentMatchContextFunction(DocumentMatchConfig config) { + this.config = config; + this.source = config.getSource(); // let's keep things clean. + } + + @Override + public Tuple3> apply(Tuple3> from) { + try { + Set documentKeys = getDocumentKeys(from.first(), from.second()); + if (log.isDebugEnabled()) { + log.debug("Collecting document-match context for tuple key " + from.first() + " using document keys " + documentKeys); + } + + List> dEntries = collectDocumentColumnAttributes(documentKeys); + DocumentMatchContext context = DocumentMatchContext.from(dEntries, config.getTimeFilter(), config.getLimits()); + if (log.isDebugEnabled()) { + log.debug("Collected " + dEntries.size() + " d-column entries for tuple key " + from.first()); + } + + Map map = from.third().isEmpty() ? new HashMap<>() : new HashMap<>(from.third()); + map.put(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, context); + return Tuples.tuple(from.first(), from.second(), map); + } catch (IOException e) { + throw new IllegalStateException("Unable to collect document-match context for " + from.first(), e); + } + } + + private List> collectDocumentColumnAttributes(Set documentKeys) throws IOException { + List> documentColumns = new ArrayList<>(); + long retainedBytes = 0L; + for (Key documentKey : documentKeys) { + retainedBytes = collectDocumentColumnAttributes(documentKey, documentColumns, retainedBytes); + if (retainedBytes >= config.getLimits().getMaxEncodedContextSize()) { + break; + } + } + return documentColumns; + } + + private long collectDocumentColumnAttributes(Key documentKey, List> documentColumns, long retainedBytes) throws IOException { + String row = documentKey.getRow().toString(); + String datatypeAndUid = documentKey.getColumnFamily().toString(); + Key startKey = new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + NULL); + Key endKey = config.isTld() ? new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + '\uffff') + : new Key(row, DOCUMENT_COLUMN_FAMILY_STRING, datatypeAndUid + '.'); + Range documentColumnRange = new Range(startKey, true, endKey, false); + if (log.isDebugEnabled()) { + log.debug("Seeking d-column range " + documentColumnRange + " for document key " + documentKey); + } + + source.seek(documentColumnRange, Collections.emptyList(), false); + + while (source.hasTop() && isDocumentColumn(source.getTopKey(), documentKey)) { + Entry entry = Maps.immutableEntry(source.getTopKey(), source.getTopValue()); + if (!shouldRetainDocumentColumn(entry, documentKey)) { + source.next(); + continue; + } + + int encodedLength = entry.getValue().get().length; + long retainedBytesWithEntry = retainedBytes + encodedLength; + if (retainedBytesWithEntry > config.getLimits().getMaxEncodedContextSize()) { + if (log.isDebugEnabled()) { + log.debug("Reached aggregate encoded document-match context limit of " + config.getLimits().getMaxEncodedContextSize() + + " bytes while collecting d-column entry " + entry.getKey() + " for document key " + documentKey + + "; skipping this and remaining d-column entries"); + } + return config.getLimits().getMaxEncodedContextSize(); + } + + if (log.isDebugEnabled()) { + log.debug("Collected d-column entry " + entry.getKey() + " for document key " + documentKey); + } + documentColumns.add(entry); + retainedBytes = retainedBytesWithEntry; + source.next(); + } + + if (log.isDebugEnabled()) { + log.debug("Finished d-column scan for document key " + documentKey + "; next top key is " + (source.hasTop() ? source.getTopKey() : "")); + } + return retainedBytes; + } + + private boolean shouldRetainDocumentColumn(Entry entry, Key documentKey) { + TimeFilter timeFilter = config.getTimeFilter(); + if (timeFilter != null && !timeFilter.apply(entry)) { + if (log.isDebugEnabled()) { + log.debug("Skipping d-column entry " + entry.getKey() + " for document key " + documentKey + " because it did not match the time filter"); + } + return false; + } + + int encodedLength = entry.getValue().get().length; + if (encodedLength > config.getLimits().getMaxEncodedValueSize()) { + if (log.isDebugEnabled()) { + log.debug("Skipping oversized d-column entry " + entry.getKey() + " for document key " + documentKey + " because encoded payload size " + + encodedLength + " exceeds configured limit of " + config.getLimits().getMaxEncodedValueSize() + " bytes"); + } + return false; + } + + return true; + } + + private Set getDocumentKeys(Key tupleKey, Document document) { + Set docKeys = new HashSet<>((config.isTld()) ? 4 : 1); + Attribute docKeyAttr = document.get(Document.DOCKEY_FIELD_NAME); + if (docKeyAttr == null) { + docKeys.add(tupleKey); + return docKeys; + } + + if (docKeyAttr instanceof DocumentKey) { + docKeys.add(((DocumentKey) docKeyAttr).getDocKey()); + } else if (docKeyAttr instanceof Attributes) { + for (Attribute docKey : ((Attributes) docKeyAttr).getAttributes()) { + if (docKey instanceof DocumentKey) { + docKeys.add(((DocumentKey) docKey).getDocKey()); + } else { + throw new IllegalStateException("Unexpected sub-Attribute type for " + Document.DOCKEY_FIELD_NAME + ": " + docKey.getClass()); + } + } + } else { + throw new IllegalStateException("Unexpected Attribute type for " + Document.DOCKEY_FIELD_NAME + ": " + docKeyAttr.getClass()); + } + + if (docKeys.isEmpty()) { + docKeys.add(tupleKey); + } + return docKeys; + } + + /** + * Determines whether a scanned key is a {@code d}-column for the supplied document key. + *

+ * The comparison intentionally checks the scanned key's column qualifier against the document key's column family. For event keys, the column family is + * {@code datatype\0uid}, while {@code d}-column qualifiers are laid out as {@code datatype\0uid\0view}. Matching on this prefix ensures that the collected + * {@code d}-column belongs to the same document identity as the event key. + * + * @param documentContentKey + * scanned 'd' column shard-table key + * @param documentKey + * event or document key whose {@code datatype\0uid} identifies the document + * @return {@code true} if the scanned key is a matching {@code d}-column entry for the document + */ + private boolean isDocumentColumn(Key documentContentKey, Key documentKey) { + // A document key's column family is datatype\0uid, and a d-column qualifier begins with that same datatype\0uid + // followed by \0view. This prefix comparison ties the d-column back to the document represented by the event key. + return documentContentKey.getColumnFamilyData().length() == 1 && documentContentKey.getColumnFamilyData().byteAt(0) == DOCUMENT_COLUMN_FAMILY_CHAR + && documentContentKey.getRow().equals(documentKey.getRow()) + && documentContentKey.getColumnQualifier().toString().startsWith(documentKey.getColumnFamily().toString() + NULL); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java new file mode 100644 index 00000000000..0879a9480fc --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/function/DocumentMatchResults.java @@ -0,0 +1,98 @@ +package datawave.query.function; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.accumulo.core.data.Key; + +import com.google.common.annotations.VisibleForTesting; +import com.google.gson.Gson; + +import datawave.query.data.parsers.DatawaveKey; + +/** + * Match results for a single matched {@code d}-column entry. + *

+ * A {@code d}-column entry has a single view name, so matches are grouped only by search string within that view. + */ +public class DocumentMatchResults { + private static final Gson GSON = new Gson(); + + public static final String VIEW_FIELD = "view"; + public static final String MATCHES_FIELD = "matches"; + + private final Key key; + private final Map> matches = new LinkedHashMap<>(); + + /** + * Creates an empty result container for a single matched {@code d}-column entry. + * + * @param key + * the matched {@code d}-column key + */ + public DocumentMatchResults(Key key) { + this.key = key; + } + + public Key getKey() { + return key; + } + + /** + * @return the single view name associated with this matched {@code d}-column entry + */ + @VisibleForTesting + public String getView() { + return Objects.toString(new DatawaveKey(key).getFieldName(), ""); + } + + /** + * Records offsets for a literal search string within this entry's view. + * + * @param search + * the matched literal string + * @param offsets + * ordered starting offsets where the string was found + */ + public void addMatches(String search, List offsets) { + matches.computeIfAbsent(search, ignored -> new TreeSet<>()).addAll(offsets); + } + + /** + * Builds the JSON-ready payload for this entry in the form {@code {"view":"...","matches":{search:[offsets]}}}. + * + * @return a payload map suitable for serialization into the {@code DOCUMENT_MATCHES} attribute, or an empty map if no matches are present + */ + private Map getPayload() { + Map payload = new LinkedHashMap<>(); + String view = getView(); + if (view == null || matches.isEmpty()) { + return payload; + } + payload.put(VIEW_FIELD, view); + Map> jsonMatches = new LinkedHashMap<>(); + for (Map.Entry> matchEntry : matches.entrySet()) { + jsonMatches.put(matchEntry.getKey(), new ArrayList<>(matchEntry.getValue())); + } + payload.put(MATCHES_FIELD, jsonMatches); + return payload; + } + + /** + * Serializes this entry's payload into the {@code DOCUMENT_MATCHES} JSON representation. + * + * @return JSON string representation, or an empty string if no matches were recorded for the entry + */ + public String toJson() { + Map payload = getPayload(); + if (payload.isEmpty()) { + return ""; + } + return GSON.toJson(payload); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java index 8e58173121f..682df0febdb 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/JexlEvaluation.java @@ -24,6 +24,7 @@ import datawave.query.jexl.DefaultArithmetic; import datawave.query.jexl.DelayedNonEventIndexContext; import datawave.query.jexl.HitListArithmetic; +import datawave.query.jexl.functions.DocumentFunctions; import datawave.query.postprocessing.tf.PhraseIndexes; import datawave.query.postprocessing.tf.TermOffsetMap; import datawave.query.transformer.ExcerptTransform; @@ -97,6 +98,7 @@ public boolean apply(Tuple3 input) { log.trace("Evaluating " + query + " against document " + input.second().getMetadata() + " with context " + input.third()); } + DocumentMatchContext documentMatchContext = (DocumentMatchContext) input.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); Object o = script.execute(input.third()); if (log.isTraceEnabled()) { @@ -110,6 +112,20 @@ public boolean apply(Tuple3 input) { ((DelayedNonEventIndexContext) input.third()).populateDocument(input.second()); } + if (matched && documentMatchContext != null) { + Document document = input.second(); + for (DocumentMatchResults entry : documentMatchContext.getMatches()) { + String documentMatches = entry.toJson(); + if (documentMatches.isEmpty()) { + continue; + } + + Content matchesAttribute = new Content(documentMatches, entry.getKey(), document.isToKeep()); + matchesAttribute.setColumnVisibility(entry.getKey().getColumnVisibilityParsed()); + document.put(DocumentFunctions.DOCUMENT_MATCHES, matchesAttribute); + } + } + if (arithmetic instanceof HitListArithmetic) { HitListArithmetic hitListArithmetic = (HitListArithmetic) arithmetic; if (matched) { diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java index e9f9ea037fa..f561d81a5a2 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java @@ -68,6 +68,8 @@ import datawave.query.composite.CompositeMetadata; import datawave.query.function.Aggregation; import datawave.query.function.DataTypeAsField; +import datawave.query.function.DocumentMatchConfig; +import datawave.query.function.DocumentMatchContextFunction; import datawave.query.function.DocumentMetadata; import datawave.query.function.DocumentPermutation; import datawave.query.function.DocumentProjection; @@ -98,12 +100,14 @@ import datawave.query.iterator.profile.SourceTrackingIterator; import datawave.query.iterator.waitwindow.WaitWindowObserver; import datawave.query.iterator.waitwindow.WaitWindowOverseerIterator; +import datawave.query.jexl.ArithmeticJexlEngines; import datawave.query.jexl.DatawaveJexlContext; import datawave.query.jexl.StatefulArithmetic; import datawave.query.jexl.functions.FieldIndexAggregator; import datawave.query.jexl.functions.IdentityAggregator; import datawave.query.jexl.functions.KeyAdjudicator; import datawave.query.jexl.visitors.DelayedNonEventSubTreeVisitor; +import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor; import datawave.query.jexl.visitors.IteratorBuildingVisitor; import datawave.query.jexl.visitors.SatisfactionVisitor; import datawave.query.jexl.visitors.VariableNameVisitor; @@ -975,7 +979,7 @@ protected Iterator> getEvaluation(NestedQueryIterator d // get the function we use for the tf functionality. Note we are // getting an additional source deep copy for this function - final Iterator>> itrWithContext; + Iterator>> itrWithContext; // TODO: this should be dynamic based on the query fields, not a flag passed to the iterator if (this.isTermFrequenciesRequired()) { @@ -997,6 +1001,17 @@ protected Iterator> getEvaluation(NestedQueryIterator d itrWithContext = Iterators.transform(tupleItr, new EmptyContext<>()); } + if (shouldCollectDocumentMatchContext(documentSource)) { + SortedKeyValueIterator documentMatchSource = getSourceDeepCopy("document-match context"); + DocumentMatchConfig documentMatchConfig = new DocumentMatchConfig(); + documentMatchConfig.setSource(documentMatchSource); + documentMatchConfig.setTimeFilter(getTimeFilter()); + documentMatchConfig.setLimits(getDocumentMatchLimits()); + Function>,Tuple3>> documentMatchFunction = buildDocumentMatchFunction( + documentMatchConfig); + itrWithContext = TraceIterators.transform(itrWithContext, documentMatchFunction, "Document Match Context Lookup"); + } + try { IteratorBuildingVisitor iteratorBuildingVisitor = createIteratorBuildingVisitor(getDocumentRange(documentSource), false, this.sortedUIDs); Multimap delayedNonEventFieldMap = DelayedNonEventSubTreeVisitor.getDelayedNonEventFieldMap(iteratorBuildingVisitor, @@ -1041,6 +1056,19 @@ protected Function,Tuple3>> return TFFactory.getFunction(tfConfig); } + /** + * This method exists so that extending classes can implement specific versions of the document-match context function. Specifically, so the + * {@link datawave.query.tld.TLDQueryIterator} can mark document-match collection as TLD-aware. + * + * @param documentMatchConfig + * a document-match configuration + * @return a document-match context function + */ + protected Function>,Tuple3>> buildDocumentMatchFunction( + DocumentMatchConfig documentMatchConfig) { + return new DocumentMatchContextFunction(documentMatchConfig); + } + private Range getDocumentRange(NestedQueryIterator documentSource) { if (null == documentSource) { return range; @@ -1085,7 +1113,7 @@ protected JexlEvaluation getJexlEvaluation(String query, NestedQueryIterator + * The top-level iterator option tells us whether the planned query requires document-match context anywhere. When a nested query is being evaluated, this + * method narrows that decision to the nested query so we only collect document-match context when the query actually being evaluated still contains + * {@code document:match(...)}. + * + * @param documentSource + * the nested query source for the current evaluation pass, if any + * @return true if document-match context should be collected for the current evaluation + */ + protected boolean shouldCollectDocumentMatchContext(NestedQueryIterator documentSource) { + if (!isDocumentMatchContextRequired()) { + return false; + } + + // At this point the planned query requires document-match context. If there is no nested source, or no nested + // query payload to inspect, we cannot narrow that requirement to a smaller subquery, so we conservatively keep + // document-match context collection enabled and return true in these cases. + if (documentSource == null) { + return true; + } + NestedQuery nestedQuery = documentSource.getNestedQuery(); + if (nestedQuery == null || nestedQuery.getQuery() == null) { + return true; + } + + ASTJexlScript nestedScript = nestedQuery.getScript(); + if (nestedScript == null) { + nestedScript = ArithmeticJexlEngines.getEngine(getArithmetic()).parse(nestedQuery.getQuery()); + } + return DocumentMatchFunctionVisitor.requiresDocumentMatchContext(nestedScript); + } + protected LimitFields getLimitFields() { return new LimitFields(this.getLimitFieldsMap(), this.getMatchingFieldSets()); } diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java index eb3e1485f99..595b9f5c0da 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java @@ -67,6 +67,7 @@ import datawave.query.composite.CompositeMetadata; import datawave.query.exceptions.DatawaveFatalQueryException; import datawave.query.function.ConfiguredFunction; +import datawave.query.function.DocumentMatchContext; import datawave.query.function.DocumentPermutation; import datawave.query.function.Equality; import datawave.query.function.GetStartKey; @@ -284,6 +285,11 @@ public class QueryOptions implements OptionDescriber { public static final String TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS = "tf.agg.threshold"; + public static final String DOCUMENT_MATCH_CONTEXT_REQUIRED = "document.match.context.required"; + public static final String DOCUMENT_MATCH_MAX_ENCODED_SIZE = "document.match.max.encoded.size"; + public static final String DOCUMENT_MATCH_MAX_DECODED_SIZE = "document.match.max.decoded.size"; + public static final String DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE = "document.match.max.encoded.context.size"; + public static final String FIELD_COUNTS = "field.counts"; public static final String TERM_COUNTS = "term.counts"; public static final String CARDINALITY_THRESHOLD = "cardinality.threshold"; @@ -405,6 +411,7 @@ public class QueryOptions implements OptionDescriber { protected Map> nonIndexedDataTypeMap = Maps.newHashMap(); protected boolean termFrequenciesRequired = false; + protected boolean documentMatchContextRequired = false; protected Set termFrequencyFields = Collections.emptySet(); protected Set contentExpansionFields; @@ -467,6 +474,9 @@ public class QueryOptions implements OptionDescriber { // aggregation thresholds private int docAggregationThresholdMs = -1; private int tfAggregationThresholdMs = -1; + private int documentMatchMaxEncodedSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE; + private int documentMatchMaxDecodedSize = DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE; + private int documentMatchMaxEncodedContextSize = DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE; private CountMap fieldCounts; private CountMap termCounts; @@ -533,6 +543,7 @@ public void deepCopy(QueryOptions other) { this.evaluationFilter = other.evaluationFilter; this.fiEvaluationFilter = other.fiEvaluationFilter; this.eventEvaluationFilter = other.eventEvaluationFilter; + this.eventFilter = other.eventFilter; this.ivaratorCacheDirConfigs = (other.ivaratorCacheDirConfigs == null) ? null : new ArrayList<>(other.ivaratorCacheDirConfigs); this.hdfsSiteConfigURLs = other.hdfsSiteConfigURLs; @@ -563,6 +574,7 @@ public void deepCopy(QueryOptions other) { this.sortedUIDs = other.sortedUIDs; this.termFrequenciesRequired = other.termFrequenciesRequired; + this.documentMatchContextRequired = other.documentMatchContextRequired; this.termFrequencyFields = other.termFrequencyFields; this.contentExpansionFields = other.contentExpansionFields; @@ -592,6 +604,9 @@ public void deepCopy(QueryOptions other) { this.docAggregationThresholdMs = other.docAggregationThresholdMs; this.tfAggregationThresholdMs = other.tfAggregationThresholdMs; + this.documentMatchMaxEncodedSize = other.documentMatchMaxEncodedSize; + this.documentMatchMaxDecodedSize = other.documentMatchMaxDecodedSize; + this.documentMatchMaxEncodedContextSize = other.documentMatchMaxEncodedContextSize; this.fieldCounts = other.fieldCounts; this.termCounts = other.termCounts; @@ -1435,6 +1450,11 @@ public IteratorOptions describeOptions() { options.put(TF_NEXT_SEEK, "The number of next calls made by a Term Frequency data filter or aggregator before a seek is issued"); options.put(DOC_AGGREGATION_THRESHOLD_MS, "Document aggregations that exceed this threshold are logged as a warning"); options.put(TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS, "TermFrequency aggregations that exceed this threshold are logged as a warning"); + options.put(DOCUMENT_MATCH_CONTEXT_REQUIRED, "Whether the query requires gathering document-match context"); + options.put(DOCUMENT_MATCH_MAX_ENCODED_SIZE, "Maximum encoded d-column payload size, in bytes, to inspect for document:match evaluation"); + options.put(DOCUMENT_MATCH_MAX_DECODED_SIZE, "Maximum decoded d-column payload size, in bytes, to inspect for document:match evaluation"); + options.put(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE, + "Maximum aggregate encoded d-column payload size, in bytes, to retain in memory for document:match evaluation"); options.put(FIELD_COUNTS, "Map of field counts from the global index"); options.put(TERM_COUNTS, "Map of term counts from the global index"); return new IteratorOptions(getClass().getSimpleName(), "Runs a query against the DATAWAVE tables", options, null); @@ -1655,6 +1675,22 @@ public boolean validateOptions(Map options) { this.tfAggregationThresholdMs = Integer.parseInt(options.get(TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS)); } + if (options.containsKey(DOCUMENT_MATCH_CONTEXT_REQUIRED)) { + this.documentMatchContextRequired = Boolean.parseBoolean(options.get(DOCUMENT_MATCH_CONTEXT_REQUIRED)); + } + + if (options.containsKey(DOCUMENT_MATCH_MAX_ENCODED_SIZE)) { + this.documentMatchMaxEncodedSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_ENCODED_SIZE)); + } + + if (options.containsKey(DOCUMENT_MATCH_MAX_DECODED_SIZE)) { + this.documentMatchMaxDecodedSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_DECODED_SIZE)); + } + + if (options.containsKey(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)) { + this.documentMatchMaxEncodedContextSize = Integer.parseInt(options.get(DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)); + } + if (options.containsKey(DATATYPE_FILTER)) { String option = options.get(DATATYPE_FILTER); if (option != null && !option.isEmpty()) { @@ -2462,6 +2498,42 @@ public void setTfAggregationThresholdMs(int tfAggregationThresholdMs) { this.tfAggregationThresholdMs = tfAggregationThresholdMs; } + public int getDocumentMatchMaxEncodedSize() { + return documentMatchMaxEncodedSize; + } + + public void setDocumentMatchMaxEncodedSize(int documentMatchMaxEncodedSize) { + this.documentMatchMaxEncodedSize = documentMatchMaxEncodedSize; + } + + public int getDocumentMatchMaxDecodedSize() { + return documentMatchMaxDecodedSize; + } + + public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { + this.documentMatchMaxDecodedSize = documentMatchMaxDecodedSize; + } + + public int getDocumentMatchMaxEncodedContextSize() { + return documentMatchMaxEncodedContextSize; + } + + public void setDocumentMatchMaxEncodedContextSize(int documentMatchMaxEncodedContextSize) { + this.documentMatchMaxEncodedContextSize = documentMatchMaxEncodedContextSize; + } + + public boolean isDocumentMatchContextRequired() { + return documentMatchContextRequired; + } + + public void setDocumentMatchContextRequired(boolean documentMatchContextRequired) { + this.documentMatchContextRequired = documentMatchContextRequired; + } + + protected DocumentMatchContext.Limits getDocumentMatchLimits() { + return new DocumentMatchContext.Limits(documentMatchMaxEncodedSize, documentMatchMaxDecodedSize, documentMatchMaxEncodedContextSize); + } + /** * Get an {@link Equality} * diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java b/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java index 7cba3801a7f..e904bc81047 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/DatawaveInterpreter.java @@ -51,6 +51,7 @@ import datawave.query.attributes.ValueTuple; import datawave.query.collections.FunctionalSet; import datawave.query.jexl.functions.ContentFunctionsDescriptor; +import datawave.query.jexl.functions.DocumentFunctions; import datawave.query.jexl.functions.QueryFunctions; import datawave.query.jexl.nodes.ExceededOr; import datawave.query.jexl.nodes.QueryPropertyMarker; @@ -128,6 +129,12 @@ public Object visit(ASTFunctionNode node, Object data) { addHits(result); + if (isDocumentMatchFunction(nodeString) && result instanceof String && !hasSiblings(node)) { + boolean matched = !((String) result).isEmpty(); + resultMap.put(nodeString, matched); + return matched; + } + // if the function stands alone, then it needs to return ag boolean // if the function is paired with a method that is called on its results (like 'size') then the // actual results must be returned. @@ -139,6 +146,10 @@ public Object visit(ASTFunctionNode node, Object data) { return result instanceof Collection ? !((Collection) result).isEmpty() : result; } + private boolean isDocumentMatchFunction(String nodeString) { + return nodeString.startsWith(DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE + ":" + DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME); + } + /** * Triggered when variable can not be resolved. * diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java new file mode 100644 index 00000000000..8b0984193fc --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctions.java @@ -0,0 +1,169 @@ +package datawave.query.jexl.functions; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map.Entry; +import java.util.Objects; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; +import org.apache.log4j.Logger; + +import datawave.query.data.parsers.DatawaveKey; +import datawave.query.function.DocumentMatchContext; +import datawave.query.table.parser.ContentKeyValueFactory; + +/** + * Evaluation-phase JEXL functions for inspecting decoded shard-table {@code d}-column content. + *

+ * The current namespace exposes {@code document:match(...)} which decodes base64-encoded, gzip-compressed document payloads, performs case-sensitive literal + * substring matching, and returns the matched search string when any eligible {@code d}-column matches. Detailed per-entry offsets are accumulated in the + * supplied {@link DocumentMatchContext} and later serialized into {@code DOCUMENT_MATCHES} attributes by the surrounding evaluation flow. + */ +@JexlFunctions(descriptorFactory = "datawave.query.jexl.functions.DocumentFunctionsDescriptor") +public class DocumentFunctions { + private static final Logger log = Logger.getLogger(DocumentFunctions.class); + + public static final String DOCUMENT_FUNCTION_NAMESPACE = "document"; + public static final String DOCUMENT_MATCH_FUNCTION_NAME = "match"; + public static final String DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME = "documentMatchContext"; + public static final String DOCUMENT_MATCHES = "DOCUMENT_MATCHES"; + + /** + * Evaluates the internal form of {@code document:match(STRING)} across all eligible views for the current document. + * + * @param context + * per-document context supplied by the evaluation pipeline + * @param search + * literal substring to search for + * @return the matched search string if any eligible {@code d}-column matches, or an empty string if no match is found + */ + public static String match(DocumentMatchContext context, String search) { + return match(null, context, search); + } + + /** + * Evaluates the internal form of {@code document:match(VIEWNAME, STRING)} against the current document. + *

+ * Matching is case-sensitive and literal. If {@code viewName} ends with {@code *}, it is treated as a prefix match against the view portion of the + * {@code d}-column qualifier. Oversized or undecodable payloads are skipped as non-matching. Matches from this invocation are accumulated in the supplied + * {@link DocumentMatchContext} on a per-{@code d}-column basis so the resulting {@code DOCUMENT_MATCHES} attributes can preserve each source visibility. + * + * @param viewName + * optional exact or prefix-matched view selector; {@code null} means evaluate all views + * @param context + * per-document context supplied by the evaluation pipeline + * @param search + * literal substring to search for + * @return the matched search string if any eligible {@code d}-column matches, or an empty string if no match is found + */ + public static String match(String viewName, DocumentMatchContext context, String search) { + if (context == null || search == null) { + if (log.isDebugEnabled()) { + log.debug("Skipping document:match evaluation because context or search term was null"); + } + return ""; + } + + if (log.isDebugEnabled()) { + log.debug("Evaluating document:match for search [" + search + "] view filter [" + viewName + "] across " + context.getDocumentEntries().size() + + " d-column entries"); + } + + boolean matched = false; + for (Entry entry : context.getDocumentEntries()) { + String candidateView = Objects.toString(new DatawaveKey(entry.getKey()).getFieldName(), ""); + if (!matchesView(viewName, candidateView)) { + if (log.isDebugEnabled()) { + log.debug("Skipping d-column entry " + entry.getKey() + " because view [" + candidateView + "] does not match filter [" + viewName + "]"); + } + continue; + } + byte[] encoded = entry.getValue().get(); + if (encoded.length > context.getMaxEncodedValueSize()) { + log.debug("Skipping oversized d-column payload of " + encoded.length + " bytes for view " + candidateView); + continue; + } + + try { + String decoded = decode(encoded, context.getMaxDecodedValueSize()); + List offsets = findOffsets(decoded, search); + if (!offsets.isEmpty()) { + if (log.isDebugEnabled()) { + log.debug("document:match found offsets " + offsets + " for search [" + search + "] in view [" + candidateView + "] using key " + + entry.getKey()); + } + context.addMatches(entry.getKey(), search, offsets); + matched = true; + } else if (log.isDebugEnabled()) { + log.debug("document:match found no offsets for search [" + search + "] in view [" + candidateView + "] using key " + entry.getKey()); + } + } catch (IOException | IllegalArgumentException e) { + log.debug("Unable to decode d-column payload for view " + candidateView, e); + } + } + if (log.isDebugEnabled()) { + log.debug("document:match produced matched=" + matched + " for search [" + search + "]"); + } + return matched ? search : ""; + } + + /** + * Determines whether a candidate view satisfies the requested selector. + * + * @param expectedView + * requested view selector; {@code null} matches all views and a trailing {@code *} indicates prefix matching + * @param candidateView + * extracted view name for the current {@code d}-column + * @return {@code true} if the candidate view should be evaluated + */ + static boolean matchesView(String expectedView, String candidateView) { + if (expectedView == null) { + return true; + } + if (expectedView.endsWith("*")) { + String prefix = expectedView.substring(0, expectedView.length() - 1); + return candidateView.startsWith(prefix); + } + return expectedView.equals(candidateView); + } + + /** + * Decodes a base64-encoded, gzip-compressed {@code d}-column payload while enforcing a maximum decoded size. + * + * @param encoded + * encoded payload bytes from the shard table + * @param maxDecodedValueSize + * maximum allowed decoded payload size in bytes + * @return the decoded UTF-8 content + * @throws IOException + * if the payload cannot be decoded or if the decoded size exceeds the configured limit + */ + static String decode(byte[] encoded, int maxDecodedValueSize) throws IOException { + return ContentKeyValueFactory.decodeAndDecompressContentAsString(encoded, maxDecodedValueSize); + } + + /** + * Finds all starting character offsets for a literal substring, including overlapping matches. + * + * @param decoded + * decoded document content + * @param search + * literal substring to search for + * @return ordered starting offsets for each match + */ + static List findOffsets(String decoded, String search) { + List offsets = new ArrayList<>(); + if (search.isEmpty()) { + return offsets; + } + int index = decoded.indexOf(search); + while (index >= 0) { + offsets.add(index); + index = decoded.indexOf(search, index + 1); + } + return offsets; + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java new file mode 100644 index 00000000000..75124f58fc3 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/DocumentFunctionsDescriptor.java @@ -0,0 +1,107 @@ +package datawave.query.jexl.functions; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTFunctionNode; + +import datawave.query.attributes.AttributeFactory; +import datawave.query.config.ShardQueryConfiguration; +import datawave.query.jexl.ArithmeticJexlEngines; +import datawave.query.jexl.functions.arguments.JexlArgumentDescriptor; +import datawave.query.jexl.visitors.EventDataQueryExpressionVisitor; +import datawave.query.util.DateIndexHelper; +import datawave.query.util.MetadataHelper; +import datawave.webservice.query.exception.BadRequestQueryException; +import datawave.webservice.query.exception.DatawaveErrorCode; + +/** + * Argument-descriptor factory for the {@code document:*} JEXL namespace. + *

+ * {@code document:match(...)} is an evaluation-only function. It does not contribute field normalization rules, event-data filters, index expansion, or + * ivarator pushdown. This descriptor exists primarily to validate the namespace/function pairing and to return a descriptor that tells the planner to leave the + * function in the evaluation phase. + */ +@SuppressWarnings("unused") +public class DocumentFunctionsDescriptor implements JexlFunctionArgumentDescriptorFactory { + + /** + * Descriptor for {@code document:match(...)}. + *

+ * The function is evaluated only after a candidate document has been materialized, so all index-planning hooks intentionally report no fields and no index + * query contribution. + */ + public static class DocumentJexlArgumentDescriptor implements JexlArgumentDescriptor { + @Override + public org.apache.commons.jexl3.parser.JexlNode getIndexQuery(ShardQueryConfiguration config, MetadataHelper helper, DateIndexHelper dateIndexHelper, + Set datatypeFilter) { + return TRUE_NODE; + } + + @Override + public void addFilters(AttributeFactory attributeFactory, Map filterMap) {} + + @Override + public Set fieldsForNormalization(MetadataHelper helper, Set datatypeFilter, int arg) { + return Collections.emptySet(); + } + + @Override + public Set fields(MetadataHelper helper, Set datatypeFilter) { + return Collections.emptySet(); + } + + @Override + public Set> fieldSets(MetadataHelper helper, Set datatypeFilter) { + return Collections.emptySet(); + } + + @Override + public boolean useOrForExpansion() { + return false; + } + + @Override + public boolean regexArguments() { + return false; + } + + @Override + public boolean allowIvaratorFiltering() { + return false; + } + } + + /** + * Validates that the supplied function node represents {@code document:match(...)} and returns the evaluation-only descriptor for it. + * + * @param node + * function node from the parsed JEXL tree + * @return descriptor describing the planning behavior for {@code document:match(...)} + * @throws IllegalArgumentException + * if the namespace, function class, or argument count is invalid + */ + @Override + public JexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) { + FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node); + Class functionClass = (Class) ArithmeticJexlEngines.functions().get(visitor.namespace()); + + if (!DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace())) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.JEXLNODEDESCRIPTOR_NAMESPACE_UNEXPECTED, + "Unexpected namespace " + visitor.namespace()); + throw new IllegalArgumentException(qe); + } + if (!functionClass.equals(DocumentFunctions.class)) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.JEXLNODEDESCRIPTOR_NODE_FOR_FUNCTION, + "Unexpected function class " + functionClass); + throw new IllegalArgumentException(qe); + } + if (!DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(visitor.name()) || visitor.args().isEmpty() || visitor.args().size() > 3) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.WRONG_NUMBER_OF_ARGUMENTS, + "Wrong number of arguments to document:match"); + throw new IllegalArgumentException(qe); + } + return new DocumentJexlArgumentDescriptor(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java index 9c456a36001..ac39b284c24 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionArgumentDescriptorFactory.java @@ -9,6 +9,8 @@ import datawave.query.jexl.ArithmeticJexlEngines; import datawave.query.jexl.functions.arguments.JexlArgumentDescriptor; +import datawave.query.util.DateIndexHelper; +import datawave.query.util.MetadataHelper; import datawave.webservice.query.exception.BadRequestQueryException; import datawave.webservice.query.exception.DatawaveErrorCode; import datawave.webservice.query.exception.QueryException; @@ -27,6 +29,10 @@ public interface JexlFunctionArgumentDescriptorFactory { */ JexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node); + /** + * Returned by {@link JexlArgumentDescriptor#getIndexQuery(datawave.query.config.ShardQueryConfiguration, MetadataHelper, DateIndexHelper, java.util.Set)} + * when index searching should be skipped for a function. + */ JexlNode TRUE_NODE = new ASTTrueNode(ParserTreeConstants.JJTTRUENODE); /** An encapsulation of methods that can be used with this interface */ diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java index f1a15d5a8b4..f3d03615de7 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/JexlFunctionNamespaceRegistry.java @@ -25,6 +25,7 @@ public class JexlFunctionNamespaceRegistry { static { registeredFunctions.put(ContentFunctions.CONTENT_FUNCTION_NAMESPACE, ContentFunctions.class); + registeredFunctions.put(DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE, DocumentFunctions.class); registeredFunctions.put(NormalizationFunctions.NORMALIZATION_FUNCTION_NAMESPACE, NormalizationFunctions.class); registeredFunctions.put(EvaluationPhaseFilterFunctions.EVAL_PHASE_FUNCTION_NAMESPACE, EvaluationPhaseFilterFunctions.class); registeredFunctions.put(GroupingRequiredFilterFunctions.GROUPING_REQUIRED_FUNCTION_NAMESPACE, GroupingRequiredFilterFunctions.class); diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java index f586d46c951..864051631e9 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/arguments/JexlArgumentDescriptor.java @@ -35,7 +35,8 @@ public interface JexlArgumentDescriptor { * the datatype filter * @param settings * the config settings - * @return The query which will be used against the global index + * @return the query which will be used against the global index, or {@link datawave.query.jexl.functions.JexlFunctionArgumentDescriptorFactory#TRUE_NODE} + * if index searching should be skipped for this function */ JexlNode getIndexQuery(ShardQueryConfiguration settings, MetadataHelper metadataHelper, DateIndexHelper dateIndexHelper, Set datatypeFilter); diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java new file mode 100644 index 00000000000..0dbfbc17f5d --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitor.java @@ -0,0 +1,107 @@ +package datawave.query.jexl.visitors; + +import org.apache.commons.jexl3.parser.ASTArguments; +import org.apache.commons.jexl3.parser.ASTFunctionNode; +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ASTNamespaceIdentifier; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.ParserTreeConstants; +import org.apache.log4j.Logger; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.JexlNodeFactory; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.jexl.functions.FunctionJexlNodeVisitor; + +/** + * Rewrites user-facing {@code document:match(...)} calls into the internal evaluation form that carries the reserved {@code documentMatchContext} argument + * explicitly. + *

+ * This mirrors the way {@code content:*} functions are evaluated with an explicit {@code termOffsetMap} argument, but preserves the external user syntax for + * document matching. + */ +public class DocumentMatchFunctionVisitor extends BaseVisitor { + protected static final Logger log = Logger.getLogger(DocumentMatchFunctionVisitor.class); + private boolean documentMatchContextRequired = false; + + private DocumentMatchFunctionVisitor() { + // no-op, local construction only. + } + + /** + * Determines whether the supplied script contains any {@code document:match(...)} calls. + * + * @param script + * script to inspect + * @return {@code true} if any document-match functions are present + */ + public static boolean requiresDocumentMatchContext(ASTJexlScript script) { + return JexlASTHelper.getFunctionNodes(script).stream().map(FunctionJexlNodeVisitor::eval) + .anyMatch(function -> DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(function.namespace()) + && DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME.equals(function.name())); + } + + /** + * Rewrites all {@code document:match(...)} calls in the supplied script to include the reserved context identifier. + * + * @param script + * script to rewrite + * @return {@code true} if any document-match functions were found + */ + public static boolean rewrite(ASTJexlScript script) { + DocumentMatchFunctionVisitor visitor = new DocumentMatchFunctionVisitor(); + script.jjtAccept(visitor, null); + return visitor.documentMatchContextRequired; + } + + @Override + public Object visit(ASTFunctionNode node, Object data) { + FunctionJexlNodeVisitor visitor = FunctionJexlNodeVisitor.eval(node); + if (DocumentFunctions.DOCUMENT_FUNCTION_NAMESPACE.equals(visitor.namespace())) { + rewriteDocumentFunction(node, visitor); + return data; + } + return super.visit(node, data); + } + + protected void rewriteDocumentFunction(ASTFunctionNode node, FunctionJexlNodeVisitor visitor) { + switch (visitor.name()) { + case DocumentFunctions.DOCUMENT_MATCH_FUNCTION_NAME: + documentMatchContextRequired = true; + if (visitor.args().size() == 1) { + JexlASTHelper.replaceNodeSafely(node, + buildFunction(visitor.namespace(), visitor.name(), + JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), + RebuildingVisitor.copy(visitor.args().get(0)))); + } else if (visitor.args().size() == 2) { + JexlASTHelper.replaceNodeSafely(node, + buildFunction(visitor.namespace(), visitor.name(), RebuildingVisitor.copy(visitor.args().get(0)), + JexlNodeFactory.buildIdentifier(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME), + RebuildingVisitor.copy(visitor.args().get(1)))); + } + return; + default: + log.warn("unknown document function:" + visitor.name()); + } + } + + private static ASTFunctionNode buildFunction(String namespace, String functionName, JexlNode... arguments) { + ASTFunctionNode functionNode = new ASTFunctionNode(ParserTreeConstants.JJTFUNCTIONNODE); + + ASTNamespaceIdentifier namespaceNode = new ASTNamespaceIdentifier(ParserTreeConstants.JJTNAMESPACEIDENTIFIER); + namespaceNode.setNamespace(namespace, functionName); + functionNode.jjtAddChild(namespaceNode, 0); + namespaceNode.jjtSetParent(functionNode); + + ASTArguments argsNode = new ASTArguments(ParserTreeConstants.JJTARGUMENTS); + functionNode.jjtAddChild(argsNode, 1); + argsNode.jjtSetParent(functionNode); + + for (int i = 0; i < arguments.length; i++) { + argsNode.jjtAddChild(arguments[i], i); + arguments[i].jjtSetParent(argsNode); + } + + return functionNode; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java b/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java index 7b9b3fea3a8..e343b531f06 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/builder/lucene/FunctionQueryNodeBuilder.java @@ -30,6 +30,7 @@ import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.search.TermQuery; +import datawave.query.language.functions.lucene.DocumentMatch; import datawave.query.language.functions.lucene.EvaluationOnly; import datawave.query.language.functions.lucene.Exclude; import datawave.query.language.functions.lucene.Include; @@ -48,7 +49,7 @@ @Deprecated public class FunctionQueryNodeBuilder implements QueryBuilder { - private Map allowedFunctionMap = Collections.synchronizedMap(new HashMap<>()); + private final Map allowedFunctionMap = Collections.synchronizedMap(new HashMap<>()); public FunctionQueryNodeBuilder() { addFunction(new IsNull()); @@ -57,6 +58,7 @@ public FunctionQueryNodeBuilder() { addFunction(new Exclude()); addFunction(new Text()); addFunction(new Occurrence()); + addFunction(new DocumentMatch()); addFunction(new EvaluationOnly()); } diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java new file mode 100644 index 00000000000..c7dcd136844 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/DocumentMatch.java @@ -0,0 +1,67 @@ +package datawave.query.language.functions.jexl; + +import java.text.MessageFormat; +import java.util.ArrayList; + +import datawave.query.language.functions.QueryFunction; +import datawave.webservice.query.exception.BadRequestQueryException; +import datawave.webservice.query.exception.DatawaveErrorCode; + +/** + * JEXL-language representation of {@code document:match(...)}. + *

+ * This function is produced by the query-language layer after parsing or after Lucene-to-JEXL translation. It validates the supported one-argument and + * two-argument forms and renders the canonical JEXL syntax consumed by the runtime query planner. + */ +public class DocumentMatch extends JexlQueryFunction { + + public static final String DOCUMENT_MATCH_FUNCTION = "DOCUMENT_MATCH"; + public static final String DOCUMENT_FIELD = "document"; + public static final String DOCUMENT_NAMESPACE = DOCUMENT_FIELD + ":"; + public static final String MATCH_FUNCTION = "match"; + + public DocumentMatch() { + super(DOCUMENT_MATCH_FUNCTION, new ArrayList<>()); + } + + /** + * Validates that {@code document:match(...)} received either one argument ({@code STRING}) or two arguments ({@code VIEWNAME, STRING}). + * + * @throws IllegalArgumentException + * if the function has no arguments or more than two arguments + */ + @Override + public void validate() throws IllegalArgumentException { + if (this.parameterList == null || this.parameterList.isEmpty() || this.parameterList.size() > 2) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name)); + throw new IllegalArgumentException(qe); + } + } + + /** + * Renders the canonical JEXL form {@code document:match(...)} with escaped arguments. + * + * @return JEXL representation of this function + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(DOCUMENT_NAMESPACE + MATCH_FUNCTION + "("); + for (int i = 0; i < parameterList.size(); i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(escapeString(parameterList.get(i))); + } + sb.append(")"); + return sb.toString(); + } + + /** + * @return a fresh function instance for parser duplication + */ + @Override + public QueryFunction duplicate() { + return new DocumentMatch(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java new file mode 100644 index 00000000000..96c99fcd687 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/lucene/DocumentMatch.java @@ -0,0 +1,107 @@ +package datawave.query.language.functions.lucene; + +import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_FIELD; +import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_MATCH_FUNCTION; +import static datawave.query.language.functions.jexl.DocumentMatch.DOCUMENT_NAMESPACE; +import static datawave.query.language.functions.jexl.DocumentMatch.MATCH_FUNCTION; + +import java.text.MessageFormat; +import java.util.ArrayList; + +import datawave.query.language.functions.QueryFunction; +import datawave.query.search.WildcardFieldedFilter; +import datawave.webservice.query.exception.BadRequestQueryException; +import datawave.webservice.query.exception.DatawaveErrorCode; + +/** + * Lucene-language representation of {@code #DOCUMENT_MATCH(...)}. + *

+ * This class exists in the Lucene query-language layer so the parser can recognize the function and carry it through the same fielded-filter machinery used by + * other Lucene functions before the query is rendered into JEXL. The runtime semantics are still provided by {@code document:match(...)} in the evaluation + * phase. + */ +@Deprecated +public class DocumentMatch extends LuceneQueryFunction { + private static class DocumentMatchFilter extends WildcardFieldedFilter { + private final String renderedQuery; + + DocumentMatchFilter(String selector) { + super(true, WildcardFieldedFilter.BooleanType.AND); + setField(DOCUMENT_FIELD); + setSelector(selector); + this.renderedQuery = DOCUMENT_NAMESPACE + selector; + this.query = renderedQuery; + } + + @Override + public String toString() { + return renderedQuery; + } + } + + public DocumentMatch() { + super(DOCUMENT_MATCH_FUNCTION, new ArrayList<>()); + } + + /** + * Validates that {@code #DOCUMENT_MATCH(...)} received either one argument ({@code STRING}) or two arguments ({@code VIEWNAME, STRING}). + * + * @throws IllegalArgumentException + * if the function has no arguments or more than two arguments + */ + @Override + public void validate() throws IllegalArgumentException { + if (this.parameterList == null || this.parameterList.isEmpty() || this.parameterList.size() > 2) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name)); + throw new IllegalArgumentException(qe); + } + } + + /** + * Initializes the Lucene-layer fielded-filter representation used during parsing. + *

+ * The synthetic {@code document:match(...)} selector created here is a parser-level representation only; actual evaluation is deferred until the translated + * JEXL query runs against candidate documents. + * + * @param parameterList + * parsed function arguments + * @param depth + * function-node depth in the Lucene parse tree + * @param parent + * parent query node + * @throws IllegalArgumentException + * if initialization fails + */ + @Override + public void initialize(java.util.List parameterList, int depth, org.apache.lucene.queryparser.flexible.core.nodes.QueryNode parent) + throws IllegalArgumentException { + super.initialize(parameterList, depth, parent); + this.fieldedFilter = new DocumentMatchFilter(buildSelector()); + } + + /** + * Builds the parser-layer selector text {@code match(...)} from the raw Lucene arguments. + * + * @return selector text used by the synthetic fielded filter + */ + private String buildSelector() { + StringBuilder sb = new StringBuilder(); + sb.append(MATCH_FUNCTION + "("); + for (int i = 0; i < parameterList.size(); i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(parameterList.get(i)); + } + sb.append(")"); + return sb.toString(); + } + + /** + * @return a fresh function instance for parser duplication + */ + @Override + public QueryFunction duplicate() { + return new DocumentMatch(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 3c20abd2401..bcdd386dfa6 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -119,6 +119,7 @@ import datawave.query.jexl.visitors.ConjunctionEliminationVisitor; import datawave.query.jexl.visitors.DepthVisitor; import datawave.query.jexl.visitors.DisjunctionEliminationVisitor; +import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor; import datawave.query.jexl.visitors.ExecutableDeterminationVisitor; import datawave.query.jexl.visitors.ExecutableDeterminationVisitor.STATE; import datawave.query.jexl.visitors.ExecutableExpansionVisitor; @@ -726,6 +727,9 @@ private void configureIterator(ShardQueryConfiguration config, IteratorSetting c addOption(cfg, QueryOptions.TRACK_SIZES, Boolean.toString(config.isTrackSizes()), false); addOption(cfg, QueryOptions.ACTIVE_QUERY_LOG_NAME, config.getActiveQueryLogName(), false); + // Add the thresholds for document matching if required + configureDocumentMatchOptions(config, cfg); + // Set the start and end dates configureTypeMappings(config, cfg, metadataHelper, getCompressOptionMappings(), false); } @@ -1184,6 +1188,9 @@ protected ASTJexlScript processTree(final ASTJexlScript originalQueryTree, Shard expandPushdownPullup(config, metadataHelper, timers, scannerFactory); } + // rewrite document:match() functions to include the documentMatchContext variable. + config.setQueryTree(timedRewriteDocumentMatchFunctions(timers, config)); + return config.getQueryTree(); } @@ -1639,6 +1646,19 @@ protected ASTJexlScript timedRewriteNullFunctions(QueryStopwatch timers, ASTJexl return visitorManager.timedVisit(timers, "Rewrite Null Functions", () -> RewriteNullFunctionsVisitor.rewriteNullFunctions(queryTree)); } + protected ASTJexlScript timedRewriteDocumentMatchFunctions(QueryStopwatch timers, ShardQueryConfiguration config) throws DatawaveQueryException { + return visitorManager.timedVisit(timers, "Rewrite Document Match Functions", () -> { + ASTJexlScript queryTree = config.getQueryTree(); + DocumentMatchFunctionVisitor.rewrite(queryTree); + config.setDocumentMatchContextRequired(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(queryTree)); + if (log.isDebugEnabled()) { + logQuery(queryTree, "Computed that the query " + (config.isDocumentMatchContextRequired() ? "requires" : "does not require") + + " document-match context lookup"); + } + return queryTree; + }); + } + protected ASTJexlScript timedEnforceUniqueTermsWithinExpressions(QueryStopwatch timers, final ASTJexlScript script) throws DatawaveQueryException { return visitorManager.timedVisit(timers, "Enforce Unique Terms within AND and OR expressions", () -> (UniqueExpressionTermsVisitor.enforce(script))); } @@ -2385,6 +2405,16 @@ protected void configureAdditionalOptions(ShardQueryConfiguration config, Iterat // no-op } + protected void configureDocumentMatchOptions(ShardQueryConfiguration config, IteratorSetting cfg) { + boolean documentMatchContextRequired = config.isDocumentMatchContextRequired(); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED, Boolean.toString(documentMatchContextRequired), false); + if (documentMatchContextRequired) { + addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedSize()), false); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE, Integer.toString(config.getDocumentMatchMaxDecodedSize()), false); + addOption(cfg, QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE, Integer.toString(config.getDocumentMatchMaxEncodedContextSize()), false); + } + } + protected Future loadQueryIterator(final MetadataHelper metadataHelper, final ShardQueryConfiguration config, final Boolean isFullTable, boolean isPreload) { diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java index 34e61842ada..be37b72c584 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java @@ -21,7 +21,6 @@ * This filter only operates on event keys. */ public class EventDataQueryFieldFilter implements EventDataQueryFilter { - private Key document = null; // the number of times next is called before issuing a seek private int maxNextCount = -1; diff --git a/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java b/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java index 8ed69bf246d..826b365f4e3 100644 --- a/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java +++ b/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java @@ -1,7 +1,9 @@ package datawave.query.table.parser; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Base64; import java.util.zip.GZIPInputStream; @@ -16,6 +18,7 @@ import datawave.query.table.parser.EventKeyValueFactory.EventKeyValue; public class ContentKeyValueFactory { + private static final int DECODE_BUFFER_SIZE = 4096; private static final Logger log = Logger.getLogger(ContentKeyValueFactory.class); @@ -55,7 +58,7 @@ public static ContentKeyValue parse(Key key, Value value, Authorizations auths, public static byte[] decodeAndDecompressContent(byte[] contents) { try { - contents = decompress(Base64.getMimeDecoder().decode(contents)); + contents = decodeAndDecompressContent(contents, Integer.MAX_VALUE); } catch (IOException e) { log.error("Error decompressing Base64 encoded GZIPInputStream", e); } catch (Exception e) { @@ -69,6 +72,22 @@ public static byte[] decodeAndDecompressContent(byte[] contents) { return contents; } + public static String decodeAndDecompressContentAsString(byte[] contents, int maxDecodedSize) throws IOException { + return new String(decodeAndDecompressContent(contents, maxDecodedSize), StandardCharsets.UTF_8); + } + + public static byte[] decodeAndDecompressContent(byte[] contents, int maxDecodedSize) throws IOException { + byte[] decoded = Base64.getMimeDecoder().decode(contents); + try { + return decompress(decoded, maxDecodedSize); + } catch (IOException e) { + if (decoded.length > maxDecodedSize) { + throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecodedSize + " bytes", e); + } + return decoded; + } + } + private static boolean isCompressed(byte[] compressed) { return (compressed[0] == (byte) (GZIPInputStream.GZIP_MAGIC)) && (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8)); } @@ -83,6 +102,26 @@ private static byte[] decompress(byte[] compressed) throws IOException { return decompressed; } + private static byte[] decompress(byte[] compressed, int maxDecompressedSize) throws IOException { + if (!isCompressed(compressed)) { + return compressed; + } + + try (GZIPInputStream gzip = new GZIPInputStream(new ByteArrayInputStream(compressed)); ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + byte[] buffer = new byte[DECODE_BUFFER_SIZE]; + int read; + int totalRead = 0; + while ((read = gzip.read(buffer)) >= 0) { + totalRead += read; + if (totalRead > maxDecompressedSize) { + throw new IOException("Decoded d-column payload exceeded configured limit of " + maxDecompressedSize + " bytes"); + } + baos.write(buffer, 0, read); + } + return baos.toByteArray(); + } + } + public static class ContentKeyValue extends EventKeyValue { protected String viewName = null; diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java index 2a060393b8f..c2ef06fad4c 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java @@ -3447,6 +3447,30 @@ public void setTfAggregationThresholdMs(int tfAggregationThresholdMs) { getConfig().setTfAggregationThresholdMs(tfAggregationThresholdMs); } + public int getDocumentMatchMaxEncodedSize() { + return getConfig().getDocumentMatchMaxEncodedSize(); + } + + public void setDocumentMatchMaxEncodedSize(int documentMatchMaxEncodedSize) { + getConfig().setDocumentMatchMaxEncodedSize(documentMatchMaxEncodedSize); + } + + public int getDocumentMatchMaxDecodedSize() { + return getConfig().getDocumentMatchMaxDecodedSize(); + } + + public void setDocumentMatchMaxDecodedSize(int documentMatchMaxDecodedSize) { + getConfig().setDocumentMatchMaxDecodedSize(documentMatchMaxDecodedSize); + } + + public int getDocumentMatchMaxEncodedContextSize() { + return getConfig().getDocumentMatchMaxEncodedContextSize(); + } + + public void setDocumentMatchMaxEncodedContextSize(int documentMatchMaxEncodedContextSize) { + getConfig().setDocumentMatchMaxEncodedContextSize(documentMatchMaxEncodedContextSize); + } + public boolean getPruneQueryOptions() { return getConfig().getPruneQueryOptions(); } diff --git a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java index a6ecc78aebf..645d5ad60f6 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java @@ -27,6 +27,8 @@ import datawave.query.attributes.AttributeFactory; import datawave.query.attributes.Document; +import datawave.query.function.DocumentMatchConfig; +import datawave.query.function.DocumentMatchContextFunction; import datawave.query.function.Equality; import datawave.query.function.RangeProvider; import datawave.query.function.TLDEquality; @@ -266,6 +268,13 @@ protected Function,Tuple3>> return TFFactory.getFunction(tfConfig); } + @Override + protected Function>,Tuple3>> buildDocumentMatchFunction( + DocumentMatchConfig documentMatchConfig) { + documentMatchConfig.setTld(true); + return new DocumentMatchContextFunction(documentMatchConfig); + } + /** * Get a {@link TLDRangeProvider} * diff --git a/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java new file mode 100644 index 00000000000..e6c32f1c26a --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/DocumentMatchQueryTest.java @@ -0,0 +1,374 @@ +package datawave.query; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.net.URL; +import java.nio.file.Path; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.security.tokens.PasswordToken; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.accumulo.minicluster.MiniAccumuloCluster; +import org.apache.accumulo.minicluster.MiniAccumuloConfig; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.ComponentScan; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import com.google.common.base.Preconditions; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; + +import datawave.ingest.data.TypeRegistry; +import datawave.query.attributes.Attribute; +import datawave.query.attributes.Attributes; +import datawave.query.attributes.Content; +import datawave.query.attributes.Document; +import datawave.query.function.DocumentMatchContext; +import datawave.query.function.DocumentMatchResults; +import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.tables.ShardQueryLogic; +import datawave.query.util.AbstractQueryTest; +import datawave.query.util.WiseGuysIngest; + +/** + * MiniAccumulo-backed integration tests for {@code document:match(...)}. + *

+ * These tests exercise the full query path, including query parsing, planner wiring, shard-table document materialization, evaluation-phase document matching, + * and publication of the {@code DOCUMENT_MATCHES} attribute on returned documents. + */ +@ExtendWith(SpringExtension.class) +@ComponentScan(basePackages = "datawave.query") +// @formatter:off +@ContextConfiguration(locations = { + "classpath:datawave/query/QueryLogicFactory.xml", + "classpath:beanRefContext.xml", + "classpath:MarkingFunctionsContext.xml", + "classpath:MetadataHelperContext.xml", + "classpath:CacheContext.xml"}) +// @formatter:on +public class DocumentMatchQueryTest extends AbstractQueryTest { + + private static final Logger log = Logger.getLogger(DocumentMatchQueryTest.class); + private static final Authorizations auths = new Authorizations("ALL"); + private static final String PASSWORD = "password"; + + @TempDir + public static Path folder; + + protected static MiniAccumuloCluster mac; + protected static AccumuloClient client; + + @Autowired + @Qualifier("EventQuery") + protected ShardQueryLogic logic; + + private final Map>>> expectedDocumentMatches = new HashMap<>(); + private final Map> expectedDocumentMatchVisibilities = new HashMap<>(); + private Boolean expectedDocumentMatchContextRequired; + + @Override + public ShardQueryLogic getLogic() { + return logic; + } + + @Override + public Authorizations getAuths() { + return auths; + } + + @BeforeAll + public static void beforeAll() throws Exception { + System.setProperty("type.metadata.dir", folder.toFile().getAbsolutePath()); + + MiniAccumuloConfig cfg = new MiniAccumuloConfig(folder.toFile(), PASSWORD); + cfg.setNumTservers(1); + mac = new MiniAccumuloCluster(cfg); + mac.start(); + + client = mac.createAccumuloClient("root", new PasswordToken(PASSWORD)); + client.securityOperations().changeUserAuthorizations("root", auths); + new QueryTestTableHelper(client, log); + WiseGuysIngest.writeItAll(client, WiseGuysIngest.WhatKindaRange.DOCUMENT); + } + + @BeforeEach + public void beforeEach() { + TimeZone.setDefault(TimeZone.getTimeZone("GMT")); + setClientForTest(client); + + URL hadoopConfig = this.getClass().getResource("/testhadoop.config"); + Preconditions.checkNotNull(hadoopConfig); + logic.setHdfsSiteConfigURLs(hadoopConfig.toExternalForm()); + logic.setIvaratorCacheDirConfigs(Collections.singletonList(new IvaratorCacheDirConfig(folder.toUri().toString()))); + logic.setMaxFieldIndexRangeSplit(1); + logic.setCollapseUids(false); + logic.setFullTableScanEnabled(false); + logic.setDocumentMatchMaxDecodedSize(DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE); + + givenParameter(QueryParameters.HIT_LIST, "true"); + logic.setHitList(true); + givenDate("20091231", "20150101"); + } + + @AfterEach + public void afterEach() { + super.afterEach(); + expectedDocumentMatches.clear(); + expectedDocumentMatchVisibilities.clear(); + expectedDocumentMatchContextRequired = null; + } + + @AfterAll + public static void afterAll() throws Exception { + if (mac != null) { + mac.stop(); + } + TypeRegistry.reset(); + } + + @Override + protected void extraConfigurations() { + // no-op + } + + /** + * Verifies that returned documents expose the expected {@code DOCUMENT_MATCHES} payload when the current test configured one. + */ + @Override + protected void extraAssertions() { + if (expectedDocumentMatchContextRequired != null) { + if (expectedDocumentMatchContextRequired) { + assertTrue(logic.getConfig().isDocumentMatchContextRequired(), "planned query did not require document-match context lookup"); + } else { + assertFalse(logic.getConfig().isDocumentMatchContextRequired(), "planned query unexpectedly required document-match context lookup"); + } + } + + for (Document result : results) { + Attribute uuid = result.get("UUID"); + assertNotNull(uuid, "result did not contain UUID"); + + String uuidValue = getUUID(uuid); + Map>> expected = expectedDocumentMatches.get(uuidValue); + if (expected != null) { + Attribute matches = result.get(DocumentFunctions.DOCUMENT_MATCHES); + assertNotNull(matches, "result did not contain DOCUMENT_MATCHES"); + assertEquals(expected, getDocumentMatchesByView(matches)); + } + + Map expectedVisibilities = expectedDocumentMatchVisibilities.get(uuidValue); + if (expectedVisibilities != null) { + Attribute matches = result.get(DocumentFunctions.DOCUMENT_MATCHES); + assertNotNull(matches, "result did not contain DOCUMENT_MATCHES"); + assertEquals(expectedVisibilities, getDocumentMatchVisibilities(matches)); + } + } + } + + /** + * Verifies that JEXL {@code document:match(STRING)} evaluates across all views and returns the expected offsets. + */ + @Test + public void testDocumentMatchJexlAllViews() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('can')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27)))); + planAndExecuteQuery(); + } + + /** + * Verifies that JEXL {@code document:match(VIEWNAME, STRING)} restricts evaluation to the named view. + */ + @Test + public void testDocumentMatchJexlSpecificView() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('CONTENT2', 'lawyer')"); + expectPlan("UUID == 'capone' && document:match('CONTENT2', documentMatchContext, 'lawyer')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT2", Map.of("lawyer", List.of(2)))); + planAndExecuteQuery(); + } + + /** + * Verifies that multiple JEXL {@code document:match(...)} calls contribute one {@code DOCUMENT_MATCHES} value per matched {@code d}-column. + */ + @Test + public void testDocumentMatchJexlAddsPerEntryMatchesAcrossCalls() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('CONTENT', 'can') && document:match('CONTENT2', 'lawyer')"); + expectPlan("UUID == 'capone' && document:match('CONTENT', documentMatchContext, 'can') && document:match('CONTENT2', documentMatchContext, 'lawyer')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("lawyer", List.of(2)))); + planAndExecuteQuery(); + } + + /** + * Verifies that end-to-end {@code DOCUMENT_MATCHES} values preserve the visibilities carried by their source {@code d}-column entries. + */ + @Test + public void testDocumentMatchJexlPreservesPerEntryVisibilities() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('can')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27)))); + expectedDocumentMatchVisibilities.put("CAPONE", Map.of("CONTENT", new ColumnVisibility("ALL"), "CONTENT2", new ColumnVisibility("ALL"))); + planAndExecuteQuery(); + } + + /** + * Verifies that a wildcard view match combined with a second targeted call accumulates per-entry matches without cross-entry merging. + */ + @Test + public void testDocumentMatchJexlWildcardThenSpecificViewAccumulatesPerEntry() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('CONTENT*', 'can') && document:match('CONTENT2', 'lawyer')"); + expectPlan("UUID == 'capone' && document:match('CONTENT*', documentMatchContext, 'can') && document:match('CONTENT2', documentMatchContext, 'lawyer')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27), "lawyer", List.of(2)))); + expectedDocumentMatchVisibilities.put("CAPONE", Map.of("CONTENT", new ColumnVisibility("ALL"), "CONTENT2", new ColumnVisibility("ALL"))); + planAndExecuteQuery(); + } + + /** + * Verifies Lucene {@code #DOCUMENT_MATCH(...)} translation and wildcard view-prefix behavior in the full query path. + */ + @Test + public void testDocumentMatchLuceneWildcardView() throws Exception { + givenParameter(QueryParameters.QUERY_SYNTAX, "LUCENE"); + givenQuery("UUID:CAPONE AND #DOCUMENT_MATCH(CONTENT*,can)"); + expectPlan("UUID == 'capone' && document:match('CONTENT*', documentMatchContext, 'can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + expectedDocumentMatches.put("CAPONE", Map.of("CONTENT", Map.of("can", List.of(4, 61)), "CONTENT2", Map.of("can", List.of(27)))); + planAndExecuteQuery(); + } + + private Map>> getDocumentMatchesByView(Attribute attribute) { + Map>> values = new HashMap<>(); + if (attribute instanceof Attributes) { + for (Attribute> child : ((Attributes) attribute).getAttributes()) { + addDocumentMatch(values, ((Content) child).getContent()); + } + } else { + addDocumentMatch(values, ((Content) attribute).getContent()); + } + return values; + } + + private Map getDocumentMatchVisibilities(Attribute attribute) { + Map visibilities = new HashMap<>(); + if (attribute instanceof Attributes) { + for (Attribute> child : ((Attributes) attribute).getAttributes()) { + Content content = (Content) child; + visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility()); + } + } else { + Content content = (Content) attribute; + visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility()); + } + return visibilities; + } + + private void addDocumentMatch(Map>> values, String json) { + JsonObject payload = JsonParser.parseString(json).getAsJsonObject(); + String view = payload.get(DocumentMatchResults.VIEW_FIELD).getAsString(); + JsonObject matches = payload.getAsJsonObject(DocumentMatchResults.MATCHES_FIELD); + Map> offsetsBySearch = new HashMap<>(); + for (Map.Entry matchEntry : matches.entrySet()) { + List offsets = new java.util.ArrayList<>(); + for (JsonElement offset : matchEntry.getValue().getAsJsonArray()) { + offsets.add(offset.getAsInt()); + } + offsetsBySearch.put(matchEntry.getKey(), offsets); + } + values.put(view, offsetsBySearch); + } + + private String getDocumentMatchView(String json) { + return JsonParser.parseString(json).getAsJsonObject().get(DocumentMatchResults.VIEW_FIELD).getAsString(); + } + + /** + * Verifies that a non-matching document-match term filters the document out of the result set. + */ + @Test + public void testDocumentMatchNoMatchFiltersDocument() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('missing')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'missing')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(0); + planAndExecuteQuery(); + } + + /** + * Verifies that document-match is case-sensitive in the full query path. + */ + @Test + public void testDocumentMatchIsCaseSensitive() throws Exception { + givenQuery("UUID == 'CAPONE' && document:match('Can')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'Can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(0); + planAndExecuteQuery(); + } + + /** + * Verifies that decoded payloads larger than the configured limit are skipped as non-matching during end-to-end query execution. + */ + @Test + public void testDocumentMatchOversizedDecodedPayloadIsSkipped() throws Exception { + logic.setDocumentMatchMaxDecodedSize(8); + givenQuery("UUID == 'CAPONE' && document:match('can')"); + expectPlan("UUID == 'capone' && document:match(documentMatchContext, 'can')"); + expectedDocumentMatchContextRequired = true; + expectResultCount(0); + planAndExecuteQuery(); + } + + /** + * Verifies that queries without {@code document:match(...)} do not request document-match context lookup in the integration harness. + */ + @Test + public void testQueryWithoutDocumentMatchDoesNotRequireContext() throws Exception { + givenQuery("UUID == 'CAPONE'"); + expectPlan("UUID == 'capone'"); + expectedDocumentMatchContextRequired = false; + expectResultCount(1); + expectUUIDs(java.util.Set.of("CAPONE")); + planAndExecuteQuery(); + assertEquals(1, results.size()); + Document result = results.iterator().next(); + assertNull(result.get(DocumentFunctions.DOCUMENT_MATCHES), "query without document:match unexpectedly emitted DOCUMENT_MATCHES"); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java index 1499ce689c1..51028b27764 100644 --- a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java @@ -46,6 +46,7 @@ import datawave.query.attributes.UniqueFields; import datawave.query.common.grouping.GroupFields; import datawave.query.config.annotation.AllHitsQueryConfig; +import datawave.query.function.DocumentMatchContext; import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; import datawave.query.iterator.logic.ContentSummaryIterator; import datawave.query.iterator.logic.TermFrequencyExcerptIterator; @@ -665,6 +666,14 @@ public void setUp() throws Exception { updatedValues.put("allHitsQueryConfig", new AllHitsQueryConfig()); defaultValues.put("originalJexlQuery", null); updatedValues.put("originalJexlQuery", "FIELD == 'VALUE'"); + defaultValues.put("documentMatchMaxEncodedSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE); + updatedValues.put("documentMatchMaxEncodedSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE + 1); + defaultValues.put("documentMatchMaxDecodedSize", DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE); + updatedValues.put("documentMatchMaxDecodedSize", DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE + 1); + defaultValues.put("documentMatchMaxEncodedContextSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE); + updatedValues.put("documentMatchMaxEncodedContextSize", DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE + 1); + defaultValues.put("documentMatchContextRequired", false); + updatedValues.put("documentMatchContextRequired", true); } private Query createQuery(String query) { diff --git a/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java new file mode 100644 index 00000000000..1d98c6bbdcb --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/function/DocumentMatchContextFunctionTest.java @@ -0,0 +1,192 @@ +package datawave.query.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.junit.jupiter.api.Test; + +import com.google.common.collect.Lists; + +import datawave.query.attributes.Document; +import datawave.query.attributes.DocumentKey; +import datawave.query.jexl.functions.DocumentFunctions; +import datawave.query.util.Tuple3; +import datawave.query.util.Tuples; + +/** + * Focused tests for {@link DocumentMatchContextFunction}. + */ +public class DocumentMatchContextFunctionTest { + + /** + * Verifies that only matching {@code d}-column entries for the current document key are added to the evaluation side-channel. + */ + @Test + public void testCollectsOnlyCurrentDocumentColumns() { + List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0uid\0META"), new Value("two".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0other\0BODY"), new Value("skip".getBytes())), + Map.entry(new Key("20240101_0", "tf", "datatype\0uid\0BODY"), new Value("skip".getBytes()))); + + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(entries)); + config.setLimits(new DocumentMatchContext.Limits(1234, 5678, 9012)); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Tuple3> result = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertEquals(2, context.getDocumentEntries().size()); + assertEquals(1234, context.getMaxEncodedValueSize()); + assertEquals(5678, context.getMaxDecodedValueSize()); + assertEquals(9012, context.getMaxEncodedContextSize()); + } + + /** + * Verifies that the function produces an empty context entry when a document has no retained {@code d}-column values. + */ + @Test + public void testCollectsEmptyContextWhenNoDocumentColumnsExist() { + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(Collections.emptyList())); + config.setLimits(new DocumentMatchContext.Limits(10, 20, 30)); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Tuple3> result = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertTrue(context.getDocumentEntries().isEmpty()); + } + + /** + * Verifies that document-match context collection honors explicit {@code DOCKEY} attributes instead of assuming that the tuple key is the only event key. + */ + @Test + public void testCollectsColumnsForDocumentKeysFromDocument() { + List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0child\0BODY"), new Value("two".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0other\0BODY"), new Value("skip".getBytes()))); + + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(entries)); + config.setLimits(new DocumentMatchContext.Limits(10, 20, 30)); + config.setTld(true); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Document document = new Document(); + document.put(Document.DOCKEY_FIELD_NAME, new DocumentKey(new Key("20240101_0", "datatype\0uid"), false)); + document.put(Document.DOCKEY_FIELD_NAME, new DocumentKey(new Key("20240101_0", "datatype\0child"), false)); + + Tuple3> result = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0root"), document, Collections.emptyMap())); + DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertEquals(2, context.getDocumentEntries().size()); + } + + /** + * Verifies that collection skips individually oversized payloads and stops once the retained encoded bytes would exceed the configured aggregate limit. + */ + @Test + public void testCollectsOnlyEntriesWithinAggregateEncodedContextLimit() { + List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("1234".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0uid\0META"), new Value("12345".getBytes())), + Map.entry(new Key("20240101_0", "d", "datatype\0uid\0TAIL"), new Value("12".getBytes()))); + + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(entries)); + config.setLimits(new DocumentMatchContext.Limits(10, 20, 4)); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Tuple3> result = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext context = (DocumentMatchContext) result.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertEquals(1, context.getDocumentEntries().size()); + assertEquals("datatype\0uid\0BODY", context.getDocumentEntries().get(0).getKey().getColumnQualifier().toString()); + } + + /** + * Verifies that each application creates a fresh {@link DocumentMatchContext}, so mutable match state from a prior evaluation cannot leak into a later one. + */ + @Test + public void testApplyCreatesFreshContextEachTime() { + List> entries = Lists.newArrayList(Map.entry(new Key("20240101_0", "d", "datatype\0uid\0BODY"), new Value("one".getBytes()))); + + DocumentMatchConfig config = new DocumentMatchConfig(); + config.setSource(new ListBackedIterator(entries)); + config.setLimits(new DocumentMatchContext.Limits(10, 20, 30)); + DocumentMatchContextFunction function = new DocumentMatchContextFunction(config); + + Tuple3> first = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext firstContext = (DocumentMatchContext) first.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + firstContext.addMatches(entries.get(0).getKey(), "one", List.of(0)); + assertEquals(1, firstContext.getMatches().size()); + + Tuple3> second = function + .apply(Tuples.tuple(new Key("20240101_0", "datatype\0uid"), new Document(), Collections.emptyMap())); + DocumentMatchContext secondContext = (DocumentMatchContext) second.third().get(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME); + + assertNotSame(firstContext, secondContext); + assertTrue(secondContext.getMatches().isEmpty()); + assertEquals(1, secondContext.getDocumentEntries().size()); + } + + private static class ListBackedIterator implements SortedKeyValueIterator { + private final List> entries; + private int index = -1; + + private ListBackedIterator(List> entries) { + this.entries = entries; + } + + @Override + public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) {} + + @Override + public boolean hasTop() { + return index >= 0 && index < entries.size(); + } + + @Override + public void next() { + index++; + } + + @Override + public void seek(Range range, java.util.Collection columnFamilies, boolean inclusive) { + index = 0; + while (index < entries.size() && !range.contains(entries.get(index).getKey())) { + index++; + } + } + + @Override + public Key getTopKey() { + return entries.get(index).getKey(); + } + + @Override + public Value getTopValue() { + return entries.get(index).getValue(); + } + + @Override + public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { + return new ListBackedIterator(entries); + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java index b4af760243a..393dd11f70b 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/JexlEvaluationTest.java @@ -1,19 +1,31 @@ package datawave.query.function; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static datawave.query.function.DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE; +import static datawave.query.function.DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.function.Consumer; +import java.util.function.Supplier; import org.apache.accumulo.core.data.Key; -import org.junit.Test; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.junit.jupiter.api.Test; import com.google.common.collect.Maps; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; import datawave.ingest.protobuf.TermWeightPosition; import datawave.query.Constants; @@ -24,22 +36,26 @@ import datawave.query.attributes.Numeric; import datawave.query.jexl.DatawaveJexlContext; import datawave.query.jexl.HitListArithmetic; +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.functions.DocumentFunctions; import datawave.query.jexl.functions.TermFrequencyList; +import datawave.query.jexl.visitors.DocumentMatchFunctionVisitor; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; import datawave.query.postprocessing.tf.TermOffsetMap; import datawave.query.util.Tuple3; public class JexlEvaluationTest { + public static final DocumentMatchContext.Limits TEST_DOCUMENT_MATCH_LIMITS = new DocumentMatchContext.Limits(1024, DEFAULT_MAX_DECODED_SIZE, + DEFAULT_MAX_ENCODED_CONTEXT_SIZE); + @Test public void testSimpleQuery() { String query = "FOO == 'bar'"; Document d = new Document(); d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO"))); } @Test @@ -49,10 +65,7 @@ public void testRegexIntersection() { d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true)); d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO"))); } @Test @@ -61,16 +74,15 @@ public void testRegexCaseIntersection() { d.put("FOO", new Content("Bar", new Key("shard", "datatype\0uid"), true)); d.put("FOO", new Numeric("123", new Key("shard", "datatype\0uid"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); + Supplier contextSupplier = contextFactory(d, Collections.singleton("FOO")); // match the original value String query = "FOO == 'bar' && FOO =~ '12.*'"; - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextSupplier); // match the normalized value query = "FOO == 'bar' && FOO =~ '\\+cE1\\.2.*'"; - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextSupplier); } @Test @@ -81,10 +93,7 @@ public void testRegexUnion() { d.put("FOO", new Content("bar", new Key("shard", "datatype\0uid"), true)); d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO"))); } @Test @@ -96,10 +105,7 @@ public void testHitTermSource() { d.put("FOO", hitTermSource); d.put("FOO", new Content("bazaar", new Key("shard", "datatype\0uid2"), true)); - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Collections.singleton("FOO"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Collections.singleton("FOO"))); Attributes hitTerm = (Attributes) d.getDictionary().get("HIT_TERM"); assertEquals(1, hitTerm.getAttributes().size()); @@ -151,10 +157,7 @@ public void testSomeFilterFunctions() { // Assume fields are {ANCHOR, FOO, FOO2} and a constant doc key private void evaluate(String query, Document d) { - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Arrays.asList("ANCHOR", "FOO", "FOO2", "FOO3"), context); - - assertEvaluation(query, new Key("shard", "datatype\0uid"), d, context); + assertEvaluation(query, new Key("shard", "datatype\0uid"), d, contextFactory(d, Arrays.asList("ANCHOR", "FOO", "FOO2", "FOO3"))); } @Test @@ -166,9 +169,6 @@ public void testContentPhraseFunction() { map.put("red", buildTfList("TOKFIELD", 2)); map.put("dog", buildTfList("TOKFIELD", 3)); - DatawaveJexlContext context = new DatawaveJexlContext(); - context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, new TermOffsetMap(map)); - Key docKey = new Key("shard", "datatype\0uid"); Document d = new Document(); @@ -176,9 +176,8 @@ public void testContentPhraseFunction() { d.put("TOKFIELD", new Content("big", docKey, true)); d.put("TOKFIELD", new Content("red", docKey, true)); d.put("TOKFIELD", new Content("dog", docKey, true)); - d.visit(Arrays.asList("FOO", "TOKFIELD"), context); - - assertEvaluation(query, docKey, d, context); + assertEvaluation(query, docKey, d, contextFactory(d, Arrays.asList("FOO", "TOKFIELD"), + ctx -> ctx.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, new TermOffsetMap(map)))); // assert that "big red dog" came back in the hit terms boolean foundPhrase = false; @@ -192,6 +191,128 @@ public void testContentPhraseFunction() { assertTrue(foundPhrase); } + @Test + public void testDocumentMatchAddsDocumentAttribute() { + String query = "FOO == 'bar' && document:match('car')"; + Key docKey = new Key("shard", "datatype\0uid"); + Document d = new Document(); + d.put("FOO", new Content("bar", docKey, true)); + + final List> entries = List + .of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car")))); + assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx + .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); + assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5))), getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + assertEquals(new ColumnVisibility("A"), d.get(DocumentFunctions.DOCUMENT_MATCHES).getColumnVisibility()); + } + + @Test + public void testDocumentMatchAddsPerEntryDocumentAttributesAcrossCalls() { + String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('CONTENT2', 'lawyer')"; + Key docKey = new Key("shard", "datatype\0uid"); + Document d = new Document(); + d.put("FOO", new Content("bar", docKey, true)); + + final List> entries = List.of( + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car"))), + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "A"), new Value(buildEncodedValue("lawyer car")))); + assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx + .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); + assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5)), "CONTENT2", Map.of("lawyer", List.of(0))), + getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + } + + @Test + public void testDocumentMatchAccumulatesCallsWithinSameEntry() { + String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('BODY', 'lawyer')"; + Key docKey = new Key("shard", "datatype\0uid"); + Document d = new Document(); + d.put("FOO", new Content("bar", docKey, true)); + + final List> entries = List + .of(Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car lawyer")))); + assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx + .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); + assertEquals(Map.of("BODY", Map.of("car", List.of(1, 5), "lawyer", List.of(9))), getDocumentMatchesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + } + + @Test + public void testDocumentMatchPreservesPerEntryVisibilities() { + String query = "FOO == 'bar' && document:match('BODY', 'car') && document:match('CONTENT2', 'lawyer')"; + Key docKey = new Key("shard", "datatype\0uid"); + Document d = new Document(); + d.put("FOO", new Content("bar", docKey, true)); + + final List> entries = List.of( + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0BODY", "A"), new Value(buildEncodedValue("scar car"))), + Maps.immutableEntry(new Key("row", "d", "datatype\0uid\0CONTENT2", "B"), new Value(buildEncodedValue("lawyer car")))); + assertEvaluation(query, docKey, d, contextFactory(d, Collections.singleton("FOO"), ctx -> ctx + .set(DocumentFunctions.DOCUMENT_MATCH_CONTEXT_JEXL_VARIABLE_NAME, new DocumentMatchContext(entries, TEST_DOCUMENT_MATCH_LIMITS)))); + + assertEquals(Map.of("BODY", new ColumnVisibility("A"), "CONTENT2", new ColumnVisibility("B")), + getDocumentMatchVisibilitiesByView(d.get(DocumentFunctions.DOCUMENT_MATCHES))); + } + + private Map>> getDocumentMatchesByView(Attribute attribute) { + Map>> values = new HashMap<>(); + if (attribute instanceof Attributes) { + for (Attribute> child : ((Attributes) attribute).getAttributes()) { + addDocumentMatch(values, ((Content) child).getContent()); + } + } else { + addDocumentMatch(values, ((Content) attribute).getContent()); + } + return values; + } + + private Map getDocumentMatchVisibilitiesByView(Attribute attribute) { + Map visibilities = new HashMap<>(); + if (attribute instanceof Attributes) { + for (Attribute> child : ((Attributes) attribute).getAttributes()) { + Content content = assertInstanceOf(Content.class, child); + visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility()); + } + } else { + Content content = assertInstanceOf(Content.class, attribute); + visibilities.put(getDocumentMatchView(content.getContent()), content.getColumnVisibility()); + } + return visibilities; + } + + private void addDocumentMatch(Map>> values, String json) { + JsonObject payload = JsonParser.parseString(json).getAsJsonObject(); + String view = payload.get(DocumentMatchResults.VIEW_FIELD).getAsString(); + JsonObject matches = payload.getAsJsonObject(DocumentMatchResults.MATCHES_FIELD); + Map> offsetsBySearch = new HashMap<>(); + for (Map.Entry matchEntry : matches.entrySet()) { + List offsets = new ArrayList<>(); + for (JsonElement offset : matchEntry.getValue().getAsJsonArray()) { + offsets.add(offset.getAsInt()); + } + offsetsBySearch.put(matchEntry.getKey(), offsets); + } + values.put(view, offsetsBySearch); + } + + private String getDocumentMatchView(String json) { + return JsonParser.parseString(json).getAsJsonObject().get(DocumentMatchResults.VIEW_FIELD).getAsString(); + } + + private byte[] buildEncodedValue(String content) { + try { + java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); + java.io.OutputStream b64s = java.util.Base64.getEncoder().wrap(bos); + java.util.zip.GZIPOutputStream gzip = new java.util.zip.GZIPOutputStream(b64s); + gzip.write(content.getBytes()); + gzip.close(); + b64s.close(); + bos.close(); + return bos.toByteArray(); + } catch (java.io.IOException e) { + throw new RuntimeException(e); + } + } + @Test public void testCompareFunction() { // eq op @@ -286,27 +407,48 @@ private void testCompare(String query, boolean expected) { d.put("FIELD_C", new Content("zebra", docKey, true)); d.put("FIELD_C", new Content("zephyr", docKey, true)); - // populate context from doc - DatawaveJexlContext context = new DatawaveJexlContext(); - d.visit(Arrays.asList("FOO", "FIELD_A", "FIELD_B", "FIELD_C"), context); - - assertEvaluation(query, docKey, d, context, expected); + assertEvaluation(query, docKey, d, contextFactory(d, Arrays.asList("FOO", "FIELD_A", "FIELD_B", "FIELD_C")), expected); } - private void assertEvaluation(String query, Key key, Document d, DatawaveJexlContext context) { - assertEvaluation(query, key, d, context, true); + private void assertEvaluation(String query, Key key, Document d, Supplier contextSupplier) { + assertEvaluation(query, key, d, contextSupplier, true); } - private void assertEvaluation(String query, Key key, Document d, DatawaveJexlContext context, boolean expected) { - JexlEvaluation evaluation = new JexlEvaluation(query); - boolean result = evaluation.apply(new Tuple3<>(key, d, context)); + private void assertEvaluation(String query, Key key, Document d, Supplier contextSupplier, boolean expected) { + JexlEvaluation evaluation = new JexlEvaluation(rewriteDocumentMatchFunctions(query)); + boolean result = evaluation.apply(new Tuple3<>(key, d, contextSupplier.get())); assertEquals(expected, result); - evaluation = new JexlEvaluation(query, new HitListArithmetic()); - result = evaluation.apply(new Tuple3<>(key, d, context)); + evaluation = new JexlEvaluation(rewriteDocumentMatchFunctions(query), new HitListArithmetic()); + result = evaluation.apply(new Tuple3<>(key, d, contextSupplier.get())); assertEquals(expected, result); } + private Supplier contextFactory(Document document, Collection fields) { + return contextFactory(document, fields, context -> {}); + } + + private Supplier contextFactory(Document document, Collection fields, Consumer customizer) { + return () -> { + DatawaveJexlContext context = new DatawaveJexlContext(); + document.visit(fields, context); + customizer.accept(context); + return context; + }; + } + + private String rewriteDocumentMatchFunctions(String query) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + if (!DocumentMatchFunctionVisitor.rewrite(script)) { + return query; + } + return JexlStringBuildingVisitor.buildQueryWithoutParse(script); + } catch (org.apache.commons.jexl3.parser.ParseException e) { + throw new RuntimeException(e); + } + } + private TermFrequencyList buildTfList(String field, int... offsets) { TermFrequencyList.Zone zone = buildZone(field); List position = buildTermWeightPositions(offsets); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java new file mode 100644 index 00000000000..6f8b9150bb3 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/DocumentFunctionsTest.java @@ -0,0 +1,319 @@ +package datawave.query.jexl.functions; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.OutputStream; +import java.util.AbstractMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.zip.GZIPOutputStream; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Value; +import org.junit.jupiter.api.Test; + +import datawave.query.function.DocumentMatchContext; +import datawave.query.function.DocumentMatchResults; + +/** + * Unit tests for {@link DocumentFunctions} covering view selection, matching semantics, payload limits, and per-{@code d}-column result accumulation. + */ +public class DocumentFunctionsTest { + /** + * Verifies that {@code document:match(STRING)} searches all available views and returns the matched search string when any view matches. + */ + @Test + public void testMatchAcrossAllViews() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0META", "carpet")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + String result = DocumentFunctions.match(context, "car"); + + assertEquals("car", result); + assertEquals(2, context.getMatches().size()); + } + + /** + * Verifies that a trailing {@code *} in the requested view name performs prefix matching across views. + */ + @Test + public void testWildcardViewMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext( + List.of(entry("test\0uid\0BODY", "car"), entry("test\0uid\0BODY_TEXT", "car car"), entry("test\0uid\0META", "car")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + String result = DocumentFunctions.match("BODY*", context, "car"); + + assertEquals("car", result); + } + + /** + * Verifies that overlapping substring matches are reported with all starting offsets. + */ + @Test + public void testOverlappingMatches() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "banana")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + String result = DocumentFunctions.match("BODY", context, "ana"); + + assertEquals("ana", result); + } + + /** + * Verifies that matching is case-sensitive. + */ + @Test + public void testCaseSensitiveMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertTrue(DocumentFunctions.match(context, "Car").isEmpty()); + } + + /** + * Verifies that a null context is treated as a non-match. + */ + @Test + public void testNullContextIsNonMatch() { + assertTrue(DocumentFunctions.match(null, "car").isEmpty()); + } + + /** + * Verifies that a null search term is treated as a non-match. + */ + @Test + public void testNullSearchIsNonMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertTrue(DocumentFunctions.match(context, null).isEmpty()); + assertTrue(DocumentFunctions.match("BODY", context, null).isEmpty()); + } + + /** + * Verifies that an empty search term is treated as a non-match. + */ + @Test + public void testEmptySearchIsNonMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertTrue(DocumentFunctions.match(context, "").isEmpty()); + assertTrue(context.getMatches().isEmpty()); + } + + /** + * Verifies that encoded payloads larger than the configured limit are skipped as non-matching. + */ + @Test + public void testOversizedPayloadIsNonMatch() throws Exception { + Map.Entry entry = entry("test\0uid\0BODY", "scar car"); + DocumentMatchContext context = new DocumentMatchContext(List.of(entry), new DocumentMatchContext.Limits(entry.getValue().get().length - 1, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertTrue(DocumentFunctions.match(context, "car").isEmpty()); + } + + /** + * Verifies that decoded payloads larger than the configured limit are skipped as non-matching. + */ + @Test + public void testOversizedDecodedPayloadIsNonMatch() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), + new DocumentMatchContext.Limits(1024, 3, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertTrue(DocumentFunctions.match(context, "car").isEmpty()); + } + + /** + * Verifies that an empty {@code d}-entry set yields no match. + */ + @Test + public void testNoDocumentEntriesIsNonMatch() { + assertTrue(DocumentFunctions.match(new DocumentMatchContext(List.of(), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)), "car").isEmpty()); + } + + /** + * Verifies that undecodable payloads are treated as non-matching rather than failing evaluation. + */ + @Test + public void testDecodeFailureIsNonMatch() { + DocumentMatchContext context = new DocumentMatchContext(List.of(Map.entry(new Key("row", "d", "test\0uid\0BODY"), new Value("not-base64".getBytes()))), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertTrue(DocumentFunctions.match(context, "car").isEmpty()); + } + + /** + * Verifies that MIME-style base64 payloads with trailing CRLF line breaks still decode and match correctly. + */ + @Test + public void testMatchWithBase64LineBreaks() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entryWithEncodedSuffix("test\0uid\0BODY", "/* Origins */ Fix.", "\r\n")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + String result = DocumentFunctions.match("BODY", context, "Origins"); + + assertEquals("Origins", result); + } + + /** + * Verifies that payloads stored as plain base64-encoded UTF-8 text still decode and match when gzip expansion is not possible. + */ + @Test + public void testMatchWithBase64OnlyPayload() { + DocumentMatchContext context = new DocumentMatchContext(List.of(base64OnlyEntry("test\0uid\0BODY", "/* Origins */ Fix.")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + String result = DocumentFunctions.match("BODY", context, "Origins"); + + assertEquals("Origins", result); + } + + /** + * Verifies that multiple {@code document:match(...)} calls accumulate results on a per-{@code d}-column basis for document output. + */ + @Test + public void testMatchAccumulatesPerEntryResultsAcrossCalls() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car"), entry("test\0uid\0CONTENT2", "lawyer car")), + new DocumentMatchContext.Limits(1024, DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, + DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertEquals("car", DocumentFunctions.match("BODY", context, "car")); + assertEquals("lawyer", DocumentFunctions.match("CONTENT2", context, "lawyer")); + assertEquals(2, context.getMatches().size()); + assertEquals(List.of("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", "{\"view\":\"CONTENT2\",\"matches\":{\"lawyer\":[0]}}"), + context.getMatches().stream().map(DocumentMatchResults::toJson).sorted().collect(Collectors.toList())); + } + + /** + * Verifies that repeated {@code document:match(...)} calls against the same {@code d}-column accumulate beneath that single entry payload. + */ + @Test + public void testMatchAccumulatesSameEntryAcrossCalls() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car lawyer")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertEquals("car", DocumentFunctions.match("BODY", context, "car")); + assertEquals("lawyer", DocumentFunctions.match("BODY", context, "lawyer")); + assertEquals(1, context.getMatches().size()); + assertEquals("BODY", context.getMatches().get(0).getView()); + assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5],\"lawyer\":[9]}}", context.getMatches().get(0).toJson()); + } + + /** + * Verifies that repeated identical {@code document:match(...)} calls against the same {@code d}-column do not duplicate offsets for the same search term. + */ + @Test + public void testMatchRepeatsSameSearchWithinEntryAcrossCalls() throws Exception { + DocumentMatchContext context = new DocumentMatchContext(List.of(entry("test\0uid\0BODY", "scar car")), new DocumentMatchContext.Limits(1024, + DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE)); + + assertEquals("car", DocumentFunctions.match("BODY", context, "car")); + assertEquals("car", DocumentFunctions.match("BODY", context, "car")); + assertEquals(1, context.getMatches().size()); + assertEquals("{\"view\":\"BODY\",\"matches\":{\"car\":[1,5]}}", context.getMatches().get(0).toJson()); + } + + /** + * Builds a test {@code d}-column entry with an empty visibility. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @return encoded test entry + * @throws Exception + * if test payload creation fails + */ + private Map.Entry entry(String cq, String content) throws Exception { + return entry(cq, content, ""); + } + + /** + * Builds a test {@code d}-column entry with caller-supplied visibility and gzip+base64 encoded content. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @param visibility + * column visibility to attach to the key + * @return encoded test entry + * @throws Exception + * if test payload creation fails + */ + private Map.Entry entry(String cq, String content, String visibility) throws Exception { + return entryWithEncodedSuffix(cq, content, visibility, ""); + } + + /** + * Builds a test {@code d}-column entry with caller-supplied visibility and an optional suffix appended to the encoded payload. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @param visibility + * column visibility to attach to the key + * @param encodedSuffix + * suffix bytes to append after base64 encoding, such as {@code \r\n} + * @return encoded test entry + * @throws Exception + * if test payload creation fails + */ + private Map.Entry entryWithEncodedSuffix(String cq, String content, String visibility, String encodedSuffix) throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + OutputStream b64s = java.util.Base64.getEncoder().wrap(bos); + GZIPOutputStream gzip = new GZIPOutputStream(b64s); + gzip.write(content.getBytes()); + gzip.close(); + b64s.close(); + if (!encodedSuffix.isEmpty()) { + bos.write(encodedSuffix.getBytes()); + } + bos.close(); + return new AbstractMap.SimpleEntry<>(new Key("row", "d", cq, visibility), new Value(bos.toByteArray())); + } + + /** + * Builds a test {@code d}-column entry with an empty visibility and an optional suffix appended to the encoded payload. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @param encodedSuffix + * suffix bytes to append after base64 encoding, such as {@code \r\n} + * @return encoded test entry + * @throws Exception + * if test payload creation fails + */ + private Map.Entry entryWithEncodedSuffix(String cq, String content, String encodedSuffix) throws Exception { + return entryWithEncodedSuffix(cq, content, "", encodedSuffix); + } + + /** + * Builds a test {@code d}-column entry whose value is only base64-encoded UTF-8 text. + * + * @param cq + * column qualifier to use + * @param content + * decoded content to encode into the value + * @return encoded test entry + */ + private Map.Entry base64OnlyEntry(String cq, String content) { + byte[] encoded = java.util.Base64.getEncoder().encode(content.getBytes()); + return new AbstractMap.SimpleEntry<>(new Key("row", "d", cq), new Value(encoded)); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java new file mode 100644 index 00000000000..29e2816aefc --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/DocumentMatchFunctionVisitorTest.java @@ -0,0 +1,81 @@ +package datawave.query.jexl.visitors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ParseException; +import org.junit.Test; + +import datawave.query.jexl.JexlASTHelper; + +/** + * Focused tests for {@link DocumentMatchFunctionVisitor}. + */ +public class DocumentMatchFunctionVisitorTest { + + /** + * Verifies that the visitor reports when a query needs the reserved document-match context variable. + * + * @throws Exception + * if parsing fails + */ + @Test + public void testRewriteReportsWhetherDocumentMatchContextIsRequired() throws Exception { + assertFalse(DocumentMatchFunctionVisitor.rewrite(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar'"))); + assertTrue(DocumentMatchFunctionVisitor.rewrite(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar' && document:match('car')"))); + } + + @Test + public void testRequiresDocumentMatchContext() throws Exception { + assertFalse(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar'"))); + assertTrue(DocumentMatchFunctionVisitor.requiresDocumentMatchContext(JexlASTHelper.parseAndFlattenJexlQuery("FOO == 'bar' && document:match('car')"))); + } + + /** + * Verifies that the one-argument form is rewritten to include the reserved context variable as the first argument. + * + * @throws ParseException + * if parsing fails + */ + @Test + public void testRewriteSingleArgumentFunction() throws ParseException { + assertRewrite("document:match(documentMatchContext, 'car')", "document:match('car')"); + } + + /** + * Verifies that the two-argument form keeps the view selector first and inserts the reserved context variable before the search string. + * + * @throws ParseException + * if parsing fails + */ + @Test + public void testRewriteTwoArgumentFunction() throws ParseException { + assertRewrite("document:match('BODY', documentMatchContext, 'car')", "document:match('BODY', 'car')"); + } + + @Test + public void testRewriteMutatesOriginalScript() throws ParseException { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("document:match('car')"); + assertTrue(DocumentMatchFunctionVisitor.rewrite(script)); + assertEquals("document:match(documentMatchContext, 'car')", JexlStringBuildingVisitor.buildQueryWithoutParse(script)); + } + + /** + * Verifies that the input form is rewritten the expected input. + * + * @param expected + * the expected re-written form + * @param input + * the input to rewrite + * @throws ParseException + * if parsing fails + */ + private static void assertRewrite(String expected, String input) throws ParseException { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(input); + DocumentMatchFunctionVisitor.rewrite(script); + String rewritten = JexlStringBuildingVisitor.buildQueryWithoutParse(script); + assertEquals(expected, rewritten); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java index d5482f403eb..7a102b316a5 100644 --- a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java @@ -62,6 +62,16 @@ public void testMatchesInGroupFunctionQuoting() throws ParseException { assertEquals("grouping:matchesInGroupLeft(FOO, 'foo', BAR, 'bar')", parseQuery("#MATCHES_IN_GROUP_LEFT(FOO, foo, BAR, bar)")); } + @Test + public void testDocumentMatchFunctionTranslation() throws ParseException { + assertEquals("document:match('car')", parseQuery("#DOCUMENT_MATCH('car')")); + assertEquals("document:match('car')", parseQuery("#DOCUMENT_MATCH(car)")); + assertEquals("document:match('BODY', 'car')", parseQuery("#DOCUMENT_MATCH('BODY', 'car')")); + assertEquals("document:match('BODY', 'car')", parseQuery("#DOCUMENT_MATCH(BODY, car)")); + assertEquals("document:match('BODY*', 'car')", parseQuery("#DOCUMENT_MATCH(BODY*, car)")); + assertEquals("BODY == 'capone' && document:match('car')", parseQuery("BODY:capone AND #DOCUMENT_MATCH(car)")); + } + @Test public void testComposableFunctions() throws ParseException { assertEquals("filter:includeRegex(foo,bar).size() > 0", parseQuery("#JEXL(\"filter:includeRegex(foo,bar).size() > 0\")")); diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java index 192af5c6235..613f1e2bade 100644 --- a/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/lucene/TestLuceneQueryParser.java @@ -204,6 +204,12 @@ public void testFunctions() throws ParseException { luceneParser.parse("field:selector AND #include(field, testbade\\.scape)").getContents()); Assert.assertEquals("[AND,field:selector][posFilter: filter(true, AND, field, testbade\\.scape)]", luceneParser.parse("field:selector AND #text(field, testbade\\.scape)").getContents()); + Assert.assertEquals("[AND,field:selector][posFilter: document:match(car)]", + luceneParser.parse("field:selector AND #DOCUMENT_MATCH(car)").getContents()); + Assert.assertEquals("[AND,field:selector][posFilter: document:match(BODY, car)]", + luceneParser.parse("field:selector AND #DOCUMENT_MATCH(BODY, car)").getContents()); + Assert.assertEquals("[AND,field:selector][posFilter: document:match(BODY*, car)]", + luceneParser.parse("field:selector AND #DOCUMENT_MATCH(BODY*, car)").getContents()); } @Test diff --git a/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java b/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java index 203060ed03c..5c487ddb8c7 100644 --- a/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/planner/DefaultQueryPlannerTest.java @@ -1,12 +1,14 @@ package datawave.query.planner; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Set; +import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.commons.jexl3.parser.ASTJexlScript; import org.junit.jupiter.api.BeforeEach; @@ -25,6 +27,8 @@ import datawave.query.config.ShardQueryConfiguration; import datawave.query.exceptions.DatawaveFatalQueryException; import datawave.query.exceptions.DatawaveQueryException; +import datawave.query.iterator.QueryIterator; +import datawave.query.iterator.QueryOptions; import datawave.query.jexl.JexlASTHelper; import datawave.query.tables.ScannerFactory; import datawave.query.util.DateIndexHelper; @@ -36,6 +40,46 @@ class DefaultQueryPlannerTest { + @Nested + class DocumentMatchOptionTests { + + @Test + void testAddDocumentMatchOptionsWithoutContextRequired() { + DefaultQueryPlanner planner = new DefaultQueryPlanner(); + ShardQueryConfiguration config = new ShardQueryConfiguration(); + config.setDocumentMatchContextRequired(false); + config.setDocumentMatchMaxEncodedSize(111); + config.setDocumentMatchMaxDecodedSize(222); + config.setDocumentMatchMaxEncodedContextSize(333); + IteratorSetting cfg = new IteratorSetting(10, "query", QueryIterator.class); + + planner.configureDocumentMatchOptions(config, cfg); + + assertEquals("false", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED)); + assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE)); + assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE)); + assertFalse(cfg.getOptions().containsKey(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)); + } + + @Test + void testAddDocumentMatchOptionsWithContextRequired() { + DefaultQueryPlanner planner = new DefaultQueryPlanner(); + ShardQueryConfiguration config = new ShardQueryConfiguration(); + config.setDocumentMatchContextRequired(true); + config.setDocumentMatchMaxEncodedSize(111); + config.setDocumentMatchMaxDecodedSize(222); + config.setDocumentMatchMaxEncodedContextSize(333); + IteratorSetting cfg = new IteratorSetting(10, "query", QueryIterator.class); + + planner.configureDocumentMatchOptions(config, cfg); + + assertEquals("true", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_CONTEXT_REQUIRED)); + assertEquals("111", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_SIZE)); + assertEquals("222", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_DECODED_SIZE)); + assertEquals("333", cfg.getOptions().get(QueryOptions.DOCUMENT_MATCH_MAX_ENCODED_CONTEXT_SIZE)); + } + } + /** * Contains tests for * {@link DefaultQueryPlanner#addDateFilters(ASTJexlScript, ScannerFactory, MetadataHelper, DateIndexHelper, ShardQueryConfiguration, Query)} diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java index 729c20a142e..ae169bea877 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java @@ -85,8 +85,10 @@ import datawave.query.QueryParameters; import datawave.query.QueryTestTableHelper; import datawave.query.RebuildingScannerTestHelper; +import datawave.query.config.ShardQueryConfiguration; import datawave.query.config.annotation.AllHitsQueryConfig; import datawave.query.config.annotation.AnnotationConfig; +import datawave.query.function.DocumentMatchContext; import datawave.query.function.deserializer.KryoDocumentDeserializer; import datawave.query.planner.DefaultQueryPlanner; import datawave.query.planner.TimedVisitorManager; @@ -257,6 +259,63 @@ public void tearDown() throws Exception { this.endDate = null; } + /** + * Verifies that the Spring-configured {@link ShardQueryLogic} bean exposes the document-match limits through its accessor surface. + */ + @Test + public void testDocumentMatchLimitsDefaultFromSpringConfig() { + assertEquals(DocumentMatchContext.DEFAULT_MAX_ENCODED_SIZE, logic.getDocumentMatchMaxEncodedSize()); + assertEquals(DocumentMatchContext.DEFAULT_MAX_DECODED_SIZE, logic.getDocumentMatchMaxDecodedSize()); + assertEquals(DocumentMatchContext.DEFAULT_MAX_ENCODED_CONTEXT_SIZE, logic.getDocumentMatchMaxEncodedContextSize()); + } + + /** + * Verifies that the bean-style setters update both the top-level logic and its backing configuration. + */ + @Test + public void testDocumentMatchLimitSettersUpdateLogicAndConfig() { + int encoded = 1024; + int decoded = 2048; + int encodedContext = 4096; + + logic.setDocumentMatchMaxEncodedSize(encoded); + logic.setDocumentMatchMaxDecodedSize(decoded); + logic.setDocumentMatchMaxEncodedContextSize(encodedContext); + + assertEquals(encoded, logic.getDocumentMatchMaxEncodedSize()); + assertEquals(decoded, logic.getDocumentMatchMaxDecodedSize()); + assertEquals(encodedContext, logic.getDocumentMatchMaxEncodedContextSize()); + assertEquals(encoded, logic.getConfig().getDocumentMatchMaxEncodedSize()); + assertEquals(decoded, logic.getConfig().getDocumentMatchMaxDecodedSize()); + assertEquals(encodedContext, logic.getConfig().getDocumentMatchMaxEncodedContextSize()); + } + + /** + * Verifies that document-match limit overrides survive {@link ShardQueryLogic#initialize(AccumuloClient, Query, Set)} and appear on the per-query config. + */ + @Test + public void testDocumentMatchLimitsPropagateThroughInitialize() throws Exception { + int encoded = 4096; + int decoded = 8192; + int encodedContext = 16384; + + logic.setDocumentMatchMaxEncodedSize(encoded); + logic.setDocumentMatchMaxDecodedSize(decoded); + logic.setDocumentMatchMaxEncodedContextSize(encodedContext); + + this.query = "UUID == '" + caponeUID + "'"; + this.startDate = dateFormat.parse("20091231"); + this.endDate = dateFormat.parse("20150101"); + + Query settings = createSettings(); + AccumuloClient client = createClient(); + ShardQueryConfiguration config = (ShardQueryConfiguration) logic.initialize(client, settings, authSet); + + assertEquals(encoded, config.getDocumentMatchMaxEncodedSize()); + assertEquals(decoded, config.getDocumentMatchMaxDecodedSize()); + assertEquals(encodedContext, config.getDocumentMatchMaxEncodedContextSize()); + } + private AccumuloClient createClient() throws Exception { AccumuloClient client = new QueryTestTableHelper(ShardRange.class.toString(), log, RebuildingScannerTestHelper.TEARDOWN.EVERY_OTHER_SANS_CONSISTENCY, RebuildingScannerTestHelper.INTERRUPT.EVERY_OTHER).client; diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml index 5035b33c879..ded86c957fd 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml @@ -28,6 +28,7 @@ + @@ -400,6 +401,8 @@ + + diff --git a/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml b/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml index 358193f223a..9ff297c3234 100644 --- a/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml +++ b/web-services/deploy/configuration/src/main/resources/JexlFunctionNamespaceRegistryContext.xml @@ -15,6 +15,7 @@ + diff --git a/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml b/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml index 960cbf64242..8251c66dcb7 100644 --- a/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml +++ b/web-services/deploy/configuration/src/main/resources/datawave/query/QueryLogicFactory.xml @@ -24,6 +24,7 @@ + @@ -410,6 +411,8 @@ + +