diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/keyword/KeywordQueryUtil.java b/warehouse/query-core/src/main/java/datawave/query/tables/keyword/KeywordQueryUtil.java index f494a4e1d79..f9bba4720c9 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/keyword/KeywordQueryUtil.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/keyword/KeywordQueryUtil.java @@ -4,6 +4,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Queue; +import java.util.Set; +import java.util.stream.Collectors; import datawave.query.attributes.Attribute; import datawave.query.attributes.Attributes; @@ -28,17 +30,34 @@ public static String chooseBestIdentifier(List identifiers) { /** * Choose the best language from a non-null, non-empty list of languages, otherwise return null. + *

+ * If preferredLanguages is provided, find the intersection of available and preferred languages + *

+ * If YAKE cannot provide a language, take the first available language * * @param languages * a list to choose from + * @param preferredLanguages + * an optional set of preferred language in upper case * @return the best identifier or null. */ - public static String chooseBestLanguage(List languages) { + public static String chooseBestLanguage(List languages, Set preferredLanguages) { if (languages == null || languages.isEmpty()) { return null; } - for (String language : languages) { + List normalizedLanguages = languages.stream().map(String::toUpperCase).collect(Collectors.toList()); + List availablePreferredLanguages = normalizedLanguages; + if (preferredLanguages != null && !preferredLanguages.isEmpty()) { + availablePreferredLanguages = normalizedLanguages.stream().filter(preferredLanguages::contains).collect(Collectors.toList()); + + if (availablePreferredLanguages.isEmpty()) { + // no overlap so revert back to available languages + availablePreferredLanguages = normalizedLanguages; + } + } + + for (String language : availablePreferredLanguages) { // if the language can't be found in the language registry, the language // registry will return English. So, if the language name returned by the // registry and the input language name match - it confirms we have @@ -52,7 +71,7 @@ public static String chooseBestLanguage(List languages) { // if we get here, we couldn't find an ideal language, just return the first value, yake will default // to processing the data as if it were English. - return languages.get(0); + return availablePreferredLanguages.get(0); } /** diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/keyword/KeywordUUIDChainStrategy.java b/warehouse/query-core/src/main/java/datawave/query/tables/keyword/KeywordUUIDChainStrategy.java index 77ed3553939..984a2e8d682 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/keyword/KeywordUUIDChainStrategy.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/keyword/KeywordUUIDChainStrategy.java @@ -2,9 +2,11 @@ import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; +import java.util.Objects; import java.util.Set; import org.apache.accumulo.core.client.AccumuloClient; @@ -42,8 +44,10 @@ protected Query buildLatterQuery(Query initialQuery, Iterator> public Iterator> runChainedQuery(AccumuloClient client, Query initialQuery, Set auths, Iterator> initialQueryResults, QueryLogic> latterQueryLogic) { - final List activeExtractors = getActiveExtractors(initialQuery); - final boolean buildKeywordCloud = isKeywordCloudRequested(initialQuery); + String[] categories = getCategories(initialQuery); + final List activeExtractors = getActiveExtractors(categories); + final boolean buildKeywordCloud = isKeywordCloudRequested(categories); + Set requiredLanguages = getRequiredLanguages(categories); Iterator> wrapped = new Iterator<>() { private Iterator> batchIterator; @@ -53,7 +57,7 @@ public boolean hasNext() { while (batchIterator == null || (!batchIterator.hasNext() && initialQueryResults.hasNext())) { try { StatefulKeywordUUIDChainStrategy statefulChainStrategy = new StatefulKeywordUUIDChainStrategy(initialQuery, latterQueryLogic, - activeExtractors, buildKeywordCloud); + activeExtractors, buildKeywordCloud, requiredLanguages); statefulChainStrategy.setBatchSize(batchSize); batchIterator = statefulChainStrategy.runChainedQuery(client, initialQuery, auths, initialQueryResults, latterQueryLogic); } catch (Exception e) { @@ -95,13 +99,12 @@ private String[] getCategories(Query settings) { /** * Check if the keyword cloud should be constructed * - * @param settings + * @param categories * @return */ - private boolean isKeywordCloudRequested(Query settings) { - String[] categories = getCategories(settings); + private boolean isKeywordCloudRequested(String[] categories) { for (String category : categories) { - if (category.equals(KEYWORD_CATEGORY)) { + if (category.equals(KEYWORD_CATEGORY) || category.startsWith(KEYWORD_CATEGORY + ".")) { return true; } } @@ -109,19 +112,49 @@ private boolean isKeywordCloudRequested(Query settings) { return false; } + /** + * + * @param categories + * the non-null parsed categories + * @return set of languages to restrict keyword clouds to in upper case + */ + private Set getRequiredLanguages(String[] categories) { + Set requiredLanguages = new HashSet<>(); + for (String category : categories) { + if (category.startsWith(KEYWORD_CATEGORY + ".") && category.length() > KEYWORD_CATEGORY.length() + 1) { + requiredLanguages.add(category.substring(KEYWORD_CATEGORY.length() + 1).toUpperCase()); + } + } + + return requiredLanguages; + } + /** * pull parameters to determine which type of tag cloud we are generating. All extraction is triggered beyond this point for the given ids * - * @param settings + * @param categories * @return */ - private List getActiveExtractors(Query settings) { + private List getActiveExtractors(String[] categories) { List activeExtractors = new ArrayList<>(); - for (String name : getCategories(settings)) { + for (String name : categories) { boolean found = false; + String subType = null; + if (name.indexOf(".") > 0) { + // split into category and subtype + String[] splits = name.split("\\."); + if (splits.length > 2) { + throw new IllegalArgumentException( + name + " is malformed. When specifying a subType with a category separate the category and subType with a single ."); + } else if (splits.length == 2) { + name = splits[0]; + subType = splits[1]; + } + } for (TagCloudInputExtractor extractor : extractors) { - if (extractor.getName().equals(name) && !activeExtractors.contains(extractor)) { + if (extractor.getName().equals(name) && (subType == null || Objects.equals(extractor.getSubType(), subType)) + && !activeExtractors.contains(extractor)) { activeExtractors.add(extractor); found = true; } diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/keyword/StatefulKeywordUUIDChainStrategy.java b/warehouse/query-core/src/main/java/datawave/query/tables/keyword/StatefulKeywordUUIDChainStrategy.java index e92f1e194d3..0ba31f9de17 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/keyword/StatefulKeywordUUIDChainStrategy.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/keyword/StatefulKeywordUUIDChainStrategy.java @@ -43,14 +43,17 @@ public class StatefulKeywordUUIDChainStrategy extends FullChainStrategy extractors; // will be true when a keyword query should be run, false otherwise private final boolean runKeywordQuery; + // used to filter keyword results that don't match the given language. Stored in uppercase + private final Set requiredLanguages; private boolean addedExtractedData = false; public StatefulKeywordUUIDChainStrategy(Query settings, QueryLogic> nextLogic, List extractors, - boolean runKeywordQuery) { + boolean runKeywordQuery, Set requiredLanguages) { this.deserializer = DocumentSerialization.getDocumentDeserializer(settings); this.nextLogic = nextLogic; this.extractors = extractors; this.runKeywordQuery = runKeywordQuery; + this.requiredLanguages = requiredLanguages; } public int getBatchSize() { @@ -137,12 +140,12 @@ public String captureResultsAndBuildQuery(Iterator> initialQuer } } - if (runKeywordQuery) { + if (runKeywordQuery && hasRequiredLanguage(documentData)) { // run query term extraction for next logic if needed queryTerms.add(extractKeywordQueryTerm(docId, documentData)); - } - count++; + count++; + } } if (nextLogic instanceof KeywordQueryLogic) { @@ -173,6 +176,30 @@ public String captureResultsAndBuildQuery(Iterator> initialQuer return queryTerms.isEmpty() ? null : StringUtils.join(queryTerms, " "); } + /** + * If requiredLanguages have been specified, return true if at least one language matches a required language + * + * @param documentData + * @return true if there are no required languages, or the document matches at least one required language + */ + private boolean hasRequiredLanguage(Map>> documentData) { + if (requiredLanguages.isEmpty()) { + return true; + } + + Attribute langaugeAttribute = documentData.get("LANGUAGE"); + if (langaugeAttribute != null) { + List languages = KeywordQueryUtil.getStringValuesFromAttribute(langaugeAttribute); + for (String language : languages) { + if (requiredLanguages.contains(language.toUpperCase())) { + return true; + } + } + } + + return false; + } + /** * Generates queries for the KeywordQueryLogic. Minimally they will include things like: * @@ -214,7 +241,7 @@ private String extractKeywordQueryTerm(String docId, Map + * Tag cloud extractors are responsible for transforming document field data into {@link TagCloudPartition} objects. Implementations extract relevant fields + * from document data, apply scoring logic, and accumulate the results into partitions for tag clouds. + *

+ * The extraction lifecycle follows this pattern: + *

    + *
  1. Optional initialization via {@link #initialize(Query)} with query settings
  2. + *
  3. Repeated calls to {@link #extract(Key, Map)} for each document
  4. + *
  5. Retrieval of accumulated results via {@link #get()}
  6. + *
  7. Reset state via {@link #clear()} when moving to the next partition
  8. + *
+ * + * @see TagCloudPartition + * @see TagCloudInputTransformer + * @see FieldedTagCloudInputExtractor + * @see ParameterFieldedTagCloudInputExtractor + */ public interface TagCloudInputExtractor { + /** + * Extracts a document identifier from an event Key in the shard table + *

+ * The document ID is constructed from the shard row key and column family, which follows the format: + *

+ * The resulting document id format is: {@code shardId/dataType/uid} + * + * @param source + * the shard event Key + * @return document identifier in the format "shardId/dataType/uid" + * @throws IllegalArgumentException + * if the cf does not contain the null byte separator + */ default String getDocId(Key source) { String row = source.getRow().toString(); String cf = source.getColumnFamily().toString(); @@ -24,15 +59,81 @@ default String getDocId(Key source) { return row + "/" + dataType + "/" + uid; } + /** + * Initialize the extractor with query-specific settings + *

+ * This optional hook allows implementations to configure themselves based on the query parameters before extraction begins. By default, does nothing + * + * @param settings + */ default void initialize(Query settings) {} + /** + * Returns the name or category identifier of this extractor + *

+ * The name is used to identify the partition category and group related tag cloud results + * + * @return the extractor name or category identifier + */ String getName(); + /** + * Returns the name or category subtype identifier + *

+ * This name is used to create subgroups of data within a given name or category when creating tag cloud results + *

+ * This is an optional parameter + * + * @return the optional subType name or category if set, otherwise null + */ + String getSubType(); + + /** + * Extracts tag cloud input data from a single document's fielded data + *

+ * This method processes the document's fields, applies scoring logic, and accumulates the results into an internal partition. The accumulated data can be + * retrieved via {@link #get()} + * + * @param source + * the Key for the document + * @param documentData + * the document's field data + * @throws TagCloudInputExtractorException + * if extraction fails due to malformed data or configuration issues + * @see #get() + * @see #clear() + */ void extract(Key source, Map>> documentData) throws TagCloudInputExtractorException; + /** + * Retrieves the accumulated tag cloud partition containing all extracted data + *

+ * This method returns the partition built up through repeated calls to {@link #extract(Key, Map)}. The partition contains aggregated term frequencies and + * scores across all processed documents + * + * @return the accumulated tag cloud partition, or null if no data has been extracted + * @see #extract(Key, Map) + * @see #clear() + */ TagCloudPartition get(); + /** + * Clears the internal state and resets the accumulated partition + *

+ * This method should be called after retrieving results via {@link #get()} to prepare the extractor for processing the next partition + * + * @see #get() + */ void clear(); + /** + * Returns the transformer used to convert the partition into the final output format + *

+ * The transformer is responsible for converting the accumulated {@link TagCloudPartition} into the appropriate response format + * + * @return the input transformer for this extractor's partition type + * @see TagCloudInputTransformer + * @see TagCloudPartition + */ TagCloudInputTransformer getInputTransformer(); } diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordQueryUtilTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordQueryUtilTest.java index 78642fa7a39..80d80e87bb2 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordQueryUtilTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordQueryUtilTest.java @@ -3,10 +3,12 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.util.ArrayList; import java.util.List; +import java.util.Set; import org.apache.accumulo.core.data.Key; import org.junit.Before; @@ -67,6 +69,36 @@ public void testGetMultipleValuesFromHetAttributes() { KeywordQueryUtil.getStringValuesFromAttribute(attributesHetTwo)); } + @Test + public void testPreferredLanguage() { + String best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "B", "C"), Set.of()); + assertEquals("A", best); + + best = KeywordQueryUtil.chooseBestLanguage(null, null); + assertNull(best); + + best = KeywordQueryUtil.chooseBestLanguage(List.of(), null); + assertNull(best); + + best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "B", "C"), null); + assertEquals("A", best); + + best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "B", "C"), Set.of("C")); + assertEquals("C", best); + + best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "B", "C"), Set.of("C", "D", "E")); + assertEquals("C", best); + + best = KeywordQueryUtil.chooseBestLanguage(List.of("ENGLISH", "B", "C"), Set.of("C", "D", "E")); + assertEquals("C", best); + + best = KeywordQueryUtil.chooseBestLanguage(List.of("ENGLISH", "B", "C"), Set.of()); + assertEquals("ENGLISH", best); + + best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "ENGLISH", "C"), Set.of()); + assertEquals("ENGLISH", best); + } + public static void assertSingleValue(String expectedValue, List results) { assertNotNull("results should not have been null", results); assertFalse("results should not have been empty", results.isEmpty()); diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordUUIDChainStrategyTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordUUIDChainStrategyTest.java index 995ebb8560d..24845c187c7 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordUUIDChainStrategyTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordUUIDChainStrategyTest.java @@ -28,6 +28,9 @@ import org.junit.Before; import org.junit.Test; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.Multimap; + import datawave.data.type.NoOpType; import datawave.microservice.query.Query; import datawave.microservice.query.QueryImpl; @@ -64,7 +67,7 @@ public void setup() throws Exception { settings = new QueryImpl(); } - public Entry createDocument(String shard, String dt, String uid, String language, String identifier) { + public Entry createDocument(String shard, String dt, String uid, String language, String identifier, Multimap extra) { String colf = dt + "\0" + uid; Key documentKey = new Key(shard, colf, "", "ALL"); @@ -77,12 +80,20 @@ public Entry createDocument(String shard, String dt, String uid, Stri d.put("FOO2", new TypeAttribute<>(new NoOpType("xx"), documentKey, true)); d.put("BAR2", new TypeAttribute<>(new NoOpType("yy"), documentKey, true)); + for (Entry entry : extra.entries()) { + d.put(entry.getKey(), new TypeAttribute<>(new NoOpType(entry.getValue()), documentKey, true)); + } + Entry entry = Map.entry(documentKey, d); DocumentSerializer serializer = DocumentSerialization.getDocumentSerializer(DocumentSerialization.DEFAULT_RETURN_TYPE); return serializer.apply(entry); } + public Entry createDocument(String shard, String dt, String uid, String language, String identifier) { + return createDocument(shard, dt, uid, language, identifier, HashMultimap.create(0, 0)); + } + public Entry createKeywordResults(String shard, String dt, String uid, String language, String identifier, String view, String visibility, LinkedHashMap results) throws IOException { String colf = "d"; @@ -279,6 +290,231 @@ public void dualInputDualBatchTest() throws Exception { } } + @Test(expected = IllegalArgumentException.class) + public void languageFilterMalformedTest() { + settings.addParameter(CATEGORY_PARAMETER, "keyword."); + + List> input = List.of(createDocument("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ENGLISH", "PAGE_ID:12345")); + + KeywordUUIDChainStrategy strategy = new KeywordUUIDChainStrategy(); + + replayAll(); + + strategy.runChainedQuery(mockAccumulo, settings, null, input.iterator(), mockLogic); + + verifyAll(); + } + + @Test + public void languageFilterMinimumTest() throws Exception { + settings.addParameter(CATEGORY_PARAMETER, "keyword.a"); + + List> input = List.of(createDocument("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "A", "PAGE_ID:12345")); + + LinkedHashMap results = new LinkedHashMap<>(); + results.put("cat", 0.2); + results.put("cat food", 0.3); + results.put("dog", 0.4); + + List> intermediateInput = List + .of(createKeywordResults("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "A", "PAGE_ID:12345", "CONTENT", "PUBLIC", results)); + + KeywordUUIDChainStrategy strategy = new KeywordUUIDChainStrategy(); + Capture intermediateSettings = Capture.newInstance(); + + expect(mockLogic.initialize(eq(mockAccumulo), capture(intermediateSettings), eq(null))).andReturn(mockConfig).once(); + mockLogic.setupQuery(eq(mockConfig)); + expect(mockLogic.iterator()).andReturn(intermediateInput.iterator()).once(); + + replayAll(); + + Iterator> result = strategy.runChainedQuery(mockAccumulo, settings, null, input.iterator(), mockLogic); + + verifyAll(); + + assertEquals("DOCUMENT:20250412/test/-cvy0gj.tlf59s.-duxzua!PAGE_ID:12345%LANGUAGE:A", intermediateSettings.getValue().getQuery()); + + assertTrue(result.hasNext()); + Entry next = result.next(); + + assertEquals("20250412 d:test%00;-cvy0gj.tlf59s.-duxzua%00;CONTENT [] 9223372036854775807 false", next.getKey().toString()); + + KeywordResults keywordResults = KeywordResults.deserialize(next.getValue().get()); + assertEquals("PAGE_ID:12345", keywordResults.getSource()); + assertEquals("CONTENT", keywordResults.getView()); + assertEquals("A", keywordResults.getLanguage()); + assertEquals("PUBLIC", keywordResults.getVisibility()); + + assertNotNull(keywordResults.getKeywords().get("cat")); + + assertFalse(result.hasNext()); + } + + @Test + public void languageFilterInclusiveTest() throws Exception { + settings.addParameter(CATEGORY_PARAMETER, "keyword.EnGlISh"); + + List> input = List.of(createDocument("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ENGLISH", "PAGE_ID:12345")); + + LinkedHashMap results = new LinkedHashMap<>(); + results.put("cat", 0.2); + results.put("cat food", 0.3); + results.put("dog", 0.4); + + List> intermediateInput = List + .of(createKeywordResults("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ENGLISH", "PAGE_ID:12345", "CONTENT", "PUBLIC", results)); + + KeywordUUIDChainStrategy strategy = new KeywordUUIDChainStrategy(); + Capture intermediateSettings = Capture.newInstance(); + + expect(mockLogic.initialize(eq(mockAccumulo), capture(intermediateSettings), eq(null))).andReturn(mockConfig).once(); + mockLogic.setupQuery(eq(mockConfig)); + expect(mockLogic.iterator()).andReturn(intermediateInput.iterator()).once(); + + replayAll(); + + Iterator> result = strategy.runChainedQuery(mockAccumulo, settings, null, input.iterator(), mockLogic); + + verifyAll(); + + assertEquals("DOCUMENT:20250412/test/-cvy0gj.tlf59s.-duxzua!PAGE_ID:12345%LANGUAGE:ENGLISH", intermediateSettings.getValue().getQuery()); + + assertTrue(result.hasNext()); + Entry next = result.next(); + + assertEquals("20250412 d:test%00;-cvy0gj.tlf59s.-duxzua%00;CONTENT [] 9223372036854775807 false", next.getKey().toString()); + + KeywordResults keywordResults = KeywordResults.deserialize(next.getValue().get()); + assertEquals("PAGE_ID:12345", keywordResults.getSource()); + assertEquals("CONTENT", keywordResults.getView()); + assertEquals("ENGLISH", keywordResults.getLanguage()); + assertEquals("PUBLIC", keywordResults.getVisibility()); + + assertNotNull(keywordResults.getKeywords().get("cat")); + + assertFalse(result.hasNext()); + } + + @Test + public void languageFilterExclusiveTest() throws Exception { + settings.addParameter(CATEGORY_PARAMETER, "keyword.nope"); + + List> input = List.of(createDocument("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ENGLISH", "PAGE_ID:12345")); + + LinkedHashMap results = new LinkedHashMap<>(); + results.put("cat", 0.2); + results.put("cat food", 0.3); + results.put("dog", 0.4); + + List> intermediateInput = List + .of(createKeywordResults("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ENGLISH", "PAGE_ID:12345", "CONTENT", "PUBLIC", results)); + + KeywordUUIDChainStrategy strategy = new KeywordUUIDChainStrategy(); + + replayAll(); + + Iterator> result = strategy.runChainedQuery(mockAccumulo, settings, null, input.iterator(), mockLogic); + + verifyAll(); + + assertFalse(result.hasNext()); + } + + @Test + public void languagePartialFilterTest() throws Exception { + settings.addParameter(CATEGORY_PARAMETER, "keyword.EnGlISh"); + + List> input = List.of(createDocument("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ENGLISH", "PAGE_ID:12345"), + createDocument("20250412", "test", "123.345.456", "alien", "PAGE_ID:7654321")); + + LinkedHashMap results = new LinkedHashMap<>(); + results.put("cat", 0.2); + results.put("cat food", 0.3); + results.put("dog", 0.4); + + List> intermediateInput = List + .of(createKeywordResults("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ENGLISH", "PAGE_ID:12345", "CONTENT", "PUBLIC", results)); + + KeywordUUIDChainStrategy strategy = new KeywordUUIDChainStrategy(); + Capture intermediateSettings = Capture.newInstance(); + + expect(mockLogic.initialize(eq(mockAccumulo), capture(intermediateSettings), eq(null))).andReturn(mockConfig).once(); + mockLogic.setupQuery(eq(mockConfig)); + expect(mockLogic.iterator()).andReturn(intermediateInput.iterator()).once(); + + replayAll(); + + Iterator> result = strategy.runChainedQuery(mockAccumulo, settings, null, input.iterator(), mockLogic); + + verifyAll(); + + assertEquals("DOCUMENT:20250412/test/-cvy0gj.tlf59s.-duxzua!PAGE_ID:12345%LANGUAGE:ENGLISH", intermediateSettings.getValue().getQuery()); + + assertTrue(result.hasNext()); + Entry next = result.next(); + + assertEquals("20250412 d:test%00;-cvy0gj.tlf59s.-duxzua%00;CONTENT [] 9223372036854775807 false", next.getKey().toString()); + + KeywordResults keywordResults = KeywordResults.deserialize(next.getValue().get()); + assertEquals("PAGE_ID:12345", keywordResults.getSource()); + assertEquals("CONTENT", keywordResults.getView()); + assertEquals("ENGLISH", keywordResults.getLanguage()); + assertEquals("PUBLIC", keywordResults.getVisibility()); + + assertNotNull(keywordResults.getKeywords().get("cat")); + + assertFalse(result.hasNext()); + } + + @Test + public void languageFilterMultiValuedTest() throws Exception { + settings.addParameter(CATEGORY_PARAMETER, "keyword.ABC"); + + Multimap extraLanguages = HashMultimap.create(); + extraLanguages.put("LANGUAGE", "abc"); + extraLanguages.put("LANGUAGE", "def"); + + List> input = List.of(createDocument("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ENGLISH", "PAGE_ID:12345", extraLanguages)); + + LinkedHashMap results = new LinkedHashMap<>(); + results.put("cat", 0.2); + results.put("cat food", 0.3); + results.put("dog", 0.4); + + List> intermediateInput = List + .of(createKeywordResults("20250412", "test", "-cvy0gj.tlf59s.-duxzua", "ABC", "PAGE_ID:12345", "CONTENT", "PUBLIC", results)); + + KeywordUUIDChainStrategy strategy = new KeywordUUIDChainStrategy(); + Capture intermediateSettings = Capture.newInstance(); + + expect(mockLogic.initialize(eq(mockAccumulo), capture(intermediateSettings), eq(null))).andReturn(mockConfig).once(); + mockLogic.setupQuery(eq(mockConfig)); + expect(mockLogic.iterator()).andReturn(intermediateInput.iterator()).once(); + + replayAll(); + + Iterator> result = strategy.runChainedQuery(mockAccumulo, settings, null, input.iterator(), mockLogic); + + verifyAll(); + + assertEquals("DOCUMENT:20250412/test/-cvy0gj.tlf59s.-duxzua!PAGE_ID:12345%LANGUAGE:ABC", intermediateSettings.getValue().getQuery()); + + assertTrue(result.hasNext()); + Entry next = result.next(); + + assertEquals("20250412 d:test%00;-cvy0gj.tlf59s.-duxzua%00;CONTENT [] 9223372036854775807 false", next.getKey().toString()); + + KeywordResults keywordResults = KeywordResults.deserialize(next.getValue().get()); + assertEquals("PAGE_ID:12345", keywordResults.getSource()); + assertEquals("CONTENT", keywordResults.getView()); + assertEquals("ABC", keywordResults.getLanguage()); + assertEquals("PUBLIC", keywordResults.getVisibility()); + + assertNotNull(keywordResults.getKeywords().get("cat")); + + assertFalse(result.hasNext()); + } + @Test public void singleFieldedTest() throws Exception { settings.addParameter(CATEGORY_PARAMETER, "external,keyword"); diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordUUIDQueryFunctionalTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordUUIDQueryFunctionalTest.java index cf4a169303d..1a1d85e3cc7 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordUUIDQueryFunctionalTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/keyword/KeywordUUIDQueryFunctionalTest.java @@ -174,6 +174,55 @@ public void extractorTest() throws Exception { test(); } + @Test + public void extractorSubTypeTest() throws Exception { + // two clouds for gendered-age, subtype gender and subtype age. One cloud for name + withExtraParameter(CATEGORY_PARAMETER, "gendered-age.age"); + withExtraParameter(TAG_CLOUD_VERSION, "2"); + + // three tag clouds expected, one for each category/subtype + // @formatter:off + withExpectedResult(tagCloudTestUtil.getExpectedCloud("2", Map.of("type", "gendered-age", "subType", "age"), + List.of(tagCloudTestUtil.createTagCloudEntry("16", 1.0, 1, List.of(SOPRANO_UUID)), + tagCloudTestUtil.createTagCloudEntry("18", 1.0, 1, List.of(SOPRANO_UUID)), + tagCloudTestUtil.createTagCloudEntry("20", 1.0, 1, List.of(CAPONE_UUID)), + tagCloudTestUtil.createTagCloudEntry("30", 1.0, 1, List.of(CAPONE_UUID)), + tagCloudTestUtil.createTagCloudEntry("34", 1.0, 1, List.of(CAPONE_UUID)), + tagCloudTestUtil.createTagCloudEntry("40", 1.0, 1, List.of(CAPONE_UUID))) + )); + // @formatter:on + + withQuery("UUID:CAPONE OR UUID:CORLEONE OR UUID: SOPRANO"); + + test(); + } + + @Test + public void keywordSubTypeTest() throws Exception { + withExtraParameter(CATEGORY_PARAMETER, "keyword.sicilian"); + withExtraParameter(TAG_CLOUD_VERSION, "2"); + + // set the threshold so something can come back (higher is worse) + KeywordQueryLogic kql = (KeywordQueryLogic) logic.getLogic2(); + kql.setMaxScore(1); + + // three tag clouds expected, one for each category/subtype + // @formatter:off + withExpectedResult(tagCloudTestUtil.getExpectedCloud("2", Map.of("type", "keyword", "language", "SICILIAN", "view", "CONTENT"), + List.of(tagCloudTestUtil.createTagCloudEntry("cant", 0.7494, 1, List.of(CORLEONE_UUID)), + tagCloudTestUtil.createTagCloudEntry("gonna", 0.8807, 1, List.of(CORLEONE_UUID)), + tagCloudTestUtil.createTagCloudEntry("im", 0.6041, 1, List.of(CORLEONE_UUID)), + tagCloudTestUtil.createTagCloudEntry("make", 0.7494, 1, List.of(CORLEONE_UUID)), + tagCloudTestUtil.createTagCloudEntry("offer", 0.7494, 1, List.of(CORLEONE_UUID)), + tagCloudTestUtil.createTagCloudEntry("refuse", 0.7494, 1, List.of(CORLEONE_UUID))) + )); + // @formatter:on + + withQuery("UUID:CORLEONE"); + + test(); + } + private void test() throws Exception { QueryImpl settings = new QueryImpl(); settings.setPagesize(Integer.MAX_VALUE); diff --git a/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java b/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java index 1ba50f9cffc..b1f5f139201 100644 --- a/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java +++ b/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java @@ -149,6 +149,8 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put(datatype + "\u0000" + corleoneUID, "NUMBER" + "\u0000" + "25", columnVisibility, timeStamp + corleoneTimeStampDelta, emptyValue); mutation.put(datatype + "\u0000" + corleoneUID, "GEO" + "\u0000" + "POINT(10 10)", columnVisibility, timeStamp + corleoneTimeStampDelta, emptyValue); + mutation.put(datatype + "\u0000" + corleoneUID, "LANGUAGE" + "\u0000" + "sicilian", columnVisibility, timeStamp + corleoneTimeStampDelta, + emptyValue); mutation.put(datatype + "\u0000" + corleoneChildUID, "UUID.0" + "\u0000" + "ANDOLINI", columnVisibility, timeStamp + corleoneTimeStampDelta, emptyValue);