Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;

import datawave.query.attributes.Attribute;
import datawave.query.attributes.Attributes;
Expand Down Expand Up @@ -31,14 +33,18 @@ public static String chooseBestIdentifier(List<String> identifiers) {
*
* @param languages
* a list to choose from
* @param preferredLanguages
* @return the best identifier or null.
*/
public static String chooseBestLanguage(List<String> languages) {
public static String chooseBestLanguage(List<String> languages, Set<String> preferredLanguages) {
if (languages == null || languages.isEmpty()) {
return null;
}

for (String language : languages) {
if (preferredLanguages != null && !preferredLanguages.isEmpty() && !preferredLanguages.contains(language.toUpperCase())) {
continue;
}
// if the language can't be found in the language registry, the language
// registry will return English. So, if the language name returned by the
// registry and the input language name match - it confirms we have
Expand All @@ -52,7 +58,12 @@ public static String chooseBestLanguage(List<String> languages) {

// if we get here, we couldn't find an ideal language, just return the first value, yake will default
// to processing the data as if it were English.
return languages.get(0);
if (preferredLanguages == null || preferredLanguages.isEmpty()) {
return languages.get(0);
} else {
Optional<String> first = languages.stream().filter(preferredLanguages::contains).findFirst();
Comment thread
FineAndDandy marked this conversation as resolved.
Outdated
return first.orElseGet(() -> preferredLanguages.iterator().next());
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;

import org.apache.accumulo.core.client.AccumuloClient;
Expand Down Expand Up @@ -42,8 +44,10 @@ protected Query buildLatterQuery(Query initialQuery, Iterator<Entry<Key,Value>>
public Iterator<Entry<Key,Value>> runChainedQuery(AccumuloClient client, Query initialQuery, Set<Authorizations> auths,
Iterator<Entry<Key,Value>> initialQueryResults, QueryLogic<Entry<Key,Value>> latterQueryLogic) {

final List<TagCloudInputExtractor> activeExtractors = getActiveExtractors(initialQuery);
final boolean buildKeywordCloud = isKeywordCloudRequested(initialQuery);
String[] categories = getCategories(initialQuery);
final List<TagCloudInputExtractor> activeExtractors = getActiveExtractors(categories);
final boolean buildKeywordCloud = isKeywordCloudRequested(categories);
Set<String> requiredLanguages = getRequiredLanguages(categories);

Iterator<Entry<Key,Value>> wrapped = new Iterator<>() {
private Iterator<Entry<Key,Value>> batchIterator;
Expand All @@ -53,7 +57,7 @@ public boolean hasNext() {
while (batchIterator == null || (!batchIterator.hasNext() && initialQueryResults.hasNext())) {
try {
StatefulKeywordUUIDChainStrategy statefulChainStrategy = new StatefulKeywordUUIDChainStrategy(initialQuery, latterQueryLogic,
activeExtractors, buildKeywordCloud);
activeExtractors, buildKeywordCloud, requiredLanguages);
statefulChainStrategy.setBatchSize(batchSize);
batchIterator = statefulChainStrategy.runChainedQuery(client, initialQuery, auths, initialQueryResults, latterQueryLogic);
} catch (Exception e) {
Expand Down Expand Up @@ -95,33 +99,62 @@ private String[] getCategories(Query settings) {
/**
* Check if the keyword cloud should be constructed
*
* @param settings
* @param categories
* @return
*/
private boolean isKeywordCloudRequested(Query settings) {
String[] categories = getCategories(settings);
private boolean isKeywordCloudRequested(String[] categories) {
for (String category : categories) {
if (category.equals(KEYWORD_CATEGORY)) {
if (category.equals(KEYWORD_CATEGORY) || category.startsWith(KEYWORD_CATEGORY + ".")) {
return true;
}
}

return false;
}

/**
*
* @param categories
* the non-null parsed categories
* @return set of languages to restrict keyword clouds to in upper case
*/
private Set<String> getRequiredLanguages(String[] categories) {
Set<String> requiredLanguages = new HashSet<>();
for (String category : categories) {
if (category.startsWith(KEYWORD_CATEGORY + ".") && category.length() > KEYWORD_CATEGORY.length() + 1) {
requiredLanguages.add(category.substring(KEYWORD_CATEGORY.length() + 1).toUpperCase());
}
}

return requiredLanguages;
}

/**
* pull parameters to determine which type of tag cloud we are generating. All extraction is triggered beyond this point for the given ids
*
* @param settings
* @param categories
* @return
*/
private List<TagCloudInputExtractor> getActiveExtractors(Query settings) {
private List<TagCloudInputExtractor> getActiveExtractors(String[] categories) {
List<TagCloudInputExtractor> activeExtractors = new ArrayList<>();

for (String name : getCategories(settings)) {
for (String name : categories) {
boolean found = false;
String subType = null;
if (name.indexOf(".") > 0) {
// split into category and subtype
String[] splits = name.split("\\.");
if (splits.length > 2) {
throw new IllegalArgumentException(
name + " is malformed. When specifying a subType with a category separate the category and subType with a single .");
} else if (splits.length == 2) {
name = splits[0];
subType = splits[1];
}
}
for (TagCloudInputExtractor extractor : extractors) {
if (extractor.getName().equals(name) && !activeExtractors.contains(extractor)) {
if (extractor.getName().equals(name) && (subType == null || Objects.equals(extractor.getSubType(), subType))
&& !activeExtractors.contains(extractor)) {
activeExtractors.add(extractor);
found = true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,17 @@ public class StatefulKeywordUUIDChainStrategy extends FullChainStrategy<Entry<Ke
private final List<TagCloudInputExtractor> extractors;
// will be true when a keyword query should be run, false otherwise
private final boolean runKeywordQuery;
// used to filter keyword results that don't match the given language. Stored in uppercase
private final Set<String> requiredLanguages;
private boolean addedExtractedData = false;

public StatefulKeywordUUIDChainStrategy(Query settings, QueryLogic<Entry<Key,Value>> nextLogic, List<TagCloudInputExtractor> extractors,
boolean runKeywordQuery) {
boolean runKeywordQuery, Set<String> requiredLanguages) {
this.deserializer = DocumentSerialization.getDocumentDeserializer(settings);
this.nextLogic = nextLogic;
this.extractors = extractors;
this.runKeywordQuery = runKeywordQuery;
this.requiredLanguages = requiredLanguages;
}

public int getBatchSize() {
Expand Down Expand Up @@ -137,12 +140,12 @@ public String captureResultsAndBuildQuery(Iterator<Entry<Key,Value>> initialQuer
}
}

if (runKeywordQuery) {
if (runKeywordQuery && hasRequiredLanguage(documentData)) {
// run query term extraction for next logic if needed
queryTerms.add(extractKeywordQueryTerm(docId, documentData));
}

count++;
count++;
}
}

if (nextLogic instanceof KeywordQueryLogic) {
Expand Down Expand Up @@ -173,6 +176,30 @@ public String captureResultsAndBuildQuery(Iterator<Entry<Key,Value>> initialQuer
return queryTerms.isEmpty() ? null : StringUtils.join(queryTerms, " ");
}

/**
* If requiredLanguages have been specified, return true if at least one language matches a required language
*
* @param documentData
* @return true if there are no required languages, or the document matches at least one required language
*/
private boolean hasRequiredLanguage(Map<String,Attribute<? extends Comparable<?>>> documentData) {
if (requiredLanguages.isEmpty()) {
return true;
}

Attribute<?> langaugeAttribute = documentData.get("LANGUAGE");
if (langaugeAttribute != null) {
List<String> languages = KeywordQueryUtil.getStringValuesFromAttribute(langaugeAttribute);
for (String language : languages) {
if (requiredLanguages.contains(language.toUpperCase())) {
return true;
}
}
}

return false;
}

/**
* Generates queries for the KeywordQueryLogic. Minimally they will include things like:
*
Expand Down Expand Up @@ -214,7 +241,7 @@ private String extractKeywordQueryTerm(String docId, Map<String,Attribute<? exte
log.trace("No identifier found for query " + queryTerm);
}

if (((language = KeywordQueryUtil.chooseBestLanguage(languages)) != null)) {
if (((language = KeywordQueryUtil.chooseBestLanguage(languages, requiredLanguages)) != null)) {
if (log.isTraceEnabled()) {
log.trace("Chose best language '" + languages + "' from '" + languages + "' for query " + queryTerm);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ public double getMinScore() {
return minScore;
}

@Override
public String getSubType() {
return subType;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ default void initialize(Query settings) {}

String getName();

String getSubType();
Comment thread
FineAndDandy marked this conversation as resolved.

void extract(Key source, Map<String,Attribute<? extends Comparable<?>>> documentData) throws TagCloudInputExtractorException;

TagCloudPartition get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import org.apache.accumulo.core.data.Key;
import org.junit.Before;
Expand Down Expand Up @@ -67,6 +69,36 @@ public void testGetMultipleValuesFromHetAttributes() {
KeywordQueryUtil.getStringValuesFromAttribute(attributesHetTwo));
}

@Test
public void testPreferredLanguage() {
String best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "B", "C"), Set.of());
assertEquals("A", best);

best = KeywordQueryUtil.chooseBestLanguage(null, null);
assertNull(best);

best = KeywordQueryUtil.chooseBestLanguage(List.of(), null);
assertNull(best);

best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "B", "C"), null);
assertEquals("A", best);

best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "B", "C"), Set.of("C"));
assertEquals("C", best);

best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "B", "C"), Set.of("C", "D", "E"));
assertEquals("C", best);

best = KeywordQueryUtil.chooseBestLanguage(List.of("ENGLISH", "B", "C"), Set.of("C", "D", "E"));
assertEquals("C", best);

best = KeywordQueryUtil.chooseBestLanguage(List.of("ENGLISH", "B", "C"), Set.of());
assertEquals("ENGLISH", best);

best = KeywordQueryUtil.chooseBestLanguage(List.of("A", "ENGLISH", "C"), Set.of());
assertEquals("ENGLISH", best);
}

public static void assertSingleValue(String expectedValue, List<String> results) {
assertNotNull("results should not have been null", results);
assertFalse("results should not have been empty", results.isEmpty());
Expand Down
Loading
Loading