Commit 08c4e4ed authored by Jean-Baptiste Nizet's avatar Jean-Baptiste Nizet
Browse files

feat: add suggestion service

parent ed8344c1
package fr.inra.urgi.rare.dao;
import java.util.Collection;
import java.util.List;
import fr.inra.urgi.rare.domain.GeneticResource;
import fr.inra.urgi.rare.domain.IndexedGeneticResource;
import org.springframework.data.domain.Pageable;
import org.springframework.data.elasticsearch.core.aggregation.AggregatedPage;
......@@ -20,4 +24,21 @@ public interface GeneticResourceDaoCustom {
boolean aggregate,
SearchRefinements refinements,
Pageable page);
/**
* Suggests completions for the given term. It autocompletes all the fields except the identifier, the URL and
* numeric fields, and the description.
* @return The N first distinct suggested completions
*/
List<String> suggest(String term);
/**
* Saves all the given genetic resources given as argument. Since {@link IndexedGeneticResource} is in fact the
* same document as {@link GeneticResource}, but with an additional computed field used only to enable suggestions
* implementation, and used only when saving the entities, this method has been added to the
* {@link GeneticResourceDao} as a custom method instead of creating a whole DAO only for thsi "fake" document:
* we don't want to encourage doing anything other than saving {@link IndexedGeneticResource} instances, which
* a specific DAO would do.
*/
void saveAll(Collection<IndexedGeneticResource> indexedGeneticResources);
}
......@@ -4,7 +4,9 @@ import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
import static org.elasticsearch.index.query.QueryBuilders.termsQuery;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
......@@ -12,9 +14,16 @@ import java.util.stream.Stream;
import fr.inra.urgi.rare.domain.GeneticResource;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import fr.inra.urgi.rare.domain.IndexedGeneticResource;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.search.suggest.SuggestBuilder;
import org.elasticsearch.search.suggest.SuggestBuilders;
import org.springframework.data.domain.Pageable;
import org.springframework.data.elasticsearch.core.ElasticsearchTemplate;
import org.springframework.data.elasticsearch.core.aggregation.AggregatedPage;
import org.springframework.data.elasticsearch.core.query.IndexQuery;
import org.springframework.data.elasticsearch.core.query.NativeSearchQueryBuilder;
/**
......@@ -23,6 +32,8 @@ import org.springframework.data.elasticsearch.core.query.NativeSearchQueryBuilde
*/
public class GeneticResourceDaoImpl implements GeneticResourceDaoCustom {
private static final String COMPLETION = "completion";
/**
* Contains the fields searchable on a {@link GeneticResource}.
* This is basically all fields at the exception of a few ones like `identifier`,
......@@ -75,4 +86,45 @@ public class GeneticResourceDaoImpl implements GeneticResourceDaoCustom {
return elasticsearchTemplate.queryForPage(builder.build(), GeneticResource.class);
}
@Override
public List<String> suggest(String term) {
SuggestBuilder suggestion =
new SuggestBuilder().addSuggestion(COMPLETION,
SuggestBuilders.completionSuggestion("suggestions")
.text(term)
.size(8)
.skipDuplicates(true));
Client client = elasticsearchTemplate.getClient();
String index = elasticsearchTemplate.getPersistentEntityFor(GeneticResource.class).getIndexName();
SearchRequestBuilder searchRequestBuilder = client.prepareSearch(index);
SearchResponse response =
searchRequestBuilder.suggest(suggestion)
.setFetchSource(false) // avoid getting the source documents, which are useless
.get();
return response.getSuggest()
.getSuggestion(COMPLETION)
.getEntries()
.stream()
.flatMap(entry -> entry.getOptions().stream())
.map(option -> option.getText().string())
.collect(Collectors.toList());
}
@Override
public void saveAll(Collection<IndexedGeneticResource> indexedGeneticResources) {
List<IndexQuery> queries = indexedGeneticResources.stream().map(this::createIndexQuery).collect(Collectors.toList());
elasticsearchTemplate.bulkIndex(queries);
elasticsearchTemplate.refresh(elasticsearchTemplate.getPersistentEntityFor(GeneticResource.class).getIndexName());
}
private IndexQuery createIndexQuery(IndexedGeneticResource entity) {
IndexQuery query = new IndexQuery();
query.setObject(entity);
query.setId(entity.getGeneticResource().getId());
return query;
}
}
package fr.inra.urgi.rare.domain;
import static fr.inra.urgi.rare.util.Utils.nullSafeUnmodifiableCopy;
import java.util.List;
import java.util.Objects;
......@@ -18,7 +20,7 @@ import org.springframework.data.elasticsearch.annotations.Mapping;
type = "#{@rareProperties.getElasticsearchPrefix()}resource"
)
@Mapping(mappingPath = "fr/inra/urgi/rare/domain/GeneticResource.mapping.json")
public final class GeneticResource {
public class GeneticResource {
@Id
@JsonProperty("identifier")
private final String id;
......@@ -72,12 +74,12 @@ public final class GeneticResource {
this.portalURL = portalURL;
this.dataURL = dataURL;
this.domain = domain;
this.taxon = taxon;
this.family = family;
this.genus = genus;
this.species = species;
this.materialType = materialType;
this.biotopeType = biotopeType;
this.taxon = nullSafeUnmodifiableCopy(taxon);
this.family = nullSafeUnmodifiableCopy(family);
this.genus = nullSafeUnmodifiableCopy(genus);
this.species = nullSafeUnmodifiableCopy(species);
this.materialType = nullSafeUnmodifiableCopy(materialType);
this.biotopeType = nullSafeUnmodifiableCopy(biotopeType);
this.countryOfOrigin = countryOfOrigin;
this.originLatitude = originLatitude;
this.originLongitude = originLongitude;
......
package fr.inra.urgi.rare.domain;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import com.fasterxml.jackson.annotation.JsonUnwrapped;
import org.springframework.data.elasticsearch.annotations.Document;
/**
* A class containing all the fields of a GeneticResource, and additional fields used uniquely for indexing
* and which thus make it possible or easier to implement completion suggestions.
* @author JB Nizet
*/
@Document(
indexName = "#{@rareProperties.getElasticsearchPrefix()}resource-index",
type = "#{@rareProperties.getElasticsearchPrefix()}resource",
createIndex = false
)
public final class IndexedGeneticResource {
private static final Pattern WORD_SPLIT_PATTERN = Pattern.compile("\\p{Punct}|\\p{Space}");
@JsonUnwrapped
private final GeneticResource geneticResource;
/**
* The list of completion suggestions that are valid for this genetic resource.
*/
private final List<String> suggestions;
public IndexedGeneticResource(GeneticResource geneticResource) {
this.geneticResource = geneticResource;
List<String> list = new ArrayList<>();
addIfNotBlank(list, geneticResource.getName());
addIfNotBlank(list, geneticResource.getPillarName());
addIfNotBlank(list, geneticResource.getDatabaseSource());
addIfNotBlank(list, geneticResource.getDomain());
addAllIfNotBlank(list, geneticResource.getTaxon());
addAllIfNotBlank(list, geneticResource.getFamily());
addAllIfNotBlank(list, geneticResource.getGenus());
addAllIfNotBlank(list, geneticResource.getSpecies());
addAllIfNotBlank(list, geneticResource.getMaterialType());
addAllIfNotBlank(list, geneticResource.getBiotopeType());
addIfNotBlank(list, geneticResource.getCountryOfOrigin());
addIfNotBlank(list, geneticResource.getCountryOfCollect());
extractTokensOutOfDescription(geneticResource.getDescription()).forEach(list::add);
this.suggestions = Collections.unmodifiableList(list);
}
public GeneticResource getGeneticResource() {
return geneticResource;
}
public List<String> getSuggestions() {
return suggestions;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
IndexedGeneticResource that = (IndexedGeneticResource) o;
return Objects.equals(geneticResource, that.geneticResource) &&
Objects.equals(suggestions, that.suggestions);
}
@Override
public int hashCode() {
return Objects.hash(geneticResource, suggestions);
}
@Override
public String toString() {
return "IndexedGeneticResource{" +
"geneticResource=" + geneticResource +
", suggestions=" + suggestions +
'}';
}
private void addIfNotBlank(List<String> list, String s) {
if (s != null && !s.isEmpty()) {
list.add(s);
}
}
private void addAllIfNotBlank(List<String> list, Collection<String> toAdd) {
toAdd.forEach(s -> addIfNotBlank(list, s));
}
private Stream<String> extractTokensOutOfDescription(String description) {
if (description == null) {
return Stream.empty();
}
return WORD_SPLIT_PATTERN.splitAsStream(description)
.filter(s -> s.length() >= 3);
}
}
package fr.inra.urgi.rare.harvest;
import static fr.inra.urgi.rare.util.Utils.nullSafeUnmodifiableCopy;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
......@@ -53,8 +54,8 @@ public final class HarvestResult {
this.id = id;
this.startInstant = startInstant;
this.endInstant = endInstant;
this.globalErrors = globalErrors == null ? Collections.emptyList() : Collections.unmodifiableList(new ArrayList<>(globalErrors));
this.files = files == null ? Collections.emptyList() : Collections.unmodifiableList(new ArrayList<>(files));
this.globalErrors = nullSafeUnmodifiableCopy(globalErrors);
this.files = nullSafeUnmodifiableCopy(files);
}
public String getId() {
......
......@@ -16,6 +16,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import fr.inra.urgi.rare.config.RareProperties;
import fr.inra.urgi.rare.dao.GeneticResourceDao;
import fr.inra.urgi.rare.domain.GeneticResource;
import fr.inra.urgi.rare.domain.IndexedGeneticResource;
import fr.inra.urgi.rare.harvest.HarvestResult.HarvestResultBuilder;
import fr.inra.urgi.rare.harvest.HarvestedFile.HarvestedFileBuilder;
import org.springframework.stereotype.Component;
......@@ -93,7 +94,7 @@ public class Harvester {
HarvestedFileBuilder fileBuilder = HarvestedFile.builder(harvestedStream.getFileName());
int index = 0;
List<GeneticResource> batch = new ArrayList<>(BATCH_SIZE);
List<IndexedGeneticResource> batch = new ArrayList<>(BATCH_SIZE);
try (BufferedInputStream bis = new BufferedInputStream(harvestedStream.getInputStream());
JsonParser parser = objectMapper.getFactory().createParser(bis)) {
......@@ -118,7 +119,7 @@ public class Harvester {
// necessary to avoid failing in the middle of an object
TreeNode treeNode = objectMapper.readTree(parser);
GeneticResource geneticResource = objectMapper.treeToValue(treeNode, GeneticResource.class);
batch.add(geneticResource);
batch.add(new IndexedGeneticResource(geneticResource));
if (batch.size() == BATCH_SIZE) {
geneticResourceDao.saveAll(batch);
fileBuilder.addSuccesses(batch.size());
......
package fr.inra.urgi.rare.search;
import java.util.List;
import fr.inra.urgi.rare.dao.GeneticResourceDao;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
/**
* A REST controller for the suggestion API
*/
@RestController
@RequestMapping("/api/genetic-resources-suggestions")
public class SuggestionController {
private GeneticResourceDao geneticResourceDao;
public SuggestionController(GeneticResourceDao geneticResourceDao) {
this.geneticResourceDao = geneticResourceDao;
}
@GetMapping
public List<String> suggest(@RequestParam("query") String query) {
return geneticResourceDao.suggest(query);
}
}
package fr.inra.urgi.rare.util;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Utility methods
* @author JB Nizet
*/
public final class Utils {
private Utils() {
}
public static <T> List<T> nullSafeUnmodifiableCopy(List<T> list) {
return list == null ? Collections.emptyList() : Collections.unmodifiableList(new ArrayList<>(list));
}
}
......@@ -141,6 +141,9 @@
},
"collectLongitude": {
"type": "double"
},
"suggestions": {
"type" : "completion"
}
}
}
......@@ -4,11 +4,14 @@ import static org.assertj.core.api.Assertions.assertThat;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.UUID;
import java.util.function.BiConsumer;
import fr.inra.urgi.rare.config.ElasticSearchConfig;
import fr.inra.urgi.rare.domain.GeneticResource;
import fr.inra.urgi.rare.domain.GeneticResourceBuilder;
import fr.inra.urgi.rare.domain.IndexedGeneticResource;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket;
import org.junit.jupiter.api.BeforeEach;
......@@ -69,7 +72,7 @@ class GeneticResourceDaoTest {
37.5,
15.099722);
geneticResourceDao.save(geneticResource);
geneticResourceDao.saveAll(Collections.singleton(new IndexedGeneticResource(geneticResource)));
assertThat(geneticResourceDao.findById(geneticResource.getId()).get()).isEqualTo(geneticResource);
}
......@@ -141,7 +144,7 @@ class GeneticResourceDaoTest {
@Test
public void shouldNotSearchOnIdentifier() {
GeneticResource geneticResource = new GeneticResourceBuilder().build();
GeneticResource geneticResource = new GeneticResourceBuilder().withId("foo-bar").build();
geneticResourceDao.save(geneticResource);
assertThat(geneticResourceDao.search(geneticResource.getId(),
......@@ -162,12 +165,125 @@ class GeneticResourceDaoTest {
firstPage).getContent()).isEmpty();
}
@Test
public void shouldSuggestOnName() {
shouldSuggest(GeneticResourceBuilder::withName);
}
@Test
public void shouldSuggestOnPillarName() {
shouldSuggest(GeneticResourceBuilder::withPillarName);
}
@Test
public void shouldSuggestOnDatabaseSource() {
shouldSuggest(GeneticResourceBuilder::withDatabaseSource);
}
@Test
public void shouldSuggestOnDomain() {
shouldSuggest(GeneticResourceBuilder::withDomain);
}
@Test
public void shouldSuggestOnTaxon() {
shouldSuggest((b, s) -> b.withTaxon(Collections.singletonList(s)));
}
@Test
public void shouldSuggestOnFamily() {
shouldSuggest((b, s) -> b.withFamily(Collections.singletonList(s)));
}
@Test
public void shouldSuggestOnGenus() {
shouldSuggest((b, s) -> b.withGenus(Collections.singletonList(s)));
}
@Test
public void shouldSuggestOnSpecies() {
shouldSuggest((b, s) -> b.withSpecies(Collections.singletonList(s)));
}
@Test
public void shouldSuggestOnMaterialType() {
shouldSuggest((b, s) -> b.withMaterialType(Collections.singletonList(s)));
}
@Test
public void shouldSuggestOnBiotopeType() {
shouldSuggest((b, s) -> b.withBiotopeType(Collections.singletonList(s)));
}
@Test
public void shouldSuggestOnCountryOfOrigin() {
shouldSuggest(GeneticResourceBuilder::withCountryOfOrigin);
}
@Test
public void shouldSuggestOnCountryOfCollect() {
shouldSuggest(GeneticResourceBuilder::withCountryOfCollect);
}
@Test
public void shouldNotSuggestOnIdentifier() {
GeneticResource geneticResource = new GeneticResourceBuilder().withId("foo-bar").build();
geneticResourceDao.saveAll(Collections.singleton(new IndexedGeneticResource(geneticResource)));
assertThat(geneticResourceDao.suggest("foo")).isEmpty();
}
@Test
public void shouldNotSuggestOnUrls() {
GeneticResource geneticResource =
new GeneticResourceBuilder().withDataURL("foo bar baz").withPortalURL("foo bar baz").build();
geneticResourceDao.saveAll(Collections.singleton(new IndexedGeneticResource(geneticResource)));
assertThat(geneticResourceDao.suggest("foo")).isEmpty();
}
@Test
public void shouldSuggestOnDescription() {
GeneticResource geneticResource =
new GeneticResourceBuilder().withDescription("Hello world").build();
geneticResourceDao.saveAll(Collections.singleton(new IndexedGeneticResource(geneticResource)));
assertThat(geneticResourceDao.suggest("hel")).containsOnly("Hello");
assertThat(geneticResourceDao.suggest("wor")).containsOnly("world");
}
@Test
public void shouldSuggestSeveralResults() {
GeneticResource resource =
new GeneticResourceBuilder()
.withId(UUID.randomUUID().toString())
.withName("vita e bella")
.withDatabaseSource("Florilege")
.build();
GeneticResource resource2 =
new GeneticResourceBuilder()
.withId(UUID.randomUUID().toString())
.withTaxon(Collections.singletonList("vitis vinifera"))
.withFamily(Collections.singletonList("vitis"))
.withDatabaseSource("Florilege")
.build();
geneticResourceDao.saveAll(Arrays.asList(new IndexedGeneticResource(resource), new IndexedGeneticResource(resource2)));
List<String> result = geneticResourceDao.suggest("vit");
assertThat(result).containsOnly("vitis", "vita e bella");
result = geneticResourceDao.suggest("vitis v");
assertThat(result).containsOnly("vitis vinifera");
}
private void shouldSearch(BiConsumer<GeneticResourceBuilder, String> config) {
GeneticResourceBuilder geneticResourceBuilder = new GeneticResourceBuilder();
config.accept(geneticResourceBuilder, "foo bar baz");
GeneticResource geneticResource = geneticResourceBuilder.build();
geneticResourceDao.save(geneticResource);
geneticResourceDao.saveAll(Collections.singleton(new IndexedGeneticResource(geneticResource)));
AggregatedPage<GeneticResource> result =
geneticResourceDao.search("bar", false, SearchRefinements.EMPTY, firstPage);
......@@ -310,5 +426,16 @@ class GeneticResourceDaoTest {
assertThat(result.getContent()).isEmpty();
}
}
private void shouldSuggest(BiConsumer<GeneticResourceBuilder, String> config) {
GeneticResourceBuilder geneticResourceBuilder = new GeneticResourceBuilder();
config.accept(geneticResourceBuilder, "foo bar baz");
GeneticResource geneticResource = geneticResourceBuilder.build();
geneticResourceDao.saveAll(Collections.singleton(new IndexedGeneticResource(geneticResource)));
assertThat(geneticResourceDao.suggest("FOO")).containsExactly("foo bar baz");
assertThat(geneticResourceDao.suggest("bing")).isEmpty();
}
}
package fr.inra.urgi.rare.domain;
import static org.assertj.core.api.Assertions.assertThat;
import java.util.Arrays;
import org.junit.jupiter.api.Test;
/**
* Unit test for {@link IndexedGeneticResource}
* @author JB Nizet
*/
class IndexedGeneticResourceTest {
@Test
public void shouldStoreSuggestions() {
GeneticResource resource = new GeneticResourceBuilder()
.withDatabaseSource("databaseResource")
.withFamily(Arrays.asList("family"))
.withName("name")
.withTaxon(Arrays.asList("taxon"))
.withBiotopeType(Arrays.asList("biotopeType"))
.withCountryOfCollect("countryOfCollect")
.withCountryOfOrigin("countryOfOrigin")
.withDomain("domain")
.withGenus(Arrays.asList("genus"))
.withMaterialType(Arrays.asList("materialType"))
.withPillarName("pillarName")
.withSpecies(Arrays.asList("species"))
.withDescription("Hello world! How\n is he/she doing? Très bien.")
.build();
IndexedGeneticResource result = new IndexedGeneticResource(resource);
assertThat(result.getGeneticResource()).isSameAs(resource);
assertThat(result.getSuggestions()).containsOnly(
"databaseResource",