Commit 07e8ad09 authored by Jean-Baptiste Nizet's avatar Jean-Baptiste Nizet
Browse files

fix: use the standard tokenizer of Elasticsearch to split the description in tokens

fix #11
parent 019f99a6
package fr.inra.urgi.rare.domain;
import java.io.IOException;
import java.io.StringReader;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import com.fasterxml.jackson.annotation.JsonUnwrapped;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.springframework.data.elasticsearch.annotations.Document;
/**
......@@ -22,8 +26,6 @@ import org.springframework.data.elasticsearch.annotations.Document;
createIndex = false
)
public final class IndexedGeneticResource {
private static final Pattern WORD_SPLIT_PATTERN = Pattern.compile("\\p{Punct}|\\p{Space}");
@JsonUnwrapped
private final GeneticResource geneticResource;
......@@ -97,11 +99,38 @@ public final class IndexedGeneticResource {
toAdd.forEach(s -> addIfNotBlank(list, s));
}
/**
* Uses the standard tokenizer of Lucene (which is itself used by ElasticSearch) to tokenize the description.
* This makes sure that words in the index used by the full-text search are the same as the ones in the suggestions,
* used to autocomplete terms. Othwerwise, we could have suggestions that lead to no search result.
* Note that words that are less than 3 characters-long are excluded from the suggestions, since it doesn't make
* much sense to suggest those words, and since the UI only starts suggesting after 2 characters anyway.
*/
private Stream<String> extractTokensOutOfDescription(String description) {
if (description == null) {
return Stream.empty();
}
return WORD_SPLIT_PATTERN.splitAsStream(description)
.filter(s -> s.length() >= 3);
try (StandardTokenizer tokenizer = new StandardTokenizer()) {
tokenizer.setReader(new StringReader(description));
CharTermAttribute termAttribute = tokenizer.addAttribute(CharTermAttribute.class);
tokenizer.reset();
List<String> terms = new ArrayList<>();
while (tokenizer.incrementToken()) {
String word = termAttribute.toString();
if (word.length() > 2) {
terms.add(word);
}
}
tokenizer.end();
return terms.stream();
}
catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
......@@ -26,7 +26,7 @@ class IndexedGeneticResourceTest {
.withMaterialType(Arrays.asList("materialType"))
.withPillarName("pillarName")
.withSpecies(Arrays.asList("species"))
.withDescription("Hello world! How\n is he/she doing? Très bien.")
.withDescription("Hello world! How\n is he/she doing? Très bien. GrapeReSeq_Illumina_20K_experiment?")
.build();
IndexedGeneticResource result = new IndexedGeneticResource(resource);
......@@ -51,7 +51,8 @@ class IndexedGeneticResourceTest {
"she",
"doing",
"Très",
"bien"
"bien",
"GrapeReSeq_Illumina_20K_experiment"
);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment