Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
urgi-is
data-discovery
Commits
07e8ad09
Commit
07e8ad09
authored
Aug 09, 2018
by
Jean-Baptiste Nizet
Browse files
fix: use the standard tokenizer of Elasticsearch to split the description in tokens
fix
#11
parent
019f99a6
Changes
2
Hide whitespace changes
Inline
Side-by-side
backend/src/main/java/fr/inra/urgi/rare/domain/IndexedGeneticResource.java
View file @
07e8ad09
package
fr.inra.urgi.rare.domain
;
import
java.io.IOException
;
import
java.io.StringReader
;
import
java.io.UncheckedIOException
;
import
java.util.ArrayList
;
import
java.util.Collection
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Objects
;
import
java.util.regex.Pattern
;
import
java.util.stream.Stream
;
import
com.fasterxml.jackson.annotation.JsonUnwrapped
;
import
org.apache.lucene.analysis.standard.StandardTokenizer
;
import
org.apache.lucene.analysis.tokenattributes.CharTermAttribute
;
import
org.springframework.data.elasticsearch.annotations.Document
;
/**
...
...
@@ -22,8 +26,6 @@ import org.springframework.data.elasticsearch.annotations.Document;
createIndex
=
false
)
public
final
class
IndexedGeneticResource
{
private
static
final
Pattern
WORD_SPLIT_PATTERN
=
Pattern
.
compile
(
"\\p{Punct}|\\p{Space}"
);
@JsonUnwrapped
private
final
GeneticResource
geneticResource
;
...
...
@@ -97,11 +99,38 @@ public final class IndexedGeneticResource {
toAdd
.
forEach
(
s
->
addIfNotBlank
(
list
,
s
));
}
/**
* Uses the standard tokenizer of Lucene (which is itself used by ElasticSearch) to tokenize the description.
* This makes sure that words in the index used by the full-text search are the same as the ones in the suggestions,
* used to autocomplete terms. Othwerwise, we could have suggestions that lead to no search result.
* Note that words that are less than 3 characters-long are excluded from the suggestions, since it doesn't make
* much sense to suggest those words, and since the UI only starts suggesting after 2 characters anyway.
*/
private
Stream
<
String
>
extractTokensOutOfDescription
(
String
description
)
{
if
(
description
==
null
)
{
return
Stream
.
empty
();
}
return
WORD_SPLIT_PATTERN
.
splitAsStream
(
description
)
.
filter
(
s
->
s
.
length
()
>=
3
);
try
(
StandardTokenizer
tokenizer
=
new
StandardTokenizer
())
{
tokenizer
.
setReader
(
new
StringReader
(
description
));
CharTermAttribute
termAttribute
=
tokenizer
.
addAttribute
(
CharTermAttribute
.
class
);
tokenizer
.
reset
();
List
<
String
>
terms
=
new
ArrayList
<>();
while
(
tokenizer
.
incrementToken
())
{
String
word
=
termAttribute
.
toString
();
if
(
word
.
length
()
>
2
)
{
terms
.
add
(
word
);
}
}
tokenizer
.
end
();
return
terms
.
stream
();
}
catch
(
IOException
e
)
{
throw
new
UncheckedIOException
(
e
);
}
}
}
backend/src/test/java/fr/inra/urgi/rare/domain/IndexedGeneticResourceTest.java
View file @
07e8ad09
...
...
@@ -26,7 +26,7 @@ class IndexedGeneticResourceTest {
.
withMaterialType
(
Arrays
.
asList
(
"materialType"
))
.
withPillarName
(
"pillarName"
)
.
withSpecies
(
Arrays
.
asList
(
"species"
))
.
withDescription
(
"Hello world! How\n is he/she doing? Très bien."
)
.
withDescription
(
"Hello world! How\n is he/she doing? Très bien.
GrapeReSeq_Illumina_20K_experiment?
"
)
.
build
();
IndexedGeneticResource
result
=
new
IndexedGeneticResource
(
resource
);
...
...
@@ -51,7 +51,8 @@ class IndexedGeneticResourceTest {
"she"
,
"doing"
,
"Très"
,
"bien"
"bien"
,
"GrapeReSeq_Illumina_20K_experiment"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment