Commit f68fb1f8 authored by Jean-Baptiste Nizet's avatar Jean-Baptiste Nizet
Browse files

feat: implement sitemaps

parent eba06a28
package fr.inra.urgi.faidare.domain.data;
import java.util.List;
import fr.inra.urgi.faidare.domain.brapi.v1.data.BrapiAdditionalInfo;
import fr.inra.urgi.faidare.domain.brapi.v1.data.BrapiLocation;
import fr.inra.urgi.faidare.domain.jsonld.data.HasURI;
import fr.inra.urgi.faidare.domain.jsonld.data.HasURL;
import fr.inra.urgi.faidare.domain.jsonld.data.IncludedInDataCatalog;
import fr.inra.urgi.faidare.elasticsearch.document.annotation.Document;
import fr.inra.urgi.faidare.elasticsearch.document.annotation.Id;
/**
* A minimal view of a location containing only its ID, used to generate sitemaps
*/
@Document(type = "location", includedFields = "locationDbId")
public class LocationSitemapVO {
@Id
private String locationDbId;
public LocationSitemapVO() {
}
public LocationSitemapVO(String locationDbId) {
this.locationDbId = locationDbId;
}
public String getLocationDbId() {
return locationDbId;
}
public void setLocationDbId(String locationDbId) {
this.locationDbId = locationDbId;
}
}
package fr.inra.urgi.faidare.domain.data.germplasm;
import java.io.Serializable;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonSetter;
import com.fasterxml.jackson.annotation.Nulls;
import fr.inra.urgi.faidare.domain.brapi.v1.data.BrapiGermplasm;
import fr.inra.urgi.faidare.domain.jsonld.data.HasURI;
import fr.inra.urgi.faidare.domain.jsonld.data.HasURL;
import fr.inra.urgi.faidare.domain.jsonld.data.IncludedInDataCatalog;
import fr.inra.urgi.faidare.elasticsearch.document.annotation.Document;
import fr.inra.urgi.faidare.elasticsearch.document.annotation.Id;
/**
* A minimal view of a germplasm, containing only its ID, used for sitemaps
*/
@Document(type = "germplasm", includedFields = "germplasmDbId")
public class GermplasmSitemapVO {
@Id
private String germplasmDbId;
public GermplasmSitemapVO() {
}
public GermplasmSitemapVO(String germplasmDbId) {
this.germplasmDbId = germplasmDbId;
}
public String getGermplasmDbId() {
return germplasmDbId;
}
public void setGermplasmDbId(String germplasmDbId) {
this.germplasmDbId = germplasmDbId;
}
}
package fr.inra.urgi.faidare.domain.data.study;
import java.util.Date;
import java.util.List;
import java.util.Set;
import com.fasterxml.jackson.annotation.JsonIgnore;
import fr.inra.urgi.faidare.domain.brapi.v1.data.BrapiAdditionalInfo;
import fr.inra.urgi.faidare.domain.brapi.v1.data.BrapiStudySummary;
import fr.inra.urgi.faidare.domain.data.GnpISInternal;
import fr.inra.urgi.faidare.domain.jsonld.data.HasURI;
import fr.inra.urgi.faidare.domain.jsonld.data.HasURL;
import fr.inra.urgi.faidare.domain.jsonld.data.IncludedInDataCatalog;
import fr.inra.urgi.faidare.elasticsearch.document.annotation.Document;
import fr.inra.urgi.faidare.elasticsearch.document.annotation.Id;
/**
* A minimal view of a study containing only its ID, used to generate sitemaps
*/
@Document(type = "study", includedFields = "studyDbId")
public class StudySitemapVO {
@Id
private String studyDbId;
public StudySitemapVO() {
}
public StudySitemapVO(String studyDbId) {
this.studyDbId = studyDbId;
}
public String getStudyDbId() {
return studyDbId;
}
public void setStudyDbId(String studyDbId) {
this.studyDbId = studyDbId;
}
}
......@@ -68,6 +68,13 @@ public class ESScrollIterator<T> implements Iterator<T> {
.size(fetchSize)
.sort(FieldSortBuilder.DOC_FIELD_NAME, SortOrder.ASC);
// Add included and excluded fields if requested
String[] includedFields = documentMetadata.getIncludedFields();
String[] excludedFields = documentMetadata.getExcludedFields();
if ((includedFields != null && includedFields.length >= 1) || (excludedFields != null && excludedFields.length >= 1)) {
request.source().fetchSource(includedFields, excludedFields);
}
SearchResponse response = null;
try {
response = client.search(request, RequestOptions.DEFAULT);
......
......@@ -54,8 +54,9 @@ public class DocumentAnnotationUtil {
Map<String, DocumentMetadata.Field> fields = findDocumentFields(ImmutableList.<String>of(),
valueObjectClass);
String[] includedFields = document.includedFields();
String[] excludedFields = document.excludedFields();
metadata = new DocumentMetadata<>(documentType, idFieldName, valueObjectClass, excludedFields, fields);
metadata = new DocumentMetadata<>(documentType, idFieldName, valueObjectClass, includedFields, excludedFields, fields);
metadataCache.put(valueObjectClass, metadata);
}
return metadata;
......
......@@ -17,15 +17,21 @@ public class DocumentMetadata<VO> {
private final String documentType;
private final String idField;
private final Class<VO> documentClass;
private final String[] includedFields;
private final String[] excludedFields;
private final Map<String, Field> fieldsByName;
private final Map<List<String>, Field> fieldByPath;
public DocumentMetadata(String documentType, String idField, Class<VO> documentClass, String[] excludedFields,
public DocumentMetadata(String documentType,
String idField,
Class<VO> documentClass,
String[] includedFields,
String[] excludedFields,
Map<String, Field> fieldsByName) {
this.documentType = documentType;
this.idField = idField;
this.documentClass = documentClass;
this.includedFields = includedFields;
this.excludedFields = excludedFields;
this.fieldsByName = fieldsByName;
this.fieldByPath = flattenDocumentFieldTree(ImmutableList.<String>of(), fieldsByName);
......@@ -57,6 +63,10 @@ public class DocumentMetadata<VO> {
return idField;
}
public String[] getIncludedFields() {
return includedFields;
}
public String[] getExcludedFields() {
return excludedFields;
}
......
......@@ -13,5 +13,6 @@ import java.lang.annotation.Target;
public @interface Document {
String type();
String[] includedFields() default {};
String[] excludedFields() default {};
}
......@@ -90,10 +90,11 @@ public class ESGenericFindRepository<C extends PaginationCriteria, VO> implement
request.source().sort(field, order);
}
// Add excluded fields if requested
// Add included and excluded fields if requested
String[] includedFields = documentMetadata.getIncludedFields();
String[] excludedFields = documentMetadata.getExcludedFields();
if (excludedFields != null && excludedFields.length >= 1) {
request.source().fetchSource(null, excludedFields);
if ((includedFields != null && includedFields.length >= 1) || (excludedFields != null && excludedFields.length >= 1)) {
request.source().fetchSource(includedFields, excludedFields);
}
Logger logger = LoggerFactory.getLogger(ESGenericFindRepository.class);
......
......@@ -3,6 +3,7 @@ package fr.inra.urgi.faidare.repository.es;
import fr.inra.urgi.faidare.domain.criteria.FaidareGermplasmPOSTShearchCriteria;
import fr.inra.urgi.faidare.domain.criteria.GermplasmSearchCriteria;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmMcpdVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmSitemapVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmVO;
import fr.inra.urgi.faidare.domain.data.germplasm.PedigreeVO;
import fr.inra.urgi.faidare.domain.data.germplasm.ProgenyVO;
......@@ -31,6 +32,11 @@ public interface GermplasmRepository
*/
GermplasmVO getById(String germplasmDbId);
/**
* Scroll through all germplasms, using the given fetch size
*/
Iterator<GermplasmSitemapVO> scrollAllForSitemap(int fetchSize);
/**
* Scroll through all germplasm matching the given criteria.
*/
......
......@@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import fr.inra.urgi.faidare.domain.criteria.FaidareGermplasmPOSTShearchCriteria;
import fr.inra.urgi.faidare.domain.criteria.GermplasmSearchCriteria;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmMcpdVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmSitemapVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmVO;
import fr.inra.urgi.faidare.domain.data.germplasm.PedigreeVO;
import fr.inra.urgi.faidare.domain.data.germplasm.ProgenyVO;
......@@ -90,6 +91,12 @@ public class GermplasmRepositoryImpl implements GermplasmRepository {
this.criteriaMapping = AnnotatedCriteriaMapper.getMapping(criteriaClass);
}
@Override
public Iterator<GermplasmSitemapVO> scrollAllForSitemap(int fetchSize) {
QueryBuilder query = QueryBuilders.matchAllQuery();
return new ESScrollIterator<>(client, requestFactory, parser, GermplasmSitemapVO.class, query, fetchSize);
}
@Override
public Iterator<GermplasmVO> scrollAll(GermplasmSearchCriteria criteria) {
QueryBuilder query = queryFactory.createQuery(criteria);
......
package fr.inra.urgi.faidare.repository.es;
import java.util.Iterator;
import fr.inra.urgi.faidare.domain.criteria.LocationCriteria;
import fr.inra.urgi.faidare.domain.data.LocationSitemapVO;
import fr.inra.urgi.faidare.domain.data.LocationVO;
import fr.inra.urgi.faidare.domain.response.PaginatedList;
import fr.inra.urgi.faidare.elasticsearch.repository.ESFindRepository;
......@@ -21,4 +24,5 @@ public interface LocationRepository
@Override
PaginatedList<LocationVO> find(LocationCriteria criteria);
Iterator<LocationSitemapVO> scrollAllForSitemap(int fetchSize);
}
package fr.inra.urgi.faidare.repository.es;
import java.util.Iterator;
import fr.inra.urgi.faidare.domain.criteria.LocationCriteria;
import fr.inra.urgi.faidare.domain.data.LocationSitemapVO;
import fr.inra.urgi.faidare.domain.data.LocationVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmSitemapVO;
import fr.inra.urgi.faidare.elasticsearch.ESRequestFactory;
import fr.inra.urgi.faidare.elasticsearch.ESResponseParser;
import fr.inra.urgi.faidare.elasticsearch.ESScrollIterator;
import fr.inra.urgi.faidare.elasticsearch.repository.impl.BaseESRepository;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;
......@@ -17,6 +24,10 @@ public class LocationRepositoryImpl
extends BaseESRepository<LocationCriteria, LocationVO>
implements LocationRepository {
private final RestHighLevelClient client;
private final ESRequestFactory requestFactory;
private final ESResponseParser parser;
@Autowired
public LocationRepositoryImpl(
RestHighLevelClient client,
......@@ -24,6 +35,14 @@ public class LocationRepositoryImpl
ESResponseParser parser
) {
super(client, requestFactory, LocationVO.class, parser);
this.client = client;
this.requestFactory = requestFactory;
this.parser = parser;
}
@Override
public Iterator<LocationSitemapVO> scrollAllForSitemap(int fetchSize) {
QueryBuilder query = QueryBuilders.matchAllQuery();
return new ESScrollIterator<>(client, requestFactory, parser, LocationSitemapVO.class, query, fetchSize);
}
}
package fr.inra.urgi.faidare.repository.es;
import fr.inra.urgi.faidare.domain.criteria.StudyCriteria;
import fr.inra.urgi.faidare.domain.data.LocationSitemapVO;
import fr.inra.urgi.faidare.domain.data.study.StudyDetailVO;
import fr.inra.urgi.faidare.domain.data.study.StudySitemapVO;
import fr.inra.urgi.faidare.domain.data.study.StudySummaryVO;
import fr.inra.urgi.faidare.domain.response.PaginatedList;
import fr.inra.urgi.faidare.elasticsearch.repository.ESFindRepository;
import fr.inra.urgi.faidare.elasticsearch.repository.ESGetByIdRepository;
import java.util.Iterator;
import java.util.Set;
/**
......@@ -29,4 +32,5 @@ public interface StudyRepository
*/
Set<String> getVariableIds(String studyDbId);
Iterator<StudySitemapVO> scrollAllForSitemap(int fetchSize);
}
......@@ -2,12 +2,15 @@ package fr.inra.urgi.faidare.repository.es;
import fr.inra.urgi.faidare.domain.brapi.v1.data.BrapiLocation;
import fr.inra.urgi.faidare.domain.criteria.StudyCriteria;
import fr.inra.urgi.faidare.domain.data.LocationSitemapVO;
import fr.inra.urgi.faidare.domain.data.LocationVO;
import fr.inra.urgi.faidare.domain.data.study.StudyDetailVO;
import fr.inra.urgi.faidare.domain.data.study.StudySitemapVO;
import fr.inra.urgi.faidare.domain.data.study.StudySummaryVO;
import fr.inra.urgi.faidare.domain.response.PaginatedList;
import fr.inra.urgi.faidare.elasticsearch.ESRequestFactory;
import fr.inra.urgi.faidare.elasticsearch.ESResponseParser;
import fr.inra.urgi.faidare.elasticsearch.ESScrollIterator;
import fr.inra.urgi.faidare.elasticsearch.document.DocumentAnnotationUtil;
import fr.inra.urgi.faidare.elasticsearch.document.DocumentMetadata;
import fr.inra.urgi.faidare.elasticsearch.query.impl.ESGenericQueryFactory;
......@@ -19,6 +22,8 @@ import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.aggregations.bucket.filter.FilterAggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
import org.slf4j.Logger;
......@@ -28,6 +33,7 @@ import org.springframework.stereotype.Repository;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
......@@ -129,4 +135,9 @@ public class StudyRepositoryImpl
return new LinkedHashSet<>(ids);
}
@Override
public Iterator<StudySitemapVO> scrollAllForSitemap(int fetchSize) {
QueryBuilder query = QueryBuilders.matchAllQuery();
return new ESScrollIterator<>(client, requestFactory, parser, StudySitemapVO.class, query, fetchSize);
}
}
package fr.inra.urgi.faidare.utils;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UncheckedIOException;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Spliterators;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import org.springframework.web.servlet.support.ServletUriComponentsBuilder;
/**
* A generator of site maps.
* @author JB Nizet
*/
@Component
public class Sitemaps {
public static final int BUCKET_COUNT = 11;
public static <T> void generateSitemap(String sitemapPath,
OutputStream out,
Iterator<T> entryIterator,
Predicate<T> entryPredicate,
Function<T, String> entryToPath) {
SanityChecker sanityChecker = new SanityChecker(sitemapPath);
Writer writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
Stream<T> entries =
StreamSupport.stream(Spliterators.spliteratorUnknownSize(entryIterator, 0), false);
entries.filter(entryPredicate)
.map(entryToPath)
.map(entryPath -> Sitemaps.generateSitemapUrl(entryPath) + '\n')
.forEach(entry -> {
try {
writer.write(entry);
sanityChecker.addEntry(entry);
}
catch (IOException e) {
throw new UncheckedIOException(e);
}
});
try {
writer.flush();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
sanityChecker.check();
}
public static String generateSitemapUrl(String path) {
return ServletUriComponentsBuilder
.fromCurrentContextPath()
.path(path)
.toUriString();
}
private static class SanityChecker {
private static final Logger LOGGER = LoggerFactory.getLogger(SanityChecker.class);
private static final int MAX_ENTRY_COUNT = 50_000;
private static final int MAX_BYTE_COUNT = 50 * 1024 * 1024;
private static final int DANGER_ENTRY_COUNT = 40_000;
private static final int DANGER_BYTE_COUNT = 40 * 1024 * 1024;
private final String sitemapPath;
private int entryCount = 0;
private int byteCount = 0;
public SanityChecker(String sitemapPath) {
this.sitemapPath = sitemapPath;
}
public void addEntry(String entry) {
entryCount++;
byteCount += entry.length();
}
public void check() {
if (entryCount > MAX_ENTRY_COUNT) {
LOGGER.error("The generated sitemap at path "
+ sitemapPath +
" has more than "
+ MAX_ENTRY_COUNT +
" entries and will thus be rejected by search engines. Increase Sitemaps.BUCKET_COUNT for a better distribution of sitemap entries.");
} else if (entryCount > DANGER_ENTRY_COUNT) {
LOGGER.warn("The generated sitemap at path "
+ sitemapPath
+ " has more than "
+ DANGER_ENTRY_COUNT
+ " entries and is thus approaching the max of "
+ MAX_ENTRY_COUNT
+ ". Increase Sitemaps.BUCKET_COUNT for a better distribution of sitemap entries.");
}
if (byteCount > MAX_BYTE_COUNT) {
LOGGER.error("The generated sitemap at path "
+ sitemapPath
+ " has more than "
+ MAX_BYTE_COUNT
+ " bytes and will thus be rejected by search engines. Increase Sitemaps.BUCKET_COUNT for a better distribution of sitemap entries.");
} else if (entryCount > DANGER_ENTRY_COUNT) {
LOGGER.warn("The generated sitemap at path "
+ sitemapPath
+ " has more than "
+ DANGER_BYTE_COUNT
+ " bytes and is thus approaching the max of "
+ MAX_BYTE_COUNT
+ ". Increase Sitemaps.BUCKET_COUNT for a better distribution of sitemap entries.");
}
}
}
}
......@@ -4,9 +4,16 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Spliterators;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import javax.servlet.http.HttpServletRequest;
import com.google.common.collect.Streams;
import fr.inra.urgi.faidare.api.NotFoundException;
import fr.inra.urgi.faidare.config.FaidareProperties;
import fr.inra.urgi.faidare.domain.brapi.v1.data.BrapiGermplasmAttributeValue;
......@@ -18,25 +25,30 @@ import fr.inra.urgi.faidare.domain.data.germplasm.DonorVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GenealogyVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmAttributeValueVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmInstituteVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmSitemapVO;
import fr.inra.urgi.faidare.domain.data.germplasm.GermplasmVO;
import fr.inra.urgi.faidare.domain.data.germplasm.InstituteVO;
import fr.inra.urgi.faidare.domain.data.germplasm.PedigreeVO;
import fr.inra.urgi.faidare.domain.data.germplasm.PhotoVO;
import fr.inra.urgi.faidare.domain.data.germplasm.PuiNameValueVO;
import fr.inra.urgi.faidare.domain.data.germplasm.SiblingVO;
import fr.inra.urgi.faidare.domain.data.germplasm.SimpleVO;
import fr.inra.urgi.faidare.domain.data.germplasm.SiteVO;
import fr.inra.urgi.faidare.domain.data.germplasm.TaxonSourceVO;
import fr.inra.urgi.faidare.domain.xref.XRefDocumentVO;
import fr.inra.urgi.faidare.repository.es.GermplasmAttributeRepository;
import fr.inra.urgi.faidare.repository.es.GermplasmRepository;
import fr.inra.urgi.faidare.repository.es.XRefDocumentRepository;
import fr.inra.urgi.faidare.utils.Sitemaps;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.servlet.ModelAndView;
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
/**
* Controller used to display a germplasm card based on its ID.
......@@ -49,7 +61,7 @@ public class GermplasmController {
private final GermplasmRepository germplasmRepository;
private final FaidareProperties faidareProperties;
private final XRefDocumentRepository xRefDocumentRepository;
private GermplasmAttributeRepository germplasmAttributeRepository;
private final GermplasmAttributeRepository germplasmAttributeRepository;
public GermplasmController(GermplasmRepository germplasmRepository,
FaidareProperties faidareProperties,
......@@ -87,6 +99,26 @@ public class GermplasmController {
return toModelAndView(germplasms.get(0));
}
@GetMapping(value = "/sitemap-{index}.txt")
@ResponseBody
public ResponseEntity<StreamingResponseBody> sitemap(@PathVariable("index") int index) {
if (index < 0 || index >= Sitemaps.BUCKET_COUNT) {
throw new NotFoundException("no sitemap for this index");
}
StreamingResponseBody body = out -> {
Iterator<GermplasmSitemapVO> iterator = germplasmRepository.scrollAllForSitemap(1000);
Sitemaps.generateSitemap(
"/germplasms/sitemap-" + index + ".txt",
out,
iterator,
vo -> Math.floorMod(vo.getGermplasmDbId().hashCode(), Sitemaps.BUCKET_COUNT) == index,
vo -> "/germplasms/" + vo.getGermplasmDbId()
);
};
return ResponseEntity.ok().contentType(MediaType.TEXT_PLAIN).body(body);
}
private ModelAndView toModelAndView(GermplasmVO germplasm) {
// List<BrapiGermplasmAttributeValue> attributes = getAttributes(germplasm);