Commit ecb15d79 authored by Jean-Baptiste Nizet's avatar Jean-Baptiste Nizet
Browse files

feat: implement harvest

parent 1b67ebf7
......@@ -58,3 +58,83 @@ to avoid symbolic links issues on Docker.
You can approximate what runs on CI by executing:
docker run --rm -v "$PWD":/home/rare -w /home/rare ninjasquad/docker-rare ./gradlew build
## Harvest
Harvesting (i.e. importing genetic resources stored in JSON files into ElasticSearch) consists in
placing the JSON files into a directory where the server can find them.
This directory, by default is `/tmp/rare/resources`. But it's externalized into the Spring Boot property
`rare.resource-dir`, so it can be easily changed by modifying the value of this property (using an
environment variable for example).
The files must have the extension `.json`, and must be stored in that directory (not in a sub-directory).
Once the files are ready and the server is started, the harvest is triggered by sending a POST request
to the endpoint `/api/harvests`, without any request body.
Example with the `http` command ([HTTPie](https://httpie.org/)):
http POST http://localhost:8080/api/harvests
Example with the `curl` command:
curl -i -X POST http://localhost:8080/api/harvests
The harvest job is executed asynchronously, and a response is immediately sent back, with the URL allowing
to get the result of the job. For example:
HTTP/1.1 201
Content-Length: 0
Date: Tue, 24 Jul 2018 12:58:04 GMT
Location: http://localhost:8080/api/harvests/abb5784d-3006-48fb-b5db-d3ff9583e8b9
To get the result of the job, you can then send a GET request to the returned URL:
http GET http://localhost:8080/api/harvests/abb5784d-3006-48fb-b5db-d3ff9583e8b9
or
curl http://localhost:8080/api/harvests/abb5784d-3006-48fb-b5db-d3ff9583e8b9
`http` has the advantage of nicely formetting the returned JSON.
The response contains a detailed report containing the start instant, and the list of files
that have been processed, with the number of successfully imported resources, and the errors
that occurred, if any.
It's only when the property `endInstant` of the returned JSON is non-null that the job is complete.
```
{
"endInstant": "2018-07-24T12:56:28.077Z",
"files": [
{
"errorCount": 0,
"errors": [],
"fileName": "rare_pilier_microbial.json",
"successCount": 10
},
{
"errorCount": 2,
"errors": [
{
"column": 4,
"error": "Error while parsing object: com.fasterxml.jackson.databind.exc.MismatchedInputException: Cannot deserialize instance of `java.lang.String` out of START_ARRAY token\n at [Source: UNKNOWN; line: -1, column: -1] (through reference chain: fr.inra.urgi.rare.domain.GeneticResource[\"name\"])",
"index": 4790,
"line": 105594
},
{
"column": 4,
"error": "Error while parsing object: com.fasterxml.jackson.databind.exc.MismatchedInputException: Cannot deserialize instance of `java.lang.String` out of START_ARRAY token\n at [Source: UNKNOWN; line: -1, column: -1] (through reference chain: fr.inra.urgi.rare.domain.GeneticResource[\"countryOfCollect\"])",
"index": 5905,
"line": 130127
}
],
"fileName": "rare_pilier_plant.json",
"successCount": 14522
}
],
"globalErrors": [],
"id": "55e70557-79e8-4e40-a44b-2ef4b3df076a",
"startInstant": "2018-07-24T12:56:27.322Z"
}
```
......@@ -77,6 +77,7 @@ dependencies {
}
testImplementation("org.junit.jupiter:junit-jupiter-api")
testImplementation("org.mockito:mockito-junit-jupiter:2.19.1")
testImplementation("org.junit-pioneer:junit-pioneer:0.1.2")
testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine")
}
package fr.inra.urgi.rare;
import fr.inra.urgi.rare.config.RareProperties;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.scheduling.annotation.EnableAsync;
/**
* The main Rare Application
* @author JB Nizet
*/
@SpringBootApplication
@EnableAsync
@EnableConfigurationProperties(RareProperties.class)
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
......
package fr.inra.urgi.rare.config;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import org.springframework.beans.factory.annotation.Qualifier;
/**
* Qualifier annotation for beans used by the harvesting process
* @author JB Nizet
*/
@Retention(RetentionPolicy.RUNTIME)
@Qualifier("harvest")
public @interface Harvest {
}
package fr.inra.urgi.rare.config;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.http.converter.json.Jackson2ObjectMapperBuilder;
/**
* Configuration of the harvesting process
* @author JB Nizet
*/
@Configuration
public class HarvestConfig {
@Harvest
@Bean
public ObjectMapper harvestObjectMapper(Jackson2ObjectMapperBuilder objectMapperBuilder) {
// build an ObjectMapper using the same configuration as for Spring MVC
ObjectMapper objectMapper = objectMapperBuilder.build();
// and add the ACCEPT_SINGLE_VALUE_AS_ARRAY deserialization feature, because the JSON files often use a single
// string for multi-valued properties
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
return objectMapper;
}
}
package fr.inra.urgi.rare.config;
import java.nio.file.Path;
import org.springframework.boot.context.properties.ConfigurationProperties;
/**
* Properties class holding the rare-specific properties of the application (typically stored in application.yml)
* @author JB Nizet
*/
@ConfigurationProperties(prefix = "rare")
public class RareProperties {
/**
* The directory where the JSON files that are harvested are located.
*/
private Path resourceDir;
public Path getResourceDir() {
return resourceDir;
}
public void setResourceDir(Path resourceDir) {
this.resourceDir = resourceDir;
}
@Override
public String toString() {
return "RareProperties{" +
"resourceDir=" + resourceDir +
'}';
}
}
package fr.inra.urgi.rare.dao;
import fr.inra.urgi.rare.domain.GeneticResource;
import org.springframework.stereotype.Component;
/**
* Repository for genetic resources
* @author JB Nizet
*/
@Component
public class GeneticResourceRepository {
public void save(GeneticResource geneticResource) {
// TODO really store
System.out.println("geneticResource = " + geneticResource);
}
}
package fr.inra.urgi.rare.dao;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
import fr.inra.urgi.rare.harvest.HarvestResult;
import org.springframework.stereotype.Component;
/**
* Repository for {@link fr.inra.urgi.rare.harvest.HarvestResult}
* @author JB Nizet
*/
@Component
public class HarvestResultRepository {
private final Map<String, HarvestResult> fakeRepo = new ConcurrentHashMap<>();
public void save(HarvestResult harvestResult) {
this.fakeRepo.put(harvestResult.getId(), harvestResult);
}
public Optional<HarvestResult> findById(String id) {
return Optional.ofNullable(fakeRepo.get(id));
}
}
package fr.inra.urgi.rare.domain;
import java.util.List;
import java.util.Objects;
import com.fasterxml.jackson.annotation.JsonCreator;
......@@ -21,12 +22,12 @@ public final class GeneticResource {
private final String portalURL;
private final String dataURL;
private final String domain;
private final String taxon;
private final String family;
private final String genus;
private final String species;
private final String materialType;
private final String biotopeType;
private final List<String> taxon;
private final List<String> family;
private final List<String> genus;
private final List<String> species;
private final List<String> materialType;
private final List<String> biotopeType;
private final String countryOfOrigin;
private final Double originLatitude;
private final Double originLongitude;
......@@ -43,12 +44,12 @@ public final class GeneticResource {
String portalURL,
String dataURL,
String domain,
String taxon,
String family,
String genus,
String species,
String materialType,
String biotopeType,
List<String> taxon,
List<String> family,
List<String> genus,
List<String> species,
List<String> materialType,
List<String> biotopeType,
String countryOfOrigin,
Double originLatitude,
Double originLongitude,
......@@ -109,27 +110,27 @@ public final class GeneticResource {
return domain;
}
public String getTaxon() {
public List<String> getTaxon() {
return taxon;
}
public String getFamily() {
public List<String> getFamily() {
return family;
}
public String getGenus() {
public List<String> getGenus() {
return genus;
}
public String getSpecies() {
public List<String> getSpecies() {
return species;
}
public String getMaterialType() {
public List<String> getMaterialType() {
return materialType;
}
public String getBiotopeType() {
public List<String> getBiotopeType() {
return biotopeType;
}
......
package fr.inra.urgi.rare.exception;
import org.springframework.http.HttpStatus;
import org.springframework.web.bind.annotation.ResponseStatus;
/**
* Exception used to signal that an entity couldn't be found
* @author JB Nizet
*/
@ResponseStatus(HttpStatus.NOT_FOUND)
public class NotFoundException extends RuntimeException {
}
package fr.inra.urgi.rare.harvest;
import java.util.stream.Stream;
import fr.inra.urgi.rare.dao.HarvestResultRepository;
import fr.inra.urgi.rare.harvest.HarvestResult.HarvestResultBuilder;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component;
/**
* Bean used to start an asynchronous harvesting job
* @author JB Nizet
*/
@Component
public class AsyncHarvester {
private final Harvester harvester;
private final HarvestResultRepository harvestResultRepository;
public AsyncHarvester(Harvester harvester, HarvestResultRepository harvestResultRepository) {
this.harvester = harvester;
this.harvestResultRepository = harvestResultRepository;
}
@Async
public void harvest(HarvestResultBuilder resultBuilder) {
try (Stream<HarvestedStream> jsonFiles = this.harvester.jsonFiles(resultBuilder)) {
jsonFiles.forEach(harvestedStream -> {
this.harvester.harvest(harvestedStream, resultBuilder);
harvestResultRepository.save(resultBuilder.build());
});
}
HarvestResult finalResult = resultBuilder.end();
harvestResultRepository.save(finalResult);
}
}
package fr.inra.urgi.rare.harvest;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
import com.fasterxml.jackson.annotation.JsonCreator;
import fr.inra.urgi.rare.harvest.HarvestedFile.HarvestedFileBuilder;
/**
* The result of a harvesting operation, stored in ElasticSearch in order to allow diagnosing what happened
* during the harvesting (which files were harvested, which errors occurred, etc.).
* @author JB Nizet
*/
public final class HarvestResult {
private final String id;
private final Instant startInstant;
private final Instant endInstant;
private final List<String> globalErrors;
private final List<HarvestedFile> files;
private HarvestResult(HarvestResultBuilder builder) {
this(builder.id,
builder.startInstant,
builder.endInstant,
builder.globalErrors,
builder.files);
}
@JsonCreator
public HarvestResult(String id,
Instant startInstant,
Instant endInstant,
List<String> globalErrors,
List<HarvestedFile> files) {
this.id = id;
this.startInstant = startInstant;
this.endInstant = endInstant;
this.globalErrors = Collections.unmodifiableList(new ArrayList<>(globalErrors));
this.files = Collections.unmodifiableList(new ArrayList<>(files));
}
public String getId() {
return id;
}
public Instant getStartInstant() {
return startInstant;
}
public Instant getEndInstant() {
return endInstant;
}
public List<HarvestedFile> getFiles() {
return files;
}
public List<String> getGlobalErrors() {
return globalErrors;
}
public static HarvestResultBuilder builder() {
return new HarvestResultBuilder();
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
HarvestResult that = (HarvestResult) o;
return Objects.equals(id, that.id) &&
Objects.equals(startInstant, that.startInstant) &&
Objects.equals(endInstant, that.endInstant) &&
Objects.equals(globalErrors, that.globalErrors) &&
Objects.equals(files, that.files);
}
@Override
public int hashCode() {
return Objects.hash(id, startInstant, endInstant, globalErrors, files);
}
@Override
public String toString() {
return "HarvestResult{" +
"id='" + id + '\'' +
", startInstant=" + startInstant +
", endInstant=" + endInstant +
", globalErrors=" + globalErrors +
", files=" + files +
'}';
}
/**
* A mutable builder allowing to record the operations, and to create an instance of {@link HarvestResult}
*/
public static final class HarvestResultBuilder {
private final String id;
private final Instant startInstant;
private Instant endInstant;
private final List<HarvestedFile> files = new ArrayList<>();
private final List<String> globalErrors = new ArrayList<>();
private HarvestResultBuilder() {
this.id = UUID.randomUUID().toString();
this.startInstant = Instant.now();
}
/**
* Adds a global error (i.e. not specific to any given file)
*/
public HarvestResultBuilder addGlobalError(String error) {
this.globalErrors.add(error);
return this;
}
/**
* Starts the harvesting of a file, and returns its {@link HarvestedFileBuilder} allowing to record
* the successes and errors of the harvesting of that file.
*/
public HarvestResultBuilder withFile(HarvestedFile harvestedFile) {
this.files.add(harvestedFile);
return this;
}
/**
* Builds the {@link HarvestResult} based on the current errors and harvested files in this builder.
*/
public HarvestResult build() {
return new HarvestResult(this);
}
/**
* Builds the final {@link HarvestResult} based on the current errors and harvested files in this builder, recording
* the end instant.
*/
public HarvestResult end() {
this.endInstant = Instant.now();
return new HarvestResult(this);
}
}
}
package fr.inra.urgi.rare.harvest;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import com.fasterxml.jackson.annotation.JsonCreator;
/**
* The information about a harvested file
* @author JB Nizet
*/
public final class HarvestedFile {
/**
* The name of the file that has been harvested
*/
private final String fileName;
/**
* The number of genetic resources in the file that have been successfully stored in ElesticSearch
*/
private final int successCount;
/**
* The number of errors that have occurred while harvesting the file
*/
private final int errorCount;
/**
* The errors that have occurred while harvesting the file
*/
private final List<HarvestedFileError> errors;
private HarvestedFile(HarvestedFileBuilder builder) {
this(builder.fileName, builder.successCount, builder.errorCount, builder.errors);
}
@JsonCreator
public HarvestedFile(String fileName,
int successCount,
int errorCount,
List<HarvestedFileError> errors) {
this.fileName = fileName;
this.successCount = successCount;
this.errorCount = errorCount;
this.errors = Collections.unmodifiableList(new ArrayList<>(errors));
}
public String getFileName() {
return fileName;
}
public int getSuccessCount() {
return successCount;
}
public int getErrorCount() {
return errorCount;
}
public List<HarvestedFileError> getErrors() {
return errors;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
HarvestedFile that = (HarvestedFile) o;
return successCount == that.successCount &&