Commit a52a4405 authored by Penom Nom's avatar Penom Nom
Browse files

Add sampling parameters.

parent 6d09d08d
......@@ -24,45 +24,53 @@ from ng6.utils import Utils
class GeneDiversity (NG6Workflow):
def process(self):
# Add raw files
addrawfiles = self.add_component( "AddRawFiles", [self.runobj, self.args['read_1'] + self.args['read_2'], "none"] )
# # Manage samples
# reads_1 = [elt['read_1'] for elt in self.args['sample']]
# reads_2 = [elt['read_2'] for elt in self.args['sample']]
# samples_names = [elt['name'] for elt in self.args['sample'] if elt.has_key('name')]
reads_1 = self.args['read_1']
reads_2 = self.args['read_2']
# Add raw files
addrawfiles = self.add_component( "AddRawFiles", [self.runobj, reads_1 + reads_2, "none"] )
# Trim sequences
trim_R1 = self.add_component("Trimmer", [self.args['read_1'], 1, self.args['trim_read_1']], component_prefix="R1")
trim_R2 = self.add_component("Trimmer", [self.args['read_2'], 1, self.args['trim_read_2']], component_prefix="R2")
trim_R1 = self.add_component("Trimmer", [reads_1, 1, self.args['trim_read_1']], component_prefix="R1")
trim_R2 = self.add_component("Trimmer", [reads_2, 1, self.args['trim_read_2']], component_prefix="R2")
# Make some statistics on raw file
fastqc = self.add_component("FastQC", [trim_R1.output_files + trim_R2.output_files, False, True])
# Merge overlapping pair
join_pairs = self.add_component("Flash", [trim_R1.output_files, trim_R2.output_files, self.args["mismatch_ratio"], self.args["min_overlap"], self.args["max_overlap"]])
# Fastq to fasta
fastq2fasta = self.add_component("Fastq2fasta", [join_pairs.extended_frags])
# Dereplicates sequences
dereplicate = self.add_component("UsearchDereplication", [fastq2fasta.output_files])
# Remove chimeric sequences
chimera = self.add_component("UsearchChimera", [dereplicate.output_files, 8], parent=join_pairs)
chimera = self.add_component("UsearchChimera", [dereplicate.output_files, 6, ';size='], parent=join_pairs)
# Sequence traduction
split = self.add_component("SplitSeq", [chimera.nonchimeras, 7000])
framebot = self.add_component("Framebot", [split.output_files, self.args["database"], False])
# Rename the pre-clusters to provide traceback after merge and cd-hit
rename_clusters = self.add_component("AddSamplesNames", [framebot.corrected_proteins, '|', ';size='])
# Merge sequences
merge = self.add_component("ConcatenateFiles", [rename_clusters.output_files, "all_trad_sequences.fasta"])
# Create OTU
cdhit = self.add_component("Cdhit", [merge.output_file, self.args["otu_identity_threshold"], self.args["otu_length_diff_cutoff"],
self.args["otu_cluster_most_similar"], 5, 'euclidean', 'average', ';size=', '|'], parent=chimera)
# Stat on OTU
blast_index = self.add_component("BlastIndex", [self.args["database"], "prot"])
otu_classify = self.add_component("GeneOTUClassify", [cdhit.biom_files, cdhit.output_files, self.args["taxonomy"], blast_index.databank], parent=cdhit)
# Normalisation
normalisation = self.add_component("BiomNormalisation", [cdhit.biom_files, 1000, 3000, 100, 1], parent=cdhit)
\ No newline at end of file
# Sampling
sampling = self.add_component("BiomSampling", [cdhit.biom_files, self.args["discard"], self.args["select"], self.args["round"],
self.args["obs_min"], cdhit.output_files], parent=cdhit)
\ No newline at end of file
......@@ -34,18 +34,38 @@ description = [DEV] Analysis the composition and function of a microbial communi
# .exclude [None]: will make sure that there is only one arguments provided
#
[parameters]
## Samples
#sample.name = Sample
#sample.flag = --sample
#sample.help = Sample.
#sample.type = multiple
#sample.action = append
## Parameter name
#sample.sample_name.name = Sample name
#sample.sample_name.flag = name
#sample.sample_name.help = Name.
## Parameter read_1
#sample.read_1.name = Read 1
#sample.read_1.flag = read-1
#sample.read_1.help = Read 1.
#sample.read_1.type = localfile
#sample.read_1.required = True
## Parameter read_2
#sample.read_2.name = Read 2
#sample.read_2.flag = read-2
#sample.read_2.help = Read 2.
#sample.read_2.type = localfile
#sample.read_2.required = True
read_1.name = read_1
read_1.flag = --read-1
read_1.help = Read1
read_1.action = append
read_1.type = localfile
read_1.required = True
read_2.name = read_2
read_2.flag = --read-2
read_2.help = Read2
read_2.action = append
read_2.type = localfile
read_2.required = True
database.name = Gene database
......@@ -60,6 +80,33 @@ taxonomy.help = The gene taxonomy. Format : 'GENE_ID<tab>TAX; TAX; TAX;'.
taxonomy.type = localfile
taxonomy.required = True
# Sampling
trim_read_2.group = SAMPLING section
# Parameter deletion
discard.name = Discarded sequences
discard.flag = --discard
discard.help = The number of discarded sequences before each round of random sampling.
discard.type = int
discard.default = 0
# Parameter selection
select.name = Selected sequences
select.flag = --select
select.help = The number of selected sequences with replacement in each round of random sampling.
select.type = int
select.default = 10
# Parameter round
round.name = Rounds
round.flag = --round
round.help = The number of round for the random sampling.
round.type = int
round.default = 100
# Parameter obs_min
obs_min.name = Observation minimum depth
obs_min.flag = --obs-min
obs_min.help = The minimum number of sequences to keep an observation. This filter is applied before sampling.
obs_min.type = int
obs_min.default = 2
# Trim sequences
trim_read_1.group = TRIM section
trim_read_2.group = TRIM section
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment