Commit b9ad769c authored by Penom Nom's avatar Penom Nom
Browse files

- Add Illumina diversity QC pipeline.

- Fix bug with exclude parameters.
parent dafdadbf
......@@ -40,8 +40,6 @@ casava_directory.exclude = read_1
lane_number.name = lane_number
lane_number.flag = --lane-number
lane_number.help = Which lane should be processed (mandatory with casava-directory)
lane_number.required = True
lane_number.exclude = read_1
read_1.name = read_1
read_1.flag = --read-1
......
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import sys
import re
from ng6.ng6workflow import NG6Workflow
from ng6.project import Project
from ng6.run import Run
from ng6.utils import Utils
class IlluminaDiversityQC (NG6Workflow):
def process(self):
# handle if run name have spaces
run_name = "_".join(self.runobj.name.split())
# manage the sequences files
group_prefix = None
if self.args['casava_directory'] is not None :
if self.args['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
self.read1_files = []
self.read2_files = []
for file in self.args["read_1"]:
if os.path.isfile(file):
self.read1_files.append(file)
else:
raise IOError, file + " file does not exists."
if self.args["read_2"]:
for file in self.args["read_2"]:
if os.path.isfile(file):
self.read2_files.append(file)
else:
raise IOError, file + " file does not exists."
else:
raise IOError, "read-2 must be specified with read-1."
else:
raise ValueError, "[casava-directory and lane-number] OR [read-1 and read-2] must be specified."
if self.args["keep_reads"] != "all" :
# fastq illumina filter
fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.read1_files+self.read2_files, self.args["keep_reads"], group_prefix, run_name+"_fastqilluminafilter.tar.gz"])
# list filtered files
[filtered_read1_files, filtered_read2_files] = Utils.split_pair(fastqilluminafilter.fastq_files_filtered, (group_prefix is not None))
filtered_read1_files = sorted(filtered_read1_files)
filtered_read2_files = sorted(filtered_read2_files)
else:
fastqilluminafilter = None
filtered_read1_files = self.read1_files
filtered_read2_files = self.read2_files
# archive the files
saved_files = filtered_read1_files + filtered_read2_files
reads_prefixes = None
if group_prefix is not None :
# concatenate fastq
reads_prefixes = (Utils.get_group_basenames(saved_files, "read")).keys()
concatenatefastq = self.add_component("ConcatenateFilesGroups", [saved_files, reads_prefixes])
saved_files = concatenatefastq.concat_files
addrawfiles = self.add_component("AddRawFiles", [self.runobj, saved_files, self.args["compression"]])
# make some statistics on raw file
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
# contamination_search
try: self.args["databank"].extend([self.get_resource("phix_bwa"), self.get_resource("ecoli_bwa"), self.get_resource("yeast_bwa")])
except: pass
if self.args["databank"]:
contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files+filtered_read2_files, self.args["databank"], reads_prefixes], parent = fastqilluminafilter)
# merge overlapping pair
join_pairs = self.add_component("Flash", [filtered_read1_files, filtered_read2_files, self.args["mismatch_ratio"], self.args["min_overlap"], self.args["max_overlap"], self.args["avg_reads_length"], self.args["avg_fragment_length"]], parent=fastqilluminafilter)
if self.args["assignation_databank"] is not None :
# subset assignation
subset_assignation = self.add_component("SubsetAssignation", [join_pairs.extended_frags, self.args["assignation_databank"]], parent=join_pairs)
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
[global]
name = illumina_diversity_qc
description = Illumina diversity quality check pipeline. This pipeline needs to use paired-end reads with overlap.
#
# Parameter section
# param.name: the parameter display name
# .flag: the command line flag to use the argument
# .help: a brief description of what the parameter does
# .default [None]: the value produced if the parameter is not provided
# .type [str]: the parameter type that should be tested (str|int|date|file|bool)
# .choices [None]: a container of the allowable values for the parameter
# .required [False]: whether or not the command-line option may be omitted
# .action [store]: the basic type of action to be taken (store|append)
#
[parameters]
casava_directory.name = casava_directory
casava_directory.flag = --casava-directory
casava_directory.help = Where are stored casava results (see also lane-number)
casava_directory.required = True
casava_directory.exclude = read_1
lane_number.name = lane_number
lane_number.flag = --lane-number
lane_number.help = Which lane should be processed (mandatory with casava-directory)
read_1.name = read_1
read_1.flag = --read-1
read_1.help = Which read1 files should be used
read_1.action = append
read_1.required = True
read_1.exclude = casava_directory
read_2.name = read_2
read_2.flag = --read-2
read_2.help = Which read2 files should be used (if single end, leave empty)
read_2.action = append
compression.name = compression
compression.flag = --compression
compression.help = How should data be compressed once archived (none|gz|bz2)
compression.default = none
compression.choices = none|gz|bz2
reference_genome.name = reference_genome
reference_genome.flag = --reference-genome
reference_genome.help = Which genome should the read being align on
databank.name = databank
databank.flag = --databank
databank.help = Which databank should be used to seek contamination (as to be phiX databank indexed for bwa)
databank.action = append
assignation_databank.name = assignation_databank
assignation_databank.flag = --assignation-databank
assignation_databank.help = Blast databank to classify a subset of sequences.
keep_reads.name = keep_reads
keep_reads.flag = --keep
keep_reads.help = Keep reads which pass the Illumina filters or keep reads which not pass the Illumina filters (pass_illumina_filters|not_pass_illumina_filters|all). With other values that "all" the headers of reads must be '@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>'
keep_reads.default = pass_illumina_filters
keep_reads.choices = pass_illumina_filters|not_pass_illumina_filters|all
mismatch_ratio.name = mismatch_ratio
mismatch_ratio.flag = --mismatch-ratio
mismatch_ratio.help = Maximum allowed ratio between the number of mismatched base pairs and the overlap length
mismatch_ratio.default = 0.1
min_overlap.name = min_overlap
min_overlap.flag = --min-overlap
min_overlap.help = The minimum required overlap length between two reads to provide a confident overlap.
min_overlap.default = 20
max_overlap.name = max_overlap
max_overlap.flag = --max-overlap
max_overlap.help = Maximum overlap length expected in approximately 90% of read pairs.
max_overlap.default = 55
avg_reads_length.name = avg_reads_length
avg_reads_length.flag = --avg-reads-length
avg_reads_length.help = Average read length
avg_reads_length.default = 250
avg_fragment_length.name = avg_fragment_length
avg_fragment_length.flag = --avg-fragment-length
avg_fragment_length.help = Average fragment length (after join)
avg_fragment_length.default = 460
\ No newline at end of file
......@@ -40,8 +40,6 @@ casava_directory.exclude = read_1
lane_number.name = lane_number
lane_number.flag = --lane-number
lane_number.help = Which lane should be processed (mandatory with casava-directory)
lane_number.required = True
lane_number.exclude = read_1
read_1.name = read_1
read_1.flag = --read-1
......
......@@ -40,8 +40,6 @@ casava_directory.exclude = read_1
lane_number.name = lane_number
lane_number.flag = --lane-number
lane_number.help = Which lane should be processed (mandatory with casava-directory)
lane_number.required = True
lane_number.exclude = read_1
read_1.name = read_1
read_1.flag = --read-1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment