Commit db9b4f96 authored by Claire Kuchly's avatar Claire Kuchly
Browse files

new pipeline - mate pair data

parent 33fcaf67
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import sys
import re
from ng6.ng6workflow import NG6Workflow
from ng6.project import Project
from ng6.run import Run
from ng6.utils import Utils
class IlluminaMatePair (NG6Workflow):
def process(self):
# handle if run name have spaces
run_name = "_".join(self.runobj.name.split())
# manage the sequences files
group_prefix = None
print self.args['files_read']
print "---------------------"
if self.args['sample_description']['casava_directory'] is not None :
if self.args['sample_description']['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['sample_description']['casava_directory'], self.project.get_name(), self.args['sample_description']['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
elif (self.args['files_read'][0]['read_1'] is not None) and (len(self.args['files_read'][0]['read_1']) > 0) :
print self.args['files_read'][0]
self.read1_files = []
self.read2_files = []
for pair in self.args["files_read"]:
R1_file = pair['read_1']
R2_file = pair['read_2']
if os.path.isfile(R1_file):
self.read1_files.append(R1_file)
else:
raise IOError, R1_file + " file does not exists."
if os.path.isfile(R2_file):
self.read2_files.append(R2_file)
else:
raise IOError, R2_file + " file does not exists."
else:
raise ValueError, "[casava-directory and lane-number] OR [read(s)] must be specified."
is_paired_end = len(self.read2_files) > 0
if self.args["keep_reads"] != "all" :
# fastq illumina filter
fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.read1_files+self.read2_files, self.args["keep_reads"], group_prefix, run_name+"_fastqilluminafilter.tar.gz"])
# list filtered files
if is_paired_end :
# split read 1 and read 2 from filtered files list
[filtered_read1_files, filtered_read2_files] = Utils.split_pair(fastqilluminafilter.fastq_files_filtered, (group_prefix is not None))
else:
filtered_read1_files = fastqilluminafilter.fastq_files_filtered
filtered_read2_files = []
filtered_read1_files = sorted(filtered_read1_files)
filtered_read2_files = sorted(filtered_read2_files)
else:
fastqilluminafilter = None
filtered_read1_files = self.read1_files
filtered_read2_files = self.read2_files
# archive the files
saved_files = filtered_read1_files + filtered_read2_files
reads_prefixes = None
if group_prefix is not None :
# concatenate fastq
reads_prefixes = (Utils.get_group_basenames(saved_files, "read")).keys()
concatenatefastq = self.add_component("ConcatenateFilesGroups", [saved_files, reads_prefixes])
saved_files = concatenatefastq.concat_files
addrawfiles = self.add_component("AddRawFiles", [self.runobj, saved_files, self.args["compression"]])
# make some statistics on raw file
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
# contamination_search
try: self.args["databank"].extend([self.get_resource("phix_bwa"), self.get_resource("ecoli_bwa"), self.get_resource("yeast_bwa")])
except: pass
if self.args["databank"]:
contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files+filtered_read2_files, self.args["databank"], reads_prefixes], parent = fastqilluminafilter)
# mate_pair analyse
cutadapt = self.add_component("CutAdapt",[filtered_read1_files,filtered_read2_files,{"a":["CTGTCTCTTATACACATCT","AGATCTATAAGAGACAG"]},{"a":["CTGTCTCTTATACACATCT","AGATCTATAAGAGACAG"]},is_paired_end,0.1,20 ],parent= fastqilluminafilter)
#reverse_complement
revcom1 = self.add_component("FastxReverseComplement",cutadapt.output_files_R1,component_prefix="read1")
revcom2 = self.add_component("FastxReverseComplement",cutadapt.output_files_R2,component_prefix="read2")
# make some statistics on filtered file
fastqc = self.add_component("FastQC", [revcom1.output_files+revcom2.output_files, (group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = cutadapt, component_prefix="Trimmed_read")
#gunzip = self.add_component("GunZipFiles",revcom1.output_files, revcom2.output_files)
if self.args["reference_genome"]:
# index the reference genome if not already indexed
indexed_ref = self.args["reference_genome"]
if not os.path.exists( self.args["reference_genome"] + ".bwt" ):
bwaindex = self.add_component("BWAIndex", [self.args["reference_genome"]])
indexed_ref = bwaindex.databank
# align reads against indexed genome
sample_lane_prefixes = None
if group_prefix is not None :
sample_lane_prefixes = (Utils.get_group_basenames(revcom_read1_files+revcom_read2_files, "lane")).keys()
#bwa = self.add_component("BWA", [indexed_ref, gunzip.fastq_R1 , gunzip.fastq_R2, sample_lane_prefixes], parent = cutadapt)
bwa = self.add_component("BWA", [indexed_ref, revcom1.output_files , revcom2.output_files, sample_lane_prefixes, 'aln', False, True], parent = cutadapt)
# make some statistic on the alignement
alignmentstats = self.add_component("AlignmentStats", [bwa.bam_files, is_paired_end, False], parent = bwa)
if is_paired_end:
# process insert sizes
insertssizes = self.add_component("InsertsSizes", [bwa.bam_files, 10000, self.args["min_pct"], "LENIENT", "inserts_sizes.tar.gz"], parent = bwa)
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
[global]
name = illumina_matepair
description = illumina matepair quality check pipeline
#
# Parameter section
# param.name: the parameter display name
# .flag: the command line flag to use the argument
# .help: a brief description of what the parameter does
# .default [None]: the value produced if the parameter is not provided
# .type [str]: the parameter type that should be tested (str|int|date|file|bool)
# .choices [None]: a container of the allowable values for the parameter
# .required [False]: whether or not the command-line option may be omitted
# .action [store]: the basic type of action to be taken (store|append)
#
[parameters]
#Casava directory description
sample_description.name = description
sample_description.flag = --casava-description
sample_description.help = where are stored casava results and the lane processed
sample_description.type = multiple
sample_description.required = True
sample_description.exclude = files_read
sample_description.casava_directory.name = casava directory
sample_description.casava_directory.flag = casava-directory
sample_description.casava_directory.help = Where are stored casava results (see also lane-number)
sample_description.casava_directory.type = casava_dir
sample_description.lane_number.name = lane number
sample_description.lane_number.flag = lane-number
sample_description.lane_number.help = Which lane should be processed (mandatory with casava-directory)
#Files read description
files_read.name = reads
files_read.flag = --reads
files_read.help = Define which read1 files and/or read2 should be used
files_read.type = multiple
files_read.action = append
files_read.exclude = sample_description
files_read.read_1.name = read 1
files_read.read_1.flag = R1
files_read.read_1.help = Which read1 files should be used
files_read.read_1.required = True
files_read.read_1.type = localfile
files_read.read_2.name = read 2
files_read.read_2.flag = R2
files_read.read_2.help = Which read2 files should be used (if single end, leave empty)
files_read.read_2.type = localfile
compression.name = compression
compression.flag = --compression
compression.help = How should data be compressed once archived (none|gz|bz2)
compression.default = none
compression.choices = none|gz|bz2
reference_genome.name = reference_genome
reference_genome.flag = --reference-genome
reference_genome.help = Which genome should the read being align on
reference_genome.type = localfile
databank.name = databank
databank.flag = --databank
databank.help = Which databank should be used to seek contamination (as to be phiX databank indexed for bwa)
databank.action = append
databank.type = localfile
keep_reads.name = keep_reads
keep_reads.flag = --keep
keep_reads.help = Keep reads which pass the Illumina filters or keep reads which not pass the Illumina filters (pass_illumina_filters|not_pass_illumina_filters|all). With other values that "all" the headers of reads must be '@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>'
keep_reads.default = pass_illumina_filters
keep_reads.choices = pass_illumina_filters|not_pass_illumina_filters|all
#Insert size section
histogram_width.group = INSERTSIZE section
min_pct.group = INSERTSIZE section
histogram_width.name = histogram_width
histogram_width.flag = --histogram_width
histogram_width.help = Explicitly sets the histogram width, overriding automatic truncation of histogram tail.
histogram_width.default = 10000
min_pct.name = min_pct
min_pct.flag = --min_pct
min_pct.help = When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this percentage of overall reads.
min_pct.default = 0.01
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment