Commit e7f3e34f authored by Celine Noirot's avatar Celine Noirot
Browse files

Add workflow directory to store jflow workflows

parent ae37a2b8
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from lib.utils import *
from jflow.workflow import Workflow
from jflow.iotypes import Formats,OutputFile
from jflow.seqio import SequenceReader
from jflow.featureio import VCFReader
import os
import sys
import StringIO
from ConfigParser import ConfigParser
class RNAseqDenovo (Workflow):
def process(self):
# Alignment
# ##############################
all_bams = []
all_bais = []
single_fastq, single_librairies_names, pair1_fastq, pair2_fastq, pair_librairies_names = [], [], [], [], []
for lib_arg in self.args["library"] :
if lib_arg["read2"] is not None and lib_arg["read2"] != "" :
pair_librairies_names.append( lib_arg["lib_name"] )
pair1_fastq.append(lib_arg["read1"])
pair2_fastq.append(lib_arg["read2"])
else :
single_librairies_names.append( lib_arg["lib_name"] )
single_fastq.append(lib_arg["read1"])
# Index contigs
index_fai_dict = self.add_component( "IndexFaiDict", [self.args["reference"]] )
index_bwa = self.add_component( "BWAIndex", [self.args["reference"]] )
#outputs of variantpreproccess
bam_variant_preprocess=[]
bai_variant_preprocess=[]
# Align paired-end reads
if len(pair2_fastq)>0 and len(pair1_fastq)>0 :
align_paired = self.add_component( "BWA", [index_bwa.databank, pair1_fastq, pair2_fastq, pair_librairies_names, "mem"], component_prefix="paired" )
index_bam_paired = self.add_component( "SamtoolsIndex", [align_paired.bam_files, 2, 2], component_prefix = "paired" )
all_bams = all_bams + index_bam_paired.sorted_bams
all_bais = all_bais + index_bam_paired.output_files
variant_preprocess_paired = self.add_component("VariantPreprocess", [index_bam_paired.sorted_bams,index_fai_dict.databank, False], component_prefix="paired" )
gatk_preprocess_paired = self.add_component("GatkPreprocess", [variant_preprocess_paired.output_files,variant_preprocess_paired.output_index_files,index_fai_dict.databank,22,True,None], component_prefix="paired" )
bam_variant_preprocess = gatk_preprocess_paired.output_files
# Align single reads
if len(single_fastq)>0 :
align_single = self.add_component( "BWA", [index_bwa.databank,single_fastq, None,single_librairies_names], component_prefix="single" )
index_bam_single = self.add_component( "SamtoolsIndex", [align_single.bam_files, 2, 2], component_prefix = "single" )
all_bams = all_bams + index_bam_single.sorted_bams
all_bais = all_bais + index_bam_single.output_files
variant_preprocess_single = self.add_component("VariantPreprocess", [index_bam_single.sorted_bams,index_fai_dict.databank, True], component_prefix="single" )
gatk_preprocess_single = self.add_component("GatkPreprocess", [variant_preprocess_single.output_files,variant_preprocess_single.output_index_files,index_fai_dict.databank,22,False,None], component_prefix="single" )
bam_variant_preprocess = bam_variant_preprocess + gatk_preprocess_single.output_files
# Variant calling
# ##############################
snpfile=self.args["snp"]
if ((snpfile is None) or not (os.path.exists(snpfile))):
gatk_recalibration_step1 = self.add_component("GatkRecalibration", [bam_variant_preprocess, index_fai_dict.databank, None,22,self.args["interval"]], component_prefix="step1")
gatk_haplotype_caller_step1 = self.add_component("GatkHaplotypeCaller", [gatk_recalibration_step1.output_files, index_fai_dict.databank, 35,35,10,self.args["interval"],40], component_prefix="step1")
gatk_filter_step1 = self.add_component("GatkVariantFilter", [gatk_haplotype_caller_step1.output_file, index_fai_dict.databank, True, None, 22], component_prefix="step1")
snpfile=gatk_filter_step1.output_file
gatk_recalibration_step2 = self.add_component("GatkRecalibration", [bam_variant_preprocess, index_fai_dict.databank, snpfile,22,self.args["interval"]], component_prefix="step2")
gatk_haplotype_caller_step2 = self.add_component("GatkHaplotypeCaller", [gatk_recalibration_step2.output_files, index_fai_dict.databank,30,30,10,self.args["interval"],40], component_prefix="step2")
gatk_filter_step2 = self.add_component("GatkVariantFilter", [gatk_haplotype_caller_step2.output_file, index_fai_dict.databank,False,None,22], component_prefix="step2")
def check_libraries (self, alignment_files):
# Do not check fastq file, done in Library
if alignment_files :
for bam in alignment_files :
if not os.path.exists(bam):
sys.stderr.write(bam + " file does not exist. Please provide a valid bam file!\n")
sys.exit(1)
lib_name = os.path.splitext(os.path.basename(bam))[0]
if self.get_library_from_name(lib_name) == None:
sys.stderr.write(bam + ' file should be linked to a library named '+ lib_name + '.\n')
sys.exit(1)
if self.args["count_matrix"] :
fh = open(self.args["count_matrix"])
line = fh.readline()
matrix_libs = []
parts = line.rstrip().split("\t")
matrix_libs = parts[1:]
for lib_name in matrix_libs :
if self.get_library_from_name(lib_name) == None:
sys.stderr.write('Library '+lib_name+' defined at the first line of the matrix file has not been defined (see --library option to define a library)\n')
sys.exit(1)
if self.args["variant"]["file"] :
reader = VCFReader(self.args["variant"]["file"])
for fullpath, samplename in reader.samples_name :
if self.get_library_from_name(lib_name) == None:
sys.stderr.write('Library '+samplename+' defined in the vcf file has not been defined (see --library option to define a library)\n')
sys.exit(1)
def check_load_format(self,contig_names):
for annot_arg in self.args["annotation"]:
err_contigs=get_gff3_unmatch_contigs(contig_names,annot_arg["file"])
if len (err_contigs) > 0 :
sys.stderr.write('Unknown sequence name >' + ",".join(err_contigs)[:-1] + '< in GFF3 ' + annot_arg["file"] + ' \n')
sys.exit(1)
for predict_arg in self.args["prediction"]:
err_contigs = get_gff3_unmatch_contigs( contig_names, predict_arg["file"] )
if len(err_contigs) > 0 :
sys.stderr.write('Unknown sequence name >' + ",".join(err_contigs)[:-1] + '< in GFF3 ' + predict_arg["file"] + ' \n')
sys.exit(1)
if self.args["variant"]["file"] :
err_contigs=get_vcf_unmatch_contigs(contig_names,self.args["variant"]["file"])
if len (err_contigs) > 0 :
sys.stderr.write('Unknown sequence name >' + ",".join(err_contigs)[:-1] + '< in VCF file ' + self.args["variant"]["file"] + '\n')
sys.exit(1)
if self.args["go"] :
err_contigs=get_GO_unmatch_contigs(contig_names,self.args["go"])
if len (err_contigs) > 0 :
sys.stderr.write('Unknown sequence name >' + ",".join(err_contigs)[:-1] + '< in GO file ' + self.args["go"] + '\n')
sys.exit(1)
if self.args["keyword"] :
err_contigs=get_keyword_unmatch_contigs(contig_names,self.args["keyword"])
if len (err_contigs) > 0 :
sys.stderr.write('Unknown sequence name >' + ",".join(err_contigs)[:-1] + '< in keyword file ' + self.args["keyword"] + '\n')
sys.exit(1)
def _nb_seq( self, filepath ):
"""
@summary : Return the number of sequences.
@param filepath : [str] the sequence file to process.
@return : [int] The number of sequences.
"""
nb_seq = 0
seq_fh = SequenceReader( filepath )
for seq_record in seq_fh :
nb_seq += 1
return nb_seq
def _unstackMatrix(self, matrix):
"""
@summary : Transform a matrix (list of list) into array.
@param matrix : [list] the list of lists.
@return : [list] The simple list and the stack rules.
The stack rules indicates the number of elements by matrix row. It is necessary for reverse transformation.
@note : Example
Matrix : [ return : [
[ "a", "b" ], [ "a", "b", "c", "d", "e", "f" ],
[ "c" ], [ 2, 1, 3 ]
[ "d", "e", "f" ] ]
]
"""
simple_list = list()
stack_rules = list()
for i in matrix:
simple_list = simple_list + i
stack_rules.append( len(i) )
return simple_list, stack_rules
def _stackMatrix(self, simple_list, stack_rules):
"""
@summary : Transform a list in matrix (list of list).
@param simple_list : [list] the list to transform.
@param stack_rules : [list] the rules to transform.
@return : [list] The matrix.
@note : Example
simple_list : return : [
[ "a", "b", "c", "d", "e", "f" ] [ "a", "b" ],
[ "c" ],
stack_rules : [ "d", "e", "f" ]
[ 2, 1, 3 ] ]
"""
matrix = list()
start_idx = 0
end_idx = 0
for size in stack_rules:
end_idx += size
matrix.append( simple_list[start_idx:end_idx] )
start_idx = end_idx
return matrix
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
[global]
name = variationcalling
description = variation calling
#
# Parameter section
# param.name: the parameter display name
# .name: the parameter argument
# .flag: the command line flag to use the argument
# .help: a brief description of what the parameter does
# .default [None]: the value produced if the parameter is not provided
# .type [str]: the parameter type that should be tested (str|int|file|long|bool|...)
# .choices [None]: a container of the allowable values for the parameter
# .required [False]: whether or not the command-line option may be omitted
# .action [store]: the basic type of action to be taken when this argument is encountered at the command line.
#
#####################################################################################################
# Parameters for process workflow
#####################################################################################################
[parameters]
# reference genome
reference.name = Reference genome
reference.help = Reference genome.
reference.flag = --reference
reference.type = localfile
reference.required = True
# sequences
library.name = Libraries
library.help = Library.
library.flag = --library
library.type = multiple
library.action = append
library.required = True
# Parameter name
library.lib_name.name = Name for library
library.lib_name.flag = name
library.lib_name.help = Name for library.
library.lib_name.required = True
# Parameter R1
library.read1.name = R1 file
library.read1.flag = R1
library.read1.help = Path to R1 file.
library.read1.type = localfile
library.read1.required = True
# Parameter R2
library.read2.name = R2 file
library.read2.flag = R2
library.read2.help = Path to R2 file.
library.read2.type = localfile
# Parameter SNP
snp.name = Snp file
snp.help = Known snp, skip the first calling and filter step
snp.flag = --snp
snp.type = localfile
# Parameter SNP
interval.name = Bed interval file (for exome)
interval.help = Bed file wich describe wich region are sequenced by exome reduction.
interval.flag = --interval
interval.type = localfile
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment