Commit 66af0576 authored by Claire Kuchly's avatar Claire Kuchly
Browse files

modif pour le changement nouvelle version ng6-jflow

parent ae2f88ff
......@@ -19,62 +19,42 @@ import os
import sys
import re
from ng6.ng6workflow import NG6Workflow
from ng6.project import Project
from ng6.run import Run
from ng6.ng6workflow import CasavaNG6Workflow
from ng6.utils import Utils
class IlluminaMatePair (NG6Workflow):
class IlluminaMatePair (CasavaNG6Workflow):
def get_name(self):
return 'illumina_matepair'
def get_description(self):
return "illumina quality check pipeline for matepair analyse"
def define_parameters(self, function="process"):
self.add_parameter("compression", "How should the data be compressed once archived", choices= [ "none", "gz", "bz2"], default = "none")
self.add_input_file("reference_genome", "Which genome should the read being align on")
self.add_input_file_list("databank", "Which databank should be used to seek contamination (as to be phiX databank indexed for bwa)")
self.add_parameter("keep_reads", "Keep or discard reads which pass the illumina filter. all option will keep all reads", flag = "--keep",
choices=[ "pass_illumina_filters", "not_pass_illumina_filters", "all"], default = "pass_illumina_filters")
self.add_parameter("delete_bam", "The BAM are not stored", type=bool, default = False)
self.add_parameter("histogram_width", "Explicitly sets the histogram width, overriding automatic truncation of histogram tail", type=int, default = 10000, group="INSERTSIZE section")
self.add_parameter("min_pct", "When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have"+
" fewer than this percentage of overall reads", type=float, default = 0.01, group="INSERTSIZE section")
self.add_parameter("no_group", "Disables grouping of bases for reads >50bp", type=bool, default = True)
def process(self):
# handle if run name have spaces
run_name = "_".join(self.runobj.name.split())
if len(self.undetermined_reads1) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.get_all_reads("read1"), self.undetermined_reads1])
# manage the sequences files
group_prefix = None
if self.args['sample_description']['casava_directory'] is not None :
if self.args['sample_description']['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
mids_desc_array, self.read1_files, self.read2_files, undetermined_read1_files, undetermined_read2_files = Utils.filesFromCasava( self.args['sample_description']['casava_directory'], self.project.get_name(), self.args['sample_description']['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
# statistics about demultiplexing
if len(undetermined_read1_files) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.read1_files, undetermined_read1_files])
elif (self.args['files_read'][0]['read_1'] is not None) and (len(self.args['files_read'][0]['read_1']) > 0) :
print self.args['files_read'][0]
self.read1_files = []
self.read2_files = []
for pair in self.args["files_read"]:
R1_file = pair['read_1']
R2_file = pair['read_2']
if os.path.isfile(R1_file):
self.read1_files.append(R1_file)
else:
raise IOError, R1_file + " file does not exists."
if os.path.isfile(R2_file):
self.read2_files.append(R2_file)
else:
raise IOError, R2_file + " file does not exists."
else:
raise ValueError, "[casava-directory and lane-number] OR [read(s)] must be specified."
is_paired_end = len(self.read2_files) > 0
print is_paired_end
if self.args["keep_reads"] != "all" :
# fastq illumina filter
fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.read1_files+self.read2_files, self.args["keep_reads"], group_prefix, run_name+"_fastqilluminafilter.tar.gz"])
if self.keep_reads != "all" :
# fastq illumina filter
fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.get_all_reads(), self.keep_reads, self.group_prefix])
# list filtered files
if is_paired_end :
if self.is_paired_end() :
# split read 1 and read 2 from filtered files list
[filtered_read1_files, filtered_read2_files] = Utils.split_pair(fastqilluminafilter.fastq_files_filtered, (group_prefix is not None))
[filtered_read1_files, filtered_read2_files] = Utils.split_pair(fastqilluminafilter.fastq_files_filtered, (self.group_prefix is not None))
else:
filtered_read1_files = fastqilluminafilter.fastq_files_filtered
filtered_read2_files = []
......@@ -82,13 +62,14 @@ class IlluminaMatePair (NG6Workflow):
filtered_read2_files = sorted(filtered_read2_files)
else:
fastqilluminafilter = None
filtered_read1_files = self.read1_files
filtered_read2_files = self.read2_files
filtered_read1_files = self.get_all_reads("read1")
filtered_read2_files = self.get_all_reads("read2")
# archive the files
saved_files = filtered_read1_files + filtered_read2_files
reads_prefixes = None
if group_prefix is not None :
if self.group_prefix is not None :
# concatenate fastq
reads_prefixes = (Utils.get_group_basenames(saved_files, "read")).keys()
concatenatefastq = self.add_component("ConcatenateFilesGroups", [saved_files, reads_prefixes])
......@@ -96,7 +77,7 @@ class IlluminaMatePair (NG6Workflow):
addrawfiles = self.add_component("AddRawFiles", [self.runobj, saved_files, self.args["compression"]])
# make some statistics on raw file
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (self.group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
# contamination_search
try: self.args["databank"].extend([self.get_resource("phix_bwa"), self.get_resource("ecoli_bwa"), self.get_resource("yeast_bwa")])
......@@ -110,7 +91,7 @@ class IlluminaMatePair (NG6Workflow):
concatenate1.concat_files = sorted(concatenate1.concat_files)
concatenate2.concat_files = sorted(concatenate2.concat_files)
cutadapt = self.add_component("CutAdapt",[concatenate1.concat_files, concatenate2.concat_files,{"g":["CTGTCTCTT","ATACACATCT","AGATCTAT","AAGAGACAG"]},{"g":["CTGTCTCTT","ATACACATCT","AGATCTAT","AAGAGACAG"]},is_paired_end,0.1,4,20 ],parent= fastqilluminafilter)
cutadapt = self.add_component("CutAdapt",[concatenate1.concat_files, concatenate2.concat_files,{"g":["CTGTCTCTT","ATACACATCT","AGATCTAT","AAGAGACAG"]},{"g":["CTGTCTCTT","ATACACATCT","AGATCTAT","AAGAGACAG"]},self.is_paired_end,0.1,4,20 ],parent= fastqilluminafilter)
#reverse_complement
......@@ -123,11 +104,11 @@ class IlluminaMatePair (NG6Workflow):
fastqc = self.add_component("FastQC", [revcom1.output_files+revcom2.output_files, (group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = cutadapt, component_prefix="Trimmed_read")
if self.args["reference_genome"]:
if self.reference_genome:
# index the reference genome if not already indexed
indexed_ref = self.args["reference_genome"]
if not os.path.exists( self.args["reference_genome"] + ".bwt" ):
bwaindex = self.add_component("BWAIndex", [self.args["reference_genome"]])
indexed_ref = self.reference_genome
if not os.path.exists( self.reference_genome + ".bwt" ):
bwaindex = self.add_component("BWAIndex", [self.reference_genome])
indexed_ref = bwaindex.databank
# align reads against indexed genome
......@@ -135,10 +116,10 @@ class IlluminaMatePair (NG6Workflow):
if group_prefix is not None :
sample_lane_prefixes = (Utils.get_group_basenames(revcom1.output_files+revcom2.output_files, "lane")).keys()
#bwa = self.add_component("BWA", [indexed_ref, gunzip.fastq_R1 , gunzip.fastq_R2, sample_lane_prefixes], parent = cutadapt)
bwa = self.add_component("BWA", [indexed_ref, revcom1.output_files , revcom2.output_files, sample_lane_prefixes, 'aln', not self.args["delete_bam"]], parent = cutadapt)
bwa = self.add_component("BWA", [indexed_ref, revcom1.output_files , revcom2.output_files, sample_lane_prefixes, 'aln', not self.delete_bam], parent = cutadapt)
# make some statistic on the alignement
alignmentstats = self.add_component("AlignmentStats", [bwa.bam_files, is_paired_end, False], parent = bwa)
alignmentstats = self.add_component("AlignmentStats", [bwa.bam_files, self.is_paired_end(), False], parent = bwa)
if is_paired_end:
if self.is_paired_end():
# process insert sizes
insertssizes = self.add_component("InsertsSizes", [bwa.bam_files, 10000, self.args["min_pct"], "LENIENT", "inserts_sizes.tar.gz"], parent = bwa)
insertssizes = self.add_component("InsertsSizes", [bwa.bam_files, 10000, self.min_pct, "LENIENT", "inserts_sizes.tar.gz"], parent = bwa)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment