Commit f98bcc47 authored by Gerald Salin's avatar Gerald Salin
Browse files

include align-subset-reads parameter in the illumina_rnaseq pipeline

parent 99f4eebf
......@@ -151,6 +151,7 @@ class NG6Workflow (BasicNG6Workflow):
self.add_parameter("sequencer", "Which sequencer produced the data", required = True, display_name="Sequencer", group="Run information")
self.add_parameter("species", "Which species has been sequenced", required = True, display_name="Species", group="Run information")
self.add_parameter("run_type", "What type of data is it (1 lane, 1 region)", flag = "--type", required = True, display_name="Type", group="Run information")
self.add_parameter("align_subset_reads", "Align only on subset reads", type=bool, default = False)
def __add_sample_parameters__(self):
self.add_multiple_parameter_list("input_sample", "Definition of a sample", flag="--sample", group="Sample description") # required = True, # TO CHECK casavaWorkflow required not if casava dir
......
{*
Copyright (C) 2009 INRA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*}
{extends file='AnalysisTemplate.tpl'}
{block name=description_update}
<br/>
<div style="float:right;">
{if $is_project_admin }
<button id="add_file" type="button" class="btn btn-sm btn-primary"><i class="glyphicon glyphicon-plus"></i> add files</button>
{/if}
</div>
{/block}
{* No result tab*}
{block name=nav_menu_results}{/block}
{block name=tab_content_results}{/block}
\ No newline at end of file
......@@ -170,14 +170,14 @@ class AddRawFiles (Component):
inputs = file, outputs = files_to_sync[idx], map=False)
elif self.compression=="gz":
for file in self.files_to_save:
files_to_sync_ori.append(os.path.join(self.runobj.get_work_directory(),os.path.basename(file)) + ".gz")
files_to_sync_ori.append(os.path.join(self.runobj.get_work_directory(),os.path.basename(file)))
files_to_sync = files_to_sync_ori
for idx, file in enumerate(self.files_to_save):
self.add_python_execution(zip_file,cmd_format="{EXE} {IN} {OUT}",
inputs = file, outputs = files_to_sync[idx], map=False)
elif self.compression=="bz2":
for file in self.files_to_save:
files_to_sync_ori.append(os.path.join(self.runobj.get_work_directory(),os.path.basename(file)) + ".bz2")
files_to_sync_ori.append(os.path.join(self.runobj.get_work_directory(),os.path.basename(file)))
files_to_sync = files_to_sync_ori
for idx, file in enumerate(self.files_to_save):
self.add_python_execution(zip_file,cmd_format="{EXE} {IN} {OUT}",
......@@ -199,6 +199,4 @@ class AddRawFiles (Component):
inputs = files_to_sync, outputs = total_stat_file, map=False)
logging.getLogger("AddRawFiles").debug("process. after concatenate_stats_file. does the work dir ("+self.runobj.get_work_directory()+") exist? "+ str(os.path.isdir(self.runobj.get_work_directory())))
def post_process(self):
logging.getLogger("AddRawFiles").debug("post_process. does the work dir ("+self.runobj.get_work_directory()+") exist? ")
\ No newline at end of file
\ No newline at end of file
......@@ -29,7 +29,7 @@ from ng6.utils import Utils
def get_number_of_reads_to_extract(extract_rate,min_nb_seq,max_nb_seq, filename, *files):
import jflow.seqio as seqio
import logging
import os
import os
# Check parameters
extract_rate = float(extract_rate)
min_nb_seq = int(min_nb_seq)
......@@ -91,7 +91,6 @@ def extract_random_seq(input_file,info_file,output_file):
count += 1
if count % step == 0 and nb_seq_to_extract > countExtractedReads :
logging.getLogger("SubsetSeqFiles").debug("Add a read in subset")
countExtractedReads += 1
seqio.writefastq(output, [[id, desc, seq, qual]])
if nb_seq_to_extract <= countExtractedReads:
......@@ -110,12 +109,13 @@ class SubsetSeqFiles (Analysis):
self.add_parameter("extract_rate", "extract_rate", type='float', default=extract_rate)
self.add_parameter("min_nb_seq", "min_nb_seq", type='int', default=min_nb_seq)
self.add_parameter("max_nb_seq", "max_nb_seq", type='int', default=max_nb_seq)
self.add_parameter("number_of_reads_in_subset", "number_of_reads_in_subset", type='int')
# Files
self.add_input_file_list( "read1", "Which read1 files should be used.", default=read1, required=True )
self.add_input_file_list( "read2", "Which read2 files should be used.", default=read2 )
self.add_output_file_list("subset_read1", "Subset of the read1 file", pattern='{basename_woext}_subset.fastq.gz', items=self.read1)
self.add_output_file_list("subset_read2", "Subset of the read1 file", pattern='{basename_woext}_subset.fastq.gz', items=self.read2)
self.add_output_file("output_file", "output_file", filename="gg.txt" )
self.add_output_file("output_file", "output_file", filename="reference_number_of_reads_to_extract.txt" )
def get_version(self):
return '-'
......@@ -126,10 +126,7 @@ class SubsetSeqFiles (Analysis):
self.options = ""
def post_process(self):
self.options = "gg"
#if self.keep_bam:
# Finally create and add the archive to the analysis
#self._save_files(self.bam_files)
self.options = "Number of reads in the subset files = "
def process(self):
......@@ -138,29 +135,16 @@ class SubsetSeqFiles (Analysis):
logging.getLogger("SubsetSeqFiles").debug("before subset of reads")
self.add_python_execution(get_number_of_reads_to_extract, cmd_format='{EXE} {ARG} {OUT} {IN}',
map=False, outputs = self.output_file, inputs=self.read1, arguments=[self.extract_rate,self.min_nb_seq, self.max_nb_seq])
# subset_reads = PythonFunction(extract_random_seq, cmd_format="{EXE} {ARG} {IN} {OUT}")
# for i,o in zip(self.read1,subset_read1 ):
# subset_reads(outputs=o, arguments=[self.output_file], inputs=i)
for i,o in zip(self.read1,subset_read1 ):
self.add_python_execution(extract_random_seq,cmd_format="{EXE} {IN} {OUT}",
inputs = [i,self.output_file], outputs = o, map=False)
if self.read2:
subset_read2 = self.get_outputs( '{basename_woext}_subset.fastq', self.read2 )
subset_read2_gz = self.get_outputs( '{basename_woext}_subset.fastq.gz', self.read2 )
#MultiMap(subset_reads( inputs = [self.read1,self.read2], outputs = [subset_read1,subset_read2]) )
# for i,o in zip(self.read2,subset_read2 ):
# subset_reads(outputs=o, arguments=self.output_file, inputs=i)
for i,o in zip(self.read1,subset_read1 ):
self.add_python_execution(extract_random_seq,cmd_format="{EXE} {IN} {OUT}",
inputs = [i,self.output_file], outputs = o, map=False)
for i,o in zip(self.read2,subset_read2 ):
self.add_python_execution(extract_random_seq,cmd_format="{EXE} {IN} {OUT}",
inputs = [i,self.output_file], outputs = o, map=False)
# self.add_python_execution(extract_random_seq,cmd_format="{EXE} " + self.output_file + " {IN} {OUT}",
# inputs = [self.read1,self.read2], outputs = [subset_read1,subset_read2], map=False)
# else:
# MultiMap(subset_reads( inputs = [self.read1], outputs = [subset_read1],includes=self.output_file) )
# self.add_python_execution(extract_random_seq,cmd_format="{EXE} " + self.output_file + " {IN} {OUT}",
# inputs = [self.read1], outputs = [subset_read1], map=False)
gzip = ShellFunction( 'gzip $1', cmd_format='{EXE} {IN} {OUT}')
MultiMap(gzip, inputs=[subset_read1], outputs=[subset_read1_gz])
......
......@@ -35,7 +35,6 @@ class IlluminaQualityCheck (CasavaNG6Workflow):
def define_parameters(self, function="process"):
self.add_input_file("reference_genome", "Which genome should the read being align on")
self.add_parameter("delete_bam", "The BAM are not stored", type=bool, default = False)
self.add_parameter("align_subset_reads", "Align only on subset reads", type=bool, default = False)
self.add_parameter("histogram_width", "Explicitly sets the histogram width, overriding automatic truncation of histogram tail", type=int, default = 800, group="INSERTSIZE section")
self.add_parameter("min_pct", "When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have"+
" fewer than this percentage of overall reads", type=float, default = 0.01, group="INSERTSIZE section")
......
......@@ -18,6 +18,7 @@
import os
import glob
import sys
import logging
from ng6.ng6workflow import CasavaNG6Workflow
from ng6.utils import Utils
......@@ -40,7 +41,10 @@ class RnaSeqQualityCheck (CasavaNG6Workflow):
def process(self):
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files = self.illumina_process()
logging.getLogger("RnaSeqQualityCheck").debug("process. filtered_read1_files = "+",".join(filtered_read1_files))
logging.getLogger("RnaSeqQualityCheck").debug("process. filtered_read2_files = "+",".join(filtered_read2_files))
logging.getLogger("RnaSeqQualityCheck").debug("process. concat_files = "+",".join(concat_files))
if self.reference_transcriptome != None :
# index the reference transcriptome if not already indexed
indexed_ref = self.reference_transcriptome
......@@ -50,9 +54,17 @@ class RnaSeqQualityCheck (CasavaNG6Workflow):
# align reads against indexed genome
sample_lane_prefixes = None
if self.group_prefix != None :
sample_lane_prefixes = list((Utils.get_group_basenames(filtered_read1_files+filtered_read2_files, "lane")).keys())
bwa = self.add_component("BWA", [indexed_ref, filtered_read1_files, filtered_read2_files, sample_lane_prefixes, "mem", not self.delete_bam], parent = fastqilluminafilter)
if self.align_subset_reads:
subset = self.add_component("SubsetSeqFiles", [filtered_read1_files, filtered_read2_files], parent = fastqilluminafilter)
filtered_read1_files = subset.subset_read1
filtered_read2_files = subset.subset_read2
bwa = self.add_component("BWA", [indexed_ref, filtered_read1_files, filtered_read2_files, sample_lane_prefixes, "mem", not self.delete_bam], parent = subset)
else:
bwa = self.add_component("BWA", [indexed_ref, filtered_read1_files, filtered_read2_files, sample_lane_prefixes, "mem", not self.delete_bam], parent = fastqilluminafilter)
# make some statistic on the alignement
alignmentstats = self.add_component("AlignmentStats", [bwa.bam_files, self.is_paired_end()], parent = bwa, component_prefix="bwa")
......@@ -67,14 +79,29 @@ class RnaSeqQualityCheck (CasavaNG6Workflow):
# spliced alignment of reads against indexed genome
if self.is_paired_end() and (self.group_prefix != None):
logging.getLogger("RnaSeqQualityCheck").debug("process. Dans self.reference_genome != None > if self.is_paired_end() and (self.group_prefix != None):")
# split read 1 and read 2 from filtered files list
[concat_read1_files, concat_read2_files] = Utils.split_pair(concat_files, (self.group_prefix != None))
if self.align_subset_reads:
subset = self.add_component("SubsetSeqFiles", [concat_read1_files, concat_read2_files], component_prefix="star", parent = fastqilluminafilter)
concat_read1_files = subset.subset_read1
concat_read2_files = subset.subset_read2
elif self.group_prefix != None:
logging.getLogger("RnaSeqQualityCheck").debug("process. Dans self.reference_genome != None > elif self.group_prefix != None:")
concat_read1_files = concat_files
concat_read2_files = []
if self.align_subset_reads:
subset = self.add_component("SubsetSeqFiles", [concat_files],component_prefix="star", parent = fastqilluminafilter)
concat_read1_files = subset.subset_read1
concat_read2_files = subset.subset_read2
else:
logging.getLogger("RnaSeqQualityCheck").debug("process. Dans self.reference_genome != None > else")
concat_read1_files = filtered_read1_files
concat_read2_files = filtered_read2_files
if self.align_subset_reads:
subset = self.add_component("SubsetSeqFiles", [concat_read1_files,concat_read2_files], component_prefix="star", parent = fastqilluminafilter)
concat_read1_files = subset.subset_read1
concat_read2_files = subset.subset_read2
concat_read1_files = sorted(concat_read1_files)
concat_read2_files = sorted(concat_read2_files)
......@@ -82,9 +109,10 @@ class RnaSeqQualityCheck (CasavaNG6Workflow):
if self.group_prefix != None :
sample_lane_prefixes = sorted(list((Utils.get_group_basenames(concat_read1_files+concat_read2_files, "lane")).keys()))
star = self.add_component("STAR", [reference_genome, index_dir, concat_read1_files, concat_read2_files,sample_lane_prefixes, not self.delete_bam, self.n_threads], parent = fastqilluminafilter)
if self.align_subset_reads:
star = self.add_component("STAR", [reference_genome, index_dir, concat_read1_files, concat_read2_files,sample_lane_prefixes, not self.delete_bam, self.n_threads], parent = subset)
else:
star = self.add_component("STAR", [reference_genome, index_dir, concat_read1_files, concat_read2_files,sample_lane_prefixes, not self.delete_bam, self.n_threads], parent = fastqilluminafilter)
# make some statistic on the alignment
alignmentstats = self.add_component("AlignmentStats", [star.output_bams, self.is_paired_end()], parent = star, component_prefix="star")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment