Commit c9609cc3 authored by Penom Nom's avatar Penom Nom
Browse files

Correction of illumina_rnaseq (tophat component) and add test data

parent 94d8e153
......@@ -62,7 +62,7 @@ tmp_directory = <path>/tmp
#CollectInsertSizeMetrics = /usr/bin/CollectInsertSizeMetrics.jar
#MarkDuplicates = /usr/bin/MarkDuplicates.jar
#mothur = /usr/bin/mothur
#bwt2_index = /usr/bin/bowtie2-build
#bowtie_build = /usr/bin/bowtie2-build
#tophat = /usr/bin/tophat
#gene_body_cov = /usr/bin/geneBody_coverage.py
#infer_experiment = /usr/bin/infer_experiment.py
......
......@@ -18,18 +18,17 @@
import os
from jflow.component import Component
from jflow.iotypes import OutputFile, OutputFileList, InputFileList, Formats
from jflow.iotypes import OutputFile, OutputFileList, InputFileList, InputFile, Formats
from weaver.function import PythonFunction, ShellFunction
from weaver.function import PythonFunction
def bwt2_index(exec_path, index, input_fasta, databank_fasta, stdout_path, stderr_path):
def bowtie_build(exec_path, databank, input_fasta, stdout_path, stderr_path, output_fasta):
from subprocess import Popen, PIPE
# first make the symbolic link
os.symlink(input_fasta, databank_fasta)
# execute bwt2 index
cmd = [exec_path, databank_fasta, index]
# symlink of fasta
os.symlink(input_fasta, output_fasta)
cmd = [exec_path, input_fasta, databank]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
# write down the stdout
......@@ -41,19 +40,18 @@ def bwt2_index(exec_path, index, input_fasta, databank_fasta, stdout_path, stder
stdeh.write(stderr)
stdeh.close()
class BWT2Index (Component):
class BowtieBuild (Component):
def define_parameters(self, input_fasta):
self.input_fasta = InputFileList(input_fasta, Formats.FASTA)
self.stdout = OutputFileList(os.path.join(self.output_directory, "bwt2index.stdout"))
self.stderr = OutputFileList(os.path.join(self.output_directory, "bwt2index.stderr"))
self.databank_fasta = OutputFileList(self.get_outputs('{basename}', self.input_fasta), Formats.FASTA)
self.index = self.get_outputs('{basename_woext}', self.input_fasta)[0]
self.input_fasta = InputFile(input_fasta, Formats.FASTA)
self.databank = os.path.join(self.output_directory, os.path.splitext(os.path.basename(input_fasta))[0] )
self.output_fasta = OutputFile(self.databank + '.fa')
self.stdout = OutputFile(os.path.join(self.output_directory, "bowtie_build.stdout"))
self.stderr = OutputFile(os.path.join(self.output_directory, "bowtie_build.stderr"))
def process(self):
buildindex = PythonFunction(bowtie_build, cmd_format="{EXE} {ARG} {IN} {OUT}")
buildindex(inputs=self.input_fasta, outputs=[ self.stdout, self.stderr, self.output_fasta], arguments=[self.get_exec_path("bowtie_build"), self.databank])
bwt2index = PythonFunction(bwt2_index, cmd_format="{EXE} {ARG} {IN} {OUT}")
bwt2index(inputs=self.input_fasta, outputs=[self.databank_fasta, self.stdout, self.stderr], arguments=[self.get_exec_path("bwt2_index"), self.index])
\ No newline at end of file
......@@ -16,73 +16,57 @@
#
import os
import re
from subprocess import Popen, PIPE
from jflow.component import Component
from jflow.iotypes import OutputDirectoryList, OutputFileList, InputFileList, Formats
from jflow.abstraction import MultiMap
from jflow.iotypes import OutputFile, OutputFileList, InputFileList, InputFile, Formats
from weaver.function import ShellFunction
from weaver.abstraction import Map
from ng6.analysis import Analysis
from ng6.utils import Utils
class TopHat (Analysis):
def define_parameters(self, index_ref_genome, read1, read2, max_intron_length="25000", nb_threads="6", keep_bam=True, archive_name=None):
def define_parameters(self, index_basename, read1, read2, keep_bam=True, mate_inner_dist = 50):
self.index_basename = index_basename
self.input_fasta = InputFile(index_basename + '.fa')
self.read1 = InputFileList(read1, Formats.FASTQ)
self.read2 = None
if read2:
self.read2 = InputFileList(read2, Formats.FASTQ)
self.output_folder = OutputDirectoryList(self.get_outputs('{basename_woext}', [self.read1, self.read2]))
self.stderr = OutputFileList(self.get_outputs('{basename_woext}.stderr', [self.read1, self.read2]))
else:
self.output_folder = OutputDirectoryList(self.get_outputs('{basename_woext}', self.read1))
self.stderr = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.read1))
self.ref_genome = InputFileList(index_ref_genome + ".fa", Formats.FASTA)
self.index_ref_genome = index_ref_genome
self.pre_bam_files = OutputFileList(self.get_outputs('{basename}/accepted_hits.bam', self.output_folder), Formats.BAM)
self.bam_files=[]
for output_folder in self.output_folder:
self.bam_files.append(output_folder+'/' + os.path.basename(output_folder) + '.bam')
self.bam_files = OutputFileList(self.bam_files, Formats.BAM)
self.max_intron_length = max_intron_length
self.nb_threads = nb_threads
self.mate_inner_dist = mate_inner_dist
self.keep_bam = keep_bam
self.stderr = OutputFileList(os.path.join(self.output_directory, "tophat2.stderr"))
if read2 :
self.read2 = InputFileList(read2, Formats.FASTQ)
self.accepted_hits = OutputFileList(os.path.join(self.output_directory, "accepted_hits.bam"), Formats.BAM)
def define_analysis(self):
self.name = "TopHat Alignment"
self.description = "Spliced transcripts alignment to whole genomes."
self.software = "tophat2"
self.options = self.max_intron_length + " " + self.nb_threads
self.software = "tophat"
def post_process(self):
if self.keep_bam:
# Finally create and add the archive to the analysis
self._save_files(self.bam_files)
def get_version(self):
cmd = [self.get_exec_path("tophat2"), "-v"]
cmd = [self.get_exec_path("tophat"), "-v"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stdout.split()[1]
def process(self):
command = [self.get_exec_path("tophat"), '-r', str(self.mate_inner_dist), "-o",self.output_directory, self.index_basename ]
if self.read2 :
tophat = ShellFunction( ' '.join(command + ["$1 $2 2> $4"]), cmd_format='{EXE} {IN} {OUT}')
tophat(inputs = [self.read1,self.read2, self.input_fasta], outputs = [self.stderr, self.accepted_hits])
else :
tophat = ShellFunction(' '.join(command + ["$1 2> $3"]), cmd_format='{EXE} {IN} {OUT}')
tophat(inputs = [self.read1, self.input_fasta],outputs = [self.stderr, self.accepted_hits])
if self.read2:
tophat = ShellFunction(self.get_exec_path("tophat2") + " -p " + self.nb_threads + " -I " + self.max_intron_length + " -o $3 " + self.index_ref_genome + " $1 $2 2> $4", cmd_format='{EXE} {IN} {OUT}')
tophat= MultiMap(tophat, inputs=[self.read1,self.read2], outputs=[self.output_folder, self.stderr, self.pre_bam_files], includes=self.ref_genome)
else:
tophat = ShellFunction(self.get_exec_path("tophat2") + " -p " + self.nb_threads + " -I " + self.max_intron_length + " -o $2 " + self.index_ref_genome + " $1 2> $3 ", cmd_format='{EXE} {IN} {OUT}')
tophat= MultiMap(tophat, inputs=self.read1, outputs=[self.output_folder, self.stderr, self.pre_bam_files], includes=self.ref_genome)
mv = ShellFunction("/bin/mv $1 $2", cmd_format='{EXE} {IN} {OUT}')
mv = MultiMap(mv, inputs=self.pre_bam_files, outputs=self.bam_files)
\ No newline at end of file
......@@ -115,10 +115,10 @@ class RnaSeqQualityCheck (NG6Workflow):
if self.args["reference_genome"] is not None :
# index the reference genome
indexed_ref = os.path.splitext(self.args["reference_genome"])[0]
if not os.path.exists( indexed_ref + ".1.bt2" ):
bwt2index = self.add_component("BWT2Index", [self.args["reference_genome"]])
indexed_ref = bwt2index.index
bowtie_index = os.path.splitext(self.args["reference_genome"])[0]
if not os.path.exists( bowtie_index + ".1.bt2" ):
bowtie2build = self.add_component("BowtieBuild", [self.args["reference_genome"]])
bowtie_index = bowtie2build.databank
# spliced alignment of reads against indexed genome
if is_paired_end and (group_prefix is not None):
......@@ -132,11 +132,11 @@ class RnaSeqQualityCheck (NG6Workflow):
concat_read2_files = filtered_read2_files
concat_read1_files = sorted(concat_read1_files)
concat_read2_files = sorted(concat_read2_files)
tophat = self.add_component("TopHat", [indexed_ref, concat_read1_files, concat_read2_files, "25000", "6", not self.args["delete_bam"]], parent = fastqilluminafilter)
tophat = self.add_component("TopHat", [bowtie_index, concat_read1_files, concat_read2_files, not self.args["delete_bam"]], parent = fastqilluminafilter)
# make some statistic on the alignement
alignmentstats = self.add_component("AlignmentStats", [tophat.bam_files, is_paired_end], parent = tophat, component_prefix="tophat")
alignmentstats = self.add_component("AlignmentStats", [tophat.accepted_hits, is_paired_end], parent = tophat, component_prefix="tophat")
#Quality RNA Seq analysis
if self.args["annotation"] is not None:
rseqc = self.add_component("RSeQC", [tophat.bam_files, self.args["annotation"], is_paired_end], parent = tophat)
\ No newline at end of file
rseqc = self.add_component("RSeQC", [tophat.accepted_hits, self.args["annotation"], is_paired_end], parent = tophat)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
--keep
all
--reference-transcriptome
workflows/illumina_rnaseq/data/Danio_cdna_sample.fa
--reference-genome
workflows/illumina_rnaseq/data/Danio_chr10_sample.fa
--read-1
workflows/illumina_rnaseq/data/ERR003990_sample_1.fastq.gz
--read-2
workflows/illumina_rnaseq/data/ERR003990_sample_2.fastq.gz
--compression
gz
--delete-bam
# for rseqc analysis
--annotation
workflows/illumina_rnaseq/data/Danio_rerio.Zv9.75.sample.bed
#run info
--project-id
1
--name
test_illumina_rnaseq
--description
Test illumina rnaseq
--date
22/08/2014
--data-nature
RNA
--sequencer
HiSeq 2000
--species
Danio rerio
--type
test type
......@@ -45,12 +45,14 @@ read_1.name = read 1
read_1.flag = --read-1
read_1.help = Which read1 files should be used
read_1.action = append
read_1.type = localfile
read_1.required = True
read_1.exclude = casava_directory
read_2.name = read 2
read_2.flag = --read-2
read_2.help = Which read2 files should be used (if single end, leave empty)
read_2.type = localfile
read_2.action = append
keep_reads.name = keep reads
......@@ -73,16 +75,19 @@ compression.choices = none|gz|bz2
reference_genome.name = reference genome
reference_genome.flag = --reference-genome
reference_genome.type = localfile
reference_genome.help = Which genome should the read being align on
reference_transcriptome.name = reference transcriptome
reference_transcriptome.flag = --reference-transcriptome
reference_transcriptome.type = localfile
reference_transcriptome.help = Which transcriptome should the read being align on
annotation.name = annotation
annotation.flag = --annotation
annotation.help = Which annotation file should be used for processing RNA Seq quality
annotation.action = append
annotation.type = localfile
databank.name = databank
databank.flag = --databank
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment