Commit 232124ed authored by Jerome Mariette's avatar Jerome Mariette
Browse files

No commit message

No commit message
parent b06af6cd
......@@ -50,7 +50,7 @@ runAssembly = /usr/bin/runAssembly
bwa = /usr/bin/bwa
samtools = /usr/bin/samtools
kronaImportBLAST = /usr/local/bin/ktImportBLAST
blastx = /usr/bin/blastx
blastn = /usr/bin/blastn
fastq_illumina_filter = /usr/bin/fastq_illumina_filter
CollectInsertSizeMetrics = /usr/bin/CollectInsertSizeMetrics.jar
......
......@@ -18,56 +18,52 @@
import os
from subprocess import Popen, PIPE
from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, OutputFile, Formats
from jflow.abstraction import MultiMap
from ng6.analysis import Analysis
from weaver.function import ShellFunction,PythonFunction
from weaver.abstraction import Map
from ng6.analysis import Analysis
def extract_random_seq(input_fastq, output_fasta, extract_rate, min_threshold, max_threshold):
import ng6.seqio as seqio
from subprocess import Popen, PIPE
if float(extract_rate) > 1 or float(extract_rate) <= 0:
os.sys.stderr.write("[ERROR] : the extract_rate value is not correct ! (Should be between 0 and 1)")
os.sys.exit()
def extract_random_seq(sequence_files, output_fasta, extract_rate, min_threshold, max_threshold):
import ng6.seqio as seqio
extract_rate = float(extract_rate)
min_threshold = int(min_threshold)
max_threshold = int(max_threshold)
if extract_rate > 1 or extract_rate <= 0:
raise Exception("[ERROR] : the extract_rate value is not correct! (Should be between 0 and 1)")
if int(min_threshold) > int(max_threshold):
os.sys.stderr.write("[ERROR] : the threshold values are not correct ! (Minimum threshold is bigger than Maximum threshold)")
os.sys.exit()
#Compute min_threshold, max_threshold, nbsubseq, totalseq
cmd=["wc","-l",input_fastq]
p=Popen(cmd,stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
totalseq=int(int(stdout.split()[0])/4)
nbsubseq=int(totalseq * float(extract_rate))
#Setting the thresholds if necessary
if nbsubseq > max_threshold:
nbsubseq = max_threshold
elif nbsubseq < min_threshold :
nbsubseq = min_threshold
if totalseq < min_threshold:
nbsubseq = totalseq
step=int(totalseq / nbsubseq)
count=0
#Writting into the fasta file from fastq file
reader = seqio.SequenceReader(input_fastq)
with open(output_fasta,"w") as output:
raise Exception("[ERROR] : the threshold values are not correct ! (Minimum threshold is bigger than Maximum threshold)")
# Compute min_threshold, max_threshold, nbsubseq, totalseq
reader = seqio.SequenceReader(sequence_files)
nb_seq, nb_seq_to_extract = 0, 0
for id, desc, seq, qual in reader :
nb_seq += 1
nb_seq_to_extract = int(nb_seq * float(extract_rate))
# Setting the thresholds if necessary
if nb_seq_to_extract > max_threshold:
nb_seq_to_extract = max_threshold
elif nb_seq_to_extract < min_threshold:
nb_seq_to_extract = min_threshold
if nb_seq < nb_seq_to_extract :
nb_seq_to_extract = nb_seq
step = nb_seq / nb_seq_to_extract
count = 0
# Writes result in fasta file
reader = seqio.SequenceReader(sequence_files)
with open(output_fasta, "w") as output:
for id, desc, seq, qual in reader :
count+=1
if count==step:
count=0
count += 1
if count == step:
count = 0
seqio.writefasta(output, [[id, desc, seq, qual]])
def blast_filter (blast_file, blast_filter_file, pci_default, pcq_default):
import ng6.NCBIXML as NCBIXML
blast_records = NCBIXML.parse(open(blast_file))
blast_output = open(blast_filter_file,"w")
......@@ -142,7 +138,6 @@ def blast_filter (blast_file, blast_filter_file, pci_default, pcq_default):
evalue = hsp.expect
info = query_id+"$"+subject_id+"$"+str(perc_identity)+"$"+str(alignment_length)+"$"+str(mismatches)+"$"+str(gap_opens)+"$"+str(q_start)+"$"+str(q_end)+"$"+str(s_start)+"$"+str(s_end)+"$"+str(evalue)
key = blast_record.query +"$"+alignment.hit_id
if not query_subject.has_key(key):
......@@ -151,21 +146,16 @@ def blast_filter (blast_file, blast_filter_file, pci_default, pcq_default):
hole_sum=0
for [hole_start,hole_end] in no_annot_regions_work :
hole_sum += hole_end - hole_start + 1
pcq = (blast_record.query_length - hole_sum ) *100 / blast_record.query_length
pci_mean = round(float(sum(pci_current_list))/float(len(pci_current_list)),2) if len(pci_current_list) > 0 else float('nan')
bits_mean = round(float(sum(bits_current_list))/float(len(bits_current_list)),2) if len(bits_current_list) > 0 else float('nan')
pci_current_list=[]
query_subject_info[key]=info
if pci_mean >= pci_default and pcq >= pcq_default :
query_subject[key]= bits_mean
max = [0,""]
max = [0,""]
for key in query_subject :
if query_subject[key] != []:
if query_subject[key] > max[0]:
......@@ -176,43 +166,41 @@ def blast_filter (blast_file, blast_filter_file, pci_default, pcq_default):
list = query_subject_info[max[1]].split("$")
ligne = str("\t".join(list)+"\t"+str(max[0]))
blast_output.write(ligne+"\n")
#print "END "+max[1], max[0]
blast_output.close()
class ContaminationSearch (Analysis):
def define_parameters(self, fastq_files, databank, e_value=1e-5 ,extract_rate=0.05, min_threshold=20000, max_threshold=1e6, pci_default=90, pcq_default=90, archive_name=None):
self.fastq_files=InputFileList(fastq_files, Formats.FASTQ)
self.sub_fasta_files=OutputFileList(self.get_outputs('{basename_woext}_sub.fasta', self.fastq_files))
def define_parameters(self, sequence_files, databank, e_value=1e-5 ,extract_rate=0.05, min_threshold=20000, max_threshold=1e6,
pci_default=90, pcq_default=90, archive_name=None):
self.sequence_files=InputFileList(sequence_files, Formats.FASTQ)
self.databank = databank
self.e_value=int(e_value)
self.e_value = int(e_value)
self.extract_rate = float(extract_rate)
self.min_threshold = int(min_threshold)
self.max_threshold = int(max_threshold)
self.pci_default = int(pci_default)
self.pcq_default = int(pcq_default)
self.blast_files=OutputFileList(self.get_outputs('{basename_woext}.blast', self.fastq_files))
self.blast_filter_files=OutputFileList(self.get_outputs('{basename_woext}_filter.blast', self.fastq_files))
self.html_files=OutputFileList(self.get_outputs('{basename_woext}.html', self.fastq_files))
self.krona_stdout=OutputFileList(self.get_outputs('{basename_woext}.stdout', self.fastq_files))
self.archive_name=archive_name
self.sub_fasta_files = OutputFileList(self.get_outputs('{basename_woext}_sub.fasta', self.sequence_files))
self.blast_files = OutputFileList(self.get_outputs('{basename_woext}.blast', self.sequence_files))
self.blast_filter_files = OutputFileList(self.get_outputs('{basename_woext}_filter.blast', self.sequence_files))
self.html_files = OutputFileList(self.get_outputs('{basename_woext}.html', self.sequence_files))
self.krona_stdout = OutputFileList(self.get_outputs('{basename_woext}.stdout', self.sequence_files))
self.archive_name = archive_name
def define_analysis(self):
self.name = "Contamination"
self.description = "Contamination Distribution"
self.name = "ContaminationSearch"
self.description = "Contamination seeked on a subsample."
self.software = "blastn"
self.options = str(self.extract_rate)+" "+str(self.min_threshold)+" "+str(self.max_threshold)+" "+self.databank+" "+str(self.pci_default)+" "+str(self.pcq_default)
self.options = str(self.extract_rate)+" "+str(self.min_threshold)+" "+str(self.max_threshold)+" "+self.databank+" "+ \
str(self.pci_default)+" "+str(self.pcq_default)
def post_process(self):
for html_file in self.html_files:
sample = os.path.splitext(os.path.basename(html_file))[0]
self._add_result_element(sample, "html", self._save_file(html_file))
def get_version(self):
cmd = [self.get_exec_path("blastn"),"-version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
......@@ -222,14 +210,14 @@ class ContaminationSearch (Analysis):
def process(self):
file_extraction_options= " ".join(self.options.split()[0:3])
file_extraction = PythonFunction(extract_random_seq, cmd_format="{EXE} {IN} {OUT} " + file_extraction_options)
file_extraction = Map(file_extraction, inputs=self.fastq_files, outputs=self.sub_fasta_files)
blastn = ShellFunction(self.get_exec_path("blastn") + " -task megablast -best_hit_overhang 0.25 -best_hit_score_edge 0.25 -evalue 1e-5 -perc_identity 90 -outfmt 5 -db " + self.databank + " -query $1 -out $2", cmd_format='{EXE} {IN} {OUT}')
blastn = Map(blastn, self.sub_fasta_files, self.blast_files)
file_extraction = Map(file_extraction, inputs=self.sequence_files, outputs=self.sub_fasta_files)
blastn = ShellFunction(self.get_exec_path("blastn") + " -task megablast -best_hit_overhang 0.25 -best_hit_score_edge " + \
"0.25 -evalue 1e-5 -perc_identity 90 -outfmt 5 -db " + self.databank + " -query $1 -out $2",
cmd_format='{EXE} {IN} {OUT}')
blastn = Map(blastn, inputs=self.sub_fasta_files, outputs=self.blast_files)
blast_filter_options = " ".join(self.options.split()[-2:])
blastfilter = PythonFunction(blast_filter, cmd_format= "{EXE} {IN} {OUT} " + blast_filter_options)
blastfilter = Map(blastfilter, self.blast_files, self.blast_filter_files)
kronaImportBLAST = ShellFunction(self.get_exec_path("kronaImportBLAST") + " $1 -o $2 > $3", cmd_format='{EXE} {IN} {OUT}')
kronaImportBLAST = MultiMap(kronaImportBLAST, inputs = self.blast_filter_files, outputs = [self.html_files, self.krona_stdout])
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment