Commit cb2c6b6b authored by Jerome Mariette's avatar Jerome Mariette
Browse files

new conta

parent 105242ff
......@@ -28,26 +28,26 @@ from weaver.abstraction import Map
from ng6.analysis import Analysis
def extract_random_seq(sequence_files, output_fasta, extract_rate, min_threshold, max_threshold):
def extract_random_seq(sequence_files, output_fasta, extract_rate, min_nb_seq, max_nb_seq):
import ng6.seqio as seqio
extract_rate = float(extract_rate)
min_threshold = int(min_threshold)
max_threshold = int(max_threshold)
min_nb_seq = int(min_nb_seq)
max_nb_seq = int(max_nb_seq)
if extract_rate > 1 or extract_rate <= 0:
raise Exception("[ERROR] : the extract_rate value is not correct! (Should be between 0 and 1)")
if int(min_threshold) > int(max_threshold):
if int(min_nb_seq) > int(max_nb_seq):
raise Exception("[ERROR] : the threshold values are not correct ! (Minimum threshold is bigger than Maximum threshold)")
# Compute min_threshold, max_threshold, nbsubseq, totalseq
# Compute min_nb_seq, max_nb_seq, nbsubseq, totalseq
reader = seqio.SequenceReader(sequence_files)
nb_seq, nb_seq_to_extract = 0, 0
for id, desc, seq, qual in reader :
nb_seq += 1
nb_seq_to_extract = int(nb_seq * float(extract_rate))
# Setting the thresholds if necessary
if nb_seq_to_extract > max_threshold:
nb_seq_to_extract = max_threshold
elif nb_seq_to_extract < min_threshold:
nb_seq_to_extract = min_threshold
if nb_seq_to_extract > max_nb_seq:
nb_seq_to_extract = max_nb_seq
elif nb_seq_to_extract < min_nb_seq:
nb_seq_to_extract = min_nb_seq
if nb_seq < nb_seq_to_extract :
nb_seq_to_extract = nb_seq
step = nb_seq / nb_seq_to_extract
......@@ -61,145 +61,46 @@ def extract_random_seq(sequence_files, output_fasta, extract_rate, min_threshold
count = 0
seqio.writefasta(output, [[id, desc, seq, qual]])
def blast_filter (blast_file, blast_filter_file, pci_default, pcq_default):
import ng6.NCBIXML as NCBIXML
blast_records = NCBIXML.parse(open(blast_file))
blast_output = open(blast_filter_file,"w")
for blast_record in blast_records:
query_subject_info={}
query_subject={}
keep_record = 0
pci_current_list=[]
bits_current_list=[]
for alignment in blast_record.alignments:
no_annot_regions_saved=[]
no_annot_regions_work=[]
nb_no_annot_region=0
for hsp in alignment.hsps:
keep_hsp = 0
if nb_no_annot_region == 0 :
no_annot_regions_saved.append([0,hsp.query_start])
no_annot_regions_saved.append([hsp.query_end,blast_record.query_letters])
nb_no_annot_region=nb_no_annot_region+1
keep_hsp = 1
else :
for [hole_start,hole_end] in no_annot_regions_work :
#for each hole
delete_no_annot_region = 0
# hsp include a no annot region
if hsp.query_start < hole_start and hsp.query_end > hole_end :
delete_no_annot_region = 1
keep_hsp = 1
#hsp start is in a no annot region
if hsp.query_start > hole_start and hsp.query_start < hole_end:
#hsp is in the none annotated region
if (hsp.query_end < hole_end) :
keep_hsp = 1
# adding the 2 new none annotated region to region table
no_annot_regions_saved.append([hole_start,hsp.query_start])
no_annot_regions_saved.append([hsp.query_end,hole_end])
else :
keep_hsp = 1;
# update end of the existing none annotated region
no_annot_regions_saved.append([hole_start,hsp.query_start])
else :
# no annot region include query end
if hsp.query_end > hole_start and hsp.query_end < hole_end :
if hsp.query_start <= hole_start :
keep_hsp = 1;
no_annot_regions_saved.append([hsp.query_end,hole_end])
else :
if delete_no_annot_region == 0 :
no_annot_regions_saved.append([hole_start,hole_end])
if keep_hsp:
perc_identity=float(hsp.identities*100/hsp.align_length)
pci_current_list.append(perc_identity)
bit_score = hsp.bits
bits_current_list.append(bit_score)
no_annot_regions_work=no_annot_regions_saved[:]
no_annot_regions_saved=[]
if nb_no_annot_region > 0 :
#info blast hit recovering
# query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
query_id = blast_record.query
subject_id = alignment.hit_id
alignment_length = hsp.align_length
gap_opens = hsp.gaps
mismatches = blast_record.query_letters-gap_opens-perc_identity
q_start = hsp.query_start
q_end = hsp.query_end
s_start = hsp.sbjct_start
s_end = hsp.sbjct_end
evalue = hsp.expect
info = query_id+"$"+subject_id+"$"+str(perc_identity)+"$"+str(alignment_length)+"$"+str(mismatches)+"$"+str(gap_opens)+"$"+str(q_start)+"$"+str(q_end)+"$"+str(s_start)+"$"+str(s_end)+"$"+str(evalue)
key = blast_record.query +"$"+alignment.hit_id
if not query_subject.has_key(key):
query_subject_info[key]=[]
query_subject[key]=[]
hole_sum=0
for [hole_start,hole_end] in no_annot_regions_work :
hole_sum += hole_end - hole_start + 1
pcq = (blast_record.query_length - hole_sum ) *100 / blast_record.query_length
pci_mean = round(float(sum(pci_current_list))/float(len(pci_current_list)),2) if len(pci_current_list) > 0 else float('nan')
bits_mean = round(float(sum(bits_current_list))/float(len(bits_current_list)),2) if len(bits_current_list) > 0 else float('nan')
pci_current_list=[]
query_subject_info[key]=info
if pci_mean >= pci_default and pcq >= pcq_default :
query_subject[key]= bits_mean
max = [0,""]
for key in query_subject :
if query_subject[key] != []:
if query_subject[key] > max[0]:
max[0] = query_subject[key]
max[1] = key
if max[0] != 0 :
list = query_subject_info[max[1]].split("$")
ligne = str("\t".join(list)+"\t"+str(max[0]))
blast_output.write(ligne+"\n")
blast_output.close()
class ContaminationSearch (Analysis):
def define_parameters(self, sequence_files, databank, e_value=1e-5 ,extract_rate=0.05, min_threshold=20000, max_threshold=1e6,
pci_default=90, pcq_default=90, archive_name=None):
def define_parameters(self, sequence_files, databank, extract_rate=0.05, min_nb_seq=20000, max_nb_seq=1e6,
archive_name=None):
self.sequence_files=InputFileList(sequence_files, Formats.FASTQ)
self.databank = databank
self.e_value = int(e_value)
self.extract_rate = float(extract_rate)
self.min_threshold = int(min_threshold)
self.max_threshold = int(max_threshold)
self.pci_default = int(pci_default)
self.pcq_default = int(pcq_default)
self.min_nb_seq = int(min_nb_seq)
self.max_nb_seq = int(max_nb_seq)
self.sequence_files = InputFileList(sequence_files, Formats.FASTQ)
self.sub_fasta_files = OutputFileList(self.get_outputs('{basename_woext}_sub.fasta', self.sequence_files))
self.blast_files = OutputFileList(self.get_outputs('{basename_woext}.blast', self.sequence_files))
self.blast_filter_files = OutputFileList(self.get_outputs('{basename_woext}_filter.blast', self.sequence_files))
self.html_files = OutputFileList(self.get_outputs('{basename_woext}.html', self.sequence_files))
self.krona_stdout = OutputFileList(self.get_outputs('{basename_woext}.stdout', self.sequence_files))
self.archive_name = archive_name
def define_analysis(self):
self.name = "ContaminationSearch"
self.description = "Contamination seeked on a subsample."
self.description = "Contamination sought on a subset of sequences."
self.software = "blastn"
self.options = str(self.extract_rate)+" "+str(self.min_threshold)+" "+str(self.max_threshold)+" "+self.databank+" "+ \
str(self.pci_default)+" "+str(self.pcq_default)
self.options = str(self.extract_rate)+" "+str(self.min_nb_seq)+" "+str(self.max_nb_seq)+" "+self.databank
def post_process(self):
import ng6.seqio as seqio
# Add the number of sequences in the subset
for sub_fasta_file in self.sub_fasta_files:
reader = seqio.SequenceReader(sub_fasta_file)
nb_seq = 0
for id, desc, seq, qual in reader :
nb_seq += 1
sample = os.path.splitext(os.path.basename(sub_fasta_file))[0][:-4]
self._add_result_element(sample, "nb_seq_extracted", nb_seq)
# Add the krona file
for html_file in self.html_files:
sample = os.path.splitext(os.path.basename(html_file))[0]
self._add_result_element(sample, "html", self._save_file(html_file))
if os.path.isdir(html_file+".files"):
self._save_directory(html_file+".files")
def get_version(self):
cmd = [self.get_exec_path("blastn"),"-version"]
......@@ -208,16 +109,12 @@ class ContaminationSearch (Analysis):
return stdout.split()[1]
def process(self):
file_extraction_options= " ".join(self.options.split()[0:3])
file_extraction = PythonFunction(extract_random_seq, cmd_format="{EXE} {IN} {OUT} " + file_extraction_options)
file_extraction = PythonFunction(extract_random_seq, cmd_format="{EXE} {IN} {OUT} " + " ".join([str(self.extract_rate),
str(self.min_nb_seq), str(self.max_nb_seq)]))
file_extraction = Map(file_extraction, inputs=self.sequence_files, outputs=self.sub_fasta_files)
blastn = ShellFunction(self.get_exec_path("blastn") + " -task megablast -best_hit_overhang 0.25 -best_hit_score_edge " + \
"0.25 -evalue 1e-5 -perc_identity 90 -outfmt 5 -db " + self.databank + " -query $1 -out $2",
blastn = ShellFunction(self.get_exec_path("blastn") + " -max_target_seqs 50 -outfmt 7 -db " + self.databank + " -query $1 -out $2",
cmd_format='{EXE} {IN} {OUT}')
blastn = Map(blastn, inputs=self.sub_fasta_files, outputs=self.blast_files)
blast_filter_options = " ".join(self.options.split()[-2:])
blastfilter = PythonFunction(blast_filter, cmd_format= "{EXE} {IN} {OUT} " + blast_filter_options)
blastfilter = Map(blastfilter, self.blast_files, self.blast_filter_files)
kronaImportBLAST = ShellFunction(self.get_exec_path("kronaImportBLAST") + " $1 -o $2 > $3", cmd_format='{EXE} {IN} {OUT}')
kronaImportBLAST = MultiMap(kronaImportBLAST, inputs = self.blast_filter_files, outputs = [self.html_files, self.krona_stdout])
kronaImportBLAST = ShellFunction(self.get_exec_path("kronaImportBLAST") + " -i -b $1 -o $2 > $3", cmd_format='{EXE} {IN} {OUT}')
kronaImportBLAST = MultiMap(kronaImportBLAST, inputs = self.blast_files, outputs = [self.html_files, self.krona_stdout])
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment