Commit e6793e5e authored by Jerome Mariette's avatar Jerome Mariette

use smarty instead of xsl to generate the analyse view

parent a4ae8437
......@@ -31,10 +31,11 @@ from Bio.SeqRecord import SeqRecord
def version_string ():
"""
Return the pyrocleaner version
Return the adaptorcleaner version
"""
return "adaptorcleaner " + __version__
def mask_sequences (fafile, tab_adapt, len_adapt, options, log, minscore,minmatch):
"""
Search and mask adaptors in seqs
......@@ -43,7 +44,6 @@ def mask_sequences (fafile, tab_adapt, len_adapt, options, log, minscore,minmatc
@param adaptators_found: hash table for each sequence to set if adaptors is found
@param options : the options asked by the user
"""
# 30 0.00 3.23 0.00 F7QVD2L01CVLN4 15 45 (804) primer2 1 32 (0)
# Cross_match primer2 matches pattern
rev_regex = re.compile("(\s+)?(\d+)\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)")
......@@ -65,7 +65,7 @@ def mask_sequences (fafile, tab_adapt, len_adapt, options, log, minscore,minmatc
cmd = "cross_match " + fafile + " " + os.path.join(options.output, os.path.basename(options.fasta) + "."+ adapt_ids + ".oligo") + " -minmatch "+str(minmatch)+" -minscore "+str(minscore)+" -penalty -1 -gap_init -1 -gap_ext -1 -ins_gap_ext -1 -del_gap_ext -1 -raw -screen > " + os.path.join(options.output, os.path.basename(options.fasta)) + "."+ adapt_ids + ".cross_match.res"
os.system(cmd)
log.write("###Adapt : "+ adapt_ids + "\n"+cmd+"\n")
log.write("###Adaptor : "+ adapt_ids + "\n"+cmd+"\n")
os.system ("mv "+ fafile + ".screen " + os.path.join(options.output, os.path.basename(options.fasta)+ "."+ adapt_ids + ".screen"))
log.write("Read\tStrand\tOligo\tReadStart\tReadEnd\tOligoStart\tOligoEnd\tScore\n")
......@@ -88,24 +88,26 @@ def mask_sequences (fafile, tab_adapt, len_adapt, options, log, minscore,minmatc
break
if save :
log.write("%"+primary_match+"\t"+strand+"\t"+secondary_match+"\t"+str(startFirstMatch)+"\t"+str(endFirstMatch)+"\t"+str(startSecondMatch)+"\t"+str(endSecondMatch)+"\t"+str(score)+"\n")
try :
adaptators_found[primary_match].append([strand, secondary_match, startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch, score])
except:
adaptators_found[primary_match] = [[strand, secondary_match, startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch, score]]
log.write("###Nombre de sequences avec l'adaptateur "+ adapt_ids + " " +str(len(adaptators_found.keys())) +" \n")
log.write("%"+primary_match+"\t"+strand+"\t"+secondary_match+"\t"+str(startFirstMatch)+"\t"+str(endFirstMatch)+"\t"+str(startSecondMatch)+"\t"+str(endSecondMatch)+"\t"+str(score)+"\n")
log.write("###Number of sequences with adaptor "+ adapt_ids + " " +str(len(adaptators_found.keys())) +" \n")
#Clean up temp files
# Clean up temp files
os.remove(os.path.join(options.output, os.path.basename(options.fasta) + "."+ adapt_ids + ".oligo"))
os.remove(os.path.join(options.output, os.path.basename(options.fasta)) + "."+ adapt_ids + ".cross_match.res")
return(os.path.join(options.output, os.path.basename(options.fasta) + "."+ adapt_ids + ".screen"))
def get_longest_subsequence (sequence):
"""
Extract the longest non Xed subsequence
@param sequence : string of the sequence
@return : coords [X,Y] of the sub sequence
@param sequence : string of the sequence
@return : coords [X,Y] of the sub sequence
"""
coords = []
inc = 0
......@@ -124,7 +126,7 @@ def get_longest_subsequence (sequence):
indic = 0
if indic == 0:
coords.append([start,inc+1])
#stop.append(inc)
maxi = 0
inc = 0
select = -1
......@@ -139,8 +141,7 @@ def get_longest_subsequence (sequence):
else :
return [0,0]
def get_adaptators (file, log):
def get_adaptators (file, log):
"""
Return a in a BioPython seq table of adaptors
@param file : the fasta file of the adaptors
......@@ -163,14 +164,13 @@ def get_adaptators (file, log):
log.write (">"+seqrecord.id+"\n")
log.write (seqrecord.seq.tostring()+"\n")
seqs_len[len(seqrecord.seq)].append(seqrecord)
# tri selon la longueur des adaptateurs.
return (seqs_len, seqs_parameters)
def which (program):
"""
Return if the asked program exist in the user path
@param options : the options asked by the user
@param program : the program to test
"""
import os
def is_exe(fpath):
......@@ -186,6 +186,7 @@ def which (program):
return exe_file
return None
def depts_error(options):
"""
Return the list of dependencies missing to run the program
......@@ -197,6 +198,7 @@ def depts_error(options):
error = "Cannot execute %prog, following dependencies are missing :\n" + error + "Please install them before to run!"
return error
if __name__ == '__main__':
parser = OptionParser(usage="Usage: %prog -f FILE -q qual -a adaptor -o DIRECTORY", description = "Delete adaptors in the fasta file.", version = version_string())
......@@ -225,8 +227,7 @@ if __name__ == '__main__':
igroup.add_option("-l", "--minlen", dest="minlen",
help="The minimum length after trimming to keep the sequence ", default=150 )
parser.add_option_group(tgroup)
(options, args) = parser.parse_args()
if depts_error(options):
......@@ -242,7 +243,7 @@ if __name__ == '__main__':
elif os.path.exists(os.path.splitext(options.fasta)[0]+".qual"):
options.qual = os.path.splitext(options.fasta)[0]+".qual"
else :
print "Qual file is require\n"
print "A quality file is required\n"
parser.print_help()
sys.exit(1)
global log
......@@ -254,10 +255,10 @@ if __name__ == '__main__':
log.write("## with the following options: \n")
opts = ""
if options.adaptator :
opts += " - Minscore : " + str(options.minscore) +" % de la longueur de l'adaptateur\n"
opts += " - Minmatch : " + str(options.minmatch) +" % de la longueur de l'adaptateur\n"
opts += " - Minlen : " + str(options.minlen) +" longueur minimum de la sequence apres nettoyage\n"
opts += " - Fichier d'adaptateurs : "+ options.adaptator+" \n"
opts += " - Minscore = " + str(options.minscore) +"% of the adaptor length\n"
opts += " - Minmatch = " + str(options.minmatch) +"% de la longueur de l'adaptateur\n"
opts += " - Min seqeunce length after adaptor removal = " + str(options.minlen) +"\n"
opts += " - Adaptor file : "+ options.adaptator+" \n"
log.write(opts)
......@@ -268,7 +269,8 @@ if __name__ == '__main__':
start_file =os.path.join(options.output,os.path.basename(options.fasta))
screen_file=start_file
previous_file=start_file
# creation dun lien sym car cross_match cree le .screen au meme endroit que le fasta
# Create symlink for crossmatch
os.system ("ln -s "+options.fasta+ " "+ screen_file)
for adapt_len in sorted(iadapt.keys(),reverse=True):
(minscore,minmatch)=adapt_parameters[adapt_len]
......@@ -279,7 +281,7 @@ if __name__ == '__main__':
if os.path.exists(previous_file+ ".log"):
os.system ("rm " + previous_file+ ".log")
previous_file=screen_file
#lecture du fichier screen et qualite.
record_dict = SeqIO.to_dict(SeqIO.parse(open(screen_file), "fasta"))
new_fasta = []
for rec in SeqIO.parse(open(options.qual,"r"), "qual") :
......@@ -310,15 +312,15 @@ if __name__ == '__main__':
output_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.fasta))[0] + ".adaptorcleaner.fasta.qual")
qual = open(output_qual, "w")
SeqIO.write(new_fasta, qual, "qual")
#clean output directory
# clean output directory
os.system ("unlink "+start_file)
os.system ("rm "+screen_file)
# 4 - Display the result summary
log.write("## Nombre de sequences analysees :"+ str(len (record_dict)) + "\n")
log.write("## Number of sequences analysed :"+ str(len (record_dict)) + "\n")
log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n")
log.close()
sys.exit(0)
\ No newline at end of file
......@@ -46,58 +46,37 @@ class AdaptorcleanerAnalyse (Analyse):
@param log_file : the log file path
@return : a log hash map
"""
# minscore parameter regexp
minscore_regex = re.compile(" - Minscore : (\d+) % de la longueur de l'adaptateur")
# minmatch parameter regexp
minmatch_regex = re.compile(" - Minmatch : (\d+) % de la longueur de l'adaptateur")
# minlen parameter regexp
minlen_regex = re.compile(" - Minlen : (\d+) longueur minimum de la sequence apres nettoyage")
# adaptors parameter regexp
adaptors_regex = re.compile(" - Fichier d'adaptateurs : (.+)\s$")
# info needed
adapt_name_nb_regex = re.compile("###Nombre de sequences avec l'adaptateur\s(\w+)\s(\d+).*")
discard_seq_tooshort_regex = re.compile("Discard .+ too short .*")
discard_seq_tooadapt_regex = re.compile("Discard .+ contains only adaptor.*")
nb_sequences_regexp = re.compile("## Nombre de sequences analysees :(\d+)")
nb_sequences_regexp = re.compile("## Number of sequences analysed :(\d+)")
logs = {}
logs["adaptors_names_param"] = []
stats=[]
names={}
discard_only_adapt=0
discard_too_short=0
nb_sequences=0
for line in open(log_file, 'r').readlines():
minscorer = minscore_regex.match(line)
minmatchr = minmatch_regex.match(line)
minlenr = minlen_regex.match(line)
adaptorsr = adaptors_regex.match(line)
adapt_name_nb_r=adapt_name_nb_regex.match(line)
discard_seq_tooshort_r=discard_seq_tooshort_regex.match(line)
discard_seq_tooadapt_r=discard_seq_tooadapt_regex.match(line)
nb_sequences_r= nb_sequences_regexp.match(line)
if minscorer != None :
logs["minscore_param"] = minscorer.group(1)
if minmatchr != None :
logs["minmatch_param"] = minmatchr.group(1)
if minlenr != None :
logs["minlen_param"] = minlenr.group(1)
if adaptorsr != None :
logs["adaptors_param"] = adaptorsr.group(1).rstrip()
if adapt_name_nb_r != None :
logs["adaptors_names_param"].append(adapt_name_nb_r.group(1))
stats.append(adapt_name_nb_r.group(2))
if line.startswith("%"):
parts = line.split()
if names.has_key(parts[2]):
names[parts[2]] += 1
else : names[parts[2]] = 1
if discard_seq_tooshort_r != None :
discard_too_short=discard_too_short+1
discard_too_short = discard_too_short + 1
if discard_seq_tooadapt_r != None :
discard_only_adapt=discard_only_adapt+1
discard_only_adapt = discard_only_adapt + 1
if nb_sequences_r != None :
nb_sequences=nb_sequences_r.group(1)
nb_sequences_after=int(nb_sequences) - int(discard_too_short) - int(discard_only_adapt)
logs["stats"]=[nb_sequences, nb_sequences_after,discard_only_adapt, discard_too_short]
logs["adaptors"]=stats
nb_sequences = nb_sequences_r.group(1)
nb_sequences_after = int(nb_sequences) - int(discard_too_short) - int(discard_only_adapt)
logs["stats"] = [nb_sequences, nb_sequences_after, discard_only_adapt, discard_too_short]
logs["adaptors"] = names
return logs
def process(self, cleaner_dirs, archive_name):
......@@ -120,24 +99,10 @@ class AdaptorcleanerAnalyse (Analyse):
elif file.endswith(".qual") :
qual_file = os.path.join(cleaner_dir, file)
result_files.append(qual_file)
sample_name = self._get_file_description(os.path.basename(result_file))
sample_name = os.path.splitext(os.path.splitext(os.path.basename(result_file))[0])[0]
files[sample_name] = {}
files[sample_name]["log"] = log_file
files[sample_name]["result"] = result_file
logs = self.parse_log_file(log_file)
if logs.has_key("minscore_param"):
self._add_param("score", logs["minscore_param"])
if logs.has_key("minmatch_param"):
self._add_param("match", logs["minmatch_param"])
if logs.has_key("minlen_param"):
self._add_param("minlen", logs["minlen_param"])
if logs.has_key("adaptors_param"):
adaptators=""
handle = open(logs["adaptors_param"], "rU")
for record in SeqIO.parse(handle, "fasta") :
self._add_param(record.id, record.seq, "adaptor")
handle.close()
# Then add the analyse resutls
for sample in files :
......@@ -148,15 +113,10 @@ class AdaptorcleanerAnalyse (Analyse):
self._add_result_element(sample, "del_only_adaptator", logs['stats'][2])
self._add_result_element(sample, "del_length", logs['stats'][3])
for i in range(len (logs["adaptors_names_param"])):
self._add_result_element(sample, logs["adaptors_names_param"][i], logs['adaptors'][i], "adaptor")
# add log and copy it
#do not create a link on it
#self._add_result_element(sample, "log", self._save_file(log_file,os.path.basename(files[sample]["result"]) + ".log","log_file"))
self._add_file(self._create_and_archive(result_files, archive_name))
for adapt in logs["adaptors"].keys():
self._add_result_element(sample, adapt, logs['adaptors'][adapt], "adaptor")
self._create_and_archive(result_files, archive_name)
if __name__ == '__main__':
......@@ -190,7 +150,7 @@ if __name__ == '__main__':
# Built a PyrocleanerAnalyse
my_analyse = AdaptorcleanerAnalyse(options.analyse_name, options.analyse_description, options.analyse_software,
options.analyse_software_parameters, options.analyse_software_version)
options.analyse_software_parameters, options.analyse_software_version)
# Built either a project or a run considering parameters
if options.cfg :
......
......@@ -95,7 +95,7 @@ class BlastContaminationSearchAnalyse (Analyse):
if file.endswith(".names"):
result_files.append(os.path.join(contamination_dir, file))
[n, db] = self.parse_m8_file(m8f)
sample = self._get_file_description(os.path.splitext(os.path.basename(m8f))[0])
sample = os.path.splitext(os.path.splitext(os.path.basename(m8f))[0])[0]
if sample not in samples:
samples.append(sample)
if db not in databases :
......@@ -111,11 +111,6 @@ class BlastContaminationSearchAnalyse (Analyse):
contamination_info[db][sample] = {}
contamination_info[db][sample]["file"] = m8f
contamination_info[db][sample]["nb"] = n
# First add the params used
for conta_info in contamination_info.keys() :
self._add_param("blastall", contamination_info[conta_info]["params"]["blastall"], conta_info)
self._add_param("filter", contamination_info[conta_info]["params"]["filter"], conta_info)
# Then add the analyse resutls
for sample in samples:
......@@ -123,10 +118,10 @@ class BlastContaminationSearchAnalyse (Analyse):
for db in databases :
total += contamination_info[db][sample]["nb"]
self._add_result_element(sample, "nb_conta", contamination_info[db][sample]["nb"], db)
self._add_result_element(sample, "total", str(total))
self._add_result_element(sample, "nb_conta", str(total), "total")
# Finaly create and add the archive to the analyse
self._add_file(self._create_and_archive(result_files, archive_name))
self._create_and_archive(result_files, archive_name)
if __name__ == '__main__':
......
......@@ -69,7 +69,7 @@ class MidsContaminationSearchAnalyse (Analyse):
stdout_desc = []
mids_desc = []
for stdout_file in stdout_files :
desc = self._get_file_description(os.path.basename(stdout_file))
desc = os.path.splitext(os.path.basename(stdout_file))[0]
[mids_values, mids_desc] = self.parse_stdout_file(stdout_file)
stdout_desc.append(desc)
contamination_info[desc] = mids_values
......@@ -81,7 +81,7 @@ class MidsContaminationSearchAnalyse (Analyse):
self._add_result_element(desc, mid, contamination_info[desc][mid])
# Finaly create and add the archive to the analyse
self._add_file(self._create_and_archive(stdout_files, archive_name))
self._create_and_archive(stdout_files, archive_name)
if __name__ == '__main__':
......
......@@ -46,54 +46,8 @@ class PyrocleanerAnalyse (Analyse):
@param log_file : the log file path
@return : a log hash map
"""
# complexity_win parameter regexp
complexity_win_regex = re.compile(" - Clean complexity based on a sliding window with a size of (\d+), a step of (\d+) and (\d+) as.*")
# complexity_full parameter regexp
complexity_full_regex = re.compile(" - Clean complexity based on the whole sequence with (\d+) as .*")
# duplicated parameter regexp
duplicated_regex = re.compile(" - Clean duplicated reads using (\d+) as limit for the difference between reads ends to consider them as duplicat.")
# length_std parameter regexp
length_std_regex = re.compile(" - Clean reads shorter than reads length mean minus (\d+) standard deviation and reads longer than reads length mean plus (\d+) standard deviation.")
# length_win parameter regexp
length_win_regex = re.compile(" - Clean reads with a length not in between \[(\d+);(\d+)\].")
# Ns parameters regexp
ns_regex = re.compile(" - Clean reads with a percentage of Ns higher than (\d+)%.")
# pairends parameters regexp
pairends_regex = re.compile(" - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than (\d+) nucleaotides or if (\d+) nucleotides missmatch with the spacer.\n")
# aggresssive parameters regexp
aggressive_regex = re.compile(" - Clean duplicated reads using (\d+) as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.")
# qual parameters regexp
qual_regex = re.compile(" - Clean reads if no bases quality has a score higher than (\d+).")
logs = {}
for line in open(log_file, 'r').readlines():
cwr = complexity_win_regex.match(line)
cfr = complexity_full_regex.match(line)
dr = duplicated_regex.match(line)
lsr = length_std_regex.match(line)
lwr = length_win_regex.match(line)
nr = ns_regex.match(line)
pr = pairends_regex.match(line)
ar = aggressive_regex.match(line)
qr = qual_regex.match(line)
if cwr != None :
logs["complexity_win_param"] = [cwr.group(1), cwr.group(2), cwr.group(3)]
if cfr != None :
logs["complexity_full_param"] = cfr.group(1)
if dr != None :
logs["duplicated_param"] = dr.group(1)
if lsr != None :
logs["length_std_param"] = lsr.group(1)
if lwr != None :
logs["length_win_param"] = [lwr.group(1), lwr.group(2)]
if nr != None :
logs["ns_param"] = nr.group(1)
if pr != None :
logs["pairends_param"] = [pr.group(1), pr.group(2)]
if ar != None :
logs["aggressive_param"] = ar.group(1)
if qr != None :
logs["qual_param"] = qr.group(1)
if line.startswith("## summary (global)"):
logs["stats"] = line.rstrip().split()[4:]
if line.startswith("## header (duplicated)"):
......@@ -116,41 +70,14 @@ class PyrocleanerAnalyse (Analyse):
for file in os.listdir(cleaner_dir):
if file.endswith(".log"):
log_file = os.path.join(cleaner_dir, file)
sample_name = os.path.splitext(os.path.basename(file))[0]
result_files.append(log_file)
elif not file.endswith(".stdout") :
result_file = os.path.join(cleaner_dir, file)
result_files.append(result_file)
sample_name = self._get_file_description(os.path.basename(result_file))
files[sample_name] = {}
files[sample_name]["log"] = log_file
files[sample_name]["result"] = result_file
logs = self.parse_log_file(log_file)
# First add the params used
if logs.has_key("complexity_win_param"):
self._add_param("complexity", logs["complexity_win_param"][2])
self._add_param("window", logs["complexity_win_param"][0])
self._add_param("step", logs["complexity_win_param"][1])
if logs.has_key("complexity_full_param"):
self._add_param("complexity", logs["complexity_full_param"])
if logs.has_key("duplicated_param"):
self._add_param("duplication_limit", logs["duplicated_param"])
if logs.has_key("length_win_param"):
self._add_param("min", logs["length_win_param"][0])
self._add_param("max", logs["length_win_param"][1])
if logs.has_key("length_std_param"):
self._add_param("std", logs["length_std_param"])
if logs.has_key("ns_param"):
self._add_param("ns_percent", logs["ns_param"])
if logs.has_key("pairends_param"):
self._add_param("border-limit", logs["pairends_param"][0])
self._add_param("missmatch", logs["pairends_param"][1])
if logs.has_key("qual_param"):
self._add_param("qual_min", logs["qual_param"])
if logs.has_key("aggressive_param"):
self._add_param("aggressive")
self._add_param("duplication_limit", logs["aggressive_param"])
# Then add the analyse resutls
for sample in files :
......@@ -174,7 +101,7 @@ class PyrocleanerAnalyse (Analyse):
pass
# Finaly create and add the archive to the analyse
self._add_file(self._create_and_archive(result_files, archive_name))
self._create_and_archive(result_files, archive_name)
if __name__ == '__main__':
......
......@@ -167,7 +167,7 @@ class ReadStatAnalyse (Analyse):
seqslengthstat = os.path.join(readstats_dir, file)
if not file.endswith(".stdout"):
results_files.append(os.path.join(readstats_dir, file))
sample = self._get_file_description(os.path.splitext(os.path.splitext(os.path.basename(seqssummary))[0])[0])
sample = os.path.splitext(os.path.splitext(os.path.splitext(os.path.basename(seqssummary))[0])[0])[0]
all_samples[sample] = self.parse_length_stat_file(seqslengthstat)
samples.append(sample)
files[sample] = {}
......@@ -202,10 +202,10 @@ class ReadStatAnalyse (Analyse):
self._add_result_element(sample, "qual_png", self._save_file(files[sample]["qualpng"], os.path.splitext(os.path.basename(files[sample]["seqssummary"]))[0] + ".qual.png"))
if len(readstats_dirs) > 1:
self._add_result_element(sample, "lenght_png", self._save_file(Matplot.create_multiple_length_histogram(all_samples)))
self._add_result_element("all", "length_png", self._save_file(Matplot.create_multiple_length_histogram(all_samples)))
# Finaly create and add the archive to the analyse
self._add_file(self._create_and_archive(results_files, archive_name))
self._create_and_archive(results_files, archive_name)
if __name__ == '__main__':
......
......@@ -276,7 +276,7 @@ class RunAssemblyAnalyse (Analyse):
if file.endswith(".txt") or file.endswith(".tsv") or file.endswith(".fna") or file.endswith(".qual") or file.endswith(".ace") :
result_files.append(os.path.join(run_assembly_dir, file))
if file.endswith("Metrics.txt") :
sample_name = self._get_file_description(file)
sample_name = os.path.splitext(os.path.splitext(file)[0])[0]
metrics_file = os.path.join(run_assembly_dir, file)
if file.endswith(".ace") :
ace_file = os.path.join(run_assembly_dir, file)
......@@ -356,7 +356,7 @@ class RunAssemblyAnalyse (Analyse):
pass
# Finaly create and add the archive to the analyse
self._add_file(self._create_and_archive(result_files, archive_name))
self._create_and_archive(result_files, archive_name)
if __name__ == '__main__':
......
#
# Copyright (C) 2009 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '1.0'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'
from optparse import *
import os, sys, re
from Bio import SeqIO
from ng6.Analyse import Analyse
from ng6.Project import Project
from ng6.Run import Run
from ng6.Matplot import Matplot
class SeqcleanAnalyse (Analyse):
"""
Class Analyse: Define a nG6 Analyse
"""
def __init__(self, name, description, software, options, version):
Analyse.__init__(self, name, description, software, options, version)
def parse_log_file(self, log_file):
"""
Parse the adaptorcleaner log file
@param log_file : the log file path
@return : a log hash map
"""
param_get_regexp = re.compile("seqclean (.+)")
# info needed
input_seq_regexp = re.compile("Sequences analyzed:\s+(\d+)")
valid_regexp =re.compile("\s+valid:\s+(\d+)\s+\((\d+) trimmed\)")
trashed_regexp =re.compile("\s+trashed:\s+(\d+)")
trashed_short_regexp =re.compile("\s+by 'shortq':\s+(\d+)")
trashed_dust_regexp =re.compile("\s+by 'dust':\s+(\d+)")
trashed_low_qual_regexp =re.compile("\s+by 'low_qual':\s+(\d+)")
logs = {}
stats=[]
trashed_short=0
trashed_dust=0
trashed_low_qual=0
nb_sequences=0
nb_sequences_after=0
trashed=0
trimmed=0
for line in open(log_file, 'r').readlines():
if (line == "") :
next
param_get = param_get_regexp.match(line)
if param_get_regexp.match(line) != None and not line.startswith("seqclean running options:") :
options = param_get_regexp.match(line).group(1)
param = options.split(" ")
i=0
while i < len(param) :
if param[i].startswith("-") :
key=param[i].replace("-","")
if not param[i+1].startswith("-") :
value = param[i+1]
i=i+2
else :
value=""
i=i+1
logs[key]=value
else:
i=i+1
valid_seq_exp=valid_regexp.match(line)
if input_seq_regexp.match(line) != None :
nb_sequences = input_seq_regexp.match(line).group(1)
if valid_seq_exp != None :
(nb_sequences_after,trimmed) = valid_seq_exp.group(1),valid_seq_exp.group(2)