Skip to content
Snippets Groups Projects
Commit 300c79d2 authored by Celine Noirot's avatar Celine Noirot
Browse files

Useless work of pavel

parent dc181a07
No related branches found
No related tags found
No related merge requests found
Showing
with 0 additions and 3513 deletions
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from lib.utils import *
from lib.transcript import Contig, DNA, Annotation, Expression, Clusters
import ngspipelines
from ngspipelines.application import ApplicationWithoutReferenceGenome
from ngspipelines.analysis import Analysis
from ngspipelines.library import Library
from jflow.seqio import SequenceReader
import os
import sys
import StringIO
from ConfigParser import ConfigParser
class Transcript (ApplicationWithoutReferenceGenome):
def get_description(self):
return "RNAseq pipeline without reference genome"
def define_parameters(self, function="process"):
# Assembly parameter
self.add_multiple_parameter("assembly", "Define the assembly process", required=True, group="ASSEMBLY section")
self.add_input_file("file", "Path to the contig file in fasta format", file_format="fasta", required=True, add_to="assembly")
self.add_parameter("software_name", "Which software was used to perform the assembly", default="", required=True, add_to="assembly")
self.add_parameter("software_parameters", "Which parameters were used when performing the assembly", default="", add_to="assembly")
self.add_parameter("software_version", "Which version of the assembly software was used", default="", required=True, add_to="assembly")
self.add_parameter("comments", "Add some comments on this analysis", default="", add_to="assembly")
# Annotation parameter
self.add_multiple_parameter_list("assembly_annot", "Define the annotation process", required=True, group="ASSEMBLY ANNOTATION section")
self.add_input_file("file", "Path to the annotation file in gff format", required=True, add_to="assembly_annot")
self.add_parameter("software_name", "Which software was used to perform the annotation", add_to="assembly_annot")
self.add_parameter("software_parameters", "Which parameters were used when performing the annotation", required=True, add_to="assembly_annot")
self.add_parameter("software_version", "Which version of the annotation software was used", required=True, add_to="assembly_annot")
self.add_parameter("database", "Which database was used", add_to="assembly_annot")
self.add_parameter("database_version", "Which database version was used", add_to="assembly_annot")
self.add_parameter("comments", "Add some comments on this analysis", add_to="assembly_annot")
self.add_multiple_parameter("annotation_filter", "Define the annotation filter", required=True, group="ANNOTATION FILTER section")
self.add_parameter("min_identity", "The minimum identity value to keep an HSP", required=True, add_to="annotation_filter")
self.add_parameter("min_coverage", "The minimum coverage value to keep an HSP", required=True, add_to="annotation_filter")
# Count matrix parameter
self.add_input_file("count_matrix", "File of mapped reads counts per contig (line) per library (column)", group="ALIGNMENT section")
# Cluster members for centroids
self.add_input_file("clusters", "File of cluster members for centroids", group="ASSEMBLY ANNOTATION section")
def process(self):
# first check all inputs
contig_names = Contig.get_contigs(self.assembly["file"])
#self.check_load_format(contig_names)
# set variables
annotation_files=self.assembly_annot["file"]
# convert blast xml files
blastxml2gff = self.add_component( "Blast2Annot", [ annotation_files, self.annotation_filter['min_identity'], self.annotation_filter['min_coverage'] ,
self.get_resource("refseq_gi_from_accession"), self.get_resource("refseq_gene_from_gi") ] )
#contact annotations
merge_annot = self.add_component( "ConcatenateFiles", [blastxml2gff.output_files, "annotations.gff", True], component_prefix="bestAnnot" )
#search best annotation
best_annot_search = self.add_component( "BestAnnotSearch", [merge_annot.output_file] )
# load the contig table
contig_parameters = []
contig_parameters = [self.assembly["file"], [best_annot_search.best_annotations]]
contig = self.add_biomart_load(Contig,
contig_parameters,
"Assembly",
"assembly",
"0",
self.assembly["software_name"],
self.assembly["software_parameters"],
self.assembly["software_version"],
self.assembly["comments"], [self.assembly["file"]])
dna = self.add_biomart_load(DNA, [self.assembly["file"]])
self.add_blast_search( Contig, self.assembly["file"] )
self.add_venn()
# load the annotation table
for annotation_arg in self.assembly_annot:
annot = self.add_biomart_load(Annotation,
[best_annot_search.std_annotations,contig.new_table],
"Annotation",
"annotation",
"0",
annotation_arg["software_name"],
annotation_arg["software_parameters"],
annotation_arg["software_version"],
annotation_arg["comments"], [annotation_arg["file"]])
counts = self.add_biomart_load(Expression, [self.count_matrix, self.project.libraries])
clusters = self.add_biomart_load(Clusters, [self.clusters])
def check_load_format(self,contig_names):
for annot_arg in self.assembly_annot:
err_contigs=get_gff3_unmatch_contigs(contig_names,annot_arg["file"])
if len (err_contigs) > 0:
sys.stderr.write('Unknown sequence name >' + ",".join(err_contigs)[:-1] + '< in GFF3 ' + annot_arg["file"] + ' \n')
sys.exit(1)
def check_libraries (self, alignment_files):
# Do not check fastq file, done in Library
if self.count_matrix:
fh = open(self.count_matrix)
line = fh.readline()
matrix_libs = []
parts = line.rstrip().split("\t")
matrix_libs = parts[1:]
for lib_name in matrix_libs :
if self.get_library_from_name(lib_name) == None:
sys.stderr.write('Library '+lib_name+' defined at the first line of the matrix file has not been defined (see --library option to define a library)\n')
sys.exit(1)
@staticmethod
def config_parser(arg_lines):
config = ConfigParser()
config.readfp(StringIO.StringIO('\n'.join(arg_lines)))
arguments = []
add_analysis_options = ['software_name', 'software_parameters', 'software_version', 'comments']
section = 'project'
if config.has_section(section):
for option in config.options(section):
if config.has_option(section, option):
arguments.extend( [ '--'+option.replace('_','-'), config.get(section, option) ] )
section = 'assembly'
if config.has_section(section):
arguments.append('--assembly')
if config.has_option(section, 'file'):
arguments.append( 'file=' + config.get(section, 'file') )
for option in add_analysis_options:
if config.has_option(section, option):
arguments.append( option.replace('_', '-') + '=' + config.get(section, option) )
section = 'annotation_filter'
if config.has_section(section):
arguments.append('--annotation-filter')
for option in ['min_identity','min_coverage']:
if config.has_option(section, option):
arguments.append( option.replace('_', '-') + '=' + config.get(section, option) )
section = 'expression'
if config.has_section(section):
if config.has_option(section, 'file'):
arguments.extend( ['--count-matrix', config.get(section, 'file')] )
section = 'clusters'
if config.has_section(section):
if config.has_option(section, 'file'):
arguments.extend( ['--clusters', config.get(section, 'file')] )
section = 'assembly_annotations'
if config.has_section(section):
lhash = {}
for option in config.options(section):
if config.get(section, option):
if option == 'best_annotation_source':
arguments.extend( ['--best-annotation-source', config.get(section, option) ])
else:
name, prop = option.split('.', 2)
if not lhash.has_key(name):
lhash[name] = []
if prop == 'file':
lhash[name].append( 'file=' + config.get(section, option) )
else:
lhash[name].append( prop.replace('_', '-') +'=' + config.get(section, option) )
for x in lhash.values():
arguments.extend( [ '--assembly-annot'] + x )
section = 'libraries'
if config.has_section(section):
lhash = {}
for option in config.options(section):
if config.get(section, option):
name, prop = option.split('.', 2)
if not lhash.has_key(name):
lhash[name] = []
if prop == "files":
for file in config.get(section, option).split(","):
lhash[name].append( prop.replace('_','-') +'=' + file.strip() )
else:
lhash[name].append( prop.replace('_','-') +'=' + config.get(section, option) )
for x in lhash.values():
arguments.extend( [ '--library'] + x)
return arguments
def _nb_seq( self, filepath ):
"""
@summary : Return the number of sequences.
@param filepath : [str] the sequence file to process.
@return : [int] The number of sequences.
"""
nb_seq = 0
seq_fh = SequenceReader( filepath )
for seq_record in seq_fh:
nb_seq += 1
return nb_seq
\ No newline at end of file
This diff is collapsed.
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'alpha'
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'lib'))
from optparse import *
import NCBIXML
def version_string ():
"""
Return the blast_filter version
"""
return "blast_filter " + __version__
if __name__ == '__main__':
parser = OptionParser(usage="Usage: %prog -i FILE|- -o FILE", description = "Filter an xml blast file", version = version_string())
igroup = OptionGroup(parser, "Input file options","")
igroup.add_option("-i", "--input", dest="input",
help="The input file, if set to - read STDIN, REQUIRE")
parser.add_option_group(igroup)
ogroup = OptionGroup(parser, "Output files options","")
ogroup.add_option("-o", "--output", dest="output",
help="The output file REQUIRE")
parser.add_option_group(ogroup)
fgroup = OptionGroup(parser, "Filters options","")
fgroup.add_option("-q", "--pcq", dest="pcq",
help="Minimun poucentage of query length per alignment REQUIRE ", default=50 )
fgroup.add_option("-p", "--pci", dest="pci",
help="Minimun poucentage of identity per alignment REQUIRE ", default=50 )
parser.add_option_group(fgroup)
(options, args) = parser.parse_args()
blast_records = NCBIXML.parse(options.input)
for blast_record in blast_records:
no_annot_regions_saved=[]
no_annot_regions_work=[]
nb_no_annot_region=0
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
keep_hsp = 0
if nb_no_annot_region == 0 :
no_annot_regions_saved.append([0,hsp.query_start])
no_annot_regions_saved.append([hsp.query_end,blast_record.query_letters])
nb_no_annot_region=nb_no_annot_region+1
keep_hsp = 1
else :
for [hole_start,hole_end] in no_annot_regions_work :
#for each hole
delete_no_annot_region = 0
# hsp include a no annot region
if hsp.query_start < hole_start and hsp.query_end > hole_end :
delete_no_annot_region = 1
keep_hsp = 1
#hsp start is in a no annot region
if hsp.query_start > hole_start and hsp.query_start < hole_end:
#hsp is in the none annotated region
if (hsp.query_end < hole_end) :
keep_hsp = 1
# adding the 2 new none annotated region to region table
no_annot_regions_saved.append([hole_start,hsp.query_start])
no_annot_regions_saved.append([hsp.query_end,hole_end])
else :
keep_hsp = 1;
# update end of the existing none annotated region
no_annot_regions_saved.append([hole_start,hsp.query_start])
else :
# no annot region include query end
if hsp.query_end > hole_start and hsp.query_end < hole_end :
if hsp.query_start <= hole_start :
keep_hsp = 1;
no_annot_regions_saved.append([hsp.query_end,hole_end])
else :
if delete_no_annot_region == 0 :
no_annot_regions_saved.append([hole_start,hole_end])
if keep_hsp :
print blast_record.query +" "+str(hsp.query_start)+" "+str(hsp.query_end) + " " +alignment.hit_def + " " +str(hsp.sbjct_start) + " " +str(hsp.sbjct_end)
no_annot_regions_work=no_annot_regions_saved[:]
no_annot_regions_saved=[]
#
# Copyright (C) 2009 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '1.0'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'
from optparse import *
import sys,re,datetime
from os.path import join, splitext, basename, exists, split
def read_count_file (file,libname,dict_count):
"""
Extract count from the count file
@param file : the input count file (output of sam2count)
@param libname : the library name
@param dict_count : the dict of contigs to update
"""
fin=open (file, "r")
re_parsed = re.compile("# total read parsed : (\d+)")
re_mapped = re.compile("# total read mapped : (\d+)")
re_parsed_paired = re.compile("# total paired read parsed : (\d+)")
re_mapped_paired = re.compile("# total paired read mapped : (\d+)")
re_mapped_paired_isize = re.compile("# total paired read mapped with good insert size : (\d+)")
total_parsed_read = 0
total_mapped_read = 0
total_parsed_pair = -1
total_mapped_pair = -1
total_mapped_pair_isize = -1
# parse headers
line = fin.readline()
while not line.startswith("#id") :
if line.startswith("# total read parsed") :
m = re_parsed.match(line)
if m != None :
total_parsed_read = int(m.group(1))
if line.startswith("# total read mapped") :
m = re_mapped.match(line)
if m != None :
total_mapped_read = int(m.group(1))
if line.startswith("# total paired read parsed"):
m = re_parsed_paired.match(line)
if m != None :
total_parsed_pair = int(m.group(1))
if line.startswith("# total paired read mapped") :
m = re_mapped_paired.match(line)
if m != None :
total_mapped_pair = int(m.group(1))
if line.startswith("# total paired read mapped with good insert size") :
m = re_mapped_paired_isize.match(line)
if m != None :
total_mapped_pair_isize = int(m.group(1))
line = fin.readline()
#split header
tab = line.split()
headers = tab[2:]
#parse data
line = fin.readline()
while line != "":
tab = line.split("\t") # contig count line
if len(tab) > 0 :
if not dict_count.has_key(tab[0]) : # if new contig
dict_count[tab[0]] = {}
if not dict_count[tab[0]].has_key(libname) : # if new library for this contig
dict_count[tab[0]][libname] = {}
index = 2 # headers/ values begin at 3rd column
for type_val in headers :
dict_count[tab[0]][libname][type_val] = str(int(tab[index]))
index = index+1
line = fin.readline()
fin.close
return total_parsed_read,total_mapped_read,total_parsed_pair,total_mapped_pair,total_mapped_pair_isize
def version_string ():
"""
Return the merge_count version
"""
return "merge_count.py " + __version__
if __name__ == '__main__':
parser = OptionParser(usage="Usage: %prog", description = "Merge count files into a global counting file,\
for single file use the column nb_single, for paired file use the count column nb_pair_isize.")
igroup = OptionGroup(parser, "Input options","")
igroup.add_option("-f", "--fof", dest="fof", help="file of count file")
igroup.add_option("-c", "--correct_pair_only", dest="correct_pair_only", help="If paired count file take column of correct pair only (default use column with good insert size count if available)",action="store_true",default=False)
parser.add_option_group(igroup)
ogroup = OptionGroup(parser, "Output files options","")
ogroup.add_option("-o", "--output", dest="output", help="the output count file")
parser.add_option_group(ogroup)
(options, args) = parser.parse_args()
if options.fof == None :
sys.stderr.write("Need a file of file\n")
parser.print_help()
sys.exit(1)
if options.output == None:
sys.stderr.write("Output file is missing\n")
parser.print_help()
sys.exit(1)
ouput_single = join(split(options.output)[0], "single."+ basename(options.output))
ouput_paired = join(split(options.output)[0], "paired."+ basename(options.output))
dict_count={}
indexlib=0
#parcours des fichiers
if options.fof != None :
fin=open (options.fof, "r")
line=fin.readline()
while line != "":
file = line.rstrip()
if exists(file) :
current_libname = splitext(basename(file))[0]
(total_parsed_read,total_mapped_read,total_parsed_pair,total_mapped_pair,total_mapped_pair_isize)=read_count_file(file,current_libname,dict_count)
if not dict_count.has_key("total_parsed") :
dict_count["total_parsed"] = {}
dict_count["total_mapped"] = {}
if not dict_count["total_parsed"].has_key(current_libname) :
dict_count["total_parsed"][current_libname] = {}
dict_count["total_mapped"][current_libname] = {}
dict_count["total_parsed"][current_libname]["nb_single"] = total_parsed_read
dict_count["total_parsed"][current_libname]["nb_read_paired"] = total_parsed_pair
dict_count["total_parsed"][current_libname]["nb_read_paired_isize"] = None
dict_count["total_mapped"][current_libname]["nb_single"] = total_mapped_read
dict_count["total_mapped"][current_libname]["nb_read_paired"] = total_mapped_pair
dict_count["total_mapped"][current_libname]["nb_read_paired_isize"] = total_mapped_pair_isize
else :
sys.stderr.write("File : " + file + " doesn't exist")
sys.exit(1)
line = fin.readline()
indexlib = indexlib+1
fin.close
fout = open (options.output,"w")
write_paired_as_single = 0
header_total_parsed = [ "#TotalParsed" ]
header_total_mapped = [ "#TotalMapped" ]
header_contig = [ "#contig_name" ]
libs_write_paired_as_single=[]
for lib in sorted(dict_count["total_parsed"].keys()) :
if dict_count["total_parsed"][lib]["nb_read_paired"] > -1 :
write_paired_as_single=1
libs_write_paired_as_single.append(lib)
header_total_parsed.append( str(dict_count["total_parsed"][lib]["nb_read_paired"]) )
if ( options.correct_pair_only or dict_count["total_mapped"][lib]["nb_read_paired_isize"] == -1) :
header_total_mapped.append( str(dict_count["total_mapped"][lib]["nb_read_paired"]) )
else :
header_total_mapped.append( str(dict_count["total_mapped"][lib]["nb_read_paired_isize"]) )
else :
header_total_parsed.append( str(dict_count["total_parsed"][lib]["nb_single"]) )
header_total_mapped.append( str(dict_count["total_mapped"][lib]["nb_single"]) )
header_contig.append( lib )
fout.write ( '\t'.join( header_contig ) +"\n")
fout.write ( '\t'.join( header_total_parsed ) +"\n")
fout.write ( '\t'.join( header_total_mapped ) +"\n")
# Create single output only if paired data are in merged count
f_single_out = None
if write_paired_as_single :
f_single_out = open (ouput_single,"w")
header_total_single_parsed = [ "#TotalParsed" ]
header_total_single_mapped = [ "#TotalMapped" ]
for lib in sorted(libs_write_paired_as_single) :
header_total_single_parsed.append( str(dict_count["total_parsed"][lib]["nb_single"]) )
header_total_single_mapped.append( str(dict_count["total_mapped"][lib]["nb_single"]) )
f_single_out.write( '\t'.join( ["#contig_name"] + sorted(libs_write_paired_as_single) ) + "\n")
f_single_out.write( '\t'.join(header_total_single_parsed) + "\n")
f_single_out.write( '\t'.join(header_total_single_mapped) + "\n")
for contig_name in sorted(dict_count.keys()) :
line = []
if (contig_name == "total_parsed" or contig_name == "total_mapped") : continue
line.append( contig_name )
line_single = line[:]
for lib in sorted(dict_count[contig_name].keys()) :
if dict_count["total_parsed"][lib]["nb_read_paired"] >- 1 : # If lib is paired
if (options.correct_pair_only or dict_count["total_mapped"][lib]["nb_read_paired_isize"] == -1 ) :
line.append(str(dict_count[contig_name][lib]["nb_read_paired"]) )
else :
line.append(str(dict_count[contig_name][lib]["nb_read_paired_isize"]))
else :
line.append( str(dict_count[contig_name][lib]["nb_single"]) )
for lib in sorted (libs_write_paired_as_single):
line_single.append( str(dict_count[contig_name][lib]["nb_single"]) )
fout.write ( '\t'.join(line) +"\n")
if write_paired_as_single :
f_single_out.write( '\t'.join(line_single) +"\n")
if write_paired_as_single :
f_single_out.close()
fout.close
sys.exit(0)
\ No newline at end of file
#
# Copyright (C) 2009 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '1.0'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'
from optparse import *
import os,sys,re,datetime
def version_string ():
"""
Return the rpkm_compute version
"""
return "sam2count.py " + __version__
if __name__ == '__main__':
parser = OptionParser(usage="Usage: %prog", description = "Read non sorted sam file as stdin and count aligned reads, can also compute the rpkm and fpkm if it's paired.")
igroup = OptionGroup(parser, "Input options","")
igroup.add_option("-s", "--size-min", dest="min_insert_size", help="minimal insert size for paired data")
igroup.add_option("-m", "--size-max", dest="max_insert_size", help="maximal insert size for paired data")
igroup.add_option("-p", "--paired", dest="paired", help="input is a paired file",action="store_true",default=False)
igroup.add_option("-r", "--rewrite",dest="rewrite" , help="write stdin on stdout",action="store_true",default=False)
parser.add_option_group(igroup)
ogroup = OptionGroup(parser, "Output files options","")
ogroup.add_option("-o", "--output", dest="output", help="the output file")
parser.add_option_group(ogroup)
(options, args) = parser.parse_args()
if options.output == None:
sys.stderr.write("Output file is missing")
parser.print_help()
sys.exit(1)
test_insert=False
option_sumary=""
if options.paired :
if options.min_insert_size != None and options.max_insert_size != None :
option_sumary="##Options :\n##\tmin insert size : "+options.min_insert_size+"\n##\tmax insert size: "+options.max_insert_size
test_insert=True
else :
option_sumary="##Options :\n##\tMin and Max insert size not specified"
sys.stderr.write("WARNING : Insert size will not be tested, set parameters --size-min --size-max if needed\n")
total_parsed_read = 0.0
total_mapped_read = 0
total_mapped_pair = 0
total_mapped_pair_plus = 0
total_parsed_pair = 0
#ref -> [len, count single, count seq paired, count seq well paired]
reference={}
#header regexp : @SQ SN:SC_00000001.1.1 LN:4030
header_regexp=re.compile("@SQ\s+SN:(.+)\s+LN:(\d+)")
#lecture du header
line=sys.stdin.readline()
if options.rewrite : sys.stdout.write(line)
while line.startswith("@") :
m=header_regexp.match(line)
if m!=None :
reference[m.group(1)]=[int(m.group(2)),0,0,0]
line=sys.stdin.readline()
if options.rewrite : sys.stdout.write(line)
if len(reference.keys())==0 :
sys.stderr.write("ERROR : Header in sam format is required\n")
sys.exit(1)
#fisrt line / pair 1
ref=line
#on est sur la premiere ligne de l'alignement il faut le compter
total_parsed_read = 1
while 1 :
data = sys.stdin.readline()
if options.rewrite : sys.stdout.write(data)
if data == '' : #handle last line
if ref != '' :
array_ref = ref.split("\t")
if str(bin(int(array_ref[1]))[-3]) == '0' :
total_mapped_read=total_mapped_read+1
reference[array_ref[2]][1] = reference[array_ref[2]][1]+1
break
else :
array_ref = ref.split("\t")
array_data = data.split("\t")
#read mapped [flag : read unmapped==0]
if str(bin(int(array_ref[1]))[-3]) == '0':
total_mapped_read=total_mapped_read+1
reference[array_ref[2]][1] = reference[array_ref[2]][1]+1
# two lines of the same fragment on the same reference
if array_data[0] == array_ref[0] :
total_parsed_pair=total_parsed_pair+2
if array_data[2] == array_ref[2] and options.paired:
#str((bin(int(array_data[1]))[-2:]) : extrait les 2 derniers flags : read paired & properly mapped
# pair 1
if str(bin(int(array_ref[1]))[-2:]) == '11' :
reference[array_data[2]][2] = reference[array_data[2]][2]+1
total_mapped_pair=total_mapped_pair+1
# et teste la longueur de l'insert
if test_insert and abs(int(array_data[8])) > int(options.min_insert_size) and abs(int(array_ref[8])) > int(options.min_insert_size) \
and abs(int(array_data[8])) < int(options.max_insert_size) and abs(int(array_ref[8])) < int(options.max_insert_size) :
reference[array_data[2]][3] = reference[array_data[2]][3]+1
total_mapped_pair_plus=total_mapped_pair_plus+1
# pair 2
if str(bin(int(array_data[1]))[-2:]) == '11' :
reference[array_data[2]][2] = reference[array_data[2]][2]+1
total_mapped_pair=total_mapped_pair+1
# et teste la longueur de l'insert
if test_insert and abs(int(array_data[8])) > int(options.min_insert_size) and abs(int(array_ref[8])) > int(options.min_insert_size) \
and abs(int(array_data[8])) < int(options.max_insert_size) and abs(int(array_ref[8])) < int(options.max_insert_size) :
reference[array_data[2]][3] = reference[array_data[2]][3]+1
total_mapped_pair_plus=total_mapped_pair_plus+1
ref = data
total_parsed_read= total_parsed_read+1
fout=open (options.output, "w")
fout.write ("#sam2count :" + str(datetime.datetime.now()) + "\n")
fout.write (option_sumary+"\n")
fout.write ("# total read parsed : "+str(int(total_parsed_read))+"\n")
fout.write ("# total read mapped : "+str(int(total_mapped_read))+"\n")
header_table="#id\tlen\tnb_single"
if int(total_parsed_pair) > 0 and options.paired :
fout.write ("# total paired read parsed : "+str(int(total_parsed_pair))+"\n")
fout.write ("# total paired read mapped : "+str(int(total_mapped_pair))+" (properly pair on same ref)\n")
header_table=header_table+"\tnb_read_paired"
if test_insert :
fout.write ("# total paired read mapped with good insert size : "+str(int(total_mapped_pair_plus))+"\n")
header_table=header_table+"\tnb_read_paired_isize"
fout.write ( header_table+"\n")
outline=""
for refid in sorted(reference.keys()) :
outline = refid+"\t"+str(reference[refid][0])+"\t"+str(reference[refid][1])
if total_parsed_pair > 0 and options.paired :
outline = outline + "\t"+str(reference[refid][2])
if test_insert :
outline = outline +"\t"+ str(reference[refid][3])
fout.write (outline+"\n")
fout.close
sys.exit(0)
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
\ No newline at end of file
#
# Copyright (C) 2014 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import pickle
from jflow.component import Component
from weaver.function import PythonFunction
from weaver.abstraction import Map
from ngspipelines.utils import Utils
def best_hit( input , best_output, output ):
"""
@summary : Find the best hit for each query in annotations files. The comparison is based on anotation.innerscore().
"""
import re
from workflows.rnaseqdenovo.lib.annotationfile import AnnotationFile, AnnotationRecord
best_annot = dict()
# Process
out_annot_fh = list()
annotations = AnnotationFile(input, "r")
output = AnnotationFile(output, "w")
for current_annot in annotations:
# The query already has a best HSP
if best_annot.has_key( current_annot.seq_id ):
if float(best_annot[current_annot.seq_id]["annot"].score) < float(current_annot.score):
# Write old best HSP
output.write( best_annot[current_annot.seq_id]["annot"] )
# Replace old best HSP
best_annot[current_annot.seq_id]["annot"] = current_annot
else:
output.write( current_annot )
# The query does not have a best HSP
else :
best_annot[current_annot.seq_id] = dict()
best_annot[current_annot.seq_id]["annot"] = current_annot
annotations.close()
# Write best annotation file
best_annot_fh = AnnotationFile( best_output, "w" )
for seq_id in best_annot.keys():
best_annot_fh.write( best_annot[seq_id]["annot"] )
best_annot_fh.close()
class BestAnnotSearch (Component):
"""
@summary : Extract in separated file(s) the best annotation and the others for each sequence.
The comparison is based on anotation.innerscore(databank_weight).
"""
def define_parameters(self, input_files):
"""
@param input_files : The annotations files
"""
# Files
self.add_input_file( "input_file", "The annotations file.",
default=input_files, required=True )
self.add_output_file( "std_annotations", "The annotations files without the best annotation.",
filename="annotations.std_annot.gff", file_format="gff" )
# get best output
self.add_output_file( "best_annotations", "the annotations file with only the best annotation of each sequences.",
filename="annotations.best.gff", file_format="gff" )
def process(self):
self.add_python_execution(best_hit, inputs=[self.input_file], outputs=[self.best_annotations, self.std_annotations])
\ No newline at end of file
#
# Copyright (C) 2014 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from subprocess import Popen, PIPE
from jflow.component import Component
class Blast2Annot (Component):
"""
@summary : Filter blast XML results and convert to annotation file.
"""
def define_parameters( self, blast_files, min_identity=0.1, min_coverage=0.1, accession_to_gi_file=None,
gi_to_gene_file=None):
"""
@param blast_files : [list] Blast files to filter and convert.
@param accession_to_gi_file : [str] Path to NCBI's file who contains link between accession number and gene ID.
@param gi_to_gene_file : [str] Path to NCBI's file who contains link between gene name and gene ID.
"""
self.conversion_options=""
# Options
if accession_to_gi_file != None:
self.conversion_options += " -a " + accession_to_gi_file
if gi_to_gene_file != None:
self.conversion_options += " -g " + gi_to_gene_file
self.add_parameter( "min_identity", "The minimum identity value to keep an HSP.", default=min_identity, type="float" )
self.add_parameter( "min_coverage", "The minimum coverage value to keep an HSP.", default=min_coverage, type="float" )
self.filter_options = " --min-identity " + str(min_identity) + " --min-coverage " + str(min_coverage)
# Files
self.add_input_file_list( "blast_files", "The blast files to align.", default=blast_files, file_format="xml", required=True )
self.add_input_file( "accession_to_gi_file", "NCBI's file who contains link between accession number and gene ID.", default=accession_to_gi_file )
self.add_input_file( "gi_to_gene_file", "NCBI's file who contains link between gene name and gene ID.", default=gi_to_gene_file )
self.add_output_file_list( "output_files", "Annotation files.", pattern='{basename_woext}.gff', items=blast_files, file_format="gff" )
self.add_output_file( "stderr", "The stderr output file.", filename="blast.stderr" )
def get_version(self):
cmd = [self.get_exec_path("blast2annot2.py"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr
def process(self):
# Filter and conversion
self.add_shell_execution(self.get_exec_path("blast2annot2.py") + " " + self.filter_options + " " + self.conversion_options + " -i $1 -o $2 ", cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr,
inputs=self.blast_files, outputs=self.output_files,
map=True)
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os, types
from jflow.component import Component
class ConcatenateFiles (Component):
"""
@summary : Concatenates files.
"""
def define_parameters(self, input_files, output_name, wext=False):
"""
@param input_files : [list] Paths to the files that will be concatenated.
@param output_name : [str] Filename for the output file (with extension : see param wext).
@param wext : [bool] True if the parameter output_name contains all extensions for the final file.
Otherwise th extensions are automatically retrieved from the input files.
"""
# Parameters
self.add_parameter( "output_name", "Filename for the output file (with extension : see param wext).", default=output_name, required=True )
self.add_parameter( "wext", "True if the parameter output_name contains all extensions for the final file. Otherwise th extensions are automatically retrieved from the input files.", default=wext, type="bool")
final_name = output_name
if not wext:
extensions = os.path.basename(input_files[0]).split(".")[1:]
final_name = output_name + ".".join( extensions )
# Files
self.add_input_file_list( "input_files", "Files that will be concatenated.", default=input_files, required=True )
out_format = input_files.file_format if hasattr(input_files, 'file_format') else "any"
self.add_output_file( "output_file", "The concatenated file.", filename=final_name, file_format=out_format )
def process(self):
files_list_str = " ".join( self.input_files )
# If the files are not zip
if not self.output_file.endswith(".gz"):
self.add_shell_execution('cat ' + files_list_str + ' > $1', cmd_format='{EXE} {OUT}', outputs=self.output_file, includes=self.input_files)
# If the files are zip
else:
self.add_shell_execution('zcat ' + files_list_str + ' | gzip - > $1', cmd_format='{EXE} {OUT}', outputs=self.output_file, includes=self.input_files)
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from subprocess import Popen, PIPE
from jflow.component import Component
from weaver.abstraction import Map
from weaver.function import ShellFunction, PythonFunction
def filter_orf( max_orf_nb, orf_file, filtered_file ):
"""
@summary : Filters the number of ORF by sequence on ORF file. It keeps the most longs. If several ORF have the same length they are all kept.
@param max_orf_nb : [int] the maximum number of ORF by sequence.
@param orf_file : [str] path to the processed file.
@param filtered_file : [str] path to the output.
"""
def write_ORF( orfs, max_length_orf_nb ):
sorted_orfs = sorted( orfs, key=lambda orf: -int(len(orf[2])) ) # sort ORF by length
nb_length = 0
i = 0
# While the maximum number of ORF is not exceeded and the sequence has an other ORF
while nb_length < max_length_orf_nb and i < len(sorted_orfs):
seqio.writefasta(out_fh, [sorted_orfs[i]])
# If the length of the current ORF is different to the previous ORF
if i == 0 or len(sorted_orfs[i][2]) != len(sorted_orfs[i-1][2]):
nb_length += 1
i += 1
import jflow.seqio as seqio
# Main
previous_seq_id = None
orfs = list()
out_fh = open(filtered_file, "w")
reader = seqio.FastaReader( orf_file )
for id, desc, seq, qual in reader:
current_seq_id = id.split("_")[:-1]
if previous_seq_id is None:
previous_seq_id = current_seq_id
# If ORF target has changed
if current_seq_id != previous_seq_id:
write_ORF( orfs, int(max_orf_nb) )
# Next ORF
orfs = list()
previous_seq_id = current_seq_id
orfs.append( [id, desc, seq, qual] )
# Write last sequence
if len(orfs) != 0:
write_ORF( orfs, int(max_orf_nb) )
class GetORF (Component):
"""
@summary : Find and extract open reading frames (ORFs).
@see : http://emboss.sourceforge.net/apps/cvs/emboss/apps/getorf.html
"""
def define_parameters(self, fasta_files, max_orf_nb=None, min_size=30, max_size=1000000, find=1, table=0):
"""
@param fasta_files : [list] sequences files to process.
@param max_orf_nb : [int] the maximum number of ORF by sequence (if several ORF have the same length they are all kept).
@param min_size : [int] minimum nucleotide size of ORF to report.
@param max_size : [int] maximum nucleotide size of ORF to report.
@param find : [int] There are two possible definitions of an open reading frame: it can either be a region that is free of STOP codons or a region that begins with a START codon and ends with a STOP codon. The last three options are probably only of interest to people who wish to investigate the statistical properties of the regions around potential START or STOP codons. The last option assumes that ORF lengths are calculated between two STOP codons.
0 (Translation of regions between STOP codons)
1 (Translation of regions between START and STOP codons)
2 (Nucleic sequences between STOP codons)
3 (Nucleic sequences between START and STOP codons)
4 (Nucleotides flanking START codons)
5 (Nucleotides flanking initial STOP codons)
6 (Nucleotides flanking ending STOP codons)
@param table : [int] Code to use.
0 Standard
1 Standard (with alternative initiation codons)
2 Vertebrate Mitochondrial
3 Yeast Mitochondrial
4 Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma
5 Invertebrate Mitochondrial
6 Ciliate Macronuclear and Dasycladacean
9 Echinoderm Mitochondrial
10 Euplotid Nuclear
11 Bacterial
12 Alternative Yeast Nuclear
13 Ascidian Mitochondrial
14 Flatworm Mitochondrial
15 Blepharisma Macronuclear
16 Chlorophycean Mitochondrial
21 Trematode Mitochondrial
22 Scenedesmus obliquus
23 Thraustochytrium Mitochondrial
"""
# Parameters
self.add_parameter( "min_size", "Minimum nucleotide size of ORF to report.", default=min_size, type=int )
self.add_parameter( "max_size", "Maximum nucleotide size of ORF to report.", default=max_size, type=int )
self.add_parameter( "find",
"ORF definition : 0 (Translation of regions between STOP codons) " +\
"1 (Translation of regions between START and STOP codons) " +\
"2 (Nucleic sequences between STOP codons) " +\
"3 (Nucleic sequences between START and STOP codons) " +\
"4 (Nucleotides flanking START codons) " +\
"5 (Nucleotides flanking initial STOP codons) " +\
"6 (Nucleotides flanking ending STOP codons).",
default=find, choices=[0, 1, 2, 3, 4, 5, 6], type=int )
self.add_parameter( "table",
"Code to use : 0 Standard " +\
"1 Standard (with alternative initiation codons) " +\
"2 Vertebrate Mitochondrial " +\
"3 Yeast Mitochondrial " +\
"4 Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma " +\
"5 Invertebrate Mitochondrial " +\
"6 Ciliate Macronuclear and Dasycladacean " +\
"9 Echinoderm Mitochondrial " +\
"10 Euplotid Nuclear " +\
"11 Bacterial " +\
"12 Alternative Yeast Nuclear " +\
"13 Ascidian Mitochondrial " +\
"14 Flatworm Mitochondrial " +\
"15 Blepharisma Macronuclear " +\
"16 Chlorophycean Mitochondrial " +\
"21 Trematode Mitochondrial " +\
"22 Scenedesmus obliquus " +\
"23 Thraustochytrium Mitochondrial ",
default=table, choices=[0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23], type=int )
self.add_parameter( "max_orf_nb", "The maximum number of ORF by sequence (if several ORF have the same length they are all kept).", default=max_orf_nb, type=int )
# Files
self.add_input_file_list( "fasta_files", "Sequences files to process.", default=fasta_files, file_format="fasta", required=True )
self.add_output_file_list( "output_files", "The ORFs fasta files.", pattern='{basename_woext}_orf.fasta', items=fasta_files, file_format="fasta" )
self.add_output_file_list( "orf_stderr_files", "The ORFs stderr files.", pattern='{basename_woext}_orf.stderr', items=fasta_files, file_format="any" )
self.add_output_file( "stderr", "The stderr output file.", filename="filter_getorf.stderr" )
def process(self):
# Temp files
get_orf_outputs = self.output_files
if self.max_orf_nb != None:
get_orf_outputs= self.get_outputs('{basename_woext}_before_filter.tmp', self.fasta_files)
# Find ORF
self.add_shell_execution(self.get_exec_path("getorf") + " -sequence $1 -outseq $2 --minsize " + str(self.min_size) + " --maxsize " + str(self.max_size) +
" --find " + str(self.find) + " --table " + str(self.table) + " 2>> $3",
cmd_format='{EXE} {IN} {OUT}',inputs=self.fasta_files, outputs=[get_orf_outputs,self.orf_stderr_files],map=True)
# Filter
if self.max_orf_nb != None:
filter = PythonFunction( filter_orf, cmd_format='{EXE} ' + str(self.max_orf_nb) + ' {IN} {OUT} 2>> ' + self.stderr )
Map( filter, inputs=get_orf_outputs, outputs=self.output_files )
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from jflow.component import Component
def cleanDesc( in_path, out_path ):
"""
@summary : Replace invalid characters in sequence description ('>' => '%3E').
@param in_path : [str] path to the file to convert.
@param out_path : [str] path to the output file.
"""
import jflow.seqio as seqio
reader = seqio.SequenceReader( in_path )
out_fh = open( out_path, "w" )
for id, desc, seq, qual in reader:
desc = desc.replace('>', '%3E')
# If sequences file is a FASTQ
if reader.__class__.__name__ == "FastqReader":
seqio.writefastq(out_fh, [[id, desc, seq, qual]])
# If sequences file is a FASTA
elif reader.__class__.__name__ == "FastaReader":
seqio.writefasta(out_fh, [[id, desc, seq, qual]])
out_fh.close()
def toGFF( in_path, out_path ):
"""
@summary : Write a GFF file from the RepeatMasker output file.
@param in_path : [str] path to the file to convert.
@param out_path : [str] path to the output file.
"""
from jflow.featureio import GFF3Record, GFF3IO
header = True
fh_output = GFF3IO( out_path , "w" )
fh_rm = open( in_path )
for line in fh_rm:
if line.strip() == "":
header = False
elif not header:
# Line example : ' 11 31.6 1.7 1.6 CHMP7_CRIGR.1.1 1238 1297 (2723) + (GATGAA)n Simple_repeat 1 61 (0) 1'
fields = line.strip().split()
record = GFF3Record()
record.seq_id = fields[4]
record.source = "RepeatMasker"
if fields[10] == "Low_complexity" :
record.type = "low_complexity_region"
elif fields[10] == "Simple_repeat" :
record.type = "repeat_region"
else:
record.type = "similarity"
record.start = fields[5]
record.end = fields[6]
record.score = fields[0]
record.strand = fields[8] if fields[8] == "+" else "-"
record.phase = "."
record.attributes = dict()
record.addToAttribute( "sw_score", fields[0] )
record.addToAttribute( "substitution_perc", fields[1] )
record.addToAttribute( "deletion_perc", fields[2] )
record.addToAttribute( "insertion_perc", fields[3] )
record.addToAttribute( "nucleotid_after_match", fields[7][1:-1] )
record.addToAttribute( "sbjct_start", fields[11] )
record.addToAttribute( "sbjct_end", fields[12] )
record.addToAttribute( "desc", fields[10] + " " + fields[9] )
fh_output.write( record )
fh_rm.close()
class RepeatMasker (Component):
"""
@summary : Annotates and masks highly repetitive DNA sequences on fasta files.
@see : http://www.repeatmasker.org/
"""
def define_parameters(self, input_files, species, engine="crossmatch", speed="default", gccalc=True, custom_library=None):
"""
@param input_files : [list] fasta file to annot and mask.
@param species : [str] query species for repeatmasker.
@param engine : [str] engine use when searching the sequence.
- 'crossmatch' is slower but often more sensitive than the other engines.
- 'abblast' ( formally known as WUBlast ) is very fast with a slight cost of sensitivity.
- 'rmblast' is a RepeatMasker compatible version of the NCBI Blast tool suite.
- 'hmmer' uses the new nhmmer program to search sequences against the new Dfam database ( human only ).
@param speed : [str] sensitivity of your search. The more sensitive the longer the processing time.
- 'slow' is 0-5% more sensitive, 2-3 times slower than default.
- 'default'.
- 'quick' is 5-10% less sensitive, 2-5 times faster than default.
- 'rush' is about 10% less sensitive, 4->10 times faster than default.
@param gccalc : [bool] force RepeatMasker to calculate the GC content even for batch files/small seqs.
@param custom_library : [str] path to the custom annotate lineage specific repeats.
"""
# Parameters
self.add_parameter( "species", "The species of queries.", default=species, required=True )
self.add_parameter( "engine", "Engine use when searching the sequence", default=engine, choices=["crossmatch", "abblast", "rmblast", "hmmer"] )
self.add_parameter( "speed", "Sensitivity of your search. The more sensitive the longer the processing time.", default=speed, choices=["slow", "default", "quick", "rush"] )
self.add_parameter( "gccalc", "Force RepeatMasker to calculate the GC content even for batch files/small seqs.", default=gccalc, type="bool" )
self.add_parameter( "custom_library", "The custom annotate lineage specific repeats.", default=custom_library )
# Options
self.options = "-engine " + self.engine
speed_flags = { "slow":"-s", "default":"", "quick":"-q", "rush":" -qq" }
self.options += " " + speed_flags[self.speed]
if self.gccalc:
self.options += " -gccalc"
if self.custom_library != None:
self.options += " -lib " + self.custom_library
else:
self.options += ' -species "' + self.species + '"'
# Files
self.add_input_file_list( "input_files", "Fasta file to annot and mask.", default=input_files, file_format="fasta", required=True )
self.add_output_file_list( "cleaned_fasta", "Fasta file to annot and mask without descriptions.", pattern="{basename_woext}.fasta", items=input_files, file_format="fasta" )
self.add_output_file_list( "output_files", "The sequences annotations : location of repeats and interspersed.", pattern="{basename_woext}.gff", items=input_files, file_format="gff" )
self.add_output_file_list( "masked", "The submitted file in which all recognized interspersed or simple repeats have been masked.", pattern="{basename_woext}.fasta.masked", items=input_files, file_format="fasta" )
self.add_output_file_list( "summary", "A table summarizing the repeat content of the queries sequences.", pattern="{basename_woext}.fasta.tbl", items=input_files )
self.add_output_file_list( "aln", "Alignments of the queries with the matching repeats.", pattern="{basename_woext}.fasta.cat", items=input_files )
self.add_output_file( "stderr", "The stderr output file.", filename="repeatmasker.stderr" )
def get_version(self):
cmd = self.get_exec_path("repeatmasker") + " -v"
p = os.popen(cmd)
return p.readline().strip().split()[2]
def process(self):
# Clean description
#cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr ,
self.add_python_execution(cleanDesc,
inputs=self.input_files, outputs=self.cleaned_fasta , map=True)
# Process
RM_output = self.get_outputs( '{basename}.out', self.cleaned_fasta )
self.add_shell_execution(self.get_exec_path("repeatmasker") + " -dir " + self.output_directory + " " + self.options + " $1", cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr,
inputs=self.cleaned_fasta, outputs=[RM_output, self.masked, self.summary, self.aln], map=True)
# To GFF3
#cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr ,
self.add_python_execution(toGFF,
inputs=RM_output, outputs=self.output_files, map=True)
\ No newline at end of file
File deleted
File deleted
workflows/transcript/data/GGallus.png

131 KiB

File deleted
This diff is collapsed.
grep -f 1.txt /work2/project/sigenae/Project_RNA_Refinery/runDrapOut/ERR348568/e-RMBT/ERR348568_R12_to_all_contigs.4633590.idxstat | awk '{print "ERR348568_" $1 "\t" $2 "\t" $3 "\t" $4}' >contigs.idxstat
grep -f 2.txt /work2/project/sigenae/Project_RNA_Refinery/runDrapOut/ERR348580/e-RMBT/ERR348580_R12_to_all_contigs.4633611.idxstat | awk '{print "ERR348580_" $1 "\t" $2 "\t" $3 "\t" $4}' >>contigs.idxstat
grep -f 3.txt /work2/project/sigenae/Project_RNA_Refinery/runDrapOut/SRR924585/e-RMBT/SRR924585_R12_to_all_contigs.107829.idxstat | awk '{print "SRR924585_" $1 "\t" $2 "\t" $3 "\t" $4}' >>contigs.idxstat
This diff is collapsed.
ERR348568_CL611Contig1_1 822 406 106
ERR348568_CL8315Contig1_1 252 91 47
ERR348568_CL13894Contig1_1 478 174 53
ERR348568_CL31035Contig1_1 263 33 8
ERR348568_CL31587Contig1_1 283 121 12
ERR348568_CL34628Contig1_1 335 169 28
ERR348568_CL42562Contig1_1 507 318 109
ERR348568_CL32575Contig1_1 438 133 52
ERR348568_CL44950Contig1_1 429 275 53
ERR348568_CL45462Contig1_1 243 50 7
ERR348568_CL15868Contig1_1 309 174 31
ERR348568_CL21513Contig1_1 386 134 9
ERR348568_CL32558Contig1_1 800 679 167
ERR348568_CL52570Contig1_1 339 162 46
ERR348568_CL3054Contig1_1 690 723 182
ERR348568_CL8727Contig1_1 290 87 20
ERR348568_CL21771Contig1_1 264 60 14
ERR348568_CL40879Contig1_1 421 146 72
ERR348568_CL41778Contig1_1 439 88 8
ERR348568_CL51547Contig1_1 570 567 93
ERR348568_k25_Locus_696_Transcript_1_1 336 71 25
ERR348568_k25_Locus_23836_Transcript_1_1 298 147 88
ERR348568_k25_Locus_28135_Transcript_1_1 400 80 12
ERR348568_k25_Locus_32834_Transcript_1_1 455 71 11
ERR348568_k25_Locus_34410_Transcript_1_1 290 256 83
ERR348568_k25_Locus_65250_Transcript_1_1 317 186 67
ERR348568_k25_Locus_69608_Transcript_1_1 322 183 29
ERR348568_k25_Locus_70988_Transcript_1_1 642 117 1
ERR348568_k25_Locus_87199_Transcript_1_1 372 180 46
ERR348568_k25_Locus_95224_Transcript_1_1 381 251 57
ERR348568_k25_Locus_99727_Transcript_1_1 428 229 97
ERR348568_k25_Locus_116667_Transcript_1_1 258 189 141
ERR348568_k25_Locus_127120_Transcript_1_1 333 111 41
ERR348568_k25_Locus_143441_Transcript_1_1 426 403 120
ERR348568_k25_Locus_145789_Transcript_1_1 287 136 58
ERR348568_k25_Locus_162634_Transcript_1_1 461 226 92
ERR348568_k25_Locus_169812_Transcript_1_1 304 78 19
ERR348568_k25_Locus_187584_Transcript_1_1 324 192 62
ERR348568_k25_Locus_239307_Transcript_1_1 423 78 33
ERR348568_k25_Locus_246769_Transcript_1_1 288 150 36
ERR348568_k25_Locus_263122_Transcript_1_1 421 227 147
ERR348568_k31_Locus_25050_Transcript_1_1 315 439 55
ERR348568_k31_Locus_55450_Transcript_1_1 330 217 37
ERR348568_k31_Locus_77275_Transcript_1_1 499 333 102
ERR348568_k31_Locus_112994_Transcript_1_1 347 138 33
ERR348568_k31_Locus_137768_Transcript_1_1 397 254 82
ERR348568_k31_Locus_216922_Transcript_1_1 420 93 54
ERR348568_k37_Locus_32424_Transcript_1_1 342 68 28
ERR348568_k37_Locus_38530_Transcript_1_1 379 359 112
ERR348568_k37_Locus_47061_Transcript_1_1 414 113 53
ERR348568_k37_Locus_49711_Transcript_1_1 515 586 39
ERR348568_k37_Locus_62110_Transcript_1_1 716 284 72
ERR348568_k37_Locus_62770_Transcript_1_1 505 320 84
ERR348568_k43_Locus_29526_Transcript_1_1 306 212 210
ERR348568_k43_Locus_53591_Transcript_1_1 365 61 42
ERR348568_k43_Locus_82264_Transcript_1_1 573 817 100
ERR348568_k43_Locus_173325_Transcript_1_1 328 203 69
ERR348568_k49_Locus_93256_Transcript_1_1 343 183 194
ERR348568_k49_Locus_134698_Transcript_1_1 322 226 188
ERR348568_k49_Locus_170475_Transcript_1_1 453 417 43
ERR348568_k49_Locus_174124_Transcript_1_1 344 175 44
ERR348568_k55_Locus_128685_Transcript_1_1 257 51 20
ERR348580_CL32568Contig1_1 341 39 16
ERR348580_CL6357Contig1_1 617 374 85
ERR348580_CL10472Contig1_1 375 150 41
ERR348580_CL23023Contig1_1 528 274 49
ERR348580_k25_Locus_17508_Transcript_1_1 357 69 27
ERR348580_k25_Locus_55720_Transcript_1_1 364 43 35
ERR348580_k25_Locus_57470_Transcript_1_1 385 99 23
ERR348580_k25_Locus_73252_Transcript_1_1 298 50 16
ERR348580_k25_Locus_92103_Transcript_1_1 315 32 9
ERR348580_k25_Locus_93808_Transcript_1_1 340 24 19
ERR348580_k25_Locus_97710_Transcript_1_1 348 35 14
ERR348580_k25_Locus_120502_Transcript_1_1 335 342 59
ERR348580_k25_Locus_157841_Transcript_1_1 270 24 8
ERR348580_k31_Locus_526_Transcript_3_1 722 85 20
ERR348580_k31_Locus_96163_Transcript_1_1 400 55 32
ERR348580_k37_Locus_91980_Transcript_1_1 293 62 18
ERR348580_k43_Locus_2327_Transcript_1_1 388 258 52
ERR348580_k43_Locus_41277_Transcript_1_1 282 97 24
ERR348580_k43_Locus_82247_Transcript_1_1 297 23 9
ERR348580_k49_Locus_72132_Transcript_1_1 280 97 31
SRR924585_CL120Contig1_1 1385 1587 32
SRR924585_CL8784Contig1_1 372 359 100
SRR924585_CL15501Contig1_1 723 668 158
SRR924585_CL45199Contig1_1 337 119 59
SRR924585_CL47810Contig1_1 344 111 27
SRR924585_CL48754Contig1_1 364 72 26
SRR924585_CL47409Contig1_1 279 172 50
SRR924585_CL31954Contig1_1 358 125 119
SRR924585_CL3692Contig1_1 566 306 70
SRR924585_CL10604Contig1_1 462 387 17
SRR924585_CL17559Contig1_1 223 284 181
SRR924585_CL19990Contig1_1 281 143 39
SRR924585_k25_Locus_30129_Transcript_1_1 425 257 239
SRR924585_k25_Locus_89365_Transcript_1_1 615 506 61
SRR924585_k25_Locus_115467_Transcript_1_1 321 133 49
SRR924585_k25_Locus_136345_Transcript_1_1 389 278 95
SRR924585_k25_Locus_157099_Transcript_1_1 392 323 36
SRR924585_k25_Locus_185389_Transcript_1_1 370 134 18
SRR924585_k25_Locus_191871_Transcript_1_1 432 322 271
SRR924585_k25_Locus_232686_Transcript_1_1 434 194 88
SRR924585_k25_Locus_239026_Transcript_1_1 354 143 57
SRR924585_k25_Locus_271328_Transcript_1_1 276 117 22
SRR924585_k31_Locus_4861_Transcript_1_1 721 532 95
SRR924585_k31_Locus_48394_Transcript_1_1 553 331 88
SRR924585_k31_Locus_164810_Transcript_1_1 426 311 112
SRR924585_k37_Locus_22437_Transcript_1_1 609 720 97
SRR924585_k37_Locus_49276_Transcript_1_1 282 93 43
SRR924585_k37_Locus_80571_Transcript_1_1 317 122 40
SRR924585_k37_Locus_134248_Transcript_1_1 395 111 16
SRR924585_k37_Locus_173336_Transcript_1_1 476 219 20
SRR924585_k37_Locus_222027_Transcript_1_1 312 236 105
SRR924585_k37_Locus_247262_Transcript_1_1 383 92 64
SRR924585_k43_Locus_41242_Transcript_1_1 613 711 111
SRR924585_k43_Locus_107404_Transcript_1_1 529 257 143
SRR924585_k43_Locus_238949_Transcript_1_1 364 186 68
SRR924585_k49_Locus_160347_Transcript_1_1 387 130 36
ERR348580_k43_Locus_82247_Transcript_1_1
SRR924585_k25_Locus_89365_Transcript_1_1
ERR348568_k37_Locus_32424_Transcript_1_1
SRR924585_CL47409Contig1_1
ERR348568_k55_Locus_128685_Transcript_1_1
ERR348568_k25_Locus_87199_Transcript_1_1
ERR348568_CL51547Contig1_1
ERR348580_k25_Locus_17508_Transcript_1_1
ERR348580_k49_Locus_72132_Transcript_1_1
ERR348568_CL45462Contig1_1
ERR348568_CL21771Contig1_1
ERR348568_CL34628Contig1_1
ERR348568_CL52570Contig1_1
ERR348568_CL32575Contig1_1
ERR348580_CL32568Contig1_1
ERR348568_k43_Locus_173325_Transcript_1_1
SRR924585_k43_Locus_238949_Transcript_1_1
ERR348580_k25_Locus_57470_Transcript_1_1
ERR348568_k25_Locus_187584_Transcript_1_1
ERR348568_k25_Locus_239307_Transcript_1_1
SRR924585_CL15501Contig1_1
SRR924585_k25_Locus_157099_Transcript_1_1
ERR348568_k25_Locus_127120_Transcript_1_1
ERR348568_CL13894Contig1_1
ERR348580_CL6357Contig1_1
SRR924585_CL45199Contig1_1
ERR348580_k25_Locus_92103_Transcript_1_1
ERR348568_k25_Locus_696_Transcript_1_1
ERR348580_CL23023Contig1_1
ERR348568_CL8727Contig1_1
ERR348568_k25_Locus_70988_Transcript_1_1
ERR348568_k37_Locus_47061_Transcript_1_1
ERR348568_k31_Locus_216922_Transcript_1_1
ERR348568_k31_Locus_77275_Transcript_1_1
ERR348580_CL10472Contig1_1
ERR348568_k25_Locus_169812_Transcript_1_1
ERR348568_k25_Locus_116667_Transcript_1_1
SRR924585_k31_Locus_164810_Transcript_1_1
ERR348568_k43_Locus_53591_Transcript_1_1
SRR924585_CL48754Contig1_1
ERR348568_CL40879Contig1_1
ERR348568_k25_Locus_32834_Transcript_1_1
ERR348568_k25_Locus_69608_Transcript_1_1
ERR348568_k49_Locus_170475_Transcript_1_1
ERR348568_CL31035Contig1_1
SRR924585_CL10604Contig1_1
SRR924585_k31_Locus_48394_Transcript_1_1
ERR348568_k37_Locus_38530_Transcript_1_1
ERR348580_k25_Locus_73252_Transcript_1_1
ERR348568_CL41778Contig1_1
ERR348568_k25_Locus_65250_Transcript_1_1
ERR348568_CL44950Contig1_1
ERR348568_k37_Locus_49711_Transcript_1_1
ERR348568_k25_Locus_99727_Transcript_1_1
ERR348568_CL611Contig1_1
SRR924585_k37_Locus_247262_Transcript_1_1
ERR348568_k31_Locus_55450_Transcript_1_1
ERR348580_k25_Locus_93808_Transcript_1_1
ERR348568_CL8315Contig1_1
ERR348568_k31_Locus_25050_Transcript_1_1
ERR348568_k43_Locus_82264_Transcript_1_1
ERR348568_k43_Locus_29526_Transcript_1_1
ERR348580_k43_Locus_2327_Transcript_1_1
ERR348580_k31_Locus_526_Transcript_3_1
SRR924585_k25_Locus_136345_Transcript_1_1
SRR924585_CL31954Contig1_1
SRR924585_k37_Locus_80571_Transcript_1_1
SRR924585_k37_Locus_22437_Transcript_1_1
SRR924585_k37_Locus_173336_Transcript_1_1
SRR924585_k25_Locus_30129_Transcript_1_1
SRR924585_k43_Locus_41242_Transcript_1_1
SRR924585_k25_Locus_239026_Transcript_1_1
SRR924585_CL8784Contig1_1
ERR348568_k25_Locus_162634_Transcript_1_1
ERR348568_k25_Locus_145789_Transcript_1_1
SRR924585_k25_Locus_191871_Transcript_1_1
SRR924585_k25_Locus_185389_Transcript_1_1
ERR348568_k31_Locus_112994_Transcript_1_1
ERR348580_k25_Locus_120502_Transcript_1_1
ERR348568_k25_Locus_246769_Transcript_1_1
ERR348568_k25_Locus_23836_Transcript_1_1
ERR348568_CL31587Contig1_1
ERR348568_k25_Locus_263122_Transcript_1_1
ERR348580_k25_Locus_55720_Transcript_1_1
SRR924585_k43_Locus_107404_Transcript_1_1
ERR348568_k25_Locus_95224_Transcript_1_1
SRR924585_k37_Locus_49276_Transcript_1_1
ERR348568_k49_Locus_134698_Transcript_1_1
SRR924585_k37_Locus_134248_Transcript_1_1
ERR348580_k31_Locus_96163_Transcript_1_1
ERR348580_k43_Locus_41277_Transcript_1_1
ERR348580_k25_Locus_97710_Transcript_1_1
ERR348568_k49_Locus_174124_Transcript_1_1
ERR348568_CL32558Contig1_1
ERR348568_CL42562Contig1_1
ERR348580_k25_Locus_157841_Transcript_1_1
SRR924585_k25_Locus_232686_Transcript_1_1
SRR924585_k31_Locus_4861_Transcript_1_1
SRR924585_k25_Locus_115467_Transcript_1_1
SRR924585_k25_Locus_271328_Transcript_1_1
ERR348568_k25_Locus_34410_Transcript_1_1
ERR348568_k25_Locus_143441_Transcript_1_1
ERR348568_k31_Locus_137768_Transcript_1_1
SRR924585_CL120Contig1_1
SRR924585_CL19990Contig1_1
ERR348580_k37_Locus_91980_Transcript_1_1
ERR348568_k37_Locus_62770_Transcript_1_1
SRR924585_k37_Locus_222027_Transcript_1_1
ERR348568_k25_Locus_28135_Transcript_1_1
SRR924585_k49_Locus_160347_Transcript_1_1
SRR924585_CL3692Contig1_1
SRR924585_CL47810Contig1_1
ERR348568_CL3054Contig1_1
SRR924585_CL17559Contig1_1
ERR348568_CL21513Contig1_1
ERR348568_k49_Locus_93256_Transcript_1_1
ERR348568_CL15868Contig1_1
ERR348568_k37_Locus_62110_Transcript_1_1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment