Useless work of pavel

300c79d2 · Celine Noirot · dc181a07 · dc181a07 · dc181a07 · dc181a07
Commit 300c79d2 authored 8 years ago by Celine Noirot
--- a/workflows/transcript/__init__.py
+++ b/workflows/transcript/__init__.py
-#
-# Copyright (C) 2012 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-from lib.utils import *
-from lib.transcript import Contig, DNA, Annotation, Expression, Clusters
-
-import ngspipelines
-from ngspipelines.application import ApplicationWithoutReferenceGenome
-from ngspipelines.analysis import Analysis
-from ngspipelines.library import Library
-
-from jflow.seqio import SequenceReader
-
-import os
-import sys
-import StringIO
-from ConfigParser import ConfigParser
-
-class Transcript (ApplicationWithoutReferenceGenome):
-
-    def get_description(self):
-        return "RNAseq pipeline without reference genome"
-
-    def define_parameters(self, function="process"):
-        # Assembly parameter
-        self.add_multiple_parameter("assembly", "Define the assembly process", required=True, group="ASSEMBLY section")
-        self.add_input_file("file", "Path to the contig file in fasta format", file_format="fasta", required=True, add_to="assembly")
-        self.add_parameter("software_name", "Which software was used to perform the assembly", default="", required=True, add_to="assembly")
-        self.add_parameter("software_parameters", "Which parameters were used when performing the assembly", default="", add_to="assembly")
-        self.add_parameter("software_version", "Which version of the assembly software was used", default="", required=True, add_to="assembly")
-        self.add_parameter("comments", "Add some comments on this analysis", default="", add_to="assembly")
-
-        # Annotation parameter
-        self.add_multiple_parameter_list("assembly_annot", "Define the annotation process", required=True, group="ASSEMBLY ANNOTATION section")
-        self.add_input_file("file", "Path to the annotation file in gff format", required=True, add_to="assembly_annot")
-        self.add_parameter("software_name", "Which software was used to perform the annotation", add_to="assembly_annot")
-        self.add_parameter("software_parameters", "Which parameters were used when performing the annotation", required=True, add_to="assembly_annot")
-        self.add_parameter("software_version", "Which version of the annotation software was used", required=True, add_to="assembly_annot")
-        self.add_parameter("database", "Which database was used", add_to="assembly_annot")
-        self.add_parameter("database_version", "Which database version was used", add_to="assembly_annot")
-        self.add_parameter("comments", "Add some comments on this analysis", add_to="assembly_annot")
-
-        self.add_multiple_parameter("annotation_filter", "Define the annotation filter", required=True, group="ANNOTATION FILTER section")
-        self.add_parameter("min_identity", "The minimum identity value to keep an HSP", required=True, add_to="annotation_filter")
-        self.add_parameter("min_coverage", "The minimum coverage value to keep an HSP", required=True, add_to="annotation_filter")
-
-
-        # Count matrix parameter
-        self.add_input_file("count_matrix", "File of mapped reads counts per contig (line) per library (column)", group="ALIGNMENT section")
-
-        # Cluster members for centroids
-        self.add_input_file("clusters", "File of cluster members for centroids", group="ASSEMBLY ANNOTATION section")
-
-    def process(self):
-
-        # first check all inputs
-        contig_names = Contig.get_contigs(self.assembly["file"])
-        #self.check_load_format(contig_names)
-        
-        # set variables
-        annotation_files=self.assembly_annot["file"]
-        
-        # convert blast xml files
-        blastxml2gff = self.add_component( "Blast2Annot", [ annotation_files, self.annotation_filter['min_identity'], self.annotation_filter['min_coverage']    ,
-                                          self.get_resource("refseq_gi_from_accession"), self.get_resource("refseq_gene_from_gi") ] )
-        
-        #contact annotations
-        merge_annot = self.add_component( "ConcatenateFiles", [blastxml2gff.output_files, "annotations.gff", True], component_prefix="bestAnnot" )
-        #search best annotation
-        best_annot_search = self.add_component( "BestAnnotSearch", [merge_annot.output_file] )
-        
-        # load the contig table
-        contig_parameters = []
-        
-        
-        contig_parameters = [self.assembly["file"], [best_annot_search.best_annotations]]
-        
-        contig = self.add_biomart_load(Contig,
-                                       contig_parameters,
-                                       "Assembly",
-                                       "assembly",
-                                       "0",
-                                       self.assembly["software_name"],
-                                       self.assembly["software_parameters"],
-                                       self.assembly["software_version"],
-                                       self.assembly["comments"], [self.assembly["file"]])
-        dna = self.add_biomart_load(DNA, [self.assembly["file"]])
-        self.add_blast_search( Contig, self.assembly["file"] )
-        self.add_venn()
-        # load the annotation table
-        for annotation_arg in self.assembly_annot:
-            annot = self.add_biomart_load(Annotation,
-                                          [best_annot_search.std_annotations,contig.new_table],
-                                          "Annotation",
-                                          "annotation",
-                                          "0",
-                                          annotation_arg["software_name"],
-                                          annotation_arg["software_parameters"],
-                                          annotation_arg["software_version"],
-                                          annotation_arg["comments"], [annotation_arg["file"]])
-
-        counts = self.add_biomart_load(Expression, [self.count_matrix, self.project.libraries])
-        clusters = self.add_biomart_load(Clusters, [self.clusters])
-        
-    def check_load_format(self,contig_names):
-
-        for annot_arg in self.assembly_annot:
-            err_contigs=get_gff3_unmatch_contigs(contig_names,annot_arg["file"])
-            if len (err_contigs) > 0:
-                sys.stderr.write('Unknown sequence name >' + ",".join(err_contigs)[:-1] + '< in GFF3 ' + annot_arg["file"] + ' \n')
-                sys.exit(1)
-
-    def check_libraries (self, alignment_files):
-        # Do not check fastq file, done in Library
-        if self.count_matrix:
-            fh = open(self.count_matrix)
-            line = fh.readline()
-            matrix_libs = []
-            parts = line.rstrip().split("\t")
-            matrix_libs = parts[1:]
-            for lib_name in matrix_libs :
-                if self.get_library_from_name(lib_name) == None:
-                    sys.stderr.write('Library '+lib_name+' defined at the first line of the matrix file has not been defined (see --library option to define a library)\n')
-                    sys.exit(1)
-
-    @staticmethod
-    def config_parser(arg_lines):
-        config = ConfigParser()
-        config.readfp(StringIO.StringIO('\n'.join(arg_lines)))
-        arguments = []
-        add_analysis_options = ['software_name', 'software_parameters',  'software_version', 'comments']
-
-        section = 'project'
-        if config.has_section(section):
-            for option in config.options(section):
-                if config.has_option(section, option):
-                    arguments.extend( [ '--'+option.replace('_','-'),  config.get(section, option)  ] )
-
-        section = 'assembly'
-        if config.has_section(section):
-            arguments.append('--assembly')
-            if config.has_option(section, 'file'):
-                arguments.append( 'file=' + config.get(section, 'file') )
-            for option in add_analysis_options:
-                if config.has_option(section, option):
-                    arguments.append( option.replace('_', '-')  + '=' + config.get(section, option) )
-        
-        section = 'annotation_filter'
-        if config.has_section(section):
-            arguments.append('--annotation-filter')
-            for option in ['min_identity','min_coverage']:
-                if config.has_option(section, option):
-                    arguments.append( option.replace('_', '-')  + '=' + config.get(section, option) )
-                    
-        section = 'expression'
-        if config.has_section(section):
-            if config.has_option(section, 'file'):
-                arguments.extend( ['--count-matrix', config.get(section, 'file')] )
-                
-        section = 'clusters'
-        if config.has_section(section):
-            if config.has_option(section, 'file'):
-                arguments.extend( ['--clusters', config.get(section, 'file')] )                
-
-        section = 'assembly_annotations'
-        if  config.has_section(section):
-            lhash = {}
-            for option in config.options(section):
-                if config.get(section, option):
-                    if option == 'best_annotation_source':
-                        arguments.extend( ['--best-annotation-source', config.get(section, option) ])
-                    else:
-                        name, prop = option.split('.', 2)
-                        if not lhash.has_key(name):
-                            lhash[name] = []
-                        if prop == 'file':
-                            lhash[name].append( 'file=' + config.get(section, option) )
-                        else:
-                            lhash[name].append( prop.replace('_', '-') +'=' + config.get(section, option) )
-            for x in lhash.values():
-                arguments.extend( [ '--assembly-annot'] + x )
-
-        section = 'libraries'
-        if config.has_section(section):
-            lhash = {}
-            for option in config.options(section):
-                if config.get(section, option):
-                    name, prop = option.split('.', 2)
-                    if not lhash.has_key(name):
-                        lhash[name] = []
-                    if prop == "files":
-                        for file in config.get(section, option).split(","):
-                            lhash[name].append( prop.replace('_','-') +'=' + file.strip() )
-                    else:
-                        lhash[name].append( prop.replace('_','-') +'=' + config.get(section, option) )
-            for x in lhash.values():
-                arguments.extend( [ '--library'] + x)
-        return arguments
-
-
-    def _nb_seq( self, filepath ):
-        """
-         @summary : Return the number of sequences.
-          @param filepath : [str] the sequence file to process.
-         @return : [int] The number of sequences.
-        """
-        nb_seq = 0
-        seq_fh = SequenceReader( filepath )
-        for seq_record in seq_fh:
-            nb_seq += 1
-        return nb_seq
\ No newline at end of file
--- a/workflows/transcript/bin/blast2annot2.py
+++ b/workflows/transcript/bin/blast2annot2.py
--- a/workflows/transcript/bin/blast_filter.py
+++ b/workflows/transcript/bin/blast_filter.py
-#
-# Copyright (C) 2012 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-__author__ = 'Plateforme bioinformatique Midi Pyrenees'
-__copyright__ = 'Copyright (C) 2009 INRA'
-__license__ = 'GNU General Public License'
-__version__ = '0.1'
-__email__ = 'support.genopole@toulouse.inra.fr'
-__status__ = 'alpha'
-
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'lib'))
-from optparse import *
-import NCBIXML
-
-
-def version_string ():
-    """
-    Return the blast_filter version
-    """
-    return "blast_filter " + __version__
-
-
-if __name__ == '__main__':
-
-
-    parser = OptionParser(usage="Usage: %prog -i FILE|- -o FILE", description = "Filter an xml blast file", version = version_string())
-    
-    igroup = OptionGroup(parser, "Input file options","")
-    igroup.add_option("-i", "--input", dest="input",
-                  help="The input file, if set to - read STDIN, REQUIRE")
-    parser.add_option_group(igroup)
-    
-    ogroup = OptionGroup(parser, "Output files options","")
-    ogroup.add_option("-o", "--output", dest="output",
-                  help="The output file REQUIRE")
-    parser.add_option_group(ogroup)
-    
-    fgroup = OptionGroup(parser, "Filters options","")
-    fgroup.add_option("-q", "--pcq", dest="pcq",
-                  help="Minimun poucentage of query length per alignment REQUIRE ",  default=50 )
-    fgroup.add_option("-p", "--pci", dest="pci",
-                  help="Minimun poucentage of identity per alignment REQUIRE ",  default=50 )
-    parser.add_option_group(fgroup)
-    
-    (options, args) = parser.parse_args()
-
-    
-    blast_records = NCBIXML.parse(options.input)
-    for blast_record in blast_records:
-        
-        no_annot_regions_saved=[]
-        no_annot_regions_work=[]
-        nb_no_annot_region=0
-        for alignment in blast_record.alignments:
-            for hsp in alignment.hsps:
-                keep_hsp = 0
-                if nb_no_annot_region == 0 :
-                    no_annot_regions_saved.append([0,hsp.query_start])
-                    no_annot_regions_saved.append([hsp.query_end,blast_record.query_letters])
-                    nb_no_annot_region=nb_no_annot_region+1
-                    keep_hsp = 1
-                else :
-                    for [hole_start,hole_end] in no_annot_regions_work :
-                        #for each hole
-                        delete_no_annot_region = 0
-                        # hsp include a no annot region
-                        if hsp.query_start < hole_start and hsp.query_end > hole_end :
-                            delete_no_annot_region = 1
-                            keep_hsp = 1
-                        #hsp start is in a no annot region
-                        if hsp.query_start >  hole_start and hsp.query_start < hole_end:
-                            #hsp is in the none annotated region
-                            if (hsp.query_end < hole_end) :
-                                keep_hsp = 1
-                                # adding the 2 new none annotated region to region table
-                                no_annot_regions_saved.append([hole_start,hsp.query_start])
-                                no_annot_regions_saved.append([hsp.query_end,hole_end])
-                            else :
-                                keep_hsp = 1;
-                                # update end of the existing none annotated region 
-                                no_annot_regions_saved.append([hole_start,hsp.query_start])
-                        else :
-                            
-                            # no annot region include query end
-                            if hsp.query_end > hole_start and hsp.query_end < hole_end :
-                                if hsp.query_start <= hole_start :
-                                    keep_hsp = 1;
-                                    no_annot_regions_saved.append([hsp.query_end,hole_end])
-                            else :
-                                if delete_no_annot_region == 0 :
-                                    no_annot_regions_saved.append([hole_start,hole_end])
-                if keep_hsp :
-                    print blast_record.query +" "+str(hsp.query_start)+" "+str(hsp.query_end) + " " +alignment.hit_def + " " +str(hsp.sbjct_start) + " " +str(hsp.sbjct_end)
-                no_annot_regions_work=no_annot_regions_saved[:]
-                no_annot_regions_saved=[]
-                
--- a/workflows/transcript/bin/merge_count.py
+++ b/workflows/transcript/bin/merge_count.py
-#
-# Copyright (C) 2009 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-__author__ = 'Plateforme bioinformatique Midi Pyrenees'
-__copyright__ = 'Copyright (C) 2009 INRA'
-__license__ = 'GNU General Public License'
-__version__ = '1.0'
-__email__ = 'support.genopole@toulouse.inra.fr'
-__status__ = 'beta'
-
-from optparse import *
-import sys,re,datetime
-from os.path import join, splitext, basename, exists, split
-
-def read_count_file (file,libname,dict_count):
-    """
-    Extract count from the count file 
-      @param file          : the input count file (output of sam2count)
-      @param libname       : the library name   
-      @param dict_count    : the dict of contigs to update
-    """
-    fin=open (file, "r")
-    
-    re_parsed   = re.compile("# total read parsed : (\d+)")
-    re_mapped   = re.compile("# total read mapped : (\d+)")
-    re_parsed_paired        = re.compile("# total paired read parsed : (\d+)")
-    re_mapped_paired        = re.compile("# total paired read mapped : (\d+)")
-    re_mapped_paired_isize  = re.compile("# total paired read mapped with good insert size : (\d+)")
-
-    total_parsed_read       = 0
-    total_mapped_read       = 0
-    total_parsed_pair       = -1
-    total_mapped_pair       = -1
-    total_mapped_pair_isize = -1
-    
-    # parse headers
-    line = fin.readline()
-    while not line.startswith("#id") :
-        if line.startswith("# total read parsed") :
-            m = re_parsed.match(line)
-            if m != None :
-                total_parsed_read = int(m.group(1))
-        if line.startswith("# total read mapped") :
-            m = re_mapped.match(line)
-            if m != None :
-                total_mapped_read = int(m.group(1))  
-        if line.startswith("# total paired read parsed"):
-            m = re_parsed_paired.match(line)
-            if m != None :
-                total_parsed_pair = int(m.group(1))
-        if line.startswith("# total paired read mapped") :
-            m = re_mapped_paired.match(line)
-            if m != None :
-                total_mapped_pair = int(m.group(1))
-        if line.startswith("# total paired read mapped with good insert size") :
-            m = re_mapped_paired_isize.match(line)
-            if m != None :
-                total_mapped_pair_isize = int(m.group(1))
-
-        line = fin.readline()
-    #split header
-    tab     = line.split()
-    headers = tab[2:]
-
-    #parse data
-    line = fin.readline()
-    while line != "":
-        tab = line.split("\t") # contig count line
-        if len(tab) > 0 :
-            if not dict_count.has_key(tab[0]) : # if new contig
-                dict_count[tab[0]] = {}
-            if not dict_count[tab[0]].has_key(libname) : # if new library for this contig
-                dict_count[tab[0]][libname] = {}
-            
-            index = 2 # headers/ values begin at 3rd column
-            for type_val in headers :
-                dict_count[tab[0]][libname][type_val] = str(int(tab[index]))
-                index = index+1
-                
-        line = fin.readline()
-    fin.close
-    return total_parsed_read,total_mapped_read,total_parsed_pair,total_mapped_pair,total_mapped_pair_isize
-
-def version_string ():
-    """
-    Return the merge_count version
-    """
-    return "merge_count.py " + __version__
-
-
-if __name__ == '__main__':
-    
-    parser = OptionParser(usage="Usage: %prog", description = "Merge count files into a global counting file,\
-     for single file use the column nb_single, for paired file use the count column nb_pair_isize.")
-    
-    igroup = OptionGroup(parser, "Input options","")
-    igroup.add_option("-f", "--fof", dest="fof", help="file of count file")
-    igroup.add_option("-c", "--correct_pair_only", dest="correct_pair_only", help="If paired count file take column of correct pair only (default use column with good insert size count if available)",action="store_true",default=False)
-    parser.add_option_group(igroup)
-    
-    ogroup = OptionGroup(parser, "Output files options","")
-    ogroup.add_option("-o", "--output", dest="output", help="the output count file")
-    parser.add_option_group(ogroup)
-    
-    (options, args) = parser.parse_args()
-
-    if options.fof == None :
-        sys.stderr.write("Need a file of file\n")
-        parser.print_help()
-        sys.exit(1)
-
-    if options.output == None:
-        sys.stderr.write("Output file is missing\n")
-        parser.print_help()
-        sys.exit(1)
-    
-    ouput_single = join(split(options.output)[0], "single."+ basename(options.output)) 
-    ouput_paired = join(split(options.output)[0], "paired."+ basename(options.output))
-    dict_count={}
-
-    indexlib=0
-    #parcours des fichiers 
-    if options.fof != None :
-        fin=open (options.fof, "r")
-        line=fin.readline()
-        while line != "":
-            file = line.rstrip()
-            if exists(file) :
-                current_libname = splitext(basename(file))[0]
-                
-                (total_parsed_read,total_mapped_read,total_parsed_pair,total_mapped_pair,total_mapped_pair_isize)=read_count_file(file,current_libname,dict_count)
-                if not dict_count.has_key("total_parsed") :
-                    dict_count["total_parsed"] = {}
-                    dict_count["total_mapped"] = {}
-                    
-                if not dict_count["total_parsed"].has_key(current_libname) :
-                    dict_count["total_parsed"][current_libname] = {}
-                    dict_count["total_mapped"][current_libname] = {}
-                    
-                dict_count["total_parsed"][current_libname]["nb_single"]            = total_parsed_read
-                dict_count["total_parsed"][current_libname]["nb_read_paired"]       = total_parsed_pair
-                dict_count["total_parsed"][current_libname]["nb_read_paired_isize"] = None
-                
-                dict_count["total_mapped"][current_libname]["nb_single"]            = total_mapped_read
-                dict_count["total_mapped"][current_libname]["nb_read_paired"]       = total_mapped_pair
-                dict_count["total_mapped"][current_libname]["nb_read_paired_isize"] = total_mapped_pair_isize
-            else :
-                sys.stderr.write("File : " + file + " doesn't exist")
-                sys.exit(1)
-            line        = fin.readline()
-            indexlib    = indexlib+1
-        fin.close
-        fout = open (options.output,"w")
-
-        write_paired_as_single = 0
-        
-        header_total_parsed = [ "#TotalParsed" ]
-        header_total_mapped = [ "#TotalMapped" ]
-        header_contig = [ "#contig_name" ]
-        libs_write_paired_as_single=[]
-        for lib in sorted(dict_count["total_parsed"].keys()) :
-            if dict_count["total_parsed"][lib]["nb_read_paired"] > -1 :
-                write_paired_as_single=1
-                libs_write_paired_as_single.append(lib)
-                header_total_parsed.append( str(dict_count["total_parsed"][lib]["nb_read_paired"]) )
-                if ( options.correct_pair_only or dict_count["total_mapped"][lib]["nb_read_paired_isize"] == -1) : 
-                    header_total_mapped.append( str(dict_count["total_mapped"][lib]["nb_read_paired"]) )
-                else :
-                    header_total_mapped.append( str(dict_count["total_mapped"][lib]["nb_read_paired_isize"]) )
-            else :
-                header_total_parsed.append( str(dict_count["total_parsed"][lib]["nb_single"]) )
-                header_total_mapped.append( str(dict_count["total_mapped"][lib]["nb_single"]) )
-            header_contig.append( lib )
-            
-        fout.write ( '\t'.join( header_contig ) +"\n")
-        fout.write ( '\t'.join( header_total_parsed ) +"\n")
-        fout.write ( '\t'.join( header_total_mapped ) +"\n")
-
-        # Create single output only if paired data are in merged count
-        f_single_out = None
-        if write_paired_as_single :
-            f_single_out = open (ouput_single,"w")
-            header_total_single_parsed = [ "#TotalParsed" ]
-            header_total_single_mapped = [ "#TotalMapped" ]
-            for lib in sorted(libs_write_paired_as_single) :
-                header_total_single_parsed.append( str(dict_count["total_parsed"][lib]["nb_single"]) )
-                header_total_single_mapped.append( str(dict_count["total_mapped"][lib]["nb_single"]) )
-            f_single_out.write( '\t'.join( ["#contig_name"] + sorted(libs_write_paired_as_single) ) + "\n")
-            f_single_out.write( '\t'.join(header_total_single_parsed) + "\n")
-            f_single_out.write( '\t'.join(header_total_single_mapped) + "\n")
-            
-        for contig_name in sorted(dict_count.keys()) :
-            line = []
-            if (contig_name == "total_parsed" or contig_name == "total_mapped") : continue 
-            line.append( contig_name ) 
-            line_single = line[:]
-            for lib in sorted(dict_count[contig_name].keys()) :
-                if  dict_count["total_parsed"][lib]["nb_read_paired"] >- 1 : # If lib is paired
-                    if  (options.correct_pair_only or dict_count["total_mapped"][lib]["nb_read_paired_isize"] == -1 ) :
-                        line.append(str(dict_count[contig_name][lib]["nb_read_paired"]) )
-                    else :
-                        line.append(str(dict_count[contig_name][lib]["nb_read_paired_isize"]))
-                else :
-                    line.append( str(dict_count[contig_name][lib]["nb_single"]) )
-            for lib in sorted (libs_write_paired_as_single):
-                line_single.append( str(dict_count[contig_name][lib]["nb_single"]) )
-                
-            fout.write ( '\t'.join(line) +"\n")
-            if write_paired_as_single :
-                f_single_out.write( '\t'.join(line_single) +"\n")
-        
-        if write_paired_as_single :
-            f_single_out.close()
-        fout.close
-    sys.exit(0)
-        
\ No newline at end of file
--- a/workflows/transcript/bin/sam2count.py
+++ b/workflows/transcript/bin/sam2count.py
-#
-# Copyright (C) 2009 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-__author__ = 'Plateforme bioinformatique Midi Pyrenees'
-__copyright__ = 'Copyright (C) 2009 INRA'
-__license__ = 'GNU General Public License'
-__version__ = '1.0'
-__email__ = 'support.genopole@toulouse.inra.fr'
-__status__ = 'beta'
-
-from optparse import *
-import os,sys,re,datetime
-   
-
-def version_string ():
-    """
-    Return the rpkm_compute version
-    """
-    return "sam2count.py " + __version__
-
-
-if __name__ == '__main__':
-    
-    parser = OptionParser(usage="Usage: %prog", description = "Read non sorted sam file as stdin  and count aligned reads, can also compute the rpkm and fpkm if it's paired.")
-    
-    igroup = OptionGroup(parser, "Input options","")
-    igroup.add_option("-s", "--size-min", dest="min_insert_size", help="minimal insert size for paired data")
-    igroup.add_option("-m", "--size-max", dest="max_insert_size", help="maximal insert size for paired data")
-    igroup.add_option("-p", "--paired", dest="paired", help="input is a paired file",action="store_true",default=False)
-    igroup.add_option("-r", "--rewrite",dest="rewrite" , help="write stdin on stdout",action="store_true",default=False)
-    parser.add_option_group(igroup)
-    
-    ogroup = OptionGroup(parser, "Output files options","")
-    ogroup.add_option("-o", "--output", dest="output", help="the output file")
-    parser.add_option_group(ogroup)
-    
-    (options, args) = parser.parse_args()
-    if options.output == None:
-        sys.stderr.write("Output file is missing")
-        parser.print_help()
-        sys.exit(1)
-
-    test_insert=False
-    option_sumary=""
-    if options.paired :
-        if options.min_insert_size != None and options.max_insert_size != None :
-            option_sumary="##Options :\n##\tmin insert size : "+options.min_insert_size+"\n##\tmax insert size: "+options.max_insert_size
-            test_insert=True
-        else :
-            option_sumary="##Options :\n##\tMin and Max insert size not specified"
-            sys.stderr.write("WARNING : Insert size will not be tested, set parameters --size-min --size-max if needed\n")
-   
-       
-    total_parsed_read = 0.0
-    total_mapped_read = 0
-    total_mapped_pair = 0
-    total_mapped_pair_plus = 0
-    total_parsed_pair = 0
-    #ref -> [len, count single, count seq paired, count seq well paired]
-    reference={}
-
-    #header regexp :  @SQ    SN:SC_00000001.1.1    LN:4030
-    header_regexp=re.compile("@SQ\s+SN:(.+)\s+LN:(\d+)")
-    #lecture du header
-    line=sys.stdin.readline()
-    if options.rewrite : sys.stdout.write(line)
-    while line.startswith("@") :
-        m=header_regexp.match(line)
-        if m!=None :
-            reference[m.group(1)]=[int(m.group(2)),0,0,0]
-        line=sys.stdin.readline()
-        if options.rewrite : sys.stdout.write(line)
-
-    if len(reference.keys())==0 :
-        sys.stderr.write("ERROR : Header in sam format is required\n")
-        sys.exit(1)
-
-    #fisrt line / pair 1
-    ref=line
-    #on est sur la premiere ligne de l'alignement il faut le compter
-    total_parsed_read = 1
-    while 1 :
-        data = sys.stdin.readline()
-        if options.rewrite : sys.stdout.write(data)
-        if data == '' : #handle last line
-            if ref != '' :
-                array_ref = ref.split("\t")
-                if str(bin(int(array_ref[1]))[-3]) == '0' :
-                    total_mapped_read=total_mapped_read+1
-                    reference[array_ref[2]][1] = reference[array_ref[2]][1]+1
-            break
-        else :
-            array_ref = ref.split("\t")
-            array_data = data.split("\t")
-            #read mapped [flag : read unmapped==0]
-            if str(bin(int(array_ref[1]))[-3]) == '0':
-                total_mapped_read=total_mapped_read+1
-                reference[array_ref[2]][1] = reference[array_ref[2]][1]+1
-            
-            # two lines of the same fragment on the same reference
-            if array_data[0] == array_ref[0] :
-                total_parsed_pair=total_parsed_pair+2
-                if array_data[2] == array_ref[2] and options.paired: 
-                    #str((bin(int(array_data[1]))[-2:]) : extrait les 2 derniers flags : read paired & properly mapped
-                    # pair 1
-                    if str(bin(int(array_ref[1]))[-2:]) == '11' :
-                        reference[array_data[2]][2] = reference[array_data[2]][2]+1
-                        total_mapped_pair=total_mapped_pair+1
-                        # et teste la longueur de l'insert
-                        if test_insert and abs(int(array_data[8])) > int(options.min_insert_size) and abs(int(array_ref[8])) > int(options.min_insert_size) \
-                        and abs(int(array_data[8])) < int(options.max_insert_size) and abs(int(array_ref[8])) < int(options.max_insert_size) :
-                            reference[array_data[2]][3] = reference[array_data[2]][3]+1
-                            total_mapped_pair_plus=total_mapped_pair_plus+1
-                    # pair 2
-                    if str(bin(int(array_data[1]))[-2:]) == '11' :
-                        reference[array_data[2]][2] = reference[array_data[2]][2]+1
-                        total_mapped_pair=total_mapped_pair+1
-                        # et teste la longueur de l'insert
-                        if test_insert and abs(int(array_data[8])) > int(options.min_insert_size) and abs(int(array_ref[8])) > int(options.min_insert_size) \
-                        and abs(int(array_data[8])) < int(options.max_insert_size) and abs(int(array_ref[8])) < int(options.max_insert_size) :
-                            reference[array_data[2]][3] = reference[array_data[2]][3]+1
-                            total_mapped_pair_plus=total_mapped_pair_plus+1
-            ref = data
-            total_parsed_read= total_parsed_read+1
-    fout=open (options.output, "w")
-    fout.write ("#sam2count :" + str(datetime.datetime.now()) + "\n")
-    fout.write (option_sumary+"\n")
-    fout.write ("# total read parsed : "+str(int(total_parsed_read))+"\n")
-    fout.write ("# total read mapped : "+str(int(total_mapped_read))+"\n")
-    header_table="#id\tlen\tnb_single"
-    if int(total_parsed_pair) > 0 and options.paired :
-        fout.write ("# total paired read parsed : "+str(int(total_parsed_pair))+"\n")
-        fout.write ("# total paired read mapped : "+str(int(total_mapped_pair))+" (properly pair on same ref)\n")
-        header_table=header_table+"\tnb_read_paired"
-        if test_insert :
-            fout.write ("# total paired read mapped with good insert size : "+str(int(total_mapped_pair_plus))+"\n")
-            header_table=header_table+"\tnb_read_paired_isize"
-
-    fout.write ( header_table+"\n")
-    outline=""
-    for refid in sorted(reference.keys()) :
-        outline =  refid+"\t"+str(reference[refid][0])+"\t"+str(reference[refid][1])
-        if total_parsed_pair > 0 and options.paired :
-            outline = outline + "\t"+str(reference[refid][2])
-            if test_insert :
-                outline = outline +"\t"+ str(reference[refid][3])
-        fout.write (outline+"\n")
-    fout.close    
-    sys.exit(0)
--- a/workflows/transcript/components/__init__.py
+++ b/workflows/transcript/components/__init__.py
-#
-# Copyright (C) 2012 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#        
\ No newline at end of file
--- a/workflows/transcript/components/best_annot_search.py
+++ b/workflows/transcript/components/best_annot_search.py
-#
-# Copyright (C) 2014 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-import os
-import pickle
-
-from jflow.component import Component
-
-from weaver.function import PythonFunction
-from weaver.abstraction import Map
-from ngspipelines.utils import Utils
-
-def best_hit( input , best_output, output ):
-    """
-    @summary : Find the best hit for each query in annotations files. The comparison is based on anotation.innerscore().
-    """
-    import re
-    from workflows.rnaseqdenovo.lib.annotationfile import AnnotationFile, AnnotationRecord
-    best_annot = dict()
-        # Process
-    out_annot_fh = list()
-
-    annotations = AnnotationFile(input, "r")
-    output =  AnnotationFile(output, "w") 
-    for current_annot in annotations:
-    
-        # The query already has a best HSP
-        if best_annot.has_key( current_annot.seq_id ):
-            if float(best_annot[current_annot.seq_id]["annot"].score) < float(current_annot.score):
-                # Write old best HSP
-                output.write( best_annot[current_annot.seq_id]["annot"] )
-                # Replace old best HSP
-                best_annot[current_annot.seq_id]["annot"] = current_annot
-            else:
-                output.write( current_annot )
-        # The query does not have a best HSP
-        else :
-            best_annot[current_annot.seq_id] = dict()
-            best_annot[current_annot.seq_id]["annot"] = current_annot
-
-
-    annotations.close()
-
-    # Write best annotation file
-    best_annot_fh = AnnotationFile( best_output, "w" )
-    for seq_id in best_annot.keys():
-        best_annot_fh.write( best_annot[seq_id]["annot"] )
-    best_annot_fh.close()
-    
-
-class BestAnnotSearch (Component):
-    """
-     @summary : Extract in separated file(s) the best annotation and the others for each sequence.
-                 The comparison is based on anotation.innerscore(databank_weight).
-    """
-
-    def define_parameters(self, input_files):
-        """
-        @param input_files : The annotations files
-        """
-        # Files
-        self.add_input_file( "input_file", "The annotations file.", 
-            default=input_files, required=True )
-        
-        self.add_output_file( "std_annotations", "The annotations files without the best annotation.", 
-                                   filename="annotations.std_annot.gff",  file_format="gff" )
-        # get best output 
-        self.add_output_file( "best_annotations", "the annotations file with only the best annotation of each sequences.", 
-                                   filename="annotations.best.gff", file_format="gff" )
-
-    def process(self):
-        self.add_python_execution(best_hit, inputs=[self.input_file], outputs=[self.best_annotations, self.std_annotations])
-            
-            
\ No newline at end of file
--- a/workflows/transcript/components/blast2annot.py
+++ b/workflows/transcript/components/blast2annot.py
-#
-# Copyright (C) 2014 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-import os
-
-from subprocess import Popen, PIPE
-
-from jflow.component import Component
-
-
-class Blast2Annot (Component):
-    """
-     @summary : Filter blast XML results and convert to annotation file.
-    """
-
-    def define_parameters( self, blast_files, min_identity=0.1, min_coverage=0.1, accession_to_gi_file=None, 
-                             gi_to_gene_file=None):
-        """
-         @param blast_files : [list] Blast files to filter and convert.
-         @param accession_to_gi_file : [str] Path to NCBI's file who contains link between accession number and gene ID.
-         @param gi_to_gene_file : [str] Path to NCBI's file who contains link between gene name and gene ID.
-        """
-        self.conversion_options=""
-        #     Options
-        if accession_to_gi_file != None:
-            self.conversion_options += " -a " + accession_to_gi_file
-        if gi_to_gene_file != None:
-            self.conversion_options += " -g " + gi_to_gene_file
-        self.add_parameter( "min_identity", "The minimum identity value to keep an HSP.", default=min_identity, type="float" )
-        self.add_parameter( "min_coverage", "The minimum coverage value to keep an HSP.", default=min_coverage, type="float" )
-        self.filter_options = " --min-identity " + str(min_identity) + " --min-coverage " + str(min_coverage)
-        
-        # Files
-        self.add_input_file_list( "blast_files", "The blast files to align.", default=blast_files, file_format="xml", required=True )
-        self.add_input_file( "accession_to_gi_file", "NCBI's file who contains link between accession number and gene ID.", default=accession_to_gi_file )
-        self.add_input_file( "gi_to_gene_file", "NCBI's file who contains link between gene name and gene ID.", default=gi_to_gene_file )
-        self.add_output_file_list( "output_files", "Annotation files.", pattern='{basename_woext}.gff', items=blast_files, file_format="gff" )
-        self.add_output_file( "stderr", "The stderr output file.", filename="blast.stderr" )
-
-    def get_version(self):
-            cmd = [self.get_exec_path("blast2annot2.py"), "--version"]
-            p = Popen(cmd, stdout=PIPE, stderr=PIPE)
-            stdout, stderr = p.communicate()
-            return stderr
-
-    def process(self):
-        # Filter and conversion
-        self.add_shell_execution(self.get_exec_path("blast2annot2.py") + " " + self.filter_options + " " + self.conversion_options + " -i $1 -o $2 ", cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr,
-                                 inputs=self.blast_files, outputs=self.output_files,
-                                 map=True)
-
-        
\ No newline at end of file
--- a/workflows/transcript/components/concatenatefiles.py
+++ b/workflows/transcript/components/concatenatefiles.py
-#
-# Copyright (C) 2012 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-import os, types
-
-from jflow.component import Component
-
-
-class ConcatenateFiles (Component):
-    """
-      @summary : Concatenates files.
-    """
-
-    def define_parameters(self, input_files, output_name, wext=False):
-        """
-         @param input_files : [list] Paths to the files that will be concatenated.
-         @param output_name : [str] Filename for the output file (with extension : see param wext).
-         @param wext : [bool] True if the parameter output_name contains all extensions for the final file. 
-                        Otherwise th extensions are automatically retrieved from the input files.
-        """
-        # Parameters
-        self.add_parameter( "output_name", "Filename for the output file (with extension : see param wext).", default=output_name, required=True )
-        self.add_parameter( "wext", "True if the parameter output_name contains all extensions for the final file. Otherwise th extensions are automatically retrieved from the input files.", default=wext, type="bool")
-        final_name = output_name
-        if not wext:
-            extensions = os.path.basename(input_files[0]).split(".")[1:]
-            final_name = output_name + ".".join( extensions )
-
-        # Files
-        self.add_input_file_list( "input_files", "Files that will be concatenated.", default=input_files, required=True )
-        out_format = input_files.file_format if hasattr(input_files, 'file_format') else "any"
-        self.add_output_file( "output_file", "The concatenated file.", filename=final_name, file_format=out_format )
-
-    def process(self):
-        files_list_str = " ".join( self.input_files )
-        # If the files are not zip
-        if not self.output_file.endswith(".gz"):
-            self.add_shell_execution('cat ' + files_list_str + ' > $1', cmd_format='{EXE} {OUT}', outputs=self.output_file, includes=self.input_files)
-        # If the files are zip
-        else:
-             self.add_shell_execution('zcat ' + files_list_str + ' | gzip - > $1', cmd_format='{EXE} {OUT}', outputs=self.output_file, includes=self.input_files)
--- a/workflows/transcript/components/getorf.py
+++ b/workflows/transcript/components/getorf.py
-#
-# Copyright (C) 2012 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-import os
-
-from subprocess import Popen, PIPE
-
-from jflow.component import Component
-
-from weaver.abstraction import Map
-from weaver.function import ShellFunction, PythonFunction
-
-def filter_orf( max_orf_nb, orf_file, filtered_file ):
-    """
-     @summary : Filters the number of ORF by sequence on ORF file. It keeps the most longs. If several ORF have the same length they are all kept.
-      @param max_orf_nb : [int] the maximum number of ORF by sequence.
-      @param orf_file : [str] path to the processed file.
-      @param filtered_file : [str] path to the output.
-    """
-    def write_ORF( orfs, max_length_orf_nb ):
-        sorted_orfs = sorted( orfs, key=lambda orf: -int(len(orf[2])) ) # sort ORF by length
-        nb_length = 0
-        i = 0
-        # While the maximum number of ORF is not exceeded and the sequence has an other ORF
-        while nb_length < max_length_orf_nb and i < len(sorted_orfs):
-            seqio.writefasta(out_fh, [sorted_orfs[i]])
-            # If the length of the current ORF is different to the previous ORF
-            if i == 0 or len(sorted_orfs[i][2]) != len(sorted_orfs[i-1][2]):
-                nb_length += 1
-            i += 1
-
-    import jflow.seqio as seqio
-
-    # Main
-    previous_seq_id = None
-    orfs = list()
-    out_fh = open(filtered_file, "w")
-    reader = seqio.FastaReader( orf_file )
-    for id, desc, seq, qual in reader:
-        current_seq_id = id.split("_")[:-1]
-        if previous_seq_id is None:
-            previous_seq_id = current_seq_id
-        # If ORF target has changed
-        if current_seq_id != previous_seq_id:
-            write_ORF( orfs, int(max_orf_nb) )
-            # Next ORF
-            orfs = list()
-            previous_seq_id = current_seq_id
-        orfs.append( [id, desc, seq, qual] )
-    # Write last sequence
-    if len(orfs) != 0:  
-        write_ORF( orfs, int(max_orf_nb) )
-
-
-class GetORF (Component):
-    """
-     @summary : Find and extract open reading frames (ORFs).
-     @see : http://emboss.sourceforge.net/apps/cvs/emboss/apps/getorf.html
-    """
-
-    def define_parameters(self, fasta_files, max_orf_nb=None, min_size=30, max_size=1000000, find=1, table=0):
-        """
-          @param fasta_files : [list] sequences files to process.
-          @param max_orf_nb : [int] the maximum number of ORF by sequence (if several ORF have the same length they are all kept).
-          @param min_size : [int] minimum nucleotide size of ORF to report.
-          @param max_size : [int] maximum nucleotide size of ORF to report.
-          @param find : [int] There are two possible definitions of an open reading frame: it can either be a region that is free of STOP codons or a region that begins with a START codon and ends with a STOP codon. The last three options are probably only of interest to people who wish to investigate the statistical properties of the regions around potential START or STOP codons. The last option assumes that ORF lengths are calculated between two STOP codons.
-                        0 (Translation of regions between STOP codons)
-                        1 (Translation of regions between START and STOP codons)
-                        2 (Nucleic sequences between STOP codons)
-                        3 (Nucleic sequences between START and STOP codons)
-                        4 (Nucleotides flanking START codons)
-                        5 (Nucleotides flanking initial STOP codons)
-                        6 (Nucleotides flanking ending STOP codons)
-          @param table : [int] Code to use.
-                         0  Standard
-                         1  Standard (with alternative initiation codons)
-                         2  Vertebrate Mitochondrial
-                         3  Yeast Mitochondrial
-                         4  Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma
-                         5  Invertebrate Mitochondrial
-                         6  Ciliate Macronuclear and Dasycladacean
-                         9  Echinoderm Mitochondrial
-                         10 Euplotid Nuclear
-                         11 Bacterial
-                         12 Alternative Yeast Nuclear
-                         13 Ascidian Mitochondrial
-                         14 Flatworm Mitochondrial
-                         15 Blepharisma Macronuclear
-                         16 Chlorophycean Mitochondrial
-                         21 Trematode Mitochondrial
-                         22 Scenedesmus obliquus
-                         23 Thraustochytrium Mitochondrial
-        """
-        # Parameters
-        self.add_parameter( "min_size", "Minimum nucleotide size of ORF to report.", default=min_size, type=int )
-        self.add_parameter( "max_size", "Maximum nucleotide size of ORF to report.", default=max_size, type=int )
-        self.add_parameter( "find", 
-                            "ORF definition : 0 (Translation of regions between STOP codons) " +\
-                              "1 (Translation of regions between START and STOP codons) " +\
-                              "2 (Nucleic sequences between STOP codons) " +\
-                              "3 (Nucleic sequences between START and STOP codons) " +\
-                              "4 (Nucleotides flanking START codons) " +\
-                              "5 (Nucleotides flanking initial STOP codons) " +\
-                              "6 (Nucleotides flanking ending STOP codons).",
-                            default=find, choices=[0, 1, 2, 3, 4, 5, 6], type=int )
-        self.add_parameter( "table", 
-                            "Code to use : 0  Standard " +\
-                              "1  Standard (with alternative initiation codons) " +\
-                              "2  Vertebrate Mitochondrial " +\
-                              "3  Yeast Mitochondrial " +\
-                              "4  Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma " +\
-                              "5  Invertebrate Mitochondrial " +\
-                              "6  Ciliate Macronuclear and Dasycladacean " +\
-                              "9  Echinoderm Mitochondrial " +\
-                              "10 Euplotid Nuclear " +\
-                              "11 Bacterial " +\
-                              "12 Alternative Yeast Nuclear " +\
-                              "13 Ascidian Mitochondrial " +\
-                              "14 Flatworm Mitochondrial " +\
-                              "15 Blepharisma Macronuclear " +\
-                              "16 Chlorophycean Mitochondrial " +\
-                              "21 Trematode Mitochondrial " +\
-                              "22 Scenedesmus obliquus " +\
-                              "23 Thraustochytrium Mitochondrial ",
-                            default=table, choices=[0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23], type=int )
-        self.add_parameter( "max_orf_nb", "The maximum number of ORF by sequence (if several ORF have the same length they are all kept).", default=max_orf_nb, type=int )
-
-        # Files
-        self.add_input_file_list( "fasta_files", "Sequences files to process.", default=fasta_files, file_format="fasta", required=True )
-        self.add_output_file_list( "output_files", "The ORFs fasta files.", pattern='{basename_woext}_orf.fasta', items=fasta_files, file_format="fasta" )
-        self.add_output_file_list( "orf_stderr_files", "The ORFs stderr files.", pattern='{basename_woext}_orf.stderr', items=fasta_files, file_format="any" )
-        self.add_output_file( "stderr", "The stderr output file.", filename="filter_getorf.stderr" )
-
-    def process(self):
-        # Temp files
-        get_orf_outputs = self.output_files
-        if self.max_orf_nb != None:
-            get_orf_outputs= self.get_outputs('{basename_woext}_before_filter.tmp', self.fasta_files)
-
-        # Find ORF
-        self.add_shell_execution(self.get_exec_path("getorf") + " -sequence $1 -outseq $2 --minsize " + str(self.min_size) + " --maxsize " + str(self.max_size) +
-                                 " --find " + str(self.find) + " --table " + str(self.table) + " 2>> $3", 
-                                 cmd_format='{EXE} {IN} {OUT}',inputs=self.fasta_files, outputs=[get_orf_outputs,self.orf_stderr_files],map=True)
-        # Filter
-        if self.max_orf_nb != None:
-            filter = PythonFunction( filter_orf, cmd_format='{EXE} ' + str(self.max_orf_nb) + ' {IN} {OUT} 2>> ' + self.stderr )
-            Map( filter, inputs=get_orf_outputs, outputs=self.output_files )
\ No newline at end of file
--- a/workflows/transcript/components/repeatmasker.py
+++ b/workflows/transcript/components/repeatmasker.py
-#
-# Copyright (C) 2012 INRA
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-import os
-
-from jflow.component import Component
-
-
-def cleanDesc( in_path, out_path ):
-    """
-     @summary : Replace invalid characters in sequence description ('>' => '%3E').
-      @param in_path : [str] path to the file to convert.
-      @param out_path : [str] path to the output file.
-    """
-    import jflow.seqio as seqio
-
-    reader = seqio.SequenceReader( in_path )
-    out_fh = open( out_path, "w" )
-    for id, desc, seq, qual in reader:
-        desc = desc.replace('>', '%3E')
-        # If sequences file is a FASTQ
-        if reader.__class__.__name__ == "FastqReader":
-            seqio.writefastq(out_fh, [[id, desc, seq, qual]])
-        # If sequences file is a FASTA   
-        elif reader.__class__.__name__ == "FastaReader":
-            seqio.writefasta(out_fh, [[id, desc, seq, qual]])
-    out_fh.close()
-
-def toGFF( in_path, out_path ):
-    """
-     @summary : Write a GFF file from the RepeatMasker output file.
-      @param in_path : [str] path to the file to convert.
-      @param out_path : [str] path to the output file.
-    """
-    from jflow.featureio import GFF3Record, GFF3IO
-
-    header = True
-    fh_output = GFF3IO( out_path , "w" )
-    fh_rm = open( in_path )
-    for line in fh_rm:
-        if line.strip() == "":
-            header = False
-        elif not header:
-            # Line example : '    11   31.6  1.7  1.6  CHMP7_CRIGR.1.1             1238  1297 (2723) + (GATGAA)n   Simple_repeat         1     61    (0)   1'
-            fields = line.strip().split()
-            record = GFF3Record()
-            record.seq_id = fields[4]
-            record.source = "RepeatMasker"
-            if fields[10] == "Low_complexity" :
-                record.type = "low_complexity_region"
-            elif fields[10] == "Simple_repeat" :
-                record.type = "repeat_region"
-            else:
-                record.type = "similarity"
-            record.start = fields[5]
-            record.end = fields[6]
-            record.score = fields[0]
-            record.strand = fields[8] if fields[8] == "+" else "-"
-            record.phase = "."
-            record.attributes = dict()
-            record.addToAttribute( "sw_score", fields[0] )
-            record.addToAttribute( "substitution_perc", fields[1] )
-            record.addToAttribute( "deletion_perc", fields[2] )
-            record.addToAttribute( "insertion_perc", fields[3] )
-            record.addToAttribute( "nucleotid_after_match", fields[7][1:-1] )
-            record.addToAttribute( "sbjct_start", fields[11] )
-            record.addToAttribute( "sbjct_end", fields[12] )
-            record.addToAttribute( "desc", fields[10] + " " + fields[9] )
-            fh_output.write( record )
-    fh_rm.close()
-
-
-class RepeatMasker (Component):
-    """
-     @summary : Annotates and masks highly repetitive DNA sequences on fasta files. 
-     @see : http://www.repeatmasker.org/
-    """
-
-    def define_parameters(self, input_files, species, engine="crossmatch", speed="default", gccalc=True, custom_library=None):
-        """
-        @param input_files : [list] fasta file to annot and mask.
-        @param species : [str] query species for repeatmasker.
-        @param engine  : [str] engine use when searching the sequence.
-                          - 'crossmatch' is slower but often more sensitive than the other engines.
-                          - 'abblast' ( formally known as WUBlast ) is very fast with a slight cost of sensitivity. 
-                          - 'rmblast' is a RepeatMasker compatible version of the NCBI Blast tool suite. 
-                          - 'hmmer' uses the new nhmmer program to search sequences against the new Dfam database ( human only ).
-        @param speed : [str] sensitivity of your search. The more sensitive the longer the processing time.
-                        - 'slow' is 0-5% more sensitive, 2-3 times slower than default.
-                        - 'default'.
-                        - 'quick' is 5-10% less sensitive, 2-5 times faster than default.
-                        - 'rush' is about 10% less sensitive, 4->10 times faster than default.
-        @param gccalc : [bool] force RepeatMasker to calculate the GC content even for batch files/small seqs.
-        @param custom_library : [str] path to the custom annotate lineage specific repeats.
-        """
-        # Parameters
-        self.add_parameter( "species", "The species of queries.", default=species, required=True )
-        self.add_parameter( "engine", "Engine use when searching the sequence", default=engine, choices=["crossmatch", "abblast", "rmblast", "hmmer"] )
-        self.add_parameter( "speed", "Sensitivity of your search. The more sensitive the longer the processing time.", default=speed, choices=["slow", "default", "quick", "rush"] )
-        self.add_parameter( "gccalc", "Force RepeatMasker to calculate the GC content even for batch files/small seqs.", default=gccalc, type="bool" )
-        self.add_parameter( "custom_library", "The custom annotate lineage specific repeats.", default=custom_library )
-        # Options
-        self.options = "-engine " + self.engine
-        speed_flags = { "slow":"-s", "default":"", "quick":"-q", "rush":" -qq" }
-        self.options += " " + speed_flags[self.speed]
-        if self.gccalc:
-            self.options += " -gccalc"
-        if self.custom_library != None:
-            self.options += " -lib " + self.custom_library
-        else:
-            self.options += ' -species "' + self.species + '"'
-
-        # Files
-        self.add_input_file_list( "input_files", "Fasta file to annot and mask.", default=input_files, file_format="fasta", required=True )
-        self.add_output_file_list( "cleaned_fasta", "Fasta file to annot and mask without descriptions.", pattern="{basename_woext}.fasta", items=input_files, file_format="fasta" )
-        self.add_output_file_list( "output_files", "The sequences annotations : location of repeats and interspersed.", pattern="{basename_woext}.gff", items=input_files, file_format="gff" )
-        self.add_output_file_list( "masked", "The submitted file in which all recognized interspersed or simple repeats have been masked.", pattern="{basename_woext}.fasta.masked", items=input_files, file_format="fasta" )
-        self.add_output_file_list( "summary", "A table summarizing the repeat content of the queries sequences.", pattern="{basename_woext}.fasta.tbl", items=input_files )
-        self.add_output_file_list( "aln", "Alignments of the queries with the matching repeats.", pattern="{basename_woext}.fasta.cat", items=input_files )
-        self.add_output_file( "stderr", "The stderr output file.", filename="repeatmasker.stderr" )
-
-    def get_version(self):
-        cmd = self.get_exec_path("repeatmasker") + " -v"
-        p = os.popen(cmd)
-        return p.readline().strip().split()[2]
-
-    def process(self):
-        # Clean description
-        #cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr ,
-        self.add_python_execution(cleanDesc, 
-                                  inputs=self.input_files, outputs=self.cleaned_fasta , map=True)
-        # Process
-        RM_output = self.get_outputs( '{basename}.out', self.cleaned_fasta )
-        self.add_shell_execution(self.get_exec_path("repeatmasker") + " -dir " + self.output_directory + " " + self.options + " $1", cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr,
-                                 inputs=self.cleaned_fasta, outputs=[RM_output, self.masked, self.summary, self.aln], map=True)
-        
-        # To GFF3
-        #cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr ,
-        self.add_python_execution(toGFF, 
-                                  inputs=RM_output, outputs=self.output_files, map=True)
-        
\ No newline at end of file
--- a/workflows/transcript/data/ERR348568.fastq.gz
+++ b/workflows/transcript/data/ERR348568.fastq.gz
--- a/workflows/transcript/data/ERR348580.fastq.gz
+++ b/workflows/transcript/data/ERR348580.fastq.gz
--- a/workflows/transcript/data/GGallus.png
+++ b/workflows/transcript/data/GGallus.png
--- a/workflows/transcript/data/SRR924585.fastq.gz
+++ b/workflows/transcript/data/SRR924585.fastq.gz
--- a/workflows/transcript/data/clusters.csv
+++ b/workflows/transcript/data/clusters.csv
--- a/workflows/transcript/data/collect_idxstat.sh
+++ b/workflows/transcript/data/collect_idxstat.sh
-grep -f 1.txt /work2/project/sigenae/Project_RNA_Refinery/runDrapOut/ERR348568/e-RMBT/ERR348568_R12_to_all_contigs.4633590.idxstat | awk '{print "ERR348568_" $1 "\t" $2 "\t" $3 "\t" $4}' >contigs.idxstat
-grep -f 2.txt /work2/project/sigenae/Project_RNA_Refinery/runDrapOut/ERR348580/e-RMBT/ERR348580_R12_to_all_contigs.4633611.idxstat | awk '{print "ERR348580_" $1 "\t" $2 "\t" $3 "\t" $4}' >>contigs.idxstat 
-grep -f 3.txt /work2/project/sigenae/Project_RNA_Refinery/runDrapOut/SRR924585/e-RMBT/SRR924585_R12_to_all_contigs.107829.idxstat | awk '{print "SRR924585_" $1 "\t" $2 "\t" $3 "\t" $4}' >>contigs.idxstat
--- a/workflows/transcript/data/contigs.fasta
+++ b/workflows/transcript/data/contigs.fasta
--- a/workflows/transcript/data/contigs.idxstat
+++ b/workflows/transcript/data/contigs.idxstat
-ERR348568_CL611Contig1_1  822 406 106
-ERR348568_CL8315Contig1_1 252 91  47
-ERR348568_CL13894Contig1_1  478 174 53
-ERR348568_CL31035Contig1_1  263 33  8
-ERR348568_CL31587Contig1_1  283 121 12
-ERR348568_CL34628Contig1_1  335 169 28
-ERR348568_CL42562Contig1_1  507 318 109
-ERR348568_CL32575Contig1_1  438 133 52
-ERR348568_CL44950Contig1_1  429 275 53
-ERR348568_CL45462Contig1_1  243 50  7
-ERR348568_CL15868Contig1_1  309 174 31
-ERR348568_CL21513Contig1_1  386 134 9
-ERR348568_CL32558Contig1_1  800 679 167
-ERR348568_CL52570Contig1_1  339 162 46
-ERR348568_CL3054Contig1_1 690 723 182
-ERR348568_CL8727Contig1_1 290 87  20
-ERR348568_CL21771Contig1_1  264 60  14
-ERR348568_CL40879Contig1_1  421 146 72
-ERR348568_CL41778Contig1_1  439 88  8
-ERR348568_CL51547Contig1_1  570 567 93
-ERR348568_k25_Locus_696_Transcript_1_1  336 71  25
-ERR348568_k25_Locus_23836_Transcript_1_1  298 147 88
-ERR348568_k25_Locus_28135_Transcript_1_1  400 80  12
-ERR348568_k25_Locus_32834_Transcript_1_1  455 71  11
-ERR348568_k25_Locus_34410_Transcript_1_1  290 256 83
-ERR348568_k25_Locus_65250_Transcript_1_1  317 186 67
-ERR348568_k25_Locus_69608_Transcript_1_1  322 183 29
-ERR348568_k25_Locus_70988_Transcript_1_1  642 117 1
-ERR348568_k25_Locus_87199_Transcript_1_1  372 180 46
-ERR348568_k25_Locus_95224_Transcript_1_1  381 251 57
-ERR348568_k25_Locus_99727_Transcript_1_1  428 229 97
-ERR348568_k25_Locus_116667_Transcript_1_1 258 189 141
-ERR348568_k25_Locus_127120_Transcript_1_1 333 111 41
-ERR348568_k25_Locus_143441_Transcript_1_1 426 403 120
-ERR348568_k25_Locus_145789_Transcript_1_1 287 136 58
-ERR348568_k25_Locus_162634_Transcript_1_1 461 226 92
-ERR348568_k25_Locus_169812_Transcript_1_1 304 78  19
-ERR348568_k25_Locus_187584_Transcript_1_1 324 192 62
-ERR348568_k25_Locus_239307_Transcript_1_1 423 78  33
-ERR348568_k25_Locus_246769_Transcript_1_1 288 150 36
-ERR348568_k25_Locus_263122_Transcript_1_1 421 227 147
-ERR348568_k31_Locus_25050_Transcript_1_1  315 439 55
-ERR348568_k31_Locus_55450_Transcript_1_1  330 217 37
-ERR348568_k31_Locus_77275_Transcript_1_1  499 333 102
-ERR348568_k31_Locus_112994_Transcript_1_1 347 138 33
-ERR348568_k31_Locus_137768_Transcript_1_1 397 254 82
-ERR348568_k31_Locus_216922_Transcript_1_1 420 93  54
-ERR348568_k37_Locus_32424_Transcript_1_1  342 68  28
-ERR348568_k37_Locus_38530_Transcript_1_1  379 359 112
-ERR348568_k37_Locus_47061_Transcript_1_1  414 113 53
-ERR348568_k37_Locus_49711_Transcript_1_1  515 586 39
-ERR348568_k37_Locus_62110_Transcript_1_1  716 284 72
-ERR348568_k37_Locus_62770_Transcript_1_1  505 320 84
-ERR348568_k43_Locus_29526_Transcript_1_1  306 212 210
-ERR348568_k43_Locus_53591_Transcript_1_1  365 61  42
-ERR348568_k43_Locus_82264_Transcript_1_1  573 817 100
-ERR348568_k43_Locus_173325_Transcript_1_1 328 203 69
-ERR348568_k49_Locus_93256_Transcript_1_1  343 183 194
-ERR348568_k49_Locus_134698_Transcript_1_1 322 226 188
-ERR348568_k49_Locus_170475_Transcript_1_1 453 417 43
-ERR348568_k49_Locus_174124_Transcript_1_1 344 175 44
-ERR348568_k55_Locus_128685_Transcript_1_1 257 51  20
-ERR348580_CL32568Contig1_1  341 39  16
-ERR348580_CL6357Contig1_1 617 374 85
-ERR348580_CL10472Contig1_1  375 150 41
-ERR348580_CL23023Contig1_1  528 274 49
-ERR348580_k25_Locus_17508_Transcript_1_1  357 69  27
-ERR348580_k25_Locus_55720_Transcript_1_1  364 43  35
-ERR348580_k25_Locus_57470_Transcript_1_1  385 99  23
-ERR348580_k25_Locus_73252_Transcript_1_1  298 50  16
-ERR348580_k25_Locus_92103_Transcript_1_1  315 32  9
-ERR348580_k25_Locus_93808_Transcript_1_1  340 24  19
-ERR348580_k25_Locus_97710_Transcript_1_1  348 35  14
-ERR348580_k25_Locus_120502_Transcript_1_1 335 342 59
-ERR348580_k25_Locus_157841_Transcript_1_1 270 24  8
-ERR348580_k31_Locus_526_Transcript_3_1  722 85  20
-ERR348580_k31_Locus_96163_Transcript_1_1  400 55  32
-ERR348580_k37_Locus_91980_Transcript_1_1  293 62  18
-ERR348580_k43_Locus_2327_Transcript_1_1 388 258 52
-ERR348580_k43_Locus_41277_Transcript_1_1  282 97  24
-ERR348580_k43_Locus_82247_Transcript_1_1  297 23  9
-ERR348580_k49_Locus_72132_Transcript_1_1  280 97  31
-SRR924585_CL120Contig1_1  1385  1587  32
-SRR924585_CL8784Contig1_1 372 359 100
-SRR924585_CL15501Contig1_1  723 668 158
-SRR924585_CL45199Contig1_1  337 119 59
-SRR924585_CL47810Contig1_1  344 111 27
-SRR924585_CL48754Contig1_1  364 72  26
-SRR924585_CL47409Contig1_1  279 172 50
-SRR924585_CL31954Contig1_1  358 125 119
-SRR924585_CL3692Contig1_1 566 306 70
-SRR924585_CL10604Contig1_1  462 387 17
-SRR924585_CL17559Contig1_1  223 284 181
-SRR924585_CL19990Contig1_1  281 143 39
-SRR924585_k25_Locus_30129_Transcript_1_1  425 257 239
-SRR924585_k25_Locus_89365_Transcript_1_1  615 506 61
-SRR924585_k25_Locus_115467_Transcript_1_1 321 133 49
-SRR924585_k25_Locus_136345_Transcript_1_1 389 278 95
-SRR924585_k25_Locus_157099_Transcript_1_1 392 323 36
-SRR924585_k25_Locus_185389_Transcript_1_1 370 134 18
-SRR924585_k25_Locus_191871_Transcript_1_1 432 322 271
-SRR924585_k25_Locus_232686_Transcript_1_1 434 194 88
-SRR924585_k25_Locus_239026_Transcript_1_1 354 143 57
-SRR924585_k25_Locus_271328_Transcript_1_1 276 117 22
-SRR924585_k31_Locus_4861_Transcript_1_1 721 532 95
-SRR924585_k31_Locus_48394_Transcript_1_1  553 331 88
-SRR924585_k31_Locus_164810_Transcript_1_1 426 311 112
-SRR924585_k37_Locus_22437_Transcript_1_1  609 720 97
-SRR924585_k37_Locus_49276_Transcript_1_1  282 93  43
-SRR924585_k37_Locus_80571_Transcript_1_1  317 122 40
-SRR924585_k37_Locus_134248_Transcript_1_1 395 111 16
-SRR924585_k37_Locus_173336_Transcript_1_1 476 219 20
-SRR924585_k37_Locus_222027_Transcript_1_1 312 236 105
-SRR924585_k37_Locus_247262_Transcript_1_1 383 92  64
-SRR924585_k43_Locus_41242_Transcript_1_1  613 711 111
-SRR924585_k43_Locus_107404_Transcript_1_1 529 257 143
-SRR924585_k43_Locus_238949_Transcript_1_1 364 186 68
-SRR924585_k49_Locus_160347_Transcript_1_1 387 130 36
--- a/workflows/transcript/data/contigs.list
+++ b/workflows/transcript/data/contigs.list
-ERR348580_k43_Locus_82247_Transcript_1_1
-SRR924585_k25_Locus_89365_Transcript_1_1
-ERR348568_k37_Locus_32424_Transcript_1_1
-SRR924585_CL47409Contig1_1
-ERR348568_k55_Locus_128685_Transcript_1_1
-ERR348568_k25_Locus_87199_Transcript_1_1
-ERR348568_CL51547Contig1_1
-ERR348580_k25_Locus_17508_Transcript_1_1
-ERR348580_k49_Locus_72132_Transcript_1_1
-ERR348568_CL45462Contig1_1
-ERR348568_CL21771Contig1_1
-ERR348568_CL34628Contig1_1
-ERR348568_CL52570Contig1_1
-ERR348568_CL32575Contig1_1
-ERR348580_CL32568Contig1_1
-ERR348568_k43_Locus_173325_Transcript_1_1
-SRR924585_k43_Locus_238949_Transcript_1_1
-ERR348580_k25_Locus_57470_Transcript_1_1
-ERR348568_k25_Locus_187584_Transcript_1_1
-ERR348568_k25_Locus_239307_Transcript_1_1
-SRR924585_CL15501Contig1_1
-SRR924585_k25_Locus_157099_Transcript_1_1
-ERR348568_k25_Locus_127120_Transcript_1_1
-ERR348568_CL13894Contig1_1
-ERR348580_CL6357Contig1_1
-SRR924585_CL45199Contig1_1
-ERR348580_k25_Locus_92103_Transcript_1_1
-ERR348568_k25_Locus_696_Transcript_1_1
-ERR348580_CL23023Contig1_1
-ERR348568_CL8727Contig1_1
-ERR348568_k25_Locus_70988_Transcript_1_1
-ERR348568_k37_Locus_47061_Transcript_1_1
-ERR348568_k31_Locus_216922_Transcript_1_1
-ERR348568_k31_Locus_77275_Transcript_1_1
-ERR348580_CL10472Contig1_1
-ERR348568_k25_Locus_169812_Transcript_1_1
-ERR348568_k25_Locus_116667_Transcript_1_1
-SRR924585_k31_Locus_164810_Transcript_1_1
-ERR348568_k43_Locus_53591_Transcript_1_1
-SRR924585_CL48754Contig1_1
-ERR348568_CL40879Contig1_1
-ERR348568_k25_Locus_32834_Transcript_1_1
-ERR348568_k25_Locus_69608_Transcript_1_1
-ERR348568_k49_Locus_170475_Transcript_1_1
-ERR348568_CL31035Contig1_1
-SRR924585_CL10604Contig1_1
-SRR924585_k31_Locus_48394_Transcript_1_1
-ERR348568_k37_Locus_38530_Transcript_1_1
-ERR348580_k25_Locus_73252_Transcript_1_1
-ERR348568_CL41778Contig1_1
-ERR348568_k25_Locus_65250_Transcript_1_1
-ERR348568_CL44950Contig1_1
-ERR348568_k37_Locus_49711_Transcript_1_1
-ERR348568_k25_Locus_99727_Transcript_1_1
-ERR348568_CL611Contig1_1
-SRR924585_k37_Locus_247262_Transcript_1_1
-ERR348568_k31_Locus_55450_Transcript_1_1
-ERR348580_k25_Locus_93808_Transcript_1_1
-ERR348568_CL8315Contig1_1
-ERR348568_k31_Locus_25050_Transcript_1_1
-ERR348568_k43_Locus_82264_Transcript_1_1
-ERR348568_k43_Locus_29526_Transcript_1_1
-ERR348580_k43_Locus_2327_Transcript_1_1
-ERR348580_k31_Locus_526_Transcript_3_1
-SRR924585_k25_Locus_136345_Transcript_1_1
-SRR924585_CL31954Contig1_1
-SRR924585_k37_Locus_80571_Transcript_1_1
-SRR924585_k37_Locus_22437_Transcript_1_1
-SRR924585_k37_Locus_173336_Transcript_1_1
-SRR924585_k25_Locus_30129_Transcript_1_1
-SRR924585_k43_Locus_41242_Transcript_1_1
-SRR924585_k25_Locus_239026_Transcript_1_1
-SRR924585_CL8784Contig1_1
-ERR348568_k25_Locus_162634_Transcript_1_1
-ERR348568_k25_Locus_145789_Transcript_1_1
-SRR924585_k25_Locus_191871_Transcript_1_1
-SRR924585_k25_Locus_185389_Transcript_1_1
-ERR348568_k31_Locus_112994_Transcript_1_1
-ERR348580_k25_Locus_120502_Transcript_1_1
-ERR348568_k25_Locus_246769_Transcript_1_1
-ERR348568_k25_Locus_23836_Transcript_1_1
-ERR348568_CL31587Contig1_1
-ERR348568_k25_Locus_263122_Transcript_1_1
-ERR348580_k25_Locus_55720_Transcript_1_1
-SRR924585_k43_Locus_107404_Transcript_1_1
-ERR348568_k25_Locus_95224_Transcript_1_1
-SRR924585_k37_Locus_49276_Transcript_1_1
-ERR348568_k49_Locus_134698_Transcript_1_1
-SRR924585_k37_Locus_134248_Transcript_1_1
-ERR348580_k31_Locus_96163_Transcript_1_1
-ERR348580_k43_Locus_41277_Transcript_1_1
-ERR348580_k25_Locus_97710_Transcript_1_1
-ERR348568_k49_Locus_174124_Transcript_1_1
-ERR348568_CL32558Contig1_1
-ERR348568_CL42562Contig1_1
-ERR348580_k25_Locus_157841_Transcript_1_1
-SRR924585_k25_Locus_232686_Transcript_1_1
-SRR924585_k31_Locus_4861_Transcript_1_1
-SRR924585_k25_Locus_115467_Transcript_1_1
-SRR924585_k25_Locus_271328_Transcript_1_1
-ERR348568_k25_Locus_34410_Transcript_1_1
-ERR348568_k25_Locus_143441_Transcript_1_1
-ERR348568_k31_Locus_137768_Transcript_1_1
-SRR924585_CL120Contig1_1
-SRR924585_CL19990Contig1_1
-ERR348580_k37_Locus_91980_Transcript_1_1
-ERR348568_k37_Locus_62770_Transcript_1_1
-SRR924585_k37_Locus_222027_Transcript_1_1
-ERR348568_k25_Locus_28135_Transcript_1_1
-SRR924585_k49_Locus_160347_Transcript_1_1
-SRR924585_CL3692Contig1_1
-SRR924585_CL47810Contig1_1
-ERR348568_CL3054Contig1_1
-SRR924585_CL17559Contig1_1
-ERR348568_CL21513Contig1_1
-ERR348568_k49_Locus_93256_Transcript_1_1
-ERR348568_CL15868Contig1_1
-ERR348568_k37_Locus_62110_Transcript_1_1