Commit 89a8c74e authored by Jerome Mariette's avatar Jerome Mariette
Browse files

add --qual option to 16Scleaner

parent 9cbfa475
......@@ -14,11 +14,14 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# v1.1 (06/2011) : Add the option --qual option to clean and trim quality file as well
# v1.0 (09/2010) : 16Scleaner first versio
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '1.0'
__version__ = '1.1'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'
......@@ -178,7 +181,7 @@ def filter_reads (seqs, options):
seq_to_check = reads_record
if is_reversed :
seq_to_check.seq = seq_to_check.seq.reverse_complement()
seq_to_check = seq_to_check.reverse_complement(id=True, description=True)
if seq_to_check.seq[start:stop].count("N") + seq_to_check.seq[start:stop].count("n") > 0:
reads_ok[i] = 1
......@@ -220,7 +223,7 @@ def filter_reads (seqs, options):
if reads_ok[i] == 0:
if options.switch_strand and is_reversed :
reads_record.seq = reads_record.seq.reverse_complement()
reads_record = reads_record.reverse_complement(id=True, description=True)
reversed_sequence += 1
log.write(reads_record.id + " information -> Sequence has been reversed and complemented.\n")
if fwd_present and rvs_present :
......@@ -236,8 +239,9 @@ def get_seqs (options):
Converts input seqs in a BioPython seq table
@param options : the options asked by the user
"""
# First get fasta input files
# First get fasta or/and qual input files
qual_file = ""
if options.format == "sff":
sff_file = options.input
......@@ -258,8 +262,12 @@ def get_seqs (options):
format = "fasta"
if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file):
#First extract the sff file to fasta, qual and xml file
cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file
cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file
os.system(cmd)
else :
log = open(options.log_file, "a+")
log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n")
log.close()
elif options.format == "fasta" or options.format == "fastq":
format = options.format
fasta_file = options.input
......@@ -267,14 +275,32 @@ def get_seqs (options):
print "Error : " + options.format + " is not a supported format!"
sys.exit(1)
if options.qual_file:
qual_file = options.qual_file
# If we got a quality file
if qual_file :
quals = {}
# If the file is gziped
if qual_file.endswith(".gz"):
for qual in SeqIO.parse(gzip.open(qual_file), "qual") :
quals[qual.id] = qual
else :
for qual in SeqIO.parse(open(qual_file), "qual") :
quals[qual.id] = qual
# If the fasta file is gziped
if fasta_file.endswith(".gz"):
seqs = []
for record in SeqIO.parse(gzip.open(fasta_file), format) :
if qual_file :
record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
seqs.append(record)
else :
seqs = []
for record in SeqIO.parse(open(fasta_file), format) :
if qual_file :
record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
seqs.append(record)
# Clean temporary files
......@@ -328,6 +354,8 @@ if __name__ == '__main__':
igroup = OptionGroup(parser, "Input files options","")
igroup.add_option("-i", "--in", dest="input",
help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE")
igroup.add_option("-q", "--qual", dest="qual_file",
help="The quality file to use if input file is fasta", metavar="FILE")
igroup.add_option("-m", "--format", dest="format",
help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff")
parser.add_option_group(igroup)
......@@ -440,6 +468,11 @@ if __name__ == '__main__':
fasta = open(output, "w")
SeqIO.write(seqs, fasta, "fasta")
fasta.close()
if options.format == "sff" or options.qual_file:
output = os.path.join(options.output, os.path.splitext(os.path.basename(options.input))[0] + ".clean.qual")
qual = open(output, "w")
SeqIO.write(seqs, qual, "qual")
qual.close()
# 4 - Display the result summary
log.write("## header (global) : nb_reads_at_begining\tnb_reads_after_clean_up\tlength\tns\tns_in_area\thomopolymers\ttriming\tfull_sequence_length\tfull_sequence\treversed_sequence\n")
......
......@@ -3,7 +3,7 @@ classification = sequence / cleaning
[parameters]
;The input format (if using list, all inputs have to be in the same format)
$;FORMAT$; = sff
$;FORMAT$; = [ sff|fastq|fasta ]
; Choose one or more between :
; --clean-length : Filter reads with a legnth in between [x:y]
; --clean-ns : Filter reads with to much N
......@@ -43,6 +43,8 @@ $;INPUT_FILE$; =
$;INPUT_DIRECTORY$; =
;; the following is only used when iterating over an INPUT_DIRECTORY
$;INPUT_EXTENSION$; = sff
; the quality file list if fasta is the input format
$;QUAL_FILE_LIST$; =
; The run config file the analyse belongs to
$;NG6CFG_FILE$; =
; OR The project id the analyse belongs to
......@@ -53,6 +55,7 @@ $;OUTPUT_TOKEN$; = default
$;OUTPUT_DIRECTORY$; = $;REPOSITORY_ROOT$;/output_repository/$;COMPONENT_NAME$;/$;PIPELINEID$;_$;OUTPUT_TOKEN$;
$;FASTQ_OUTPUT_LIST$; = $;OUTPUT_DIRECTORY$;/$;COMPONENT_NAME$;.fastq.list
$;FASTA_OUTPUT_LIST$; = $;OUTPUT_DIRECTORY$;/$;COMPONENT_NAME$;.fasta.list
$;QUAL_OUTPUT_LIST$; = $;OUTPUT_DIRECTORY$;/$;COMPONENT_NAME$;.qual.list
$;LOG_OUTPUT_LIST$; = $;OUTPUT_DIRECTORY$;/$;COMPONENT_NAME$;.log.list
[component]
......
......@@ -8,11 +8,63 @@
<name>16Scleaner</name>
<state>incomplete</state>
<executable>python $;BIN_DIR$;/16Scleaner.py</executable>
<arg>-i "$;I_FILE_PATH$;" -o "$;OUTPUT_DIRECTORY$;/$;ITERATOR_NAME$;/g$;GROUP_NUMBER$;/" -m "$;FORMAT$;" $;CLEANING_OPTIONS$; -k "$;MIN_FULL_LENGTH_LENGTH$;" -x "$;MIN_LENGTH$;" -y "$;MAX_LENGTH$;" -p "$;NS_PERCENT$;" -f $;FWD_PRIMER$; -r $;RVRS_PRIMER$; -s $;RVRS_MINMATCH$; -b $;FWD_MINMATCH$; -w $;HOMOPOLYLER_LENGTH$;</arg>
<arg>$;CLEANING_OPTIONS$;</arg>
<param>
<key>--in</key>
<value>"$;I_FILE_PATH$;"</value>
</param>
<param>
<key>--qual</key>
<value>"$;SECOND_FILE_PATH$;"</value>
</param>
<param>
<key>--log</key>
<value>$;I_FILE_BASE$;.log</value>
</param>
<param>
<key>--format</key>
<value>"$;FORMAT$;"</value>
</param>
<param>
<key>--out</key>
<value>"$;OUTPUT_DIRECTORY$;/$;ITERATOR_NAME$;/g$;GROUP_NUMBER$;/"</value>
</param>
<param>
<key>--full-length-min-size</key>
<value>"$;MIN_FULL_LENGTH_LENGTH$;"</value>
</param>
<param>
<key>--min-length</key>
<value>"$;MIN_LENGTH$;"</value>
</param>
<param>
<key>--max-length</key>
<value>"$;MAX_LENGTH$;"</value>
</param>
<param>
<key>--ns-percent</key>
<value>"$;NS_PERCENT$;"</value>
</param>
<param>
<key>--forward</key>
<value>"$;FWD_PRIMER$;"</value>
</param>
<param>
<key>--reverse</key>
<value>"$;RVRS_PRIMER$;"</value>
</param>
<param>
<key>--rvrs-minmatch</key>
<value>"$;RVRS_MINMATCH$;"</value>
</param>
<param>
<key>--fwd-minmatch</key>
<value>"$;FWD_MINMATCH$;"</value>
</param>
<param>
<key>--homopolymers-length</key>
<value>"$;HOMOPOLYLER_LENGTH$;"</value>
</param>
<param>
<key>stdout</key>
<value>$;OUTPUT_DIRECTORY$;/$;ITERATOR_NAME$;/g$;GROUP_NUMBER$;/$;I_FILE_BASE$;.$;COMPONENT_NAME$;.stdout</value>
......
......@@ -19,10 +19,17 @@
<executable>mkdir</executable>
<arg>-p -m 777 $;TMP_DIR$;</arg>
</command>
<command>
<type>RunUnixCommand</type>
<name>create 16Scleaner iterator list</name>
<state>incomplete</state>
<executable>$;BIN_DIR$;/create_2files_iterator_list</executable>
<arg>--input_file=$;INPUT_FILE$; --input_file_list=$;INPUT_FILE_LIST$; --second_file_list=$;QUAL_FILE_LIST$; --input_directory=$;INPUT_DIRECTORY$; --input_directory_extension=$;INPUT_EXTENSION$; --output_iter_list=$;OUTPUT_DIRECTORY$;/16Scleaner_iterator.list</arg>
</command>
<!--Processing-->
<!--Iterator-->
<INCLUDE file="$;DOCS_DIR$;/file_iterator_template.xml" keys="$;ITERATOR_NAME$;=ITERATOR1,$;ITERATOR_XML$;=ITERATOR1_XML">
<!--Postprocessing-->
<INCLUDE file="$;DOCS_DIR$;/iterator_template.xml" keys="$;ITERATOR_NAME$;=ITERATOR1,$;ITERATOR_XML$;=ITERATOR1_XML,$;ITERATOR_LIST$;=$;OUTPUT_DIRECTORY$;/16Scleaner_iterator.list"/>
<!--Post-Processing-->
<command>
<type>RunUnixCommand</type>
<name>create fastq list</name>
......@@ -59,6 +66,24 @@
<value>$;FASTA_OUTPUT_LIST$;</value>
</param>
</command>
<command>
<type>RunUnixCommand</type>
<name>create qual list</name>
<state>incomplete</state>
<executable>$;BIN_DIR$;/create_list_file</executable>
<param>
<key>--directory</key>
<value>$;OUTPUT_DIRECTORY$;</value>
</param>
<param>
<key>--regex</key>
<value>".*\.qual"</value>
</param>
<param>
<key>--output_list</key>
<value>$;QUAL_OUTPUT_LIST$;</value>
</param>
</command>
<command>
<type>RunUnixCommand</type>
<name>create log list</name>
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment