Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Maintenance - Mise à jour mensuelle Lundi 6 Février entre 7h00 et 9h00
Open sidebar
genotoul-bioinfo
ng6
Commits
89a8c74e
Commit
89a8c74e
authored
Jun 10, 2011
by
Jerome Mariette
Browse files
add --qual option to 16Scleaner
parent
9cbfa475
Changes
4
Hide whitespace changes
Inline
Side-by-side
metagenomic/bin/16Scleaner.py
View file @
89a8c74e
...
...
@@ -14,11 +14,14 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# v1.1 (06/2011) : Add the option --qual option to clean and trim quality file as well
# v1.0 (09/2010) : 16Scleaner first versio
#
__author__
=
'Plateforme bioinformatique Midi Pyrenees'
__copyright__
=
'Copyright (C) 2009 INRA'
__license__
=
'GNU General Public License'
__version__
=
'1.
0
'
__version__
=
'1.
1
'
__email__
=
'support.genopole@toulouse.inra.fr'
__status__
=
'beta'
...
...
@@ -178,7 +181,7 @@ def filter_reads (seqs, options):
seq_to_check
=
reads_record
if
is_reversed
:
seq_to_check
.
seq
=
seq_to_check
.
seq
.
reverse_complement
()
seq_to_check
=
seq_to_check
.
reverse_complement
(
id
=
True
,
description
=
True
)
if
seq_to_check
.
seq
[
start
:
stop
].
count
(
"N"
)
+
seq_to_check
.
seq
[
start
:
stop
].
count
(
"n"
)
>
0
:
reads_ok
[
i
]
=
1
...
...
@@ -220,7 +223,7 @@ def filter_reads (seqs, options):
if
reads_ok
[
i
]
==
0
:
if
options
.
switch_strand
and
is_reversed
:
reads_record
.
seq
=
reads_record
.
seq
.
reverse_complement
()
reads_record
=
reads_record
.
reverse_complement
(
id
=
True
,
description
=
True
)
reversed_sequence
+=
1
log
.
write
(
reads_record
.
id
+
" information -> Sequence has been reversed and complemented.
\n
"
)
if
fwd_present
and
rvs_present
:
...
...
@@ -236,8 +239,9 @@ def get_seqs (options):
Converts input seqs in a BioPython seq table
@param options : the options asked by the user
"""
# First get fasta input files
# First get fasta or/and qual input files
qual_file
=
""
if
options
.
format
==
"sff"
:
sff_file
=
options
.
input
...
...
@@ -258,8 +262,12 @@ def get_seqs (options):
format
=
"fasta"
if
not
os
.
path
.
isfile
(
fasta_file
)
or
not
os
.
path
.
isfile
(
qual_file
)
or
not
os
.
path
.
isfile
(
xml_file
):
#First extract the sff file to fasta, qual and xml file
cmd
=
"sff_extract.py -c -s "
+
fasta_file
+
" -q "
+
qual_file
+
" -x "
+
xml_file
+
" "
+
sff_file
cmd
=
"sff_extract.py -c -s "
+
fasta_file
+
" -q "
+
qual_file
+
" -x "
+
xml_file
+
" "
+
sff_file
+
" >> "
+
options
.
output
+
"/"
+
options
.
log_file
os
.
system
(
cmd
)
else
:
log
=
open
(
options
.
log_file
,
"a+"
)
log
.
write
(
fasta_file
+
", "
+
qual_file
+
", "
+
xml_file
+
" already exist, working on existing files
\n
"
)
log
.
close
()
elif
options
.
format
==
"fasta"
or
options
.
format
==
"fastq"
:
format
=
options
.
format
fasta_file
=
options
.
input
...
...
@@ -267,14 +275,32 @@ def get_seqs (options):
print
"Error : "
+
options
.
format
+
" is not a supported format!"
sys
.
exit
(
1
)
if
options
.
qual_file
:
qual_file
=
options
.
qual_file
# If we got a quality file
if
qual_file
:
quals
=
{}
# If the file is gziped
if
qual_file
.
endswith
(
".gz"
):
for
qual
in
SeqIO
.
parse
(
gzip
.
open
(
qual_file
),
"qual"
)
:
quals
[
qual
.
id
]
=
qual
else
:
for
qual
in
SeqIO
.
parse
(
open
(
qual_file
),
"qual"
)
:
quals
[
qual
.
id
]
=
qual
# If the fasta file is gziped
if
fasta_file
.
endswith
(
".gz"
):
seqs
=
[]
for
record
in
SeqIO
.
parse
(
gzip
.
open
(
fasta_file
),
format
)
:
if
qual_file
:
record
.
letter_annotations
[
"phred_quality"
]
=
quals
[
record
.
id
].
letter_annotations
[
"phred_quality"
]
seqs
.
append
(
record
)
else
:
seqs
=
[]
for
record
in
SeqIO
.
parse
(
open
(
fasta_file
),
format
)
:
if
qual_file
:
record
.
letter_annotations
[
"phred_quality"
]
=
quals
[
record
.
id
].
letter_annotations
[
"phred_quality"
]
seqs
.
append
(
record
)
# Clean temporary files
...
...
@@ -328,6 +354,8 @@ if __name__ == '__main__':
igroup
=
OptionGroup
(
parser
,
"Input files options"
,
""
)
igroup
.
add_option
(
"-i"
,
"--in"
,
dest
=
"input"
,
help
=
"The file to clean, can be [sff|fastq|fasta]"
,
metavar
=
"FILE"
)
igroup
.
add_option
(
"-q"
,
"--qual"
,
dest
=
"qual_file"
,
help
=
"The quality file to use if input file is fasta"
,
metavar
=
"FILE"
)
igroup
.
add_option
(
"-m"
,
"--format"
,
dest
=
"format"
,
help
=
"The format of the input file [sff|fastq|fasta] default is sff"
,
type
=
"string"
,
default
=
"sff"
)
parser
.
add_option_group
(
igroup
)
...
...
@@ -440,6 +468,11 @@ if __name__ == '__main__':
fasta
=
open
(
output
,
"w"
)
SeqIO
.
write
(
seqs
,
fasta
,
"fasta"
)
fasta
.
close
()
if
options
.
format
==
"sff"
or
options
.
qual_file
:
output
=
os
.
path
.
join
(
options
.
output
,
os
.
path
.
splitext
(
os
.
path
.
basename
(
options
.
input
))[
0
]
+
".clean.qual"
)
qual
=
open
(
output
,
"w"
)
SeqIO
.
write
(
seqs
,
qual
,
"qual"
)
qual
.
close
()
# 4 - Display the result summary
log
.
write
(
"## header (global) : nb_reads_at_begining
\t
nb_reads_after_clean_up
\t
length
\t
ns
\t
ns_in_area
\t
homopolymers
\t
triming
\t
full_sequence_length
\t
full_sequence
\t
reversed_sequence
\n
"
)
...
...
metagenomic/wfcomponent/16Scleaner.config
View file @
89a8c74e
...
...
@@ -3,7 +3,7 @@ classification = sequence / cleaning
[
parameters
]
;
The
input
format
(
if
using
list
,
all
inputs
have
to
be
in
the
same
format
)
$;
FORMAT
$; =
sff
$;
FORMAT
$; =
[
sff
|
fastq
|
fasta
]
;
Choose
one
or
more
between
:
; --
clean
-
length
:
Filter
reads
with
a
legnth
in
between
[
x
:
y
]
; --
clean
-
ns
:
Filter
reads
with
to
much
N
...
...
@@ -43,6 +43,8 @@ $;INPUT_FILE$; =
$;
INPUT_DIRECTORY
$; =
;;
the
following
is
only
used
when
iterating
over
an
INPUT_DIRECTORY
$;
INPUT_EXTENSION
$; =
sff
;
the
quality
file
list
if
fasta
is
the
input
format
$;
QUAL_FILE_LIST
$; =
;
The
run
config
file
the
analyse
belongs
to
$;
NG6CFG_FILE
$; =
;
OR
The
project
id
the
analyse
belongs
to
...
...
@@ -53,6 +55,7 @@ $;OUTPUT_TOKEN$; = default
$;
OUTPUT_DIRECTORY
$; = $;
REPOSITORY_ROOT
$;/
output_repository
/$;
COMPONENT_NAME
$;/$;
PIPELINEID
$;
_
$;
OUTPUT_TOKEN
$;
$;
FASTQ_OUTPUT_LIST
$; = $;
OUTPUT_DIRECTORY
$;/$;
COMPONENT_NAME
$;.
fastq
.
list
$;
FASTA_OUTPUT_LIST
$; = $;
OUTPUT_DIRECTORY
$;/$;
COMPONENT_NAME
$;.
fasta
.
list
$;
QUAL_OUTPUT_LIST
$; = $;
OUTPUT_DIRECTORY
$;/$;
COMPONENT_NAME
$;.
qual
.
list
$;
LOG_OUTPUT_LIST
$; = $;
OUTPUT_DIRECTORY
$;/$;
COMPONENT_NAME
$;.
log
.
list
[
component
]
...
...
metagenomic/wfcomponent/16Scleaner.i1.xml
View file @
89a8c74e
...
...
@@ -8,11 +8,63 @@
<name>
16Scleaner
</name>
<state>
incomplete
</state>
<executable>
python $;BIN_DIR$;/16Scleaner.py
</executable>
<arg>
-i "$;I_FILE_PATH$;" -o "$;OUTPUT_DIRECTORY$;/$;ITERATOR_NAME$;/g$;GROUP_NUMBER$;/" -m "$;FORMAT$;" $;CLEANING_OPTIONS$; -k "$;MIN_FULL_LENGTH_LENGTH$;" -x "$;MIN_LENGTH$;" -y "$;MAX_LENGTH$;" -p "$;NS_PERCENT$;" -f $;FWD_PRIMER$; -r $;RVRS_PRIMER$; -s $;RVRS_MINMATCH$; -b $;FWD_MINMATCH$; -w $;HOMOPOLYLER_LENGTH$;
</arg>
<arg>
$;CLEANING_OPTIONS$;
</arg>
<param>
<key>
--in
</key>
<value>
"$;I_FILE_PATH$;"
</value>
</param>
<param>
<key>
--qual
</key>
<value>
"$;SECOND_FILE_PATH$;"
</value>
</param>
<param>
<key>
--log
</key>
<value>
$;I_FILE_BASE$;.log
</value>
</param>
<param>
<key>
--format
</key>
<value>
"$;FORMAT$;"
</value>
</param>
<param>
<key>
--out
</key>
<value>
"$;OUTPUT_DIRECTORY$;/$;ITERATOR_NAME$;/g$;GROUP_NUMBER$;/"
</value>
</param>
<param>
<key>
--full-length-min-size
</key>
<value>
"$;MIN_FULL_LENGTH_LENGTH$;"
</value>
</param>
<param>
<key>
--min-length
</key>
<value>
"$;MIN_LENGTH$;"
</value>
</param>
<param>
<key>
--max-length
</key>
<value>
"$;MAX_LENGTH$;"
</value>
</param>
<param>
<key>
--ns-percent
</key>
<value>
"$;NS_PERCENT$;"
</value>
</param>
<param>
<key>
--forward
</key>
<value>
"$;FWD_PRIMER$;"
</value>
</param>
<param>
<key>
--reverse
</key>
<value>
"$;RVRS_PRIMER$;"
</value>
</param>
<param>
<key>
--rvrs-minmatch
</key>
<value>
"$;RVRS_MINMATCH$;"
</value>
</param>
<param>
<key>
--fwd-minmatch
</key>
<value>
"$;FWD_MINMATCH$;"
</value>
</param>
<param>
<key>
--homopolymers-length
</key>
<value>
"$;HOMOPOLYLER_LENGTH$;"
</value>
</param>
<param>
<key>
stdout
</key>
<value>
$;OUTPUT_DIRECTORY$;/$;ITERATOR_NAME$;/g$;GROUP_NUMBER$;/$;I_FILE_BASE$;.$;COMPONENT_NAME$;.stdout
</value>
...
...
metagenomic/wfcomponent/16Scleaner.xml
View file @
89a8c74e
...
...
@@ -19,10 +19,17 @@
<executable>
mkdir
</executable>
<arg>
-p -m 777 $;TMP_DIR$;
</arg>
</command>
<command>
<type>
RunUnixCommand
</type>
<name>
create 16Scleaner iterator list
</name>
<state>
incomplete
</state>
<executable>
$;BIN_DIR$;/create_2files_iterator_list
</executable>
<arg>
--input_file=$;INPUT_FILE$; --input_file_list=$;INPUT_FILE_LIST$; --second_file_list=$;QUAL_FILE_LIST$; --input_directory=$;INPUT_DIRECTORY$; --input_directory_extension=$;INPUT_EXTENSION$; --output_iter_list=$;OUTPUT_DIRECTORY$;/16Scleaner_iterator.list
</arg>
</command>
<!--Processing-->
<!--Iterator-->
<INCLUDE
file=
"$;DOCS_DIR$;/
file_
iterator_template.xml"
keys=
"$;ITERATOR_NAME$;=ITERATOR1,$;ITERATOR_XML$;=ITERATOR1_XML
"
>
<!--Post
p
rocessing-->
<INCLUDE
file=
"$;DOCS_DIR$;/iterator_template.xml"
keys=
"$;ITERATOR_NAME$;=ITERATOR1,$;ITERATOR_XML$;=ITERATOR1_XML
,$;ITERATOR_LIST$;=$;OUTPUT_DIRECTORY$;/16Scleaner_iterator.list"
/
>
<!--Post
-P
rocessing-->
<command>
<type>
RunUnixCommand
</type>
<name>
create fastq list
</name>
...
...
@@ -59,6 +66,24 @@
<value>
$;FASTA_OUTPUT_LIST$;
</value>
</param>
</command>
<command>
<type>
RunUnixCommand
</type>
<name>
create qual list
</name>
<state>
incomplete
</state>
<executable>
$;BIN_DIR$;/create_list_file
</executable>
<param>
<key>
--directory
</key>
<value>
$;OUTPUT_DIRECTORY$;
</value>
</param>
<param>
<key>
--regex
</key>
<value>
".*\.qual"
</value>
</param>
<param>
<key>
--output_list
</key>
<value>
$;QUAL_OUTPUT_LIST$;
</value>
</param>
</command>
<command>
<type>
RunUnixCommand
</type>
<name>
create log list
</name>
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment