Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bios4biol
bioinfo-utils
Commits
b85e0ddd
Commit
b85e0ddd
authored
Jun 26, 2014
by
Celine Noirot
Browse files
find from 2 files (paired) the pair present in the two files.
parent
a8a11fe6
Changes
1
Hide whitespace changes
Inline
Side-by-side
bin/fastqcombinepair.py
0 → 100755
View file @
b85e0ddd
#!/bin/env python
import
sys
from
optparse
import
*
import
os
#####Authors: Team Ladner-Barshis-Tepolt
######Usage########
######This script takes a set of separate quality trimmed paired end .fastq files
######and pairs the reads that still have a mate and combined the solitary reads
######into a single file of orphans for input into denovo assembly software (e.g. CLC Genomics).
######Command line usage: fastqcombinepairedend.py "seqheader" "delimiter" infiledirection1.fastq infiledirection2.fastq
#### ~227GB of RAM in order to use this script on my data which consisted of two paired-end files 58GB and 29GB
__name__
=
"fastqcombinepair.py"
__synopsis__
=
"fastqcombinepair.py -f file_name.fasta"
__example__
=
"python fastqcombinepair.py TODO"
__date__
=
"2013/10"
__authors__
=
"Team Ladner-Barshis-Tepolt review by C.Noirot"
__keywords__
=
"fastq pair combine"
__description__
=
"This script extract pairs combine from 2 fastq; "
__version__
=
'1.0'
class
Sequence
:
"""qualities is a string and it contains the qualities encoded as ascii(qual+33)."""
def
__init__
(
self
,
name
,
sequence
,
qualities
):
"""Set qualities to None if there are no quality values"""
self
.
name
=
name
self
.
sequence
=
sequence
self
.
qualities
=
qualities
def
__getitem__
(
self
,
key
):
"""slicing"""
return
self
.
__class__
(
self
.
name
,
self
.
sequence
[
key
],
self
.
qualities
[
key
]
if
self
.
qualities
is
not
None
else
None
)
def
__repr__
(
self
):
qstr
=
''
if
self
.
qualities
is
not
None
:
qstr
=
'
\'
, qualities="{0}"'
.
format
(
_shorten
(
self
.
qualities
))
return
'Sequence(name="{0}", sequence="{1}"{2})'
.
format
(
_shorten
(
self
.
name
),
_shorten
(
self
.
sequence
),
qstr
)
def
__len__
(
self
):
return
len
(
self
.
sequence
)
def
__eq__
(
self
,
other
):
return
self
.
name
==
other
.
name
and
\
self
.
sequence
==
other
.
sequence
and
\
self
.
qualities
==
other
.
qualities
def
__ne__
(
self
,
other
):
return
not
self
.
__eq__
(
other
)
class
FastqReader
(
object
):
"""
Reader for FASTQ files. Does not support multi-line FASTQ files.
"""
def
__init__
(
self
,
file
,
colorspace
=
False
,
skip_color
=
0
):
"""
file is a filename or a file-like object.
If file is a filename, then .gz files are supported.
colorspace -- Usually (when this is False), there must be n characters in the sequence and
n quality values. When this is True, there must be n+1 characters in the sequence and n quality values.
"""
if
isinstance
(
file
,
basestring
):
file
=
xopen
(
file
,
"r"
)
self
.
fp
=
file
self
.
colorspace
=
colorspace
self
.
skip_color
=
skip_color
self
.
twoheaders
=
False
def
__iter__
(
self
):
"""
Return tuples: (name, sequence, qualities).
qualities is a string and it contains the unmodified, encoded qualities.
"""
lengthdiff
=
1
if
self
.
colorspace
else
0
for
i
,
line
in
enumerate
(
self
.
fp
):
if
i
%
4
==
0
:
if
not
line
.
startswith
(
'@'
):
raise
FormatError
(
"at line {0}, expected a line starting with '+'"
.
format
(
i
+
1
))
name
=
line
.
strip
()[
1
:]
elif
i
%
4
==
1
:
sequence
=
line
.
strip
()
elif
i
%
4
==
2
:
line
=
line
.
strip
()
if
not
line
.
startswith
(
'+'
):
raise
FormatError
(
"at line {0}, expected a line starting with '+'"
.
format
(
i
+
1
))
if
len
(
line
)
>
1
:
self
.
twoheaders
=
True
if
not
line
[
1
:]
==
name
:
raise
FormatError
(
"At line {0}: Two sequence descriptions are given in "
"the FASTQ file, but they don't match "
"('{1}' != '{2}')"
" perhaps you should try the 'sra-fastq' format?"
.
format
(
i
+
1
,
name
,
line
.
rstrip
()[
1
:]))
elif
i
%
4
==
3
:
qualities
=
line
.
rstrip
(
"
\n\r
"
)[
self
.
skip_color
:]
if
len
(
qualities
)
+
lengthdiff
!=
len
(
sequence
):
raise
ValueError
(
"Length of quality sequence and length of read do not match (%d+%d!=%d)"
%
(
len
(
qualities
),
lengthdiff
,
len
(
sequence
)))
yield
Sequence
(
name
,
sequence
,
qualities
)
def
__enter__
(
self
):
if
self
.
fp
is
None
:
raise
ValueError
(
"I/O operation on closed FastqReader"
)
return
self
def
__exit__
(
self
,
*
args
):
self
.
fp
.
close
()
def
version_string
():
"""
Return the version
"""
return
"%prog "
+
__version__
if
__name__
==
'__main__'
:
parser
=
OptionParser
(
usage
=
"Usage: %prog"
,
description
=
__description__
)
igroup
=
OptionGroup
(
parser
,
"Input options"
,
""
)
igroup
.
add_option
(
"-1"
,
"--pair1"
,
dest
=
"pair1"
,
help
=
"fastq pair1"
)
igroup
.
add_option
(
"-2"
,
"--pair2"
,
dest
=
"pair2"
,
help
=
"fastq pair2"
)
parser
.
add_option_group
(
igroup
)
ogroup
=
OptionGroup
(
parser
,
"Output files options"
,
""
)
ogroup
.
add_option
(
"-o"
,
"--output"
,
dest
=
"output"
,
help
=
"the output directory"
)
parser
.
add_option_group
(
ogroup
)
(
options
,
args
)
=
parser
.
parse_args
()
if
options
.
output
==
None
:
sys
.
stderr
.
write
(
"Output directory is missing"
)
parser
.
print_help
()
sys
.
exit
(
1
)
if
not
os
.
path
.
exists
(
options
.
pair1
)
or
not
os
.
path
.
exists
(
options
.
pair2
):
sys
.
stderr
.
write
(
"Pair1 and pair2 fastq files must exist"
)
parser
.
print_help
()
sys
.
exit
(
1
)
names1
=
[]
names2
=
[]
seqs1
=
{}
seqs2
=
{}
quals1
=
{}
quals2
=
{}
#Starts a for loop to read through the infile line by line
count
=
0
reader
=
seqio
.
SequenceReader
(
options
.
pair1
)
for
id
,
desc
,
seq
,
qualities
in
reader
:
names1
.
append
(
id
)
seqs1
[
id
]
=
seq
quals1
[
id
]
=
qualities
print
'finished reading in: %s'
%
(
options
.
pair1
)
reader
=
seqio
.
SequenceReader
(
options
.
pair2
)
for
id
,
desc
,
seq
,
qualities
in
reader
:
names2
.
append
(
id
)
seqs2
[
id
]
=
seq
quals2
[
id
]
=
qualities
# print 'here'
print
'finished reading in: %s'
%
(
options
.
pair2
)
paired
=
list
(
set
(
names1
)
&
set
(
names2
))
print
len
(
paired
)
print
'number of paired reads: %d'
%
(
len
(
paired
))
single
=
list
(
set
(
names1
)
^
set
(
names2
))
print
len
(
single
)
single1
=
list
(
set
(
single
)
&
set
(
names1
))
print
len
(
single1
)
print
'number of single reads left from %s: %d'
%
(
setoffiles
[
0
],
len
(
single1
))
single2
=
list
(
set
(
single
)
&
set
(
names2
))
print
len
(
single2
)
print
'number of single reads left from %s: %d'
%
(
setoffiles
[
1
],
len
(
single2
))
base_output
=
os
.
path
.
join
(
options
.
output
,
os
.
path
.
splitext
(
os
.
path
.
basename
(
options
.
pair1
))[
0
])
SIN
=
open
(
base_output
+
".single.fastq"
,
'w'
)
PAIR1
=
open
(
base_output
+
".1.fastq"
,
'w'
)
PAIR2
=
open
(
base_output
+
".2.fastq"
,
'w'
)
for
label
in
single1
:
SIN
.
write
(
'@'
+
str
(
label
)
+
' /1
\n
'
+
str
(
seqs1
[
label
])
+
'
\n
+'
+
str
(
label
)
+
' /1
\n
'
+
str
(
quals1
[
label
])
+
'
\n
'
)
for
label
in
single2
:
SIN
.
write
(
'@'
+
str
(
label
)
+
' /2
\n
'
+
str
(
seqs2
[
label
])
+
'
\n
+'
+
str
(
label
)
+
' /2
\n
'
+
str
(
quals2
[
label
])
+
'
\n
'
)
for
label
in
paired
:
PAIR1
.
write
(
'@'
+
str
(
label
)
+
' 1
\n
'
+
str
(
seqs1
[
label
])
+
'
\n
+'
+
str
(
label
)
+
' 1
\n
'
+
str
(
quals1
[
label
])
+
'
\n
'
)
PAIR2
.
write
(
'@'
+
str
(
label
)
+
' 2
\n
'
+
str
(
seqs2
[
label
])
+
'
\n
+'
+
str
(
label
)
+
' 2
\n
'
+
str
(
quals2
[
label
])
+
'
\n
'
)
SIN
.
close
()
PAIR1
.
close
()
PAIR2
.
close
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment