Commit b85e0ddd authored by Celine Noirot's avatar Celine Noirot
Browse files

find from 2 files (paired) the pair present in the two files.

parent a8a11fe6
#!/bin/env python
import sys
from optparse import *
import os
#####Authors: Team Ladner-Barshis-Tepolt
######This script takes a set of separate quality trimmed paired end .fastq files
######and pairs the reads that still have a mate and combined the solitary reads
######into a single file of orphans for input into denovo assembly software (e.g. CLC Genomics).
######Command line usage: "seqheader" "delimiter" infiledirection1.fastq infiledirection2.fastq
#### ~227GB of RAM in order to use this script on my data which consisted of two paired-end files 58GB and 29GB
__name__ = ""
__synopsis__ = " -f file_name.fasta"
__example__ = "python TODO"
__date__ = "2013/10"
__authors__ = "Team Ladner-Barshis-Tepolt review by C.Noirot"
__keywords__ = "fastq pair combine"
__description__ = "This script extract pairs combine from 2 fastq; "
__version__ = '1.0'
class Sequence:
"""qualities is a string and it contains the qualities encoded as ascii(qual+33)."""
def __init__(self, name, sequence, qualities):
"""Set qualities to None if there are no quality values""" = name
self.sequence = sequence
self.qualities = qualities
def __getitem__(self, key):
return self.__class__(, self.sequence[key], self.qualities[key] if self.qualities is not None else None)
def __repr__(self):
qstr = ''
if self.qualities is not None:
qstr = '\', qualities="{0}"'.format(_shorten(self.qualities))
return 'Sequence(name="{0}", sequence="{1}"{2})'.format(_shorten(, _shorten(self.sequence), qstr)
def __len__(self):
return len(self.sequence)
def __eq__(self, other):
return == and \
self.sequence == other.sequence and \
self.qualities == other.qualities
def __ne__(self, other):
return not self.__eq__(other)
class FastqReader(object):
Reader for FASTQ files. Does not support multi-line FASTQ files.
def __init__(self, file, colorspace=False, skip_color=0):
file is a filename or a file-like object.
If file is a filename, then .gz files are supported.
colorspace -- Usually (when this is False), there must be n characters in the sequence and
n quality values. When this is True, there must be n+1 characters in the sequence and n quality values.
if isinstance(file, basestring):
file = xopen(file, "r")
self.fp = file
self.colorspace = colorspace
self.skip_color = skip_color
self.twoheaders = False
def __iter__(self):
Return tuples: (name, sequence, qualities).
qualities is a string and it contains the unmodified, encoded qualities.
lengthdiff = 1 if self.colorspace else 0
for i, line in enumerate(self.fp):
if i % 4 == 0:
if not line.startswith('@'):
raise FormatError("at line {0}, expected a line starting with '+'".format(i+1))
name = line.strip()[1:]
elif i % 4 == 1:
sequence = line.strip()
elif i % 4 == 2:
line = line.strip()
if not line.startswith('+'):
raise FormatError("at line {0}, expected a line starting with '+'".format(i+1))
if len(line) > 1:
self.twoheaders = True
if not line[1:] == name:
raise FormatError(
"At line {0}: Two sequence descriptions are given in "
"the FASTQ file, but they don't match "
"('{1}' != '{2}')"
" perhaps you should try the 'sra-fastq' format?".format(i+1, name, line.rstrip()[1:]))
elif i % 4 == 3:
qualities = line.rstrip("\n\r")[self.skip_color:]
if len(qualities) + lengthdiff != len(sequence):
raise ValueError("Length of quality sequence and length of read do not match (%d+%d!=%d)" % (len(qualities), lengthdiff, len(sequence)))
yield Sequence(name, sequence, qualities)
def __enter__(self):
if self.fp is None:
raise ValueError("I/O operation on closed FastqReader")
return self
def __exit__(self, *args):
def version_string ():
Return the version
return "%prog " + __version__
if __name__ == '__main__':
parser = OptionParser(usage="Usage: %prog", description = __description__)
igroup = OptionGroup(parser, "Input options","")
igroup.add_option("-1", "--pair1", dest="pair1", help="fastq pair1")
igroup.add_option("-2", "--pair2", dest="pair2", help="fastq pair2")
ogroup = OptionGroup(parser, "Output files options","")
ogroup.add_option("-o", "--output", dest="output", help="the output directory")
(options, args) = parser.parse_args()
if options.output == None:
sys.stderr.write("Output directory is missing")
if not os.path.exists(options.pair1) or not os.path.exists(options.pair2):
sys.stderr.write("Pair1 and pair2 fastq files must exist")
#Starts a for loop to read through the infile line by line
reader = seqio.SequenceReader(options.pair1)
for id, desc, seq, qualities in reader:
print 'finished reading in: %s'%(options.pair1)
reader = seqio.SequenceReader(options.pair2)
for id, desc, seq, qualities in reader:
# print 'here'
print 'finished reading in: %s'%(options.pair2)
paired = list(set(names1) & set(names2))
print len(paired)
print 'number of paired reads: %d'%(len(paired))
single = list(set(names1) ^ set(names2))
print len(single)
single1 = list(set(single) & set(names1))
print len(single1)
print 'number of single reads left from %s: %d'%(setoffiles[0],len(single1))
single2 = list(set(single) & set(names2))
print len(single2)
print 'number of single reads left from %s: %d'%(setoffiles[1],len(single2))
SIN = open(base_output+".single.fastq", 'w')
PAIR1 = open(base_output+".1.fastq", 'w')
PAIR2 = open(base_output+".2.fastq", 'w')
for label in single1:
SIN.write('@' + str(label) + ' /1\n' + str(seqs1[label]) + '\n+' + str(label) + ' /1\n' + str(quals1[label]) + '\n')
for label in single2:
SIN.write('@' + str(label) + ' /2\n' + str(seqs2[label]) + '\n+' + str(label) + ' /2\n' + str(quals2[label]) + '\n')
for label in paired:
PAIR1.write('@' + str(label) + ' 1\n' + str(seqs1[label]) + '\n+' + str(label) + ' 1\n' + str(quals1[label]) + '\n')
PAIR2.write('@' + str(label) + ' 2\n' + str(seqs2[label]) + '\n+' + str(label) + ' 2\n' + str(quals2[label]) + '\n')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment