Commit 4dce7047 authored by Penom Nom's avatar Penom Nom
Browse files

Delete copy version.

parent cec3f14c
#!/usr/bin/python
#
# Copyright (C) 2009 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '2.2'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'
import argparse,re,sys
def cigar2statesRead (sam_line_obj):
"""
Returns the status of each position on read from the CIGAR field.
@param sam_line_obj : an hashage from a line in SAM (See
parseSamLine)
@return : [S,S,S,M,M,M,I,M,M,...]
"""
cigar = sam_line_obj['CIGAR']
cigar=re.sub('([MIDNSHPX=])', r"\1 ", cigar)
cigar_sub = cigar.split()
status_on_read = ''
for section in cigar_sub :
result = re.search("(\d+)([MIDNSHPX=])", section)
(nb_pos, state)=result.group(1,2)
if( state != "D" and state != "P" ) :
status_on_read += (state * int(nb_pos))
return status_on_read
def getPositionSatus(sam_line_obj):
"""
Returns the status of each position on read (status : clipping,
insertion, match, mismatch and undefined).
@param sam_line_obj : an hashage from a line in SAM (See
parseSamLine)
@return : ['clipping','match','match','mismatch',...]
"""
pos = 0
prec = ""
isDel = False
status_by_pos_length = len(sam_line_obj['SEQ'])
status_by_pos=[0 for i in xrange(0, status_by_pos_length)]
#Retrieve information from the CIGAR field
cigar_states = cigar2statesRead(sam_line_obj)
#Process the start clipping
while(re.search("[HS]",cigar_states[pos])):
status_by_pos[pos]='clipping'
pos+=1
#For each character in the MD field
for md_char in sam_line_obj['MD'] :
#If the character is a number
if( re.search("\d",md_char)):
#If we were in a deleted area
if( isDel == True) :
isDel = False #End of the deleted area
prec += str(md_char)
#If the character is a number in an undeleted area
elif( re.search("[ATGCN]", md_char) and not isDel ):
if( prec != "" ):
for i in range(0,int(prec)):
#While the position corresponds to an insertion
while( cigar_states[pos] == 'I' ):
status_by_pos[pos]='insertion'
pos+=1
#Count the match
status_by_pos[pos]='match'
pos+=1
prec =""
#While the position corresponds to an insertion
while( cigar_states[pos] == 'I' ):
status_by_pos[pos]='insertion'
pos+=1
#Count the mismatch
status_by_pos[pos]='mismatch'
pos+=1
#If the character is the start marker of deletion
elif(re.search("\^",md_char)):
#If you had one match or more before
if( prec != "" ):
for i in range(0, int(prec)):
#While the position corresponds to an insertion
while( cigar_states[pos] == 'I' ):
status_by_pos[pos]='insertion'
pos+=1
#Count the match
status_by_pos[pos]='match'
pos+=1
prec =""
#Indicate the start of an deleted area
isDel = True ;
#If the MD field ends with matches
if( prec != "" ):
for i in range(0, int(prec)):
#While the position corresponds to an insertion
while( cigar_states[pos] == 'I'):
status_by_pos[pos]='insertion'
pos+=1
#Count the match
status_by_pos[pos]='match'
pos+=1
prec =""
#Finish the positions of the read with the information present in the CIGAR field
while( pos < len(cigar_states) ):
#The position corresponds to one clipping
if( re.search("[HS]", cigar_states[pos] )):
status_by_pos[pos]='clipping'
#The position corresponds to one insertion
elif( re.search("[I]",cigar_states[pos]) ):
status_by_pos[pos]='insertion'
#The position exists in the CIGAR field but not in the MD when it should be
else:
status_by_pos[pos]='undef'
sys.stderr.write("[WARNING] " + sam_line_obj['QUERY'] + " the position " + str(pos) + " in the read isn't in MD field.\n")
pos+=1
#If the query is forward
if( isForward(sam_line_obj) is False ):
#Reverse status
status_by_pos.reverse()
return status_by_pos
def isAligned(sam_line_obj):
"""
Returns True if the read in sam_line_obj is aligned.
@param sam_line_obj : an hashage from a line in SAM (See
parseSamLine)
@return : boolean
"""
return ((sam_line_obj['REFNAME'] != '*') and (sam_line_obj['CIGAR'] != '*'))
def isForward(sam_line_obj):
"""
Returns True if the strand of the query is forward.
@param sam_line_obj : an hashage from a line in SAM (See
parseSamLine)
@return : boolean
"""
isForward = False
if( (int(sam_line_obj['FLAG']) & 16) == 0 ):
isForward = True
return isForward
def isPaired(sam_line_obj):
"""
Returns True if the query is paired.
@param sam_line_obj : an hashage from a line in SAM (See
parseSamLine)
@return : boolean
"""
isPaired = False
if (int(sam_line_obj['FLAG']) & 1) != 0 :
isPaired = True
return isPaired
def getReadNumber(sam_line_obj):
"""
Returns 1 if the query is the first in pair or if it is not paired.
@param sam_line_obj : an hashage from a line in SAM (See
parseSamLine)
@return : int
"""
readNumber = 1
if (int(sam_line_obj['FLAG']) & 128) != 0 :
readNumber = 2
return readNumber
def parseSamLine(line):
"""
Returns an hash that contains each field of read line indexed by the
name of field.
@param line : a line of one read extract from a SAM file
@return : {QUERY : HXV1, MD : 101, ...}
"""
line_info = {}
line.rstrip("\n")
line_subdivisions = line.split( "\t")
nb_fields = len(line_subdivisions)
#Mandatory fields
line_info = {
'QUERY' : line_subdivisions[0],
'FLAG' : line_subdivisions[1],
'REFNAME' : line_subdivisions[2],
'CIGAR' : line_subdivisions[5],
'SEQ' : line_subdivisions[9] }
#Optional fields
regex=re.compile("([^\:]+)\:[^\:]+\:([^\s]*)")
for i in range (11, nb_fields) :
result=re.search(regex,line_subdivisions[i])
if(result is not None) :
line_info[result.group(1)] = result.group(2)
return line_info
def sumStatus(input, nb_by_pos):
"""
Sum by position the status of reads.
@param input : the SAM path.
@param nb_by_pos : the hash use to write count by position. {read1
: [], read2 : []}
"""
#For each line on SAM file
for current_line in input :
line_info = parseSamLine( current_line )
#If the read is aligned
if ( isAligned(line_info) ):
read_number = 1
status_by_pos = []
#Unperfect alignment
if( (re.search("^\d+M$", line_info['CIGAR']) is None) or (re.search("^\d+$", line_info['MD']) is None) ):
read_number = getReadNumber(line_info)
status_by_pos = getPositionSatus(line_info)
#Perfect alignment
else:
read_number = getReadNumber(line_info)
seq_length = len(line_info['SEQ'])
status_by_pos = ['match' for e in range( seq_length )]
#If the read is longer than nb_by_pos
if len(status_by_pos) > len(nb_by_pos["read"+str(read_number)]) :
start_pos = len(nb_by_pos["read"+str(read_number)])
#Create the missing positions
for pos in range(start_pos, len(status_by_pos) ):
nb_by_pos["read"+str(read_number)].append({'mismatch' : 0, 'match' : 0, 'clipping' : 0, 'insertion' : 0, 'undef' : 0})
#Add alignment information to global count
for pos in range( 0, len(status_by_pos) ):
nb_by_pos["read"+str(read_number)][pos][status_by_pos[pos]] += 1
input.close()
def writeOutput (output, separator, *reads):
"""
Writes one output file from a list of count.
@param output : the output file path.
@param separator : the separator for each field of one line.
@param reads : a list of array. Example of array :
[{match : 2, clipping : 3,...}, #pos 1
{match : 4, clipping : 1,...}, #pos 2
{match : 4, clipping : 1,...}] #pos 3
"""
status_list = 'match', 'mismatch', 'clipping', 'insertion'
#Find the length of the longest read
nb_pos = 0
for read in reads:
nb_pos = max( nb_pos, len(read) )
if nb_pos == 0 :
sys.stderr.write("[WARNING] " + str(output) + " has no reads.\n")
#Write output header
output.write("pos" + separator + separator.join(status_list) + "\n")
#For each position of the longest read
for i in range(0, nb_pos):
status_at_pos = str(i+1)
#For each status
for status in status_list:
nb = 0
for read in reads:
if len(read) > i:
nb += read[i][status]
status_at_pos += separator + str(nb)
output.write(status_at_pos + "\n")
output.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="description : Sum by position the status of reads. This provides for example the number of reads that have a mismatch at first base.",
usage='%(prog)s -i INPUT_SAM_FILE -o OUTPUT_FILE [ OUTPUT_FILE_R2 --readsplit]')
parser.add_argument("-i", "--input", type=argparse.FileType('r'), default=sys.stdin, help="The SAM file")
parser.add_argument("-o", "--output", nargs='*', type=argparse.FileType('w'), required=True, help="The file(s) with count.")
parser.add_argument("--readsplit", action="store_true", help="if specified, reads statstics are splitted in two outputs (reads 1 and reads 2).")
parser.add_argument("--separator", dest="separator", default="\t", help="The fields separator in output file(s) (default : tabulation).")
parser.add_argument("--version", action='version', version=__version__)
#Getting arguments from the command line
args = parser.parse_args()
if len(args.output)!= 1 and len(args.output)!= 2:
raise ValueError, "[ERROR] Incorrect number of output files, expected 1 output file or 2 output files with --readsplit option."
if len(args.output)!=2 and args.readsplit:
raise ValueError, "[ERROR] Incorrect number of output files : with --readsplit option you need 2 output files."
#Init count
nb_by_pos = {'read1':[],'read2':[]}
#Sum by position the status of reads
sumStatus( args.input, nb_by_pos )
#Write output
if len(args.output)==2:
writeOutput( args.output[0], args.separator, nb_by_pos["read1"] )
writeOutput( args.output[1], args.separator, nb_by_pos["read2"] )
else:
writeOutput( args.output[0], args.separator, nb_by_pos["read1"], nb_by_pos["read2"] )
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment