Skip to content
Snippets Groups Projects
Commit cbc36628 authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Add vawk

parent 91cf5217
No related branches found
No related tags found
No related merge requests found
vawk 0 → 100755
#!/usr/bin/env python
import argparse, sys, re
from argparse import RawTextHelpFormatter
from subprocess import Popen, PIPE, STDOUT
__author__ = "Colby Chiang (cc2qe@virginia.edu)"
__version__ = "$Revision: 0.0.2 $"
__date__ = "$Date: 2015-02-28 15:32 $"
# --------------------------------------
# define functions
def get_args():
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter,
add_help=True, description="\
vawk\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: An awk-like VCF parser")
parser.add_argument('-v', '--var', action='append', default=[], required=False, type=str, help='declare an external variable (e.g.: SIZE=10000)')
parser.add_argument('-c', '--col', dest='info_col', required=False, default=8, type=int, help='column of the INFO field [8]')
parser.add_argument('--header', required=False, action='store_true', help='print VCF header')
parser.add_argument('cmd', nargs=1, help='''vawk command syntax is exactly the same as awk syntax with
a few additional features. The INFO field can be split using
the I$ prefix and the SAMPLE field can be split using
the S$ prefix. For example, I$AF prints the allele frequency of
each variant and S$NA12878 prints the entire SAMPLE field for the
NA12878 individual for each variant. S$* returns all samples.
The SAMPLE field can be further split based on the keys in the
FORMAT field of the VCF (column 9). For example, S$NA12877$GT
returns the genotype of the NA12878 individual.
ex: '{ if (I$AF>0.5) print $1,$2,$3,I$AN,S$NA12878,S$NA12877$GT }'
''')
parser.add_argument('vcf', nargs='?', type=str, default=None, help='VCF file (default: stdin)')
parser.add_argument('--debug', action='store_true', help='debugging level verbosity')
# parse the arguments
args = parser.parse_args()
# if no input, check if part of pipe and if so, read stdin.
if args.vcf == None:
if sys.stdin.isatty():
parser.print_help()
exit(1)
else:
args.vcf = '-'
# send back the user input
return args
# parse the vawk string into BEGIN PER-LINE and END portions
def parse(raw):
begin = ''
perline = ''
end = ''
op_brace = 0
b = 0
a = 0
begin_idx = [0,-1]
end_idx = [0,-1]
# get BEGIN script
try:
begin_idx[0] = re.search(r'BEGIN\s*{', raw).start()
a = begin_idx[0] + 5
b = a
while begin == '':
if raw[b] == '{':
op_brace += 1
elif raw[b] == '}':
op_brace -= 1
if op_brace == 0:
begin = raw[a:b+1].strip()[1:-1]
begin_idx[1] = b
break
b += 1
except AttributeError:
pass
raw = raw[:begin_idx[0]] + raw[begin_idx[1]+1:]
# get END script
try:
end_idx[0] = re.search(r'END\s*{', raw).start()
a = re.search(r'END\s*{', raw).start() + 3
b = a
while end == '':
if raw[b] == '{':
op_brace += 1
elif raw[b] == '}':
op_brace -= 1
if op_brace == 0:
end = raw[a:b+1].strip()[1:-1]
end_idx[1] = b
break
b += 1
except AttributeError:
pass
raw = raw[:end_idx[0]] + raw[end_idx[1]+1:]
# get PER-LINE script (remainder of string)
perline = raw.strip()
if perline.startswith('{') and perline.endswith('}'):
perline = perline[1:-1]
else:
perline = 'if (' + perline + ') print ; '
return (begin, perline, end)
# a class for the tokenized sample string
class Sample(object):
def __init__(self, s):
self.string_raw = s
# tokenize the string
s_tok = self.string_raw.split('$')
self.name_raw = s_tok[0]
self.name_clean = None
self.name_escaped = None
# set the field
if len(s_tok) == 1:
self.field = "ALL"
elif len(s_tok) == 2:
self.field = s_tok[1]
else:
print 'Error: format should be S$[ID]$[FMT]'
exit(1)
return
# clean weird punctuation from name
def clean(self):
s = self.name_raw
s_esc = self.name_raw
gremlin_list = ("-", ".")
for gremlin in gremlin_list:
s = s.replace(gremlin, "_")
s_esc = s_esc.replace(gremlin, "\\" + gremlin)
self.name_clean = s
self.name_escaped = s_esc
return
# primary function
def vawk(debug, header, var, info_col, vawk_string, vcf_file):
# parse the external variables
format_col = info_col + 1
samples_start_col = info_col + 2
var_list = []
for v in var:
var_list = var_list + ['-v', v]
# parse the vawk string in to BEGIN, END, and PERLINE
(begin, perline, end) = parse(vawk_string[0])
if debug:
print 'begin:', begin
print 'perline:', perline
print 'end:', end
# get the requested info keys from the command line (format: I$FIELD)
# [\w]: any word character [a-zA-Z0-9_]
# [\W]: any non-word character [^a-zA-Z0-9_]
info_keys = set(re.findall(r'[\W]I\$([\w]*)', perline))
if re.search('I\$([^\W]*)',perline):
# split and parse the relevant INFO fields
# THE SPLIT INFO MATCHING CAN BE SPED UP BY DOING MULTI GREP AND MOVING THE OR STATEMENT INSIDE
perline = 'split($' + str(info_col) + ',x,";"); for (i=1;i<=length(x);++i) { ' \
+ ' '.join(['if (x[i]~"^' + mykey + '=" || x[i]=="' + mykey + '") { split(x[i],info,"="); if (length(info)==1) { INFO_' + mykey + '=1} else {INFO_' + mykey + '=info[2]} }' for mykey in info_keys]) \
+ '} ' \
+ perline
# replace the toxic $ in the awk command
for mykey in info_keys:
perline = re.sub('I\$' + mykey, 'INFO_' + mykey, perline)
# --------------------------------------
# sample info
samples_raw = set(re.findall(r'[\W]S\$([a-zA-Z0-9_\-\.*^$]*)', perline))
sample_queries = []
for s in samples_raw:
# make new Sample object and clean it
s_obj = Sample(s)
s_obj.clean()
# add it to the sample queries
sample_queries.append(s_obj)
if debug:
print "name_raw: " + s_obj.name_raw
print "name_clean: " + s_obj.name_clean
print "name_escaped: " + s_obj.name_escaped
# parse the per-sample data fields
for s_obj in sample_queries:
# return all data for each sample
if s_obj.field == 'ALL':
# concatenate all sample columns with S$*
if s_obj.name_clean == '*':
perline = 'ALLSAMPLES_' + s_obj.field + '=""; for (COL=' + str(samples_start_col) + ';COL<=NF-1;++COL) { ALLSAMPLES_' + s_obj.field + '=ALLSAMPLES_' + s_obj.field + '""$COL"\\t" }; ALLSAMPLES_' + s_obj.field + '=ALLSAMPLES_' + s_obj.field + '""$NF; ' + perline
# user-specified sample name
else:
perline = 'SAMPLE_' + '_'.join([s_obj.name_clean, s_obj.field]) + '=$SAMPLE["' + s_obj.name_raw + '"]; ' + perline
# return user-specified sample fields
else:
# special case for wildcard (all samples)
if s_obj.name_clean == '*':
perline = 'ALLSAMPLES_' + s_obj.field + '=""; for(i=1;i<=length(fmt);++i) { if (fmt[i]=="' + s_obj.field + '") { fmt_key=i; break } }; for (COL=' + str(samples_start_col) + ';COL<=NF-1;++COL) { split($COL,samp,":"); ALLSAMPLES_' + s_obj.field + '=ALLSAMPLES_' + s_obj.field + '""samp[fmt_key]"\\t" }; split($NF,samp,":"); ALLSAMPLES_' + s_obj.field + '=ALLSAMPLES_' + s_obj.field + '""samp[fmt_key]; ' + perline
else:
perline = 'split($SAMPLE["' + s_obj.name_raw + '"],samp,":"); for(i=1;i<=length(fmt);++i) { if (fmt[i]=="' + s_obj.field + '") { SAMPLE_' + '_'.join([s_obj.name_clean, s_obj.field]) + '=samp[i]; break } } ' + perline
# rename query (ex: S$NA12878$GT to SAMPLE_NA12878_GT)
look_ahead = '(?![$_\-\.a-zA-Z0-9])'
if re.search('S\$',perline):
for s_obj in sample_queries:
if s_obj.field == 'ALL':
if s_obj.name_clean == '*':
perline = re.sub(r'S\$\*' + look_ahead, 'ALLSAMPLES_' + s_obj.field, perline)
perline = perline
else:
perline = re.sub('S\$' + s_obj.name_escaped + look_ahead, 'SAMPLE_' + '_'.join([s_obj.name_clean, s_obj.field]), perline)
else:
if s_obj.name_clean == '*':
perline = re.sub('S\$\*\$' + s_obj.field + look_ahead, 'ALLSAMPLES_' + s_obj.field, perline)
else:
perline = re.sub('S\$' + '\$'.join([s_obj.name_escaped, s_obj.field]) + look_ahead, 'SAMPLE_' + '_'.join([s_obj.name_clean, s_obj.field]), perline)
# clear data from previous line
for mykey in info_keys:
perline = 'INFO_' + mykey + '=""; ' + perline
for s_obj in sample_queries:
if s_obj.name_clean != '*':
perline = 'SAMPLE_' + '_'.join([s_obj.name_clean, s_obj.field]) + '=""; ' + perline
# split up the FORMAT field and tack it to the beginning of the per-line statement
perline = 'split($' + str(format_col) + ',fmt,":"); ' + perline
# print the header if requested
if header:
perline = 'if ($0~"^#") {if ($0!~"^##") { for (i=' + str(samples_start_col) + ';i<=NF;++i) SAMPLE[$i]=i; }; print} else {' + perline + '}'
else:
perline = 'if ($0~"^#") {if ($0!~"^##") { for (i=' + str(samples_start_col) + ';i<=NF;++i) SAMPLE[$i]=i; }; next} else {' + perline + '}'
vawk_string = 'BEGIN {FS="\\t"; OFS="\\t"; ' + begin + '}' + ' {' + perline + '} ' + 'END {' + end + '}'
cmd = ['gawk'] + var_list + [vawk_string, vcf_file]
if debug:
print ' '.join(cmd)
p = Popen(cmd, stdout=PIPE, stderr=STDOUT)
for line in iter(p.stdout.readline, b''):
print line.rstrip()
return
# --------------------------------------
# main function
def main():
# parse the command line args
args = get_args()
# call primary function
vawk(args.debug, args.header, args.var, args.info_col, args.cmd, args.vcf)
# initialize the script
if __name__ == '__main__':
try:
sys.exit(main())
except IOError, e:
if e.errno != 32: # ignore SIGPIPE
raise
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment