Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
popsim
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
SVdetection
popsim
Commits
cbc36628
Commit
cbc36628
authored
7 years ago
by
Floreal Cabanettes
Browse files
Options
Downloads
Patches
Plain Diff
Add vawk
parent
91cf5217
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
vawk
+277
-0
277 additions, 0 deletions
vawk
with
277 additions
and
0 deletions
vawk
0 → 100755
+
277
−
0
View file @
cbc36628
#!/usr/bin/env python
import
argparse
,
sys
,
re
from
argparse
import
RawTextHelpFormatter
from
subprocess
import
Popen
,
PIPE
,
STDOUT
__author__
=
"
Colby Chiang (cc2qe@virginia.edu)
"
__version__
=
"
$Revision: 0.0.2 $
"
__date__
=
"
$Date: 2015-02-28 15:32 $
"
# --------------------------------------
# define functions
def
get_args
():
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
RawTextHelpFormatter
,
add_help
=
True
,
description
=
"
\
vawk
\n\
author:
"
+
__author__
+
"
\n\
version:
"
+
__version__
+
"
\n\
description: An awk-like VCF parser
"
)
parser
.
add_argument
(
'
-v
'
,
'
--var
'
,
action
=
'
append
'
,
default
=
[],
required
=
False
,
type
=
str
,
help
=
'
declare an external variable (e.g.: SIZE=10000)
'
)
parser
.
add_argument
(
'
-c
'
,
'
--col
'
,
dest
=
'
info_col
'
,
required
=
False
,
default
=
8
,
type
=
int
,
help
=
'
column of the INFO field [8]
'
)
parser
.
add_argument
(
'
--header
'
,
required
=
False
,
action
=
'
store_true
'
,
help
=
'
print VCF header
'
)
parser
.
add_argument
(
'
cmd
'
,
nargs
=
1
,
help
=
'''
vawk command syntax is exactly the same as awk syntax with
a few additional features. The INFO field can be split using
the I$ prefix and the SAMPLE field can be split using
the S$ prefix. For example, I$AF prints the allele frequency of
each variant and S$NA12878 prints the entire SAMPLE field for the
NA12878 individual for each variant. S$* returns all samples.
The SAMPLE field can be further split based on the keys in the
FORMAT field of the VCF (column 9). For example, S$NA12877$GT
returns the genotype of the NA12878 individual.
ex:
'
{ if (I$AF>0.5) print $1,$2,$3,I$AN,S$NA12878,S$NA12877$GT }
'
'''
)
parser
.
add_argument
(
'
vcf
'
,
nargs
=
'
?
'
,
type
=
str
,
default
=
None
,
help
=
'
VCF file (default: stdin)
'
)
parser
.
add_argument
(
'
--debug
'
,
action
=
'
store_true
'
,
help
=
'
debugging level verbosity
'
)
# parse the arguments
args
=
parser
.
parse_args
()
# if no input, check if part of pipe and if so, read stdin.
if
args
.
vcf
==
None
:
if
sys
.
stdin
.
isatty
():
parser
.
print_help
()
exit
(
1
)
else
:
args
.
vcf
=
'
-
'
# send back the user input
return
args
# parse the vawk string into BEGIN PER-LINE and END portions
def
parse
(
raw
):
begin
=
''
perline
=
''
end
=
''
op_brace
=
0
b
=
0
a
=
0
begin_idx
=
[
0
,
-
1
]
end_idx
=
[
0
,
-
1
]
# get BEGIN script
try
:
begin_idx
[
0
]
=
re
.
search
(
r
'
BEGIN\s*{
'
,
raw
).
start
()
a
=
begin_idx
[
0
]
+
5
b
=
a
while
begin
==
''
:
if
raw
[
b
]
==
'
{
'
:
op_brace
+=
1
elif
raw
[
b
]
==
'
}
'
:
op_brace
-=
1
if
op_brace
==
0
:
begin
=
raw
[
a
:
b
+
1
].
strip
()[
1
:
-
1
]
begin_idx
[
1
]
=
b
break
b
+=
1
except
AttributeError
:
pass
raw
=
raw
[:
begin_idx
[
0
]]
+
raw
[
begin_idx
[
1
]
+
1
:]
# get END script
try
:
end_idx
[
0
]
=
re
.
search
(
r
'
END\s*{
'
,
raw
).
start
()
a
=
re
.
search
(
r
'
END\s*{
'
,
raw
).
start
()
+
3
b
=
a
while
end
==
''
:
if
raw
[
b
]
==
'
{
'
:
op_brace
+=
1
elif
raw
[
b
]
==
'
}
'
:
op_brace
-=
1
if
op_brace
==
0
:
end
=
raw
[
a
:
b
+
1
].
strip
()[
1
:
-
1
]
end_idx
[
1
]
=
b
break
b
+=
1
except
AttributeError
:
pass
raw
=
raw
[:
end_idx
[
0
]]
+
raw
[
end_idx
[
1
]
+
1
:]
# get PER-LINE script (remainder of string)
perline
=
raw
.
strip
()
if
perline
.
startswith
(
'
{
'
)
and
perline
.
endswith
(
'
}
'
):
perline
=
perline
[
1
:
-
1
]
else
:
perline
=
'
if (
'
+
perline
+
'
) print ;
'
return
(
begin
,
perline
,
end
)
# a class for the tokenized sample string
class
Sample
(
object
):
def
__init__
(
self
,
s
):
self
.
string_raw
=
s
# tokenize the string
s_tok
=
self
.
string_raw
.
split
(
'
$
'
)
self
.
name_raw
=
s_tok
[
0
]
self
.
name_clean
=
None
self
.
name_escaped
=
None
# set the field
if
len
(
s_tok
)
==
1
:
self
.
field
=
"
ALL
"
elif
len
(
s_tok
)
==
2
:
self
.
field
=
s_tok
[
1
]
else
:
print
'
Error: format should be S$[ID]$[FMT]
'
exit
(
1
)
return
# clean weird punctuation from name
def
clean
(
self
):
s
=
self
.
name_raw
s_esc
=
self
.
name_raw
gremlin_list
=
(
"
-
"
,
"
.
"
)
for
gremlin
in
gremlin_list
:
s
=
s
.
replace
(
gremlin
,
"
_
"
)
s_esc
=
s_esc
.
replace
(
gremlin
,
"
\\
"
+
gremlin
)
self
.
name_clean
=
s
self
.
name_escaped
=
s_esc
return
# primary function
def
vawk
(
debug
,
header
,
var
,
info_col
,
vawk_string
,
vcf_file
):
# parse the external variables
format_col
=
info_col
+
1
samples_start_col
=
info_col
+
2
var_list
=
[]
for
v
in
var
:
var_list
=
var_list
+
[
'
-v
'
,
v
]
# parse the vawk string in to BEGIN, END, and PERLINE
(
begin
,
perline
,
end
)
=
parse
(
vawk_string
[
0
])
if
debug
:
print
'
begin:
'
,
begin
print
'
perline:
'
,
perline
print
'
end:
'
,
end
# get the requested info keys from the command line (format: I$FIELD)
# [\w]: any word character [a-zA-Z0-9_]
# [\W]: any non-word character [^a-zA-Z0-9_]
info_keys
=
set
(
re
.
findall
(
r
'
[\W]I\$([\w]*)
'
,
perline
))
if
re
.
search
(
'
I\$([^\W]*)
'
,
perline
):
# split and parse the relevant INFO fields
# THE SPLIT INFO MATCHING CAN BE SPED UP BY DOING MULTI GREP AND MOVING THE OR STATEMENT INSIDE
perline
=
'
split($
'
+
str
(
info_col
)
+
'
,x,
"
;
"
); for (i=1;i<=length(x);++i) {
'
\
+
'
'
.
join
([
'
if (x[i]~
"
^
'
+
mykey
+
'
=
"
|| x[i]==
"'
+
mykey
+
'"
) { split(x[i],info,
"
=
"
); if (length(info)==1) { INFO_
'
+
mykey
+
'
=1} else {INFO_
'
+
mykey
+
'
=info[2]} }
'
for
mykey
in
info_keys
])
\
+
'
}
'
\
+
perline
# replace the toxic $ in the awk command
for
mykey
in
info_keys
:
perline
=
re
.
sub
(
'
I\$
'
+
mykey
,
'
INFO_
'
+
mykey
,
perline
)
# --------------------------------------
# sample info
samples_raw
=
set
(
re
.
findall
(
r
'
[\W]S\$([a-zA-Z0-9_\-\.*^$]*)
'
,
perline
))
sample_queries
=
[]
for
s
in
samples_raw
:
# make new Sample object and clean it
s_obj
=
Sample
(
s
)
s_obj
.
clean
()
# add it to the sample queries
sample_queries
.
append
(
s_obj
)
if
debug
:
print
"
name_raw:
"
+
s_obj
.
name_raw
print
"
name_clean:
"
+
s_obj
.
name_clean
print
"
name_escaped:
"
+
s_obj
.
name_escaped
# parse the per-sample data fields
for
s_obj
in
sample_queries
:
# return all data for each sample
if
s_obj
.
field
==
'
ALL
'
:
# concatenate all sample columns with S$*
if
s_obj
.
name_clean
==
'
*
'
:
perline
=
'
ALLSAMPLES_
'
+
s_obj
.
field
+
'
=
""
; for (COL=
'
+
str
(
samples_start_col
)
+
'
;COL<=NF-1;++COL) { ALLSAMPLES_
'
+
s_obj
.
field
+
'
=ALLSAMPLES_
'
+
s_obj
.
field
+
'""
$COL
"
\\
t
"
}; ALLSAMPLES_
'
+
s_obj
.
field
+
'
=ALLSAMPLES_
'
+
s_obj
.
field
+
'""
$NF;
'
+
perline
# user-specified sample name
else
:
perline
=
'
SAMPLE_
'
+
'
_
'
.
join
([
s_obj
.
name_clean
,
s_obj
.
field
])
+
'
=$SAMPLE[
"'
+
s_obj
.
name_raw
+
'"
];
'
+
perline
# return user-specified sample fields
else
:
# special case for wildcard (all samples)
if
s_obj
.
name_clean
==
'
*
'
:
perline
=
'
ALLSAMPLES_
'
+
s_obj
.
field
+
'
=
""
; for(i=1;i<=length(fmt);++i) { if (fmt[i]==
"'
+
s_obj
.
field
+
'"
) { fmt_key=i; break } }; for (COL=
'
+
str
(
samples_start_col
)
+
'
;COL<=NF-1;++COL) { split($COL,samp,
"
:
"
); ALLSAMPLES_
'
+
s_obj
.
field
+
'
=ALLSAMPLES_
'
+
s_obj
.
field
+
'""
samp[fmt_key]
"
\\
t
"
}; split($NF,samp,
"
:
"
); ALLSAMPLES_
'
+
s_obj
.
field
+
'
=ALLSAMPLES_
'
+
s_obj
.
field
+
'""
samp[fmt_key];
'
+
perline
else
:
perline
=
'
split($SAMPLE[
"'
+
s_obj
.
name_raw
+
'"
],samp,
"
:
"
); for(i=1;i<=length(fmt);++i) { if (fmt[i]==
"'
+
s_obj
.
field
+
'"
) { SAMPLE_
'
+
'
_
'
.
join
([
s_obj
.
name_clean
,
s_obj
.
field
])
+
'
=samp[i]; break } }
'
+
perline
# rename query (ex: S$NA12878$GT to SAMPLE_NA12878_GT)
look_ahead
=
'
(?![$_\-\.a-zA-Z0-9])
'
if
re
.
search
(
'
S\$
'
,
perline
):
for
s_obj
in
sample_queries
:
if
s_obj
.
field
==
'
ALL
'
:
if
s_obj
.
name_clean
==
'
*
'
:
perline
=
re
.
sub
(
r
'
S\$\*
'
+
look_ahead
,
'
ALLSAMPLES_
'
+
s_obj
.
field
,
perline
)
perline
=
perline
else
:
perline
=
re
.
sub
(
'
S\$
'
+
s_obj
.
name_escaped
+
look_ahead
,
'
SAMPLE_
'
+
'
_
'
.
join
([
s_obj
.
name_clean
,
s_obj
.
field
]),
perline
)
else
:
if
s_obj
.
name_clean
==
'
*
'
:
perline
=
re
.
sub
(
'
S\$\*\$
'
+
s_obj
.
field
+
look_ahead
,
'
ALLSAMPLES_
'
+
s_obj
.
field
,
perline
)
else
:
perline
=
re
.
sub
(
'
S\$
'
+
'
\$
'
.
join
([
s_obj
.
name_escaped
,
s_obj
.
field
])
+
look_ahead
,
'
SAMPLE_
'
+
'
_
'
.
join
([
s_obj
.
name_clean
,
s_obj
.
field
]),
perline
)
# clear data from previous line
for
mykey
in
info_keys
:
perline
=
'
INFO_
'
+
mykey
+
'
=
""
;
'
+
perline
for
s_obj
in
sample_queries
:
if
s_obj
.
name_clean
!=
'
*
'
:
perline
=
'
SAMPLE_
'
+
'
_
'
.
join
([
s_obj
.
name_clean
,
s_obj
.
field
])
+
'
=
""
;
'
+
perline
# split up the FORMAT field and tack it to the beginning of the per-line statement
perline
=
'
split($
'
+
str
(
format_col
)
+
'
,fmt,
"
:
"
);
'
+
perline
# print the header if requested
if
header
:
perline
=
'
if ($0~
"
^#
"
) {if ($0!~
"
^##
"
) { for (i=
'
+
str
(
samples_start_col
)
+
'
;i<=NF;++i) SAMPLE[$i]=i; }; print} else {
'
+
perline
+
'
}
'
else
:
perline
=
'
if ($0~
"
^#
"
) {if ($0!~
"
^##
"
) { for (i=
'
+
str
(
samples_start_col
)
+
'
;i<=NF;++i) SAMPLE[$i]=i; }; next} else {
'
+
perline
+
'
}
'
vawk_string
=
'
BEGIN {FS=
"
\\
t
"
; OFS=
"
\\
t
"
;
'
+
begin
+
'
}
'
+
'
{
'
+
perline
+
'
}
'
+
'
END {
'
+
end
+
'
}
'
cmd
=
[
'
gawk
'
]
+
var_list
+
[
vawk_string
,
vcf_file
]
if
debug
:
print
'
'
.
join
(
cmd
)
p
=
Popen
(
cmd
,
stdout
=
PIPE
,
stderr
=
STDOUT
)
for
line
in
iter
(
p
.
stdout
.
readline
,
b
''
):
print
line
.
rstrip
()
return
# --------------------------------------
# main function
def
main
():
# parse the command line args
args
=
get_args
()
# call primary function
vawk
(
args
.
debug
,
args
.
header
,
args
.
var
,
args
.
info_col
,
args
.
cmd
,
args
.
vcf
)
# initialize the script
if
__name__
==
'
__main__
'
:
try
:
sys
.
exit
(
main
())
except
IOError
,
e
:
if
e
.
errno
!=
32
:
# ignore SIGPIPE
raise
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment