Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
genotoul-bioinfo
ng6
Commits
b1116c43
Commit
b1116c43
authored
Jul 18, 2014
by
Penom Nom
Browse files
add ustack component to radseq workflow
parent
9699b1c6
Changes
4
Hide whitespace changes
Inline
Side-by-side
workflows/radseq/__init__.py
View file @
b1116c43
...
...
@@ -19,7 +19,7 @@ import os
import
glob
import
sys
from
ng6.ng6workflow
import
NG6Workflow
,
BasicNG6Workflow
from
ng6.ng6workflow
import
NG6Workflow
class
RADseq
(
NG6Workflow
):
...
...
@@ -33,34 +33,42 @@ class RADseq (NG6Workflow):
raise
ValueError
,
"Duplicated pool id."
+
p
[
'id'
]
pools
[
p
[
"id"
]]
=
(
p
,
[])
barcodes
=
[]
# array of tuples
indivs_by_name
=
{}
for
indiv
in
self
.
args
[
"individu"
]:
pool_id
=
indiv
[
'pool_id'
]
if
not
pools
.
has_key
(
pool_id
):
raise
ValueError
,
"The pool id "
+
pool_id
+
" does not exists in (individual "
+
indiv
[
'indiv_name'
]
+
")"
pools
[
pool_id
][
1
].
append
(
indiv
)
barcodes
.
append
((
indiv
[
'indiv_name'
],
indiv
[
'barcode'
]))
if
indivs_by_name
.
has_key
(
indiv
[
'indiv_name'
])
:
raise
ValueError
,
"Duplicated individual name "
+
indiv
[
'indiv_name'
]
indivs_by_name
[
indiv
[
'indiv_name'
]
]
=
indiv
# write barcode file
barcode_file
=
self
.
get_temporary_file
()
with
open
(
barcode_file
,
"w"
)
as
ff
:
for
i
in
barcodes
:
ff
.
write
(
i
[
0
]
+
"
\t
"
+
i
[
1
]
+
"
\n
"
)
# ----- DEMULTIPLEX -----
# process each pairs of fastq for each pool
# prepare fastq files and create a barcode file per pool
barcode_files
=
[]
indiv_names
=
[]
fastq_files_1
=
[]
fastq_files_2
=
[]
for
pool_id
,
data
in
pools
.
iteritems
()
:
pooldata
=
data
[
0
]
splitbc
=
self
.
add_component
(
"SplitBC"
,
[
pooldata
[
'read1'
],
pooldata
[
'read2'
]
if
pooldata
[
'read2'
]
else
None
,
indivs_by_name
.
keys
(),
barcode_file
,
self
.
args
[
'enzyme'
],
self
.
args
[
'mismatches'
],
self
.
args
[
'tag_mismatch'
],
self
.
args
[
'trim'
],
self
.
args
[
'forward'
]],
component_prefix
=
pool_id
)
indivs
=
data
[
1
]
fastq_files_1
.
append
(
pooldata
[
'read1'
]);
if
pooldata
[
'read2'
]
:
fastq_files_2
.
append
(
pooldata
[
'read2'
])
#ustacks = self.add_component("Ustacks", [self.args["read_1"]])
ustacks
=
self
.
add_component
(
"Ustacks"
,
[],
{
"indiv_dic"
:
self
.
args
[
"individu"
],
"read1_files"
:
splitbc
.
output_read1
,
"max_locus"
:
3
}
,
component_prefix
=
"ustacks"
)
# write barcode file
barcode_file
=
self
.
get_temporary_file
()
barcode_files
.
append
(
barcode_file
)
inames
=
[]
with
open
(
barcode_file
,
"w"
)
as
ff
:
for
indiv
in
indivs
:
inames
.
append
(
indiv
[
'indiv_name'
])
ff
.
write
(
indiv
[
'indiv_name'
]
+
"
\t
"
+
indiv
[
'barcode'
]
+
"
\n
"
)
indiv_names
.
append
(
inames
)
splitbc
=
self
.
add_component
(
"SplitBC"
,
[
fastq_files_1
,
fastq_files_2
if
fastq_files_2
else
None
,
indiv_names
,
barcode_files
,
self
.
args
[
'enzyme'
],
self
.
args
[
'mismatches'
],
self
.
args
[
'tag_mismatch'
],
self
.
args
[
'trim'
],
self
.
args
[
'forward'
]])
ustacks
=
self
.
add_component
(
"Ustacks"
,
[],
{
"indiv_dic"
:
indivs_by_name
,
"read1_files"
:
splitbc
.
output_read1
,
"max_locus"
:
3
}
)
#cstacks = self.add_component("Cstacks", [ustacks.alleles, ustacks.snps, ustacks.tags, self.args["catalog_mismatches"]])
workflows/radseq/components/splitbc.py
View file @
b1116c43
...
...
@@ -17,12 +17,13 @@
import
os
from
jflow.component
import
Component
from
jflow.iotypes
import
OutputFileList
,
OutputFile
,
InputFile
,
InputFileList
,
Formats
from
jflow.abstraction
import
MultiMap
from
weaver.function
import
ShellFunction
class
SplitBC
(
Component
):
from
ng6.analysis
import
Analysis
class
SplitBC
(
Analysis
):
ENZYMES
=
{
'sbfI'
:
{
...
...
@@ -37,15 +38,28 @@ class SplitBC (Component):
return
(
self
.
ENZYMES
[
name
][
'rad'
],
self
.
ENZYMES
[
name
][
'radtag'
])
def
define_parameters
(
self
,
fastq_file1
,
fastq_file2
,
indiv_name
s
,
barcode_file
,
enzyme
,
def
define_parameters
(
self
,
fastq_file1
,
fastq_file2
,
matrix_
indiv_name
,
barcode_file
,
enzyme
,
mismatches
,
tag_mismatch
,
trim
=
True
,
forward
=
True
):
self
.
indiv_names
=
indiv_names
self
.
fastq1
=
InputFile
(
fastq_file1
,
Formats
.
FASTQ
)
"""
@param fastq_file1: list of fastq_files path
@param fastq_file2: list of fastq_files path
@param matrix_indiv_name: list of list of individual names
@param barcode_file: list of bascode file path
@param enzyme: enzyme name
"""
check_len
=
len
(
fastq_file1
)
==
len
(
matrix_indiv_name
)
==
len
(
barcode_file
)
self
.
fastq1
=
OutputFileList
(
fastq_file1
,
Formats
.
FASTQ
)
self
.
fastq2
=
None
if
fastq_file2
is
not
None
:
self
.
fastq2
=
InputFile
(
fastq_file2
,
Formats
.
FASTQ
)
self
.
barcode_file
=
InputFile
(
barcode_file
,
Formats
.
FASTQ
)
check_len
=
len
(
fastq_file1
)
==
len
(
matrix_indiv_name
)
==
len
(
fastq_file1
)
==
len
(
barcode_file
)
self
.
fastq2
=
OutputFileList
(
fastq_file2
,
Formats
.
FASTQ
)
if
not
check_len
:
raise
Exception
(
"length of fastq_file1, fastq_file2, matrix_indiv_name and barcode_file must be the same"
)
self
.
barcode_file
=
OutputFileList
(
barcode_file
)
self
.
mismatches
=
mismatches
self
.
rad
,
self
.
rad_tag
=
self
.
get_enzyme
(
enzyme
)
self
.
tag_mismatch
=
tag_mismatch
...
...
@@ -53,28 +67,54 @@ class SplitBC (Component):
self
.
forward
=
forward
self
.
prefix_r1
=
os
.
path
.
join
(
self
.
output_directory
,
"%_1.fq"
)
self
.
prefix_r2
=
os
.
path
.
join
(
self
.
output_directory
,
"%_2.fq"
)
self
.
output_read1
=
OutputFileList
(
self
.
get_outputs
(
'{basename_woext}_1.fq'
,
self
.
indiv_names
),
Formats
.
FASTQ
)
self
.
output_read2
=
OutputFileList
(
self
.
get_outputs
(
'{basename_woext}_2.fq'
,
self
.
indiv_names
),
Formats
.
FASTQ
)
self
.
stdout
=
OutputFile
(
os
.
path
.
join
(
self
.
output_directory
,
"splitbc.stdout"
))
self
.
matrix_read1
=
[]
self
.
matrix_read2
=
[]
self
.
output_read1
=
[]
self
.
output_read2
=
[]
self
.
stdout
=
[]
for
id
,
inames
in
enumerate
(
matrix_indiv_name
)
:
outr1
=
OutputFileList
(
self
.
get_outputs
(
'{basename_woext}_1.fq'
,
inames
),
Formats
.
FASTQ
)
outr2
=
OutputFileList
(
self
.
get_outputs
(
'{basename_woext}_2.fq'
,
inames
),
Formats
.
FASTQ
)
self
.
matrix_read1
.
append
(
outr1
)
self
.
matrix_read2
.
append
(
outr2
)
self
.
output_read1
+=
outr1
self
.
output_read2
+=
outr2
self
.
stdout
.
append
(
OutputFile
(
os
.
path
.
join
(
self
.
output_directory
,
"splitbc"
+
str
(
id
)
+
".stdout"
)))
def
get_version
(
self
):
return
"1.1"
def
define_analysis
(
self
):
self
.
name
=
"splitbc"
self
.
description
=
"Demultiplex individual"
self
.
software
=
"splitbc.pl"
self
.
options
=
' '
def
process
(
self
):
strand
=
"--bol"
if
self
.
forward
else
"--eol"
if
self
.
fastq2
is
not
None
:
command
=
" "
.
join
([
self
.
get_exec_path
(
"splitbc.pl"
),
self
.
fastq1
,
self
.
fastq2
,
"--mismatches"
,
self
.
mismatches
,
"--bcfile"
,
self
.
barcode_file
,
"--rad"
,
self
.
rad
,
"--radTAG"
,
self
.
rad_tag
,
"--TAG_mismatch"
,
self
.
tag_mismatch
,
"--trim"
if
self
.
trim
else
""
,
"--prefix-r1"
,
self
.
prefix_r1
,
"--prefix-r2"
,
self
.
prefix_r2
,
strand
,
' 2>&1 >
'
,
self
.
stdout
self
.
get_exec_path
(
"splitbc.pl"
),
"$1"
,
"$2"
,
"--mismatches"
,
self
.
mismatches
,
"--bcfile"
,
"$3"
,
"--rad"
,
self
.
rad
,
"--radTAG"
,
self
.
rad_tag
,
"--TAG_mismatch"
,
self
.
tag_mismatch
,
"--trim"
if
self
.
trim
else
""
,
"--prefix-r1"
,
self
.
prefix_r1
,
"--prefix-r2"
,
self
.
prefix_r2
,
strand
,
' 2>&1 >
> $4 '
])
splitbc
=
ShellFunction
(
command
,
cmd_format
=
'{EXE} {IN} {OUT}'
)
splitbc
(
inputs
=
[
self
.
fastq1
,
self
.
fastq2
],
outputs
=
[
self
.
out
put_read1
,
self
.
output
_read
2
,
self
.
stdout
])
MultiMap
(
splitbc
,
inputs
=
[
self
.
fastq1
,
self
.
fastq2
,
self
.
barcode_file
],
outputs
=
[
self
.
std
out
,
self
.
matrix
_read
1
,
self
.
matrix_read2
])
else
:
command
=
" "
.
join
([
self
.
get_exec_path
(
"splitbc.pl"
),
self
.
fastq1
,
"--mismatches"
,
self
.
mismatches
,
"--bcfile"
,
self
.
barcode_file
,
"--rad"
,
self
.
rad
,
"--radTAG"
,
self
.
rad_tag
,
"--TAG_mismatch"
,
self
.
tag_mismatch
,
"--trim"
if
self
.
trim
else
""
,
"--prefix-r1"
,
self
.
prefix_r1
,
strand
,
' 2>&1 >
'
,
self
.
stdout
self
.
get_exec_path
(
"splitbc.pl"
),
"$1"
,
"--mismatches"
,
self
.
mismatches
,
"--bcfile"
,
"$2"
,
"--rad"
,
self
.
rad
,
"--radTAG"
,
self
.
rad_tag
,
"--TAG_mismatch"
,
self
.
tag_mismatch
,
"--trim"
if
self
.
trim
else
""
,
"--prefix-r1"
,
self
.
prefix_r1
,
strand
,
' 2>&1 >
$3 '
])
splitbc
=
ShellFunction
(
command
,
cmd_format
=
'{EXE} {IN} {OUT}'
)
splitbc
(
inputs
=
self
.
fastq1
,
outputs
=
[
self
.
output_read1
,
self
.
stdout
])
MultiMap
(
splitbc
,
inputs
=
[
self
.
fastq1
,
self
.
barcode_file
],
outputs
=
[
self
.
stdout
,
self
.
matrix_read1
])
def
post_process
(
self
):
print
""
\ No newline at end of file
workflows/radseq/components/ustacks.py
View file @
b1116c43
...
...
@@ -55,17 +55,37 @@ class Ustacks (Analysis):
self
.
software
=
"ustacks"
self
.
options
=
"-m "
+
str
(
self
.
min_cov
)
+
" -M "
+
str
(
self
.
primary_mismatch
)
+
" -N "
+
str
(
self
.
secondary_mismatch
)
+
\
" --max_locus_stacks "
+
str
(
self
.
max_locus
)
+
" -r -d -t gzfastq --model_type "
+
self
.
model_type
" --max_locus_stacks "
+
str
(
self
.
max_locus
)
+
" -r -d -t gzfastq --model_type "
+
str
(
self
.
model_type
)
if
self
.
model_type
==
"snp"
or
self
.
model_type
==
"bounded"
:
self
.
options
+=
" --alpha "
+
self
.
alpha
if
self
.
model_type
==
"snp"
or
self
.
model_type
==
"bounded"
:
self
.
options
+=
" --alpha "
+
str
(
self
.
alpha
)
if
self
.
model_type
==
"bounded"
:
self
.
options
+=
" --bound_low "
+
self
.
bound_low
+
" --bound_high "
+
self
.
bound_high
self
.
options
+=
" --bound_low "
+
str
(
self
.
bound_low
)
+
" --bound_high "
+
str
(
self
.
bound_high
)
if
self
.
model_type
==
"fixed"
:
self
.
options
+=
" --bc_err_freq "
+
self
.
bc_err_freq
self
.
options
+=
" --bc_err_freq "
+
str
(
self
.
bc_err_freq
)
def
get_version
(
self
):
cmd
=
[
self
.
get_exec_path
(
"ustacks"
),
"--version"
]
p
=
Popen
(
cmd
,
stdout
=
PIPE
,
stderr
=
PIPE
)
stdout
,
stderr
=
p
.
communicate
()
return
stderr
.
split
()[
1
]
def
process
(
self
):
for
idx
,
input
in
enumerate
(
self
.
read1_files
):
parts
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
input
))
indiv
=
parts
[
0
]
if
parts
[
1
]
==
".gz"
:
indiv
=
os
.
path
.
splitext
(
parts
[
0
])
indiv
=
re
.
sub
(
r
'_1$'
,
''
,
indiv
)
individ
=
self
.
indiv_dic
[
indiv
][
"id"
]
format
=
"fastq"
if
os
.
path
.
splitext
(
os
.
path
.
basename
(
input
))[
1
]
!=
".gz"
else
"gzfastq"
ustacks
=
ShellFunction
(
self
.
get_exec_path
(
"ustacks"
)
+
" -t "
+
format
+
" -f $1 -d -r -o "
+
self
.
output_directory
+
" -i "
+
str
(
individ
)
+
" -m "
+
str
(
self
.
min_cov
)
+
" -M "
+
str
(
self
.
primary_mismatch
)
+
" -N "
+
str
(
self
.
secondary_mismatch
)
+
" 2> $2"
,
cmd_format
=
'{EXE} {IN} {OUT}'
)
ustacks
(
inputs
=
input
,
outputs
=
[
self
.
stderrs
[
idx
],
self
.
tags
[
idx
],
self
.
alleles
[
idx
],
self
.
snps
[
idx
]])
def
post_process
(
self
):
# Parse stderr
nb_tot
,
nb_used
=
0
,
0
...
...
@@ -146,22 +166,7 @@ class Ustacks (Analysis):
self
.
_add_result_element
(
"all"
,
"meansnp_cov"
,
meansnp_cov
)
self
.
_add_result_element
(
"all"
,
"mediansnp_cov"
,
mediansnp_cov
)
def
get_version
(
self
):
cmd
=
[
self
.
get_exec_path
(
"ustacks"
),
"--version"
]
p
=
Popen
(
cmd
,
stdout
=
PIPE
,
stderr
=
PIPE
)
stdout
,
stderr
=
p
.
communicate
()
return
stderr
.
split
()[
1
]
def
process
(
self
):
for
input
in
self
.
read1_files
:
indiv
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
input
))[
0
]
if
os
.
path
.
splitext
(
os
.
path
.
basename
(
input
))[
1
]
!=
".gz"
else
os
.
path
.
splitext
(
os
.
path
.
splitext
(
os
.
path
.
basename
(
input
))[
0
])[
0
]
id
=
self
.
indiv_dic
[
indiv
][
"id"
]
format
=
"fastq"
if
os
.
path
.
splitext
(
os
.
path
.
basename
(
input
))[
1
]
!=
".gz"
else
gzfastq
ustacks
=
ShellFunction
(
self
.
get_exec_path
(
"ustacks"
)
+
" -t "
+
format
+
" -f $1 -d -r -o "
+
self
.
output_directory
+
" -i "
+
id
+
\
" -m "
+
str
(
self
.
min_cov
)
+
" -M "
+
str
(
self
.
primary_mismatch
)
+
" -N "
+
str
(
self
.
secondary_mismatch
)
+
\
" 2> $2"
,
cmd_format
=
'{EXE} {IN} {OUT}'
)
ustacks
(
inputs
=
input
,
outputs
=
[
self
.
stderrs
[
id
],
self
.
tags
[
id
],
self
.
alleles
[
id
],
self
.
snps
[
id
]])
def
__parse_stderr
(
self
,
stderr
):
nb_seq_tot
,
nb_seq_used
=
""
,
""
for
line
in
open
(
stderr
).
readlines
():
...
...
workflows/radseq/data/radseq.cfg
View file @
b1116c43
...
...
@@ -52,6 +52,20 @@ read1=workflows/radseq/data/omymune_r1.fq.gz
read2=workflows/radseq/data/omymune_r2.fq.gz
#run
--project-id
1
--name
radseq
--description
The radseq demonstration workflow.
--date
02/01/2014
--data-nature
DNA
--sequencer
HiSeq 2000
--type
Region 1
--species
Escherichia coli
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment