Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
genotoul-bioinfo
ng6
Commits
1a0a3f8c
Commit
1a0a3f8c
authored
Aug 09, 2019
by
ckuchly
Browse files
Add process for 10X data
parent
e5178b47
Changes
2
Hide whitespace changes
Inline
Side-by-side
src/ng6/config_reader.py
View file @
1a0a3f8c
...
...
@@ -178,7 +178,7 @@ class NG6ConfigReader(object):
for
barcode
in
barcodes
:
#print("config reader")
#print(barcode[0])
logging
.
getLogger
(
"ng6"
).
debug
(
"barcode[0] = "
+
barcode
[
0
].
upper
())
#
logging.getLogger("ng6").debug("barcode[0] = " + barcode[0].upper())
barcode_array
[
barcode
[
0
].
upper
()]
=
barcode
[
1
].
upper
()
return
barcode_array
except
:
...
...
src/ng6/ng6workflow.py
View file @
1a0a3f8c
...
...
@@ -33,7 +33,7 @@ from ng6.project import Project
from
ng6.run
import
Run
from
ng6.sample
import
Sample
from
ng6.utils
import
Utils
from
ng6.config_reader
import
NG6ConfigReader
class
BasicNG6Workflow
(
Workflow
):
...
...
@@ -108,6 +108,7 @@ class BasicNG6Workflow (Workflow):
raise
ValueError
(
"Component "
+
cmpt_object
.
__class__
.
__name__
+
" with prefix "
+
cmpt_object
.
prefix
+
" already exist in this pipeline!"
)
self
.
component_nameids
[
cmpt_object
.
get_nameid
()]
=
None
return
cmpt_object
else
:
raise
ImportError
(
component_name
+
" component cannot be loaded, available components are: {0}"
.
format
(
...
...
@@ -141,6 +142,7 @@ class NG6Workflow (BasicNG6Workflow):
self
.
samples
=
[]
self
.
reads1
=
[]
self
.
reads2
=
[]
self
.
index
=
[]
self
.
samples_names
=
[]
self
.
reads1_indexes
=
[]
self
.
reads2_indexes
=
[]
...
...
@@ -167,10 +169,11 @@ class NG6Workflow (BasicNG6Workflow):
self
.
add_parameter_list
(
"metadata"
,
"Add metadata to the sample"
,
type
=
'samplemetadata'
,
add_to
=
"input_sample"
)
self
.
add_input_file_list
(
"read1"
,
"Read 1 data file path"
,
required
=
True
,
add_to
=
"input_sample"
)
self
.
add_input_file_list
(
"read2"
,
"Read 2 data file path"
,
add_to
=
"input_sample"
)
self
.
add_input_file_list
(
"index"
,
"Index data file path"
,
add_to
=
"input_sample"
)
def
__create_samples__
(
self
):
for
sd
in
self
.
input_sample
:
sp_object
=
Sample
(
sd
[
'sample_id'
],
sd
[
'read1'
],
sd
[
'read2'
],
name
=
sd
[
'sample_name'
],
description
=
sd
[
'sample_description'
],
type
=
sd
[
'type'
],
sp_object
=
Sample
(
sd
[
'sample_id'
],
sd
[
'read1'
],
sd
[
'read2'
],
sd
[
'index'
],
name
=
sd
[
'sample_name'
],
description
=
sd
[
'sample_description'
],
type
=
sd
[
'type'
],
insert_size
=
sd
[
'insert_size'
],
species
=
sd
[
'species'
]
)
for
metadata
in
sd
[
'metadata'
]
:
...
...
@@ -205,6 +208,9 @@ class NG6Workflow (BasicNG6Workflow):
for
rfile
in
sample
.
reads2
:
self
.
reads2_indexes
.
append
(
sample
.
sample_id
)
self
.
reads2
.
append
(
rfile
)
for
rfile
in
sample
.
index
:
self
.
index
.
append
(
rfile
)
if
len
(
self
.
samples_names
)
!=
0
:
if
len
(
self
.
samples_names
)
!=
len
(
self
.
samples
)
:
...
...
@@ -215,8 +221,9 @@ class NG6Workflow (BasicNG6Workflow):
return
self
.
reads1
elif
type
==
'read2'
:
return
self
.
reads2
return
self
.
reads1
+
self
.
reads2
elif
type
==
'index'
:
return
self
.
index
return
self
.
reads1
+
self
.
reads2
+
self
.
index
def
get_files_index
(
self
,
type
=
None
):
if
type
==
'read1'
:
...
...
@@ -225,7 +232,6 @@ class NG6Workflow (BasicNG6Workflow):
return
self
.
reads2_indexes
return
self
.
reads1_indexes
+
self
.
reads2_indexes
def
is_paired_end
(
self
):
return
len
(
self
.
reads2
)
>
0
...
...
@@ -288,6 +294,25 @@ def get_files_from_casava(casava_directory, project_name, lane_number):
if
file
.
endswith
(
".fastq.gz"
)
and
re
.
search
(
".*_L00"
+
str
(
lane
)
+
"_.*"
,
file
):
files
.
append
(
filepath
);
return
files
def
bcl2fastq_10X
(
directory
,
pname
,
lane
):
"""longranger"""
files
=
[]
with
open
(
os
.
path
.
join
(
directory
,
"SampleSheet_10X.mk"
))
as
fh
:
subdirs_list
=
[]
for
line
in
fh
:
if
line
.
startswith
(
"l"
+
str
(
lane
)
+
"_SUBDIRS"
):
parts
=
line
.
strip
().
split
(
":="
)
subdirs_list
=
parts
[
1
].
split
(
" "
)
# parse samples
for
subdir
in
subdirs_list
:
# filter on project name
if
re
.
match
(
"Project_"
+
pname
+
"/Sample_.+"
,
subdir
)
or
subdir
.
startswith
(
"Undetermined_indices"
):
for
file
in
os
.
listdir
(
directory
+
"/"
+
subdir
):
filepath
=
directory
+
"/"
+
subdir
+
"/"
+
file
if
file
.
endswith
(
".fastq.gz"
)
and
re
.
search
(
".*_L00"
+
str
(
lane
)
+
"_.*"
,
file
):
files
.
append
(
filepath
);
return
files
def
bcl2fastq_216
(
directory
,
pname
,
lane
):
"""bcl2fastq >= 1.9"""
...
...
@@ -315,6 +340,8 @@ def get_files_from_casava(casava_directory, project_name, lane_number):
return
bcl2fastq_18
(
casava_directory
,
project_name
,
lane_number
)
elif
os
.
path
.
exists
(
os
.
path
.
join
(
casava_directory
,
'Stats'
,
'DemultiplexingStats.xml'
))
:
return
bcl2fastq_216
(
casava_directory
,
project_name
,
lane_number
)
elif
os
.
path
.
exists
(
os
.
path
.
join
(
casava_directory
,
"SampleSheet_10X.mk"
))
:
return
bcl2fastq_10X
(
casava_directory
,
project_name
,
lane_number
)
...
...
@@ -326,8 +353,10 @@ class CasavaNG6Workflow(NG6Workflow):
self
.
group_prefix
=
None
self
.
undetermined_reads1
=
[]
self
.
undetermined_reads2
=
[]
self
.
undetermined_index
=
[]
self
.
log_files
=
[]
self
.
is_casava
=
False
self
.
is_10Xcasava
=
False
def
__add_sample_parameters__
(
self
):
self
.
add_multiple_parameter
(
'casava'
,
'Provide the options to retrieve samples from a CASAVA directory'
,
group
=
"Sample description"
)
...
...
@@ -383,9 +412,16 @@ class CasavaNG6Workflow(NG6Workflow):
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow.__create_samples__ before self._process_casava_18"
)
all_samples
,
all_samples_id
=
self
.
_process_casava_18
(
casava_directory
,
project_name
,
lane_number
,
input_files
)
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow.__create_samples__ before self._process_casava_18"
)
elif
os
.
path
.
exists
(
os
.
path
.
join
(
casava_directory
,
'Stats'
,
'DemultiplexingStats.xml'
))
:
all_samples
,
all_samples_id
=
self
.
_process_casava_216
(
casava_directory
,
project_name
,
lane_number
,
input_files
)
elif
os
.
path
.
exists
(
os
.
path
.
join
(
casava_directory
,
"SampleSheet_10X.mk"
)):
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow.__create_samples__ before self._process_casava_10X"
)
all_samples
,
all_samples_id
=
self
.
_process_casava_10X
(
casava_directory
,
project_name
,
lane_number
,
input_files
)
self
.
is_10Xcasava
=
True
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow.__create_samples__ after self._process_casava_10X"
)
selected_samples
=
self
.
casava
[
'select_sample_id'
]
logging
.
getLogger
(
"CasavaNG6Workflow"
).
debug
(
"__create_samples__. all_samples_id = a"
+
", "
.
join
(
all_samples_id
)
+
"a"
)
if
selected_samples
:
...
...
@@ -450,7 +486,6 @@ class CasavaNG6Workflow(NG6Workflow):
# filter on project name
if
re
.
match
(
"Project_"
+
project_name
+
"/Sample_.+"
,
sample
[
'subdir'
])
or
sample
[
'subdir'
].
startswith
(
"Undetermined_indices"
):
for
file
in
os
.
listdir
(
casava_directory
+
"/"
+
sample
[
'subdir'
]):
filepath
=
casava_directory
+
"/"
+
sample
[
'subdir'
]
+
"/"
+
file
if
file
.
endswith
(
".fastq.gz"
)
and
re
.
search
(
".*_L00"
+
str
(
lane_number
)
+
"_.*"
,
file
):
...
...
@@ -477,7 +512,6 @@ class CasavaNG6Workflow(NG6Workflow):
all_samples
.
append
(
sp_object
)
all_samples_id
.
append
(
sample
[
'sample_id'
])
for
file
in
os
.
listdir
(
casava_directory
):
filepath
=
casava_directory
+
"/"
+
file
if
file
.
endswith
(
".log"
):
...
...
@@ -485,9 +519,104 @@ class CasavaNG6Workflow(NG6Workflow):
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow._process_casava_18 self.log_files = "
+
","
.
join
(
self
.
log_files
))
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow._process_casava_18 all_samples_id = "
+
","
.
join
(
all_samples_id
))
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow._process_casava_18 exiting"
)
return
all_samples
,
all_samples_id
def
_process_casava_10X
(
self
,
casava_directory
,
project_name
,
lane_number
,
input_files
):
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow._process_casava_10X enter"
)
print
(
"Process Casava 10X "
)
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow._process_casava_10X casava_directory = "
+
casava_directory
+
", project_name = "
+
str
(
project_name
))
"""
Creates samples from casavadir from longranger demultiplexing
@param casava_directory:
@param project_name:
@param lane_number:
@param input_files:
"""
all_samples
=
[]
all_samples_id
=
[]
# open casava samplesheet again to associate our files with a sample
with
open
(
os
.
path
.
join
(
casava_directory
,
"SampleSheet_10X.mk"
))
as
fh
:
barcodes_list
=
[]
sample_ids_list
=
[]
subdirs_list
=
[]
for
line
in
fh
:
if
line
.
startswith
(
"l"
+
str
(
lane_number
)
+
"_BARCODES"
):
parts
=
line
.
strip
().
split
(
":="
)
barcodes_list
=
[
re
.
sub
(
r
"[_\s]+"
,
""
,
x
)
for
x
in
parts
[
1
].
split
()
]
elif
line
.
startswith
(
"l"
+
str
(
lane_number
)
+
"_SAMPLEIDS"
):
parts
=
line
.
strip
().
split
(
":="
)
sample_ids_list
=
parts
[
1
].
split
(
" "
)
elif
line
.
startswith
(
"l"
+
str
(
lane_number
)
+
"_SUBDIRS"
):
parts
=
line
.
strip
().
split
(
":="
)
subdirs_list
=
parts
[
1
].
split
(
" "
)
assert
len
(
barcodes_list
)
==
len
(
sample_ids_list
)
==
len
(
subdirs_list
),
"Invalid lane {0} in SampleSheet.mk"
.
format
(
lane_number
)
cfg_reader
=
NG6ConfigReader
()
indexs
=
cfg_reader
.
get_10X_indexs
()
# parse samples
for
i
in
range
(
len
(
barcodes_list
)):
if
barcodes_list
[
i
]
==
'Undetermined'
:
barcode
=
'Undetermined'
else
:
barcode
=
indexs
[
barcodes_list
[
i
]]
#print("ng6worflow retrouver les barcodes")
#print(barcode)
sample
=
{
'barcode'
:
barcode
,
'sample_id'
:
sample_ids_list
[
i
],
'subdir'
:
subdirs_list
[
i
],
'reads1'
:
[],
'reads2'
:
[],
'index'
:
[]
}
# filter on project name
if
re
.
match
(
"Project_"
+
project_name
+
"/Sample_.+"
,
sample
[
'subdir'
])
or
sample
[
'subdir'
].
startswith
(
"Undetermined_indices"
):
for
file
in
os
.
listdir
(
casava_directory
+
"/"
+
sample
[
'subdir'
]):
filepath
=
casava_directory
+
"/"
+
sample
[
'subdir'
]
+
"/"
+
file
if
file
.
endswith
(
".fastq.gz"
)
and
re
.
search
(
".*_L00"
+
str
(
lane_number
)
+
"_.*"
,
file
):
for
idx
,
iofile
in
enumerate
(
input_files
)
:
if
iofile
==
filepath
:
if
re
.
search
(
".*_R1_.*"
,
file
):
if
not
sample
[
'subdir'
].
startswith
(
"Undetermined_indices"
):
sample
[
'reads1'
].
append
(
iofile
)
else
:
self
.
undetermined_reads1
.
append
(
iofile
)
if
re
.
search
(
".*_R2_.*"
,
file
):
if
not
sample
[
'subdir'
].
startswith
(
"Undetermined_indices"
):
sample
[
'reads2'
].
append
(
iofile
)
else
:
self
.
undetermined_reads2
.
append
(
iofile
)
if
re
.
search
(
".*_I1_.*"
,
file
):
if
not
sample
[
'subdir'
].
startswith
(
"Undetermined_indices"
):
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow.__process_casava_10X__index_files = "
+
""
.
join
(
iofile
))
sample
[
'index'
].
append
(
iofile
)
else
:
self
.
undetermined_index
.
append
(
iofile
)
input_files
.
pop
(
idx
)
break
if
not
sample
[
'subdir'
].
startswith
(
"Undetermined_indices"
)
:
sp_object
=
Sample
(
sample
[
'barcode'
],
sample
[
'reads1'
],
reads2
=
sample
[
'reads2'
],
index
=
sample
[
'index'
],
name
=
sample
[
'sample_id'
])
sp_object
.
add_metadata
(
'barcode'
,
sample
[
'barcode'
])
sp_object
.
add_metadata
(
'is_casava'
,
True
)
all_samples
.
append
(
sp_object
)
all_samples_id
.
append
(
sample
[
'sample_id'
])
for
file
in
os
.
listdir
(
casava_directory
):
filepath
=
casava_directory
+
"/"
+
file
if
file
.
endswith
(
".log"
):
self
.
log_files
.
append
(
filepath
)
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow._process_casava_10X all_samples_id = "
+
","
.
join
(
all_samples_id
))
logging
.
getLogger
(
"ng6"
).
debug
(
"CasavaNG6Workflow._process_casava_10X exiting"
)
return
all_samples
,
all_samples_id
def
_process_casava_216
(
self
,
casava_directory
,
project_name
,
lane_number
,
input_files
):
"""
Creates samples from casavadir (>=1.9) using input files
...
...
@@ -509,12 +638,21 @@ class CasavaNG6Workflow(NG6Workflow):
logging
.
getLogger
(
"ng6"
).
debug
(
"illumina_process self.is_casava"
)
if
len
(
self
.
log_files
)
>
0
:
add_log
=
self
.
add_component
(
"BasicAnalysis"
,
[
self
.
log_files
,
"Log Files"
,
"Log files generated during primary analysis"
,
"-"
,
"-"
,
"-"
,
"gz"
,
""
,
"log.gz"
])
if
len
(
self
.
undetermined_reads1
)
>
0
:
if
self
.
casava
[
'mismatch_index'
]
:
demultiplex_stats
=
self
.
add_component
(
"DemultiplexStats"
,
[
self
.
get_all_reads
(
"read1"
),
self
.
undetermined_reads1
,
self
.
get_files_index
(
'read1'
)])
elif
self
.
is_10Xcasava
:
logging
.
getLogger
(
"ng6"
).
debug
(
"illumina_process self.is_10Xcasava = "
)
logging
.
getLogger
(
"ng6"
).
debug
(
self
.
get_all_reads
(
"read1"
))
logging
.
getLogger
(
"ng6"
).
debug
(
"illumina_process undetermined reads = "
)
logging
.
getLogger
(
"ng6"
).
debug
(
self
.
undetermined_reads1
)
logging
.
getLogger
(
"ng6"
).
debug
(
"illumina_process file index ="
)
logging
.
getLogger
(
"ng6"
).
debug
(
self
.
get_files_index
(
"read1"
))
#demultiplex_stats = self.add_component("Demultiplex10XStats", [self.get_all_reads("read1"), self.undetermined_reads1, self.get_files_index("read1")])
else
:
demultiplex_stats
=
self
.
add_component
(
"DemultiplexStats"
,
[
self
.
get_all_reads
(
"read1"
),
self
.
undetermined_reads1
])
demultiplex_stats
=
self
.
add_component
(
"DemultiplexStats"
,
[
self
.
get_all_reads
(
"read1"
),
self
.
undetermined_reads1
])
if
self
.
keep_reads
!=
"all"
:
logging
.
getLogger
(
"ng6"
).
debug
(
"illumina_process self.keep_reads != all"
)
...
...
@@ -568,9 +706,9 @@ class CasavaNG6Workflow(NG6Workflow):
except
:
pass
# contamination_search
if
contam
:
if
self
.
contamination_databank
:
contam
.
extend
(
self
.
contamination_databank
)
contamination_search
=
self
.
add_component
(
"ContaminationSearch"
,
[
filtered_read1_files
,
contam
,
list
((
Utils
.
get_group_basenames
(
filtered_read
1
_files
,
"read"
)).
keys
())
],
parent
=
fastqilluminafilter
)
#
if contam :
#
if self.contamination_databank: contam.extend(self.contamination_databank)
#
contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files
+
filtered_read
2
_files,
contam, reads_prefixes
], parent = fastqilluminafilter)
# make some statistics on raw file
fastqc
=
self
.
add_component
(
"FastQC"
,
[
filtered_read1_files
+
filtered_read2_files
,
(
self
.
group_prefix
is
not
None
),
self
.
no_group
,
"fastqc.tar.gz"
],
parent
=
fastqilluminafilter
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment