Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
genotoul-bioinfo
metagWGS
Commits
57925b24
Commit
57925b24
authored
Jun 01, 2021
by
Celine Noirot
Browse files
Add depth info
parent
02cc9686
Changes
1
Hide whitespace changes
Inline
Side-by-side
bin/merge_idxstats_percontig_lineage.py
View file @
57925b24
...
...
@@ -42,6 +42,9 @@ merge idstats and .percontig.tsv files for one sample.')
parser
.
add_argument
(
'-i'
,
'--idxstats_file'
,
required
=
True
,
\
help
=
'idxstats file.'
)
parser
.
add_argument
(
'-m'
,
'--mosdepth_file'
,
required
=
True
,
\
help
=
'depth per contigs from mosdepth (regions.bed.gz).'
)
parser
.
add_argument
(
'-c'
,
'--percontig_file'
,
required
=
True
,
\
help
=
'.percontig.tsv file.'
)
...
...
@@ -57,41 +60,47 @@ args = parser.parse_args()
# Recovery of idxstats file.
idxstats
=
pd
.
read_csv
(
args
.
idxstats_file
,
delimiter
=
'
\t
'
,
header
=
None
)
# Recovery of mosdepth file; remove start/end columns
mosdepth
=
pd
.
read_csv
(
args
.
mosdepth_file
,
delimiter
=
'
\t
'
,
header
=
None
,
compression
=
'gzip'
)
mosdepth
.
columns
=
[
"contig"
,
"start"
,
"end"
,
"depth"
]
mosdepth
.
drop
([
"start"
,
"end"
],
inplace
=
True
,
axis
=
1
)
# Recovery of .percontig.tsv file.
percontig
=
pd
.
read_csv
(
args
.
percontig_file
,
delimiter
=
'
\t
'
,
dtype
=
str
)
# Merge idxstats and .percontig.tsv files.
merge
=
pd
.
merge
(
idxstats
,
percontig
,
left_on
=
0
,
right_on
=
'#contig'
,
how
=
'outer'
)
#add depth
merge
=
pd
.
merge
(
merge
,
mosdepth
,
left_on
=
0
,
right_on
=
'contig'
,
how
=
'outer'
)
# Group by lineage and sum number of reads and contigs.
res
=
merge
.
groupby
([
'consensus_lineage'
,
'consensus_tax_id'
,
'tax_id_by_level'
]).
agg
({
0
:
[
';'
.
join
,
'count'
],
2
:
'sum'
}).
reset_index
()
res
.
columns
=
[
'lineage_by_level'
,
'consensus_tax_id'
,
'tax_id_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
]
print
(
res
.
head
())
res
=
merge
.
groupby
([
'consensus_lineage'
,
'consensus_tax_id'
,
'tax_id_by_level'
]).
agg
({
0
:
[
';'
.
join
,
'count'
],
2
:
'sum'
,
'depth'
:
'mean'
}).
reset_index
()
res
.
columns
=
[
'lineage_by_level'
,
'consensus_tax_id'
,
'tax_id_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
,
'depth'
]
# Fill the NaN by 0.
res
.
fillna
(
0
,
inplace
=
True
)
# Split by taxonomic level
res_split_tax_id
=
res
.
join
(
res
[
'tax_id_by_level'
].
str
.
split
(
pat
=
";"
,
expand
=
True
))
res_split_tax_id
.
columns
=
[
'consensus_lineage'
,
'consensus_taxid'
,
'tax_id_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
,
"superkingdom_tax_id"
,
"phylum_tax_id"
,
"order_tax_id"
,
"class_tax_id"
,
"family_tax_id"
,
"genus_tax_id"
,
"species_tax_id"
]
res_split_tax_id
.
columns
=
[
'consensus_lineage'
,
'consensus_taxid'
,
'tax_id_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'depth'
,
'nb_reads'
,
"superkingdom_tax_id"
,
"phylum_tax_id"
,
"order_tax_id"
,
"class_tax_id"
,
"family_tax_id"
,
"genus_tax_id"
,
"species_tax_id"
]
res_split_tax_id
.
fillna
(
value
=
'no_affi'
,
inplace
=
True
)
print
(
res_split_tax_id
.
head
())
res_split
=
res_split_tax_id
.
join
(
res_split_tax_id
[
'consensus_lineage'
].
str
.
split
(
pat
=
";"
,
expand
=
True
))
res_split
.
columns
=
[
'consensus_lineage'
,
'consensus_taxid'
,
'tax_id_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
,
"superkingdom_tax_id"
,
"phylum_tax_id"
,
"order_tax_id"
,
"class_tax_id"
,
"family_tax_id"
,
"genus_tax_id"
,
"species_tax_id"
,
"superkingdom_lineage"
,
"phylum_lineage"
,
"order_lineage"
,
"class_lineage"
,
"family_lineage"
,
"genus_lineage"
,
"species_lineage"
]
res_split
.
columns
=
[
'consensus_lineage'
,
'consensus_taxid'
,
'tax_id_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
,
'depth'
,
"superkingdom_tax_id"
,
"phylum_tax_id"
,
"order_tax_id"
,
"class_tax_id"
,
"family_tax_id"
,
"genus_tax_id"
,
"species_tax_id"
,
"superkingdom_lineage"
,
"phylum_lineage"
,
"order_lineage"
,
"class_lineage"
,
"family_lineage"
,
"genus_lineage"
,
"species_lineage"
]
res_split
.
fillna
(
value
=
'no_affi'
,
inplace
=
True
)
level_superkingdom
=
res_split
.
groupby
([
'superkingdom_tax_id'
,
'superkingdom_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
}).
reset_index
()
level_superkingdom
.
columns
=
[
'tax_id_by_level'
,
'lineage_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
]
level_phylum
=
res_split
.
groupby
([
'phylum_tax_id'
,
'phylum_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
}).
reset_index
()
level_phylum
.
columns
=
[
'tax_id_by_level'
,
'lineage_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
]
level_order
=
res_split
.
groupby
([
'order_tax_id'
,
'order_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
}).
reset_index
()
level_order
.
columns
=
[
'tax_id_by_level'
,
'lineage_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
]
level_class
=
res_split
.
groupby
([
'class_tax_id'
,
'class_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
}).
reset_index
()
level_class
.
columns
=
[
'tax_id_by_level'
,
'lineage_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
]
level_family
=
res_split
.
groupby
([
'family_tax_id'
,
'family_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
}).
reset_index
()
level_family
.
columns
=
[
'tax_id_by_level'
,
'lineage_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
]
level_genus
=
res_split
.
groupby
([
'genus_tax_id'
,
'genus_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
}).
reset_index
()
level_genus
.
columns
=
[
'tax_id_by_level'
,
'lineage_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
]
level_species
=
res_split
.
groupby
([
'species_tax_id'
,
'species_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
}).
reset_index
()
level_species
.
columns
=
[
'tax_id_by_level'
,
'lineage_by_level'
,
'name_contigs'
,
'nb_contigs'
,
'nb_reads'
]
levels_columns
=
[
'tax_id_by_level'
,
'lineage_by_level'
,
'nb_contigs'
,
'nb_reads'
,
'depth'
,
'name_contigs'
]
level_superkingdom
=
res_split
.
groupby
([
'superkingdom_tax_id'
,
'superkingdom_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
,
'depth'
:
'mean'
}).
reset_index
()
level_superkingdom
.
columns
=
levels_columns
level_phylum
=
res_split
.
groupby
([
'phylum_tax_id'
,
'phylum_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
,
'depth'
:
'mean'
}).
reset_index
()
level_phylum
.
columns
=
levels_columns
level_order
=
res_split
.
groupby
([
'order_tax_id'
,
'order_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
,
'depth'
:
'mean'
}).
reset_index
()
level_order
.
columns
=
levels_columns
level_class
=
res_split
.
groupby
([
'class_tax_id'
,
'class_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
,
'depth'
:
'mean'
}).
reset_index
()
level_class
.
columns
=
levels_columns
level_family
=
res_split
.
groupby
([
'family_tax_id'
,
'family_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
,
'depth'
:
'mean'
}).
reset_index
()
level_family
.
columns
=
levels_columns
level_genus
=
res_split
.
groupby
([
'genus_tax_id'
,
'genus_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
,
'depth'
:
'mean'
}).
reset_index
()
level_genus
.
columns
=
levels_columns
level_species
=
res_split
.
groupby
([
'species_tax_id'
,
'species_lineage'
]).
agg
({
'name_contigs'
:
[
';'
.
join
],
'nb_contigs'
:
'sum'
,
'nb_reads'
:
'sum'
,
'depth'
:
'mean'
}).
reset_index
()
level_species
.
columns
=
levels_columns
# Write merge data frame in output files.
res
.
to_csv
(
args
.
output_name
+
".tsv"
,
sep
=
"
\t
"
,
index
=
False
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment