Commit 57925b24 authored by Celine Noirot's avatar Celine Noirot
Browse files

Add depth info

parent 02cc9686
......@@ -42,6 +42,9 @@ merge idstats and .percontig.tsv files for one sample.')
parser.add_argument('-i', '--idxstats_file', required = True, \
help = 'idxstats file.')
parser.add_argument('-m', '--mosdepth_file', required = True, \
help = 'depth per contigs from mosdepth (regions.bed.gz).')
parser.add_argument('-c', '--percontig_file', required = True, \
help = '.percontig.tsv file.')
......@@ -57,41 +60,47 @@ args = parser.parse_args()
# Recovery of idxstats file.
idxstats = pd.read_csv(args.idxstats_file, delimiter='\t', header=None)
# Recovery of mosdepth file; remove start/end columns
mosdepth = pd.read_csv(args.mosdepth_file, delimiter='\t', header=None,compression='gzip')
mosdepth.columns = ["contig","start","end","depth"]
mosdepth.drop(["start","end"], inplace=True,axis=1)
# Recovery of .percontig.tsv file.
percontig = pd.read_csv(args.percontig_file, delimiter='\t', dtype=str)
# Merge idxstats and .percontig.tsv files.
merge = pd.merge(idxstats,percontig,left_on=0,right_on='#contig', how='outer')
#add depth
merge = pd.merge(merge,mosdepth,left_on=0,right_on='contig', how='outer')
# Group by lineage and sum number of reads and contigs.
res = merge.groupby(['consensus_lineage','consensus_tax_id', 'tax_id_by_level']).agg({0 : [';'.join, 'count'], 2: 'sum'}).reset_index()
res.columns=['lineage_by_level', 'consensus_tax_id', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads']
print(res.head())
res = merge.groupby(['consensus_lineage','consensus_tax_id', 'tax_id_by_level']).agg({0 : [';'.join, 'count'], 2: 'sum', 'depth': 'mean'}).reset_index()res.columns=['lineage_by_level', 'consensus_tax_id', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', 'depth']
# Fill the NaN by 0.
res.fillna(0, inplace=True)
# Split by taxonomic level
res_split_tax_id = res.join(res['tax_id_by_level'].str.split(pat=";",expand=True))
res_split_tax_id.columns=['consensus_lineage', 'consensus_taxid', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', "superkingdom_tax_id", "phylum_tax_id", "order_tax_id", "class_tax_id", "family_tax_id", "genus_tax_id", "species_tax_id"]
res_split_tax_id.columns=['consensus_lineage', 'consensus_taxid', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'depth', 'nb_reads', "superkingdom_tax_id", "phylum_tax_id", "order_tax_id", "class_tax_id", "family_tax_id", "genus_tax_id", "species_tax_id"]
res_split_tax_id.fillna(value='no_affi', inplace = True)
print(res_split_tax_id.head())
res_split = res_split_tax_id.join(res_split_tax_id['consensus_lineage'].str.split(pat=";",expand=True))
res_split.columns=['consensus_lineage', 'consensus_taxid', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', "superkingdom_tax_id", "phylum_tax_id", "order_tax_id", "class_tax_id", "family_tax_id", "genus_tax_id", "species_tax_id", "superkingdom_lineage", "phylum_lineage", "order_lineage", "class_lineage", "family_lineage", "genus_lineage", "species_lineage"]
res_split.columns=['consensus_lineage', 'consensus_taxid', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', 'depth', "superkingdom_tax_id", "phylum_tax_id", "order_tax_id", "class_tax_id", "family_tax_id", "genus_tax_id", "species_tax_id", "superkingdom_lineage", "phylum_lineage", "order_lineage", "class_lineage", "family_lineage", "genus_lineage", "species_lineage"]
res_split.fillna(value='no_affi', inplace = True)
level_superkingdom = res_split.groupby(['superkingdom_tax_id','superkingdom_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_superkingdom.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_phylum = res_split.groupby(['phylum_tax_id','phylum_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_phylum.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_order = res_split.groupby(['order_tax_id','order_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_order.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_class = res_split.groupby(['class_tax_id','class_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_class.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_family = res_split.groupby(['family_tax_id','family_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_family.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_genus = res_split.groupby(['genus_tax_id','genus_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_genus.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_species = res_split.groupby(['species_tax_id','species_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_species.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
levels_columns=['tax_id_by_level','lineage_by_level','nb_contigs', 'nb_reads', 'depth','name_contigs']
level_superkingdom = res_split.groupby(['superkingdom_tax_id','superkingdom_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_superkingdom.columns=levels_columns
level_phylum = res_split.groupby(['phylum_tax_id','phylum_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_phylum.columns=levels_columns
level_order = res_split.groupby(['order_tax_id','order_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_order.columns=levels_columns
level_class = res_split.groupby(['class_tax_id','class_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_class.columns=levels_columns
level_family = res_split.groupby(['family_tax_id','family_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_family.columns=levels_columns
level_genus = res_split.groupby(['genus_tax_id','genus_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_genus.columns=levels_columns
level_species = res_split.groupby(['species_tax_id','species_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_species.columns=levels_columns
# Write merge data frame in output files.
res.to_csv(args.output_name + ".tsv", sep="\t", index=False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment