Commit 49ac5e5f authored by MARTIN Pierre's avatar MARTIN Pierre
Browse files

Merge branch 'dsl2' of forgemia.inra.fr:genotoul-bioinfo/metagwgs into dsl2

parents 2aef4c74 62b9ae1b
......@@ -43,7 +43,7 @@ except ImportError as error:
# Variables
# These are identities normalized with query coverage:
MIN_IDENTITY_TAXA = (0.40,0.50,0.60,0.70,0.80,0.90,0.95)
MIN_IDENTITY_TAXA = (0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95)
# Fraction of weights needed to assign at a specific level,
# a measure of concensus at that level.
......@@ -60,12 +60,14 @@ d_taxonomy = {}
# Define taxonomy main levels
global main_level
main_level = \
["superkingdom", "phylum", "class", "order", "family", "genus", "species"]
["superkingdom", "phylum", "class", "order", "family", "genus", "species"]
##### SAME AS Script_renameVcn.py
# SAME AS Script_renameVcn.py
prot_prefix = "CDS_"
# Definition of the class Node
class Node:
def __init__(self):
self.tax_id = 0 # Number of the tax id.
......@@ -73,11 +75,12 @@ class Node:
self.children = [] # List of the children of this node
self.tip = 0 # Tip=1 if it's a terminal node, 0 if not.
self.name = "" # Name of the node: taxa if it's a terminal node,
# numero if not.
# numero if not.
self.level = "None"
def genealogy(self): # Trace genealogy from root to leaf
ancestors = [] # Initialize the list of all nodes
# from root to leaf.
# from root to leaf.
tax_id = self.tax_id # Define leaf
while 1:
if tax_id in d_taxonomy:
......@@ -91,9 +94,10 @@ class Node:
ancestors.append(tax_id)
break
return ancestors # Return the list
def fullnamelineage(self): # Trace genealogy from root to leaf
def fullnamelineage(self): # Trace genealogy from root to leaf
ancestors = [] # Initialise the list of all nodes
# from root to leaf.
# from root to leaf.
tax_id = self.tax_id # Define leaf
while 1:
if tax_id in d_taxonomy:
......@@ -104,15 +108,16 @@ class Node:
if tax_id == "1":
break
ancestors.reverse()
return "; ".join(ancestors) # Return the list
return "; ".join(ancestors) # Return the list
def genealogy_main_level(self):
ancestors = ["None"] * 7 # Initialise the list of all nodes
# from root to leaf.
# from root to leaf.
tax_id = self.tax_id
while 1:
if tax_id in d_taxonomy:
cur_level = d_taxonomy[tax_id].level
if cur_level in main_level :
if cur_level in main_level:
ancestors[main_level.index(cur_level)] = tax_id
tax_id = d_taxonomy[tax_id].parent
else:
......@@ -120,27 +125,30 @@ class Node:
if tax_id == "1":
# If it is the root, we reached the end.
break
return ancestors # Return the list
return ancestors # Return the list
def lineage_main_level(self):
ancestors = ["None"] * 7 # Initialise the list of all nodes
# from root to leaf.
# from root to leaf.
ancestors_tax_id = ["None"] * 7 # Initialise the list of all nodes
tax_id = self.tax_id
while 1:
if tax_id in d_taxonomy:
cur_level = d_taxonomy[tax_id].level
if cur_level in main_level :
if cur_level in main_level:
ancestors[main_level.index(cur_level)] = d_taxonomy[tax_id].name
ancestors_tax_id [main_level.index(cur_level)] = str(tax_id)
ancestors_tax_id[main_level.index(cur_level)] = str(tax_id)
tax_id = d_taxonomy[tax_id].parent
else:
break
if tax_id == "1":
# If it is the root, we reached the end.
break
return ("; ".join(ancestors), "; ".join(ancestors_tax_id))# Return the two lists
return ("; ".join(ancestors), "; ".join(ancestors_tax_id)) # Return the two lists
# Function to find common ancestor between two nodes or more
def common_ancestor(node_list):
global d_taxonomy
# Define the whole genealogy of the first node
......@@ -150,15 +158,16 @@ def common_ancestor(node_list):
list2 = d_taxonomy[node].genealogy()
ancestral_list = []
for i in list1:
if i in list2: # Identify common nodes between the two genealogy
if i in list2: # Identify common nodes between the two genealogy
ancestral_list.append(i)
list1 = ancestral_list # Reassing ancestral_list to list 1.
list1 = ancestral_list # Reassing ancestral_list to list 1.
# Finally, the first node of the ancestra_list is the common ancestor
# of all nodes.
common_ancestor = ancestral_list[0]
# Return a node
return common_ancestor
def load_taxonomy(directory):
# Load taxonomy
global d_taxonomy
......@@ -177,9 +186,8 @@ def load_taxonomy(directory):
d_name_by_tax_id[tax_id] = name # ... and load them
d_name_by_tax_id_reverse[name] = tax_id # ... into dictionaries
# Load taxonomy NCBI file ("nodes.dmp")
with open(os.path.join(directory, "nodes.dmp"), "r") as taxonomy_file :
with open(os.path.join(directory, "nodes.dmp"), "r") as taxonomy_file:
for line in taxonomy_file:
line = line.rstrip().replace("\t", "")
tab = line.split("|")
......@@ -194,10 +202,10 @@ def load_taxonomy(directory):
if tax_id not in d_taxonomy:
d_taxonomy[tax_id] = Node()
d_taxonomy[tax_id].tax_id = tax_id # Assign tax_id
d_taxonomy[tax_id].parent = tax_id_parent # Assign tax_id parent
d_taxonomy[tax_id].parent = tax_id_parent # Assign tax_id parent
d_taxonomy[tax_id].name = name # Assign name
d_taxonomy[tax_id].level = str(tab[2].strip()) # Assign level
if tax_id_parent in d_taxonomy:
d_taxonomy[tax_id].level = str(tab[2].strip()) # Assign level
if tax_id_parent in d_taxonomy:
children = d_taxonomy[tax_id].children # If parent is already in the object
children.append(tax_id) # ... we found its children
d_taxonomy[tax_id].children = children # ... so add them to the parent.
......@@ -205,6 +213,7 @@ def load_taxonomy(directory):
# END Functions for taxonomy taxdump.tar.gz
################################################
def read_query_length_file(query_length_file):
lengths = {}
for line in open(query_length_file):
......@@ -212,16 +221,17 @@ def read_query_length_file(query_length_file):
lengths[queryid] = float(length)
return lengths
def read_blast_input(blastinputfile,lengths,min_identity,max_matches,min_coverage):
#c1.Prot_00001 EFK63346.1 100.0 85 0 0 1 85 62 146 1.6e-36 158.3 85 \
# 146 EFK63346.1 LOW QUALITY PROTEIN: hypothetical protein HMPREF9008_04720, partial [Parabacteroides sp. 20_3]
def read_blast_input(blastinputfile, lengths, min_identity, max_matches, min_coverage):
# c1.Prot_00001 EFK63346.1 100.0 85 0 0 1 85 62 146 1.6e-36 158.3 85 \
# 146 EFK63346.1 LOW QUALITY PROTEIN: hypothetical protein HMPREF9008_04720, partial [Parabacteroides sp. 20_3]
#queryId, subjectId, percIdentity, alnLength, mismatchCount, gapOpenCount,
#queryStart, queryEnd, subjectStart, subjectEnd, eVal, bitScore, queryLength, subjectLength, subjectTitle
matches = defaultdict(list)
accs = Counter()
nmatches = Counter();
nmatches = Counter()
with open(blastinputfile) as blast_handler:
reader = csv.DictReader(blast_handler, delimiter='\t')
......@@ -231,24 +241,24 @@ def read_blast_input(blastinputfile,lengths,min_identity,max_matches,min_coverag
# queryStart, queryEnd, subjectStart, subjectEnd, eVal, bitScore, queryLength, subjectLength, subjectTitle) \
# = line.rstrip().split("\t")
if aln['sseqid'].startswith("gi|") :
if aln['sseqid'].startswith("gi|"):
m = re.search(r"gi\|.*?\|.*\|(.*)\|", aln['sseqid'])
acc = m.group(1)
else :
else:
acc = aln['sseqid']
qLength = lengths[aln['qseqid']]
alnLength_in_query = abs(int(aln['qend']) - int(aln['qstart'])) + 1
fHit = float(alnLength_in_query) / qLength
coverage = fHit * 100
fHit *= float(aln['pident']) / 100.0
fHit = min(1.0,fHit)
fHit = min(1.0, fHit)
#hits[queryId] = hits[queryId] + 1
if float(aln['pident']) > min_identity and nmatches[aln['qseqid']] < max_matches and float(coverage) > min_coverage:
matches[aln['qseqid']].append((acc, fHit))
nmatches[aln['qseqid']] += 1
accs[acc] +=1
accs[acc] += 1
return (OrderedDict(sorted(matches.items(), key = lambda t: t[0])), list(accs.keys()))
return (OrderedDict(sorted(matches.items(), key=lambda t: t[0])), list(accs.keys()))
def map_accessions(accs, mapping_file):
......@@ -263,53 +273,55 @@ def map_accessions(accs, mapping_file):
return mappings
def get_consensus (collate_table):
#From collapse_hit retrieve consensus tax_id and lineage
#Compute best lineage consensus
def get_consensus(collate_table):
# From collapse_hit retrieve consensus tax_id and lineage
# Compute best lineage consensus
for depth in range(6, -1, -1):
collate = collate_table[depth]
dWeight = sum(collate.values())
sortCollate = sorted(list(collate.items()), key = operator.itemgetter(1), reverse = True)
sortCollate = sorted(list(collate.items()), key=operator.itemgetter(1), reverse=True)
nL = len(collate)
if nL > 0:
dP = 0.0
if dWeight > 0.0:
dP = float(sortCollate[0][1]) / dWeight
if dP > MIN_FRACTION:
(fullnamelineage_text, fullnamelineage_ids) = d_taxonomy[str(sortCollate[0][0])].lineage_main_level()
(fullnamelineage_text, fullnamelineage_ids) = d_taxonomy[str(
sortCollate[0][0])].lineage_main_level()
tax_id_keep = str(sortCollate[0][0])
return (tax_id_keep, fullnamelineage_text, fullnamelineage_ids)
return (1,"Unable to found taxonomy consensus",1)
return (1, "Unable to found taxonomy consensus", 1)
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument("aln_input_file", \
help = "file with blast/diamond matches expected format m8 \
parser.add_argument("aln_input_file",
help="file with blast/diamond matches expected format m8 \
\nqueryId, subjectId, percIdentity, alnLength, mismatchCount, gapOpenCount,\
queryStart, queryEnd, subjectStart, subjectEnd, eVal, bitScore")
parser.add_argument("query_length_file", \
help = "tab delimited file of query lengths")
parser.add_argument('-a','--acc_taxaid_mapping_file', \
help = "mapping from accession to taxaid gzipped")
parser.add_argument('-t','--taxonomy', \
help = "path of taxdump.tar.gz extracted directory")
parser.add_argument('-o','--output_file', type = str, \
default = "taxonomyassignation", help = ("string specifying output file"))
parser.add_argument('-i','--identity', default = 60, \
help = "percentage of identity")
parser.add_argument('-m','--max_matches', default = 10, \
help = "max number of matches to analyze")
parser.add_argument('-c','--min_coverage', default = 70, \
help = "percentage of coverage")
parser.add_argument("query_length_file",
help="tab delimited file of query lengths")
parser.add_argument('-a', '--acc_taxaid_mapping_file',
help="mapping from accession to taxaid gzipped")
parser.add_argument('-t', '--taxonomy',
help="path of taxdump.tar.gz extracted directory")
parser.add_argument('-o', '--output_file', type=str,
default="taxonomyassignation", help=("string specifying output file"))
parser.add_argument('-i', '--identity', default=60,
help="percentage of identity")
parser.add_argument('-m', '--max_matches', default=10,
help="max number of matches to analyze")
parser.add_argument('-c', '--min_coverage', default=70,
help="percentage of coverage")
args = parser.parse_args()
lengths = read_query_length_file(args.query_length_file)
print("Finished reading lengths file")
nb_total_prot = len(lengths)
(matches,accs) = read_blast_input(args.aln_input_file, lengths, \
args.identity,args.max_matches,args.min_coverage)
(matches, accs) = read_blast_input(args.aln_input_file, lengths,
args.identity, args.max_matches, args.min_coverage)
print("Finished reading in blast results file")
nb_prot_annotated = len(matches)
......@@ -320,10 +332,10 @@ def main(argv):
print("Finished loading taxa directory " + args.taxonomy)
re_contig = re.compile('(.*)\.' + prot_prefix)
with open (args.output_file + ".pergene.tsv", "w") as out, \
open (args.output_file + ".percontig.tsv", "w") as outpercontig, \
open (args.output_file + ".warn.tsv", "w") as outdisc :
#Write header
with open(args.output_file + ".pergene.tsv", "w") as out, \
open(args.output_file + ".percontig.tsv", "w") as outpercontig, \
open(args.output_file + ".warn.tsv", "w") as outdisc:
# Write header
out.write("#prot_id\tconsensus_tax_id\tconsensus_lineage\ttax_id_by_level\n")
outpercontig.write("#contig\tconsensus_tax_id\tconsensus_lineage\ttax_id_by_level\n")
outdisc.write("#prot_id\tlist nr hit not found in taxo\n")
......@@ -340,45 +352,49 @@ def main(argv):
contig_id = None
for prot, matchs in list(matches.items()):
hit_sorted = sorted(matchs, key = lambda x: x[1], reverse = True)
hit_sorted = sorted(matchs, key=lambda x: x[1], reverse=True)
####
# handle contig consensus
match = re_contig.match(prot)
if match :
if match:
contig_id = match.group(1)
if prev_contig == None :
if prev_contig == None:
prev_contig = contig_id
if prev_contig != contig_id :
if prev_contig != contig_id:
###
(tax_id_keep, fullnamelineage_text, fullnamelineage_ids) = get_consensus(collate_hits_per_contig)
(tax_id_keep, fullnamelineage_text, fullnamelineage_ids) = get_consensus(
collate_hits_per_contig)
count_genealogy_contig[d_taxonomy[str(tax_id_keep)].level] += 1
outpercontig.write(prev_contig + "\t" + str(tax_id_keep) + "\t" + fullnamelineage_text + "\t" + str(fullnamelineage_ids) + "\n")
outpercontig.write(prev_contig + "\t" + str(tax_id_keep) + "\t" +
fullnamelineage_text + "\t" + str(fullnamelineage_ids) + "\n")
collate_hits_per_contig = list()
for depth in range(7):
collate_hits_per_contig.append(Counter())
prev_contig = contig_id
best_hit = hit_sorted [0][0]
fHit = hit_sorted [0][1]
best_hit = hit_sorted[0][0]
fHit = hit_sorted[0][1]
if mapping[best_hit] > -1:
#Retrieve hit taxa id
# Retrieve hit taxa id
tax_id = mapping[best_hit]
if str(tax_id) in d_taxonomy : # taxid in taxonomy ?
#Retreive lineage on main level only (no no rank)
if str(tax_id) in d_taxonomy: # taxid in taxonomy ?
# Retreive lineage on main level only (no no rank)
hits = d_taxonomy[str(tax_id)].genealogy_main_level()
for depth in range(7):
if hits[depth] != "None":
weight = (fHit - MIN_IDENTITY_TAXA[depth]) / (1.0 - MIN_IDENTITY_TAXA[depth])
weight = max(weight,0.0)
weight = (fHit - MIN_IDENTITY_TAXA[depth]
) / (1.0 - MIN_IDENTITY_TAXA[depth])
weight = max(weight, 0.0)
if weight > 0.0:
collate_hits_per_contig[depth][hits[depth]] += weight #could put a transform in here
# could put a transform in here
collate_hits_per_contig[depth][hits[depth]] += weight
# end handle contig consensus
####
####
#Handle a protein, init variable
# Handle a protein, init variable
added_matches = []
collate_hits = list()
protTaxaNotFound = list()
......@@ -387,50 +403,56 @@ def main(argv):
for depth in range(7):
collate_hits.append(Counter())
#For each hit, retrieve taxon id and compute weight in lineage
# For each hit, retrieve taxon id and compute weight in lineage
for (match, fHit) in hit_sorted:
#taxid id found in acc_taxaid_mapping_file
# taxid id found in acc_taxaid_mapping_file
if mapping[match] > -1:
#Retrieve hit taxa id
# Retrieve hit taxa id
tax_id = mapping[match]
if tax_id not in added_matches: # Only add the best hit per species
added_matches.append(tax_id)
if str(tax_id) in d_taxonomy : # taxid in taxonomy ?
#Retreive lineage on main level only (no no rank)
if str(tax_id) in d_taxonomy: # taxid in taxonomy ?
# Retreive lineage on main level only (no no rank)
hits = d_taxonomy[str(tax_id)].genealogy_main_level()
for depth in range(7):
if hits[depth] != "None":
weight = (fHit - MIN_IDENTITY_TAXA[depth]) / (1.0 - MIN_IDENTITY_TAXA[depth])
weight = max(weight,0.0)
weight = (
fHit - MIN_IDENTITY_TAXA[depth]) / (1.0 - MIN_IDENTITY_TAXA[depth])
weight = max(weight, 0.0)
if weight > 0.0:
collate_hits[depth][hits[depth]] += weight #could put a transform in here
else :
# could put a transform in here
collate_hits[depth][hits[depth]] += weight
else:
if tax_id not in protTaxaNotFound:
protTaxaNotFound.append(tax_id)
else :
protTaxaNotFound.append(tax_id)
else:
if match not in protMatchNotFound:
protMatchNotFound.append(match)
if len (added_matches) > 0 :
if len(added_matches) > 0:
(tax_id_keep, fullnamelineage_text, fullnamelineage_ids) = get_consensus(collate_hits)
count_genealogy[d_taxonomy[str(tax_id_keep)].level] += 1
#write simple output
out.write(prot + "\t" + str(tax_id_keep) + "\t" + fullnamelineage_text + "\t" + str(fullnamelineage_ids) + "\n")
# write simple output
out.write(prot + "\t" + str(tax_id_keep) + "\t" +
fullnamelineage_text + "\t" + str(fullnamelineage_ids) + "\n")
nb_prot_assigned += 1
#write discarded values.
if len(protTaxaNotFound) > 0 :
outdisc.write(prot + "\t" + "No taxid in taxdump\t" + ",".join(map(str, protTaxaNotFound)) + "\n")
if len(protMatchNotFound) > 0 :
outdisc.write(prot + "\t" + "No protid correspondance file\t" + ",".join(map(str, protMatchNotFound)) + "\n")
# write discarded values.
if len(protTaxaNotFound) > 0:
outdisc.write(prot + "\t" + "No taxid in taxdump\t" +
",".join(map(str, protTaxaNotFound)) + "\n")
if len(protMatchNotFound) > 0:
outdisc.write(prot + "\t" + "No protid correspondance file\t" +
",".join(map(str, protMatchNotFound)) + "\n")
if(os.path.getsize(args.aln_input_file) != 0):
#handle last record of contigs consensus.
# handle last record of contigs consensus.
(tax_id_keep, fullnamelineage_text, fullnamelineage_ids) = get_consensus(collate_hits_per_contig)
count_genealogy_contig[d_taxonomy[str(tax_id_keep)].level] += 1
outpercontig.write(str(prev_contig) + "\t" + str(tax_id_keep) + "\t" + str(fullnamelineage_text) + "\t" + str(fullnamelineage_ids) + "\n")
outpercontig.write(str(prev_contig) + "\t" + str(tax_id_keep) + "\t" +
str(fullnamelineage_text) + "\t" + str(fullnamelineage_ids) + "\n")
#graphs
# graphs
try:
os.makedirs("graphs")
except OSError:
......@@ -439,8 +461,8 @@ def main(argv):
# Sort dictionaries
count_genealogy_ord = OrderedDict(sorted(count_genealogy.items(), key=lambda t: t[0]))
count_genealogy_contig_ord = OrderedDict(sorted(count_genealogy_contig.items(), key=lambda t: t[0]))
count_genealogy_contig_ord = OrderedDict(
sorted(count_genealogy_contig.items(), key=lambda t: t[0]))
# Figures
pyplot.bar(range(len(count_genealogy_ord.values())), count_genealogy_ord.values())
......@@ -448,22 +470,28 @@ def main(argv):
pyplot.xlabel("Taxonomy level")
pyplot.ylabel("Number of proteins")
pyplot.title(args.aln_input_file + " number of proteins at different taxonomy levels")
pyplot.savefig("graphs/" + args.aln_input_file.split(sep="/")[-1] + "_prot_taxonomy_level.pdf")
pyplot.savefig("graphs/" + args.aln_input_file.split(sep="/")
[-1] + "_prot_taxonomy_level.pdf")
pyplot.close()
pyplot.bar(range(len(count_genealogy_contig_ord.values())), count_genealogy_contig_ord.values())
pyplot.xticks(range(len(count_genealogy_contig_ord.values())), count_genealogy_contig_ord.keys())
pyplot.bar(range(len(count_genealogy_contig_ord.values())),
count_genealogy_contig_ord.values())
pyplot.xticks(range(len(count_genealogy_contig_ord.values())),
count_genealogy_contig_ord.keys())
pyplot.xlabel("Taxonomy level")
pyplot.ylabel("Number of contigs")
pyplot.title(args.aln_input_file + " number of contigs at different taxonomy levels")
pyplot.savefig("graphs/" + args.aln_input_file.split(sep="/")[-1] + "_contig_taxonomy_level.pdf")
pyplot.savefig("graphs/" + args.aln_input_file.split(sep="/")
[-1] + "_contig_taxonomy_level.pdf")
pyplot.close()
list_graphs = [nb_total_prot, nb_prot_annotated, nb_prot_assigned]
pyplot.bar(range(len(list_graphs)), list_graphs)
pyplot.xticks(range(len(list_graphs)), ["Total","Annotated","Assigned"])
pyplot.xticks(range(len(list_graphs)), ["Total", "Annotated", "Assigned"])
pyplot.ylabel("Number of proteins")
pyplot.title(args.aln_input_file + " number of annotated and assigned proteins")
pyplot.savefig("graphs/" + args.aln_input_file.split(sep="/")[-1] + "_nb_prot_annotated_and_assigned.pdf")
pyplot.savefig("graphs/" + args.aln_input_file.split(sep="/")
[-1] + "_nb_prot_annotated_and_assigned.pdf")
pyplot.close()
if __name__ == "__main__":
main(sys.argv[1:])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment