Commit b5e6bb3e authored by Celine Noirot's avatar Celine Noirot
Browse files

Add enzymatic digestion

parent e80d0f64
#
# insilicoRRBSdigestion
# Copyright (C) 2014 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2015 INRA'
__license__ = 'GNU General Public License'
__version__ = '1'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'alpha'
from optparse import *
import os, sys
import numpy as np
from subprocess import Popen, PIPE
import tempfile
import json
import re
sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', '..', 'src'))
import jflow.seqio as seqio
GRAPH_CLASS_PARTITION = 100
MAX_FRAGMENT_LEN_IN_HISTOGRAM = 5000
def version_string ():
"""
Return the insilicoRRBSdigestion version
"""
return "insilicoRRBSdigestion " + __version__
def nb_col_file(file):
fh=open(file)
for line in fh :
if line.startswith("#"):
pass
else :
row = line.split("\t")
fh.close()
return (len(row))
fh.close()
return (0)
def bedtools_intersect(bedtools,external_bed,fragment_bed, output):
'''Validate with R'''
nb_cols_external_bed = nb_col_file(external_bed)
fh_out=open(output,"w")
cmd = [bedtools, "intersect", "-wo", "-a", external_bed, "-b", fragment_bed]
p_intersect = Popen(cmd, stdout=fh_out, stderr=PIPE)
stderr = p_intersect.communicate()[1]
if stderr.decode("utf-8") != "" :
print ("error on intersect bed between " + external_bed + " and " + fragment_bed)
print (stderr.decode("utf-8"))
def get_intersection_stats (intersection_bed_fragment_file, nb_col_first_file, ranges):
base_overlap=0
fh=open(intersection_bed_fragment_file)
result={"global":{}, "ranges":{}}
dict_fragment_a_overlap={}
dict_fragment_b_overlap={}
for key, range in ranges.items():
result["ranges"][key]= {"nb_base":0,
"list_fragment_a":{},
"list_fragment_b":{}}
for line in fh :
row=line.rstrip().split("\t")
keya="_".join(row[0:3])
keyb="_".join(row[nb_col_first_file:nb_col_first_file+2])
base_overlap+=int(row[nb_col_first_file+4])
dict_fragment_a_overlap[keya]=1
dict_fragment_b_overlap[keyb]=1
fragment_size=int(row[nb_col_first_file+3])
#compute coverage per ranges
for key, range in ranges.items():
if fragment_size >= range['coord'][0] and fragment_size <= range['coord'][1] :
result["ranges"][key]['nb_base']+=int(row[nb_col_first_file+4])
result["ranges"][key]['list_fragment_a'][keya]=1
result["ranges"][key]['list_fragment_b'][keyb]=1
fh.close()
#Delete list fragment in reange and remplace by number of fragement
for key in result["ranges"]:
result["ranges"][key]["nb_fragment_a"]=len(result["ranges"][key]["list_fragment_a"].keys())
result["ranges"][key]["nb_fragment_b"]=len(result["ranges"][key]["list_fragment_b"].keys())
del result["ranges"][key]["list_fragment_a"]
del result["ranges"][key]["list_fragment_b"]
result["global"]={"nb_fragment_a":len(dict_fragment_a_overlap.keys()),
"nb_fragment_b":len(dict_fragment_b_overlap.keys()),
"nb_overlap_base":base_overlap}
return(result)
def count_bp_element_in_bed (bedfile):
sum_base=0
nb_element=0
bfh=open(bedfile)
for line in bfh :
val = line.split("\t")
if len(val) > 2 :
sum_base+=int(val[2])-int(val[1])
nb_element+=1
return(sum_base,nb_element)
def get_fragment_histogram(fragments_length, bins=None):
results={}
if not bins is None :
hist, edges = np.histogram(fragments_length, bins=bins)
results={"edges":edges.tolist(),"hist":hist.tolist(),"select": "select only fragment length in bins"}
else :
hist, edges = np.histogram(fragments_length, range = (0,MAX_FRAGMENT_LEN_IN_HISTOGRAM))
results={"edges":edges.tolist(),"hist":hist.tolist(),"select": "exclude fragment longer than "+str(MAX_FRAGMENT_LEN_IN_HISTOGRAM)}
return results
def split_genome(fasta, enz_dict, bed=None, bed_select=None, min=30, max=500 ,bedCpG=None, double_selection=False):
#Compared to http://www.restrictionmapper.org/cgi-bin/sitefind3.pl
fa_fh = open(fasta, "rU")
reader = seqio.FastaReader(fa_fh)
fragments_len = []
fragments_len_selection = []
bed_fh = open(bed,"w")
nb_cpg=0
genome_size=0
select_fh=None
if not bed_select is None:
select_fh=open(bed_select,"w")
if bedCpG is not None : bedCpG_fh= open(bedCpG,"w")
#if we only want to keep fragment digest by both enzyme, build pattern for regexp
pattern=""
if double_selection and len(enz_dict.keys()) == 2 :
keys_enz=list(enz_dict)
pattern1 = "^"+enz_dict[keys_enz[0]][1].split('*')[1]+ ".*"+enz_dict[keys_enz[1]][1].split('*')[0]+"$"
pattern2 = "^"+enz_dict[keys_enz[1]][1].split('*')[1]+ ".*"+enz_dict[keys_enz[0]][1].split('*')[0]+"$"
pattern=pattern1+'|'+pattern2
for id, desc, seq, qual in reader:
genome_size+=len(seq)
# for each enzyme replace sequence by pattern
new_seq=seq
for key, val in enz_dict.items():
new_seq = new_seq.replace(val[0],val[1])
position=0
for frag in new_seq.split("*") :
if not select_fh is None:
if len(frag)>=min and len(frag) <= max:
if double_selection :
if re.match(pattern, frag):
select_fh.write("\t".join([id,str(position), str(position+len(frag)),str(len(frag))])+"\n")
fragments_len_selection.append(len(frag))
else :
select_fh.write("\t".join([id,str(position), str(position+len(frag)),str(len(frag))])+"\n")
fragments_len_selection.append(len(frag))
fragments_len.append(len(frag))
bed_fh.write("\t".join([id,str(position), str(position+len(frag)),str(len(frag))])+"\n")
position+=len(frag)
# compute position of ecah CpG
cpg_seq = seq.replace("CG","C*G")
position=0
for frag in cpg_seq.split("*") :
if position == 0 :
if seq.startswith('CG') :
bedCpG_fh.write("\t".join([id,str(position), str(position+1), "CpG"])+"\n")
else:
bedCpG_fh.write("\t".join([id,str(position), str(position+1), "CpG"])+"\n")
position+=len(frag)
nb_cpg+=1
fa_fh.close()
bed_fh.close()
bedCpG_fh.close()
if not select_fh is None:
select_fh.close()
print ("genome_size :" , genome_size)
return fragments_len,fragments_len_selection,nb_cpg,genome_size
def get_enzymes_pattern(all_enz_dict, enzyme_list=["MspI"]) :
"""
Return dictionnary with the enzyme listed in parameter
@param all_enz_dict : the enzyme dictionnary
@param enzyme_list : the list of enzyme name to retrieve
@return : dict of enzyme to retrieve
"""
enz_dict = {}
for key in all_enz_dict :
if key in enzyme_list:
enz_dict[key]=all_enz_dict[key]
return enz_dict
def get_available_enzymes(file):
"""
Read an enzyme file and return dictionnary
@param file : the file with enzyme to load, format : MspI CCGG C*CGG
@return : dict of enzyme_name -> [sequence, pattern]
"""
prog_seq = re.compile("^[ACGT]*$")
prog_pattern = re.compile("^[ACGT\*]*$")
enz_fh = open(file, "rU")
enz_dict = {}
msg=""
for line in enz_fh :
a_enz=line.rstrip().split("\t")
if len(a_enz)==3:
if prog_seq.match(a_enz[1]) is not None and prog_pattern.match(a_enz[2]) is not None :
enz_dict[a_enz[0]]=a_enz[1:]
else :
msg+="Only ACGT are authorize in sequence and AC*GT in pattern, not true for "+a_enz[0]+"\n"
else :
msg+="Enzyme file format doesn't correspond to ENZ_NAME\tSEQTOCUT\tPATTERN*CUT\n"
print (msg)
enz_fh.close()
return enz_dict
def generate_bins(bins_size,min,max):
"""
Generate list of range from min to max with a step of bins_size
@param bins_size : size of bins from min to max
@param min : minimal fragment length
@param max : maximal fragment length
@return bins : array of minimal index of each range
@return ranges : dict of min_range -> {coord: [min_range-max_range]}
"""
index=int(min)
ranges={}
bins=[index]
while ((index+bins_size-1) <= max):
ranges[str(index)]={'coord':[index,index+bins_size-1]}
bins.append(index+bins_size-1)
index=index+bins_size
#add last one
ranges[str(index)]={'coord':[index,max]}
return (bins,ranges)
def compute_fragment_size_per_range(ranges,fragments_length):
for key in ranges.keys():
ranges[key]["nb_fragment"]=0
ranges[key]["size"]=0
for i in fragments_length :
for key in ranges.keys():
if i>=ranges[key]["coord"][0] and i<ranges[key]["coord"][1] :
ranges[key]["nb_fragment"]+=1
ranges[key]["size"]+=i
break
if __name__ == "__main__":
# Get available enzyme
all_enzymes = get_available_enzymes(os.path.join(os.path.dirname(os.path.realpath(__file__)),"enzyme.txt"))
parser = OptionParser(usage="Usage: insilicoRRBS.py -i FILE [options]")
usage = "usage: %prog -i file.fa -o output.stats -c CpGisland.bed"
desc = "The script simulates a complete enzymatic digestion of the genome to\
identify all the potential restriction fragments. Then, for each size\
range (from min to max fragment length of bins size ) it computes different metrics"
parser = OptionParser(usage = usage, version = version_string(), description = desc)
igroup = OptionGroup(parser, "Input files options","")
igroup.add_option("-i", "--input", dest="fasta_file",
help="The genome to digest [fasta]", metavar="FILE")
igroup.add_option("-c", "--bed", dest="bedfile",
help="A bed file of region of interrest (eg CpG island), to get coverage information with enzymatic digestion ", metavar="FILE")
parser.add_option_group(igroup)
ogroup = OptionGroup(parser, "Output directory options","")
ogroup.add_option("-o", "--output", dest="output",
help="The output directory where output file will be stored", metavar="DIR")
ogroup.add_option("-j", "--json", dest="json_file",
help="The json file for web visualisation", metavar="FILE")
parser.add_option_group(ogroup)
cgroup = OptionGroup(parser, "Fragment options","")
cgroup.add_option("-e", "--enzyme", dest="enzyme", default=None, metavar="STRING",
help="Enzyme to use for digestion, if two enzyme or more, perform double digestion or more eg : -e 'MspI TaqI'\n\
Available enzyme : "+ " ".join(all_enzymes.keys()))
cgroup.add_option("-y", "--enzyme-file", dest="enzyme_file", default=None,
help="Provide your own enzyme file with following format :\n\
MspI CCGG C*CGG")
cgroup.add_option("-d", "--double-selection", dest="double_selection" ,action="store_true",default=False,
help="Set if you only want to select fragment which are cut by different enzyme at each ends")
cgroup.add_option("-f", "--bins", dest="bins_size", default=50,type="int",
help="The bins size for fragment statistics")
cgroup.add_option("-m", "--min", dest="min_size", default=30,type="int",
help="The minimum fragment size")
cgroup.add_option("-a", "--max", dest="max_size", default=500,type="int",
help="The maximum fragment size")
parser.add_option_group(cgroup)
bgroup = OptionGroup(parser, "Dependencies options","")
bgroup.add_option("-l", "--bedtools", dest="bedtools",
help="The bedtools program path [REQUIRED If not in path]", default='bedtools')
parser.add_option_group(bgroup)
(options, args) = parser.parse_args()
# to do check enzyme name
msg=""
if options.fasta_file == None :
msg="Input file is required\n"
if options.output == None:
msg="Output directory is required\n"
if options.enzyme == None and options.enzyme == None :
msg="Enzyme file OR enzyme name is required\n"
if options.output == None or os.path.exists(options.output):
msg="Output dir already exists\n"
if not msg=="" :
print ("ERROR " + msg)
parser.print_help()
sys.exit(1)
try:
p = Popen([options.bedtools, '--version'], stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
except:
print ("Bedtools is required [in path or specified with -l]\n")
parser.print_help()
sys.exit(1)
os.makedirs(options.output)
#retrieve enzyme to process
if options.enzyme_file is not None :
all_enzymes = get_available_enzymes(options.enzyme_file)
enz_dict = all_enzymes
if options.enzyme is not None :
enz_dict = get_enzymes_pattern(all_enzymes, options.enzyme.split(" "))
if options.double_selection and len(enz_dict.keys()) != 2 :
print ("ERROR option double-selection/-d can only be use with 2 enzymes selected")
parser.print_help()
sys.exit(1)
fasta_basename = os.path.basename(options.fasta_file)
digestion_genome_bed = os.path.join(options.output,fasta_basename+".all_genome.bed")
CpG_genome_bed = os.path.join(options.output,fasta_basename+".CpG.bed")
digestion_selection_bed = os.path.join(options.output,fasta_basename+".select"+str(options.min_size)+'-'+str(options.max_size)+".bed")
json_file=options.json_file
stats_file=os.path.join(options.output,fasta_basename+".stats.txt")
#digest genome with enzyme
fragments_length,fragments_length_selection,nb_cpg,genome_size = split_genome(options.fasta_file,enz_dict, digestion_genome_bed,digestion_selection_bed,options.min_size,options.max_size,CpG_genome_bed,options.double_selection)
dict_genome={"fasta":options.fasta_file,
"size" :genome_size,
"nb_CpG": nb_cpg ,
"bed_CpG": CpG_genome_bed}
result={"genome":dict_genome}
result["enzyme"]=enz_dict
#save fragment informations
dict_fragments={"bed":digestion_genome_bed,
"size" :genome_size,
"nb_fragment":len(fragments_length),
"distribution":get_fragment_histogram(fragments_length)}
result["fragments"]=dict_fragments
# generate bins
(bins,ranges) = generate_bins(int(options.bins_size), options.min_size, options.max_size)
compute_fragment_size_per_range(ranges,fragments_length_selection)
#Statistics of selected fragments thanks to bins
nb_base,nb_fragment = count_bp_element_in_bed(digestion_selection_bed)
dict_selection={"bed":digestion_selection_bed,
"size" :nb_base,
"nb_fragment":nb_fragment,
"distribution":get_fragment_histogram(fragments_length, bins),
"ranges":ranges}
#stats with bed
dict_user_bed={}
if options.bedfile is not None :
nb_base_user_bed,nb_fragment_user_bed = count_bp_element_in_bed(options.bedfile)
dict_user_bed={"bed":options.bedfile,
"size" :nb_base_user_bed,
"nb_fragment":nb_fragment_user_bed}
result["user_bed"]=dict_user_bed
#compute intersection between CpG file and fragment selection
intersection_CpG_fragment_file=os.path.join(options.output,"Intersect_CpG_fragment.bed")
# Compute Global intersection of fragment in range with CpG island
bedtools_intersect(options.bedtools,
CpG_genome_bed,
digestion_selection_bed,
intersection_CpG_fragment_file)
dict_selection["intersectionCpG"]=get_intersection_stats(intersection_CpG_fragment_file,nb_col_file(CpG_genome_bed), ranges)
dict_selection["intersectionCpG"]["filea"]="CpG"
dict_selection["intersectionCpG"]["fileb"]="fragment selection"
dict_selection["intersectionCpG"]["bed"]=intersection_CpG_fragment_file
#write_bed_files_per_ranges(ranges, digestion_genome_bed, tmp_dir,digestion_selection_bed)
#compute_ranges_intersection(options.bedtools, ranges, CpG_genome_bed, nb_cpg, genome_size, options.bedfile,)
#result["selection_statistics"]=ranges
#compute intersection between external bed and fragment selection
if options.bedfile is not None :
intersection_externalbed_fragment_file=os.path.join(options.output,"intersect_userbed_fragment.bed")
# Compute Global intersection of fragment in range with CpG island
bedtools_intersect(options.bedtools,
options.bedfile,
digestion_selection_bed,
intersection_externalbed_fragment_file)
dict_selection["intersectionBed"]=get_intersection_stats(intersection_externalbed_fragment_file,nb_col_file(options.bedfile),ranges)
dict_selection["intersectionBed"]["bed"]=intersection_externalbed_fragment_file
dict_selection["intersectionBed"]["filea"]="user bed"
dict_selection["intersectionBed"]["fileb"]="fragment selection"
result["selection"]=dict_selection
# compute CpG intersection with ranges and IlotCpG with rages :
fh_out = open(stats_file,"w")
fh_out.write("### This file contains statistics of digestion ("+options.fasta_file+") with "+ " and ".join(result["enzyme"].keys())+"\n")
fh_out.write("### Computed digestion of the genome: "+ result["fragments"]["bed"]+"\n")
fh_out.write("### Computed CpG position on the genome: "+ result["genome"]["bed_CpG"]+"\n")
fh_out.write("### Fragment selection on length ["+str(options.min_size)+'-'+str(options.max_size)+"]: "+ result["selection"]["bed"]+"\n")
fh_out.write("### Statistics on inputs:"+"\n")
fh_out.write("##### Genome size: "+ str(result["genome"]["size"]) +"bp\n")
fh_out.write("##### Nb Cpg: "+ str(result["genome"]["nb_CpG"]) +"\n")
if options.bedfile is not None :
fh_out.write("##### Nb fragments in bed: "+ str(result["user_bed"]["nb_fragment"]) +"\n")
fh_out.write("##### Nb bases in bed: "+ str(result["user_bed"]["size"]) +"bp\n")
fh_out.write("### Histogram values on fragment length after digestion: ("+result["fragments"]["distribution"]["select"]+")"+"\n")
fh_out.write("##### Nb fragments: "+str(result["fragments"]["nb_fragment"])+"\n")
fh_out.write("##### Bins: "+str(result["fragments"]["distribution"]["edges"])+"\n")
fh_out.write("##### Values: "+str(result["fragments"]["distribution"]["hist"])+"\n")
fh_out.write("### Statistics on fragment selection:\n")
fh_out.write("##### Nb fragments:"+str(result["selection"]["nb_fragment"])+"\n")
fh_out.write("##### Nb bases:"+str(result["selection"]["size"])+"bp ("+str(round(result["selection"]["size"]*100/result["genome"]["size"],2))+"%)\n")
fh_out.write("### Histogram values on fragment length after selection:"+"\n")
fh_out.write("##### Bins:"+str(result["selection"]["distribution"]["edges"])+"\n")
fh_out.write("##### Values:"+str(result["selection"]["distribution"]["hist"])+"\n")
fh_out.write("### Statistics on intersection between CpG position and fragment selection:\n")
fh_out.write("##### Nb CpG covered :"+str(result["selection"]["intersectionCpG"]["global"]["nb_fragment_a"])+"\n")
fh_out.write("##### Nb fragment covered :"+str(result["selection"]["intersectionCpG"]["global"]["nb_fragment_b"])+"\n")
fh_out.write("##### Nb overlap base :"+str(result["selection"]["intersectionCpG"]["global"]["nb_overlap_base"])+"\n")
header = "#min\tmax\tnb base select\tnb fragment select\tnb CpG base\tnb fragment covered by CpG"
if options.bedfile is not None :
fh_out.write("### Statistics on intersection between bed provided by user and fragment selection:"+"\n")
fh_out.write("##### Nb user fragment covered :"+str(result["selection"]["intersectionBed"]["global"]["nb_fragment_a"])+"\n")
fh_out.write("##### Nb fragment covered :"+str(result["selection"]["intersectionBed"]["global"]["nb_fragment_b"])+"\n")
fh_out.write("##### Nb overlap base :"+str(result["selection"]["intersectionBed"]["global"]["nb_overlap_base"])+"\n")
header += "\tnb base bed\tnb fragment select covered by bed\tnb fragment from bed covered"
fh_out.write(header+"\n")
keys_int=sorted([int(i)for i in result["selection"]["intersectionCpG"]["ranges"].keys()])
for key_int in keys_int :
key = str(key_int)
fh_out.write("\t".join([str(result["selection"]["ranges"][key]['coord'][0]),
str(result["selection"]["ranges"][key]['coord'][1]),
str(result["selection"]["ranges"][key]["size"]),
str(result["selection"]["ranges"][key]["nb_fragment"]),
str(result["selection"]["intersectionCpG"]["ranges"][key]['nb_base']),
str(result["selection"]["intersectionCpG"]["ranges"][key]['nb_fragment_b'])]))
if options.bedfile is not None :
fh_out.write("\t"+"\t".join([str(result["selection"]["intersectionBed"]["ranges"][key]['nb_base']),
str(result["selection"]["intersectionBed"]["ranges"][key]['nb_fragment_b']),
str(result["selection"]["intersectionBed"]["ranges"][key]['nb_fragment_a'])]))
fh_out.write("\n")
fh_out.close()
if (json_file):
fh_out = open(json_file,"w")
json.dump(result, fh_out , sort_keys=True, indent=4, separators=(',', ': '))
fh_out.close()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment