Commit dc2650a9 authored by Celine Noirot's avatar Celine Noirot
Browse files

add simulation of different sequencing length to get statistics on genome coverage

parent a4456f74
#
# insilicoRRBSsequencing
# Copyright (C) 2014 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2015 INRA'
__license__ = 'GNU General Public License'
__version__ = '1'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'alpha'
from optparse import *
import os, sys
from subprocess import Popen, PIPE
import json
def version_string ():
"""
Return the insilicoRRBSsequencing version
"""
return "insilicoRRBSsequencing " + __version__
def generate_bed_from_read_size (file,read_size,output):
fh=open(file)
fh_out=open(output,"w")
for line in fh:
val=line.rstrip().split("\t")
if len(val)>=2 :
#read len superieur au fragment
if read_size > int(val[2])-int(val[1]):
fh_out.write(line)
else :
start_5=val[1]
end_5=int(val[1])+read_size
start_3=int(val[2])-read_size
end_3=val[2]
fh_out.write("\t".join([val[0],str(start_5),str(end_5)]+val[3:])+"\n")
fh_out.write("\t".join([val[0],str(start_3),str(end_3)]+val[3:])+"\n")
fh.close()
fh_out.close()
def bedtools_intersect(bedtools,filea,fileb, options=[]):
'''Validate with R'''
cmd = [bedtools, "intersect", "-a", filea, "-b", fileb]+options
p_intersect = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p_intersect.communicate()
sum_base_intersected=0
nb_element_intesected=0
if stderr.decode("utf-8") != "" :
print ("error on intersect bed between " + filea + " and " + fileb)
print (stderr.decode("utf-8"))
return 0
for line in stdout.decode("utf-8").split("\n") :
val = line.split("\t")
if len(val) > 2 :
sum_base_intersected+=int(val[2])-int(val[1])
nb_element_intesected+=1
return(sum_base_intersected,nb_element_intesected)
if __name__ == "__main__":
# Get available enzyme
parser = OptionParser(usage="Usage: insilicoRRBS.py -i FILE [options]")
usage = "usage: %prog -i genome.select50-500.bed [-b genome.CpG.bed -c genome.CpGisland.bed] -o output.stats"
desc = "The script simulates a complete enzymatic digestion of the genome to\
identify all the potential restriction fragments. Then, for each size\
range (from min to max fragment length of bins size ) it computes different metrics"
available_enzyme=["MspI","TakI","Alu"]
parser = OptionParser(usage = usage, version = version_string(), description = desc)
igroup = OptionGroup(parser, "Input files options","")
igroup.add_option("-i", "--input", dest="digestion_selection",
help="The bed file of digestion selection", metavar="FILE")
igroup.add_option("-b", "--cpg", dest="cpg_genome",
help="The bed file of CpG on the genome", metavar="FILE")
igroup.add_option("-c", "--cpg-island", dest="cpg_island",
help="A bed file of CpG island, to get coverage information", metavar="FILE")
parser.add_option_group(igroup)
ogroup = OptionGroup(parser, "Output files options","")
ogroup.add_option("-o", "--output", dest="output",
help="The output statistics file", metavar="FILE")
ogroup.add_option("-j", "--json", dest="json_file",
help="The json file for web visualisation", metavar="FILE")
parser.add_option_group(ogroup)
cgroup = OptionGroup(parser, "Sequencing options","")
cgroup.add_option("-r", "--read-min-size", dest="read_min_size", default=50,type="int",
help="The minimun read size of sequencing")
cgroup.add_option("-m", "--read-max-size", dest="read_max_size", default=150,type="int",
help="The maximun read size of sequencing")
cgroup.add_option("-s", "--step", dest="step", default=10,type="int",
help="The step from min to max to compute statistics")
parser.add_option_group(cgroup)
bgroup = OptionGroup(parser, "Dependencies options","")
bgroup.add_option("-l", "--bedtools", dest="bedtools",
help="The bedtools program path [REQUIRED If not in path]", default='bedtools')
parser.add_option_group(bgroup)
(options, args) = parser.parse_args()
# to do check enzyme name
if options.digestion_selection == None or (options.cpg_genome == None and options.cpg_island == None) or options.output == None:
parser.print_help()
sys.exit(1)
try:
p = Popen([options.bedtools, '--version'], stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
except:
print ("Bedtools is required [in path or specified with -l]\n")
parser.print_help()
sys.exit(1)
result = {"options":{
"digestion_selection": options.digestion_selection,
"min_read_size": options.read_min_size,
"max_read_size": options.read_max_size,
"read_size": options.step}
}
ranges={}
result["stats"]={}
i=int(options.read_min_size)
while i<=int(options.read_max_size) :
result["stats"][i]={"read_size":i}
read_digestion_selection_bed = os.path.splitext(options.output)[0]+".read"+str(i)+".bed"
generate_bed_from_read_size(options.digestion_selection,i,read_digestion_selection_bed)
nb_base, nb_fgt = bedtools_intersect(options.bedtools,options.cpg_island,read_digestion_selection_bed,["-u"])
result["stats"][i]["cpg_island"]={"nb_base":nb_base, "nb_fragment": nb_fgt}
nb_base, nb_fgt = bedtools_intersect(options.bedtools,options.cpg_genome,read_digestion_selection_bed,["-u"])
result["stats"][i]["cpg"]={"nb_base":nb_base, "nb_fragment": nb_fgt}
i=i+int(options.step)
fh_out = open(options.output,"w")
fh_out.write("### This file contains statistics of sequencing a digestion ("+options.digestion_selection+") \n")
fh_out.write("### Compute statistics for reads from "+options.read_min_size+" to "+options.read_max_size+ " with step "+options.step+"\n")
fh_out.write("### Statistics on selection:"+"\n")
header = "#read size\tnb CpG base"
if options.cpg_island is not None :
header += "\tnb CpG_island base\tnb CpG_island fragment"
fh_out.write(header+"\n")
for key in sorted(result["stats"].keys()) :
fh_out.write(str(key)+"\t" +str(result["stats"][key]["cpg"]["nb_base"]))
if options.cpg_island:
fh_out.write("\t"+ str(result["stats"][key]["cpg_island"]["nb_base"])+"\t"+str(result["stats"][key]["cpg_island"]["nb_fragment"]))
fh_out.write("\n")
fh_out.close()
if options.json_file is not None:
fh_out = open(options.json_file,"w")
json.dump(result, fh_out)
fh_out.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment