analyzebed.py 10.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from ng6.analysis import Analysis
from weaver.function import ShellFunction
from weaver.abstraction import Map
Penom Nom's avatar
Penom Nom committed
19
from jflow import utils
20
21
22
from collections import Counter
import os.path
import numpy
Penom Nom's avatar
Penom Nom committed
23
import itertools
24
25
import subprocess
import re
26
27
28

class Analyzebed (Analysis):
    
29
    def define_parameters(self, files, p, li, i_list, options):
30
        self.add_input_file_list( "input_files", "The files to be archived.", default=files, required=True )
Penom Nom's avatar
Penom Nom committed
31
32
33
34
        self.add_output_file_list( "output_links", "Link to the files.", pattern='link2_{basename}' , items=self.input_files )
        self.add_output_file( "software_merge", "TODO", filename="software_merge.bed")
        self.add_parameter("overlap_percent", "Minimum threshold of overlap percent between 2 SV", type=float, default=p)
        self.add_parameter("li_range", "Range of localisation interval around breakpoints", type=int, default=li)
35
        self.add_parameter("param_sum", "Summary of used parameters", type=str, default=options)
Penom Nom's avatar
Penom Nom committed
36
37
        self.add_parameter_list("indiv_list", "String list of studied individuals name", default=i_list, type=str, required=True)
    
38
    def get_version(self):
39
40
41
42
43
44
45
46
47
48
49
        pindel_help = subprocess.Popen(["pindel","-h"], universal_newlines=True, stdout=subprocess.PIPE)
        pindel_version = re.search("Pindel version ([0-9\.a-b]+),", pindel_help.stdout.read())
        cnvnator_help = subprocess.Popen(["cnvnator"], universal_newlines=True, stderr=subprocess.PIPE)
        cnvnator_version = re.search("CNVnator v([0-9\.]+)", cnvnator_help.stderr.read())
        delly_help = subprocess.Popen(["delly"], universal_newlines=True, stdout=subprocess.PIPE)
        delly_version = re.search("Delly \(Version: ([0-9\.]+)\)", delly_help.stdout.read())
        breakdancer_help = subprocess.Popen(["breakdancer-max"], universal_newlines=True, stderr=subprocess.PIPE)
        breakdancer_version = re.search("breakdancer-max version ([0-9\.]+)", breakdancer_help.stderr.read())
        versions = "Pindel: {0} / CNVnator: {1} / Delly: {2} / Breakdancer: {3}".format(pindel_version.groups()[0],
                     cnvnator_version.groups()[0], delly_version.groups()[0], breakdancer_version.groups()[0])
        return versions
50
51
        
    def define_analysis(self):
52
53
        self.options = self.param_sum
        self.name = "SV Detection"
Penom Nom's avatar
Penom Nom committed
54
55
56
        self.description = "A few stats on SV repartition"
        self.software = "Pindel, CNVnator, Delly, Breakdancer"
        
57
    def post_process(self):
58
59
60
        for i, bedfile in enumerate(self.output_links):
            sample = os.path.splitext(os.path.basename(bedfile))[0]
            soft = self.indiv_list[i]
61
            #for boxplot
62
63
            sizes_list = [] 
            sizes_by_type = {}
64
            #for accumulation curve
65
            n_list = []
Penom Nom's avatar
Penom Nom committed
66
            n_by_type = {}
67
68
            
            #PARSE FILE
69
            with open(bedfile) as result_file:
Penom Nom's avatar
Penom Nom committed
70
71
72
                for line in result_file:
                    sv = line.split()
                    size = int(sv[2])-int(sv[1])
73
                    sv_type = sv[4]
Penom Nom's avatar
Penom Nom committed
74
                    members = sv[7].split(";")
75
                    n = len(members)
76
77
                    
                    sizes_list.append(size)
Penom Nom's avatar
Penom Nom committed
78
                    n_list.append(n)
79
                    if sv_type in sizes_by_type: #fill
80
81
                        sizes_by_type[sv_type].append(size)
                        n_by_type[sv_type].append(n)
82
                    else: #if not, create
83
84
85
86
                        sizes_by_type[sv_type] = [size]
                        n_by_type[sv_type] = [n]

            #GENERATE DB INFOS
87
            #add "ALL" type
Penom Nom's avatar
Penom Nom committed
88
89
90
91
92
93
            sizes_by_type["ALL"] = sizes_list
            n_by_type["ALL"] = n_list
            for type in sizes_by_type:
                sizes = numpy.array(sizes_by_type[type])
                boxplot = self.boxplot(sizes)
                accu = self.accumulation(n_by_type[type])
94
                db_entry = {'boxplot':boxplot[0],
Penom Nom's avatar
Penom Nom committed
95
96
97
98
99
100
101
102
103
104
                            'outliers':boxplot[1],
                            'n_tag':accu[0],
                            'n_count':accu[1],
                            'mean':round(numpy.mean(sizes),1),
                            'min':min(sizes),
                            'max':max(sizes),
                            'sd': round(numpy.std(sizes),1),
                            'number':len(sizes),
                            'median':int(numpy.median(sizes))
                            }
105
                #fill db on soft-specific data
Penom Nom's avatar
Penom Nom committed
106
                self._add_result_element(sample, "soft", soft, type)
107
108
109
110
111
112
                for key in db_entry:
                    self._add_result_element(sample, key, db_entry[key], type)
        
        
        #CREATE VENN TABLE
        venn_table = {}
Penom Nom's avatar
Penom Nom committed
113
114
115
116
117
        for key in self.powerset(self.indiv_list):
            if key == ():
                venn_table["all"] = 0
            else:
                venn_table[";".join(key)] = 0
118
119
120
        
        venn_table_by_type = {}
        
Penom Nom's avatar
Penom Nom committed
121
122
123
        with open(self.software_merge) as merge_list:
            for line in merge_list:
                used_software = []
124
125
126
127
                svr = line.split()
                svr_type = svr[4]
                
                for column in svr[6].split(";"):
128
                    #get members
Penom Nom's avatar
Penom Nom committed
129
                    used_software.append("".join(column.split("-")[:-1]))
130
                if svr_type not in venn_table_by_type:  # create the dic
131
132
133
134
135
                    for key in self.powerset(self.indiv_list):
                        if key == ():
                            venn_table_by_type[svr_type] = {"all":0}
                        else:
                            venn_table_by_type[svr_type][";".join(key)] = 0
136
137
138
139
                #update the dic
                venn_table_by_type[svr_type][";".join(self.unique(used_software))] += 1
                venn_table[";".join(self.unique(used_software))] += 1
        #add "ALL" type
140
141
142
143
144
        venn_table_by_type['ALL'] = venn_table
        
        #GENERATE DB INFOS
        for type in venn_table_by_type:
            v_table = venn_table_by_type[type]
145
            #good format
146
            db_venn = "".join([y for y in str(v_table)[1:-1] if y not in ["\'", " "]])
147
            #fill the db for inter-soft data
148
149
            self._add_result_element("software_merge", "venn_count", db_venn, type)
        
150
        #output files
151
152
153
        result_files = [self.output_links, self.software_merge]
        self._create_and_archive(self.output_links, None) #TODO pass result_files

154
    def process(self):
155
        #create links
156
        link = ShellFunction(self.get_exec_path("ln") + " -s $1 $2", cmd_format='{EXE} {IN} {OUT}')
Penom Nom's avatar
Penom Nom committed
157
        Map(link, inputs=self.input_files, outputs=self.output_links)
158
        # MERGE INTERSOFT
Penom Nom's avatar
Penom Nom committed
159
160
161
162
163
164
165
166
167
        mrg = ShellFunction(self.get_exec_path("merge_sv.py") + \
                                    " --output $1 \
                                      --tag $2\
                                      --p $3\
                                      --li $4" \
                                    + " --names" + utils.get_argument_pattern(self.indiv_list, 5)[0] \
                                    + " --files" + utils.get_argument_pattern(self.input_files, len(self.indiv_list)+5)[0],
                                    cmd_format='{EXE} {OUT} {ARG} {IN}')
        mrg(outputs=self.software_merge, inputs=self.input_files, arguments=["soft_merge", self.overlap_percent, self.li_range, self.indiv_list])
168
        
169
    def js_prepross(self, my_list):
170
171
172
173
174
175
        """
        facilitate data transfer and comprehension for javascript via jflow & php
        
        :Example: self.js_prepross(['rock', 'scissor', 'paper'])
        >>> "rock;scissor;paper" 
        """
176
177
        my_string = ";".join(map(str,my_list))
        return my_string
Penom Nom's avatar
Penom Nom committed
178
179
        
    def accumulation(self, all_n):
180
181
182
183
184
185
186
187
        """
        Count how many SV are present in at least 1..N individuals
        
        :return: 1..N & associated count as strings
        
        :Example: self.accumulation([1,1,1,1,2,3,3])
        >>> ["1;2;3", "4;1;2"]
        """
Penom Nom's avatar
Penom Nom committed
188
189
        n_count = []
        all_n = Counter(all_n)
190
        for i,n in enumerate(all_n): # Rare bug if one n is missing in 1..N --> bug in GUI
Penom Nom's avatar
Penom Nom committed
191
192
193
194
            tmp = 0
            for j in list(reversed(range(i+1, len(all_n)+1))):
                tmp = tmp + all_n[j]
            n_count.append(tmp)
195
        n_tag = self.js_prepross(all_n.keys())
Penom Nom's avatar
Penom Nom committed
196
197
198
199
        n_count = self.js_prepross(n_count)
        return([n_tag, n_count])
        
    def boxplot(self, sizes):
200
201
202
203
204
205
206
207
208
        """
        give the values needed for drawing a boxplot
        
        :return: ["lower whisker;lower quartile;median;upper quartile;upper whisker", number of outliers]
        
        :Example: self.boxplot([13799,12699,101,...])
        >>> ["101;509;1099;5349;12299", 4]
        """
        #boxplot
Penom Nom's avatar
Penom Nom committed
209
210
211
212
213
214
        median = numpy.median(sizes)
        upper_quartile = numpy.percentile(sizes, 75)
        lower_quartile = numpy.percentile(sizes, 25)
        iqr = upper_quartile - lower_quartile
        upper_whisker = sizes[sizes<=upper_quartile+1.5*iqr].max()
        lower_whisker = sizes[sizes>=lower_quartile-1.5*iqr].min()
215
216
        boxplot = self.js_prepross([lower_whisker,lower_quartile,median,upper_quartile,upper_whisker])
        #outliers
Penom Nom's avatar
Penom Nom committed
217
218
219
220
221
        outliers = sizes[sizes > upper_whisker]
        numpy.append(outliers, sizes[sizes < lower_whisker])
        outliers = len(outliers)
        return([boxplot, outliers])

222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
    def unique(self, seq, idfun=None):
        """
            unique([1,2,2,3,27,1,2,4]) --> [1,2,3,27,4]
            keep order of appearance intact
        """
        # found on peterbe.com
        if idfun is None:
            def idfun(x): return x
        seen = {}
        result = []
        for item in seq:
            marker = idfun(item)
            if marker in seen: continue
            seen[marker] = 1
            result.append(item)
        return result

Penom Nom's avatar
Penom Nom committed
239
    def powerset(self, iterable):
240
241
242
243
244
245
        """ 
        return all possible sub-sets from an iterable
        
        :Example: powerset("abc")
        >>> (''),('a'),('b'),('c'),('a','b'),('a','c'),('b','c'),('a','b','c')
        """
Penom Nom's avatar
Penom Nom committed
246
247
        s = list(iterable)
        return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s)+1))