build_genome_features.R 13.2 KB
Newer Older
Aurelien Brionne's avatar
Aurelien Brionne committed
1
2
3
4
5
6
#' @title  build a convenient genomic features object.
#' @description build a convenient \code{\link{genome_features-class}} object needed for find overlaps beetween
#' selected features and genomic coordinates in a single step.
#' @importFrom  AnnotationDbi select keys as.data.frame metadata
#' @importFrom  GenomicFeatures promoters transcripts fiveUTRsByTranscript threeUTRsByTranscript exonsBy intronsByTranscript cdsBy
#' @importFrom  GenomicRanges flank
Aurelien Brionne's avatar
update    
Aurelien Brionne committed
7
#' @importFrom rtracklayer import
Aurelien Brionne's avatar
Aurelien Brionne committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#' @importFrom  BiocGenerics as.data.frame
#' @importFrom  methods new
#' @import data.table
#' @param txdb a txdb database built with \pkg{GenomicFeatures} (see example).
#' @param features a \code{\link[base]{vector}} of features \code{\link[base]{character}} with by
#' default all the availables features ("promoter", "UTR5", "UTR3", "exon", "intron", "cds", and "downstream").
#' @param parameters a \code{\link[base]{list}} of parameters to use in order to define promoter and
#' downstream ranges distance to TSS (Transcript Start Site) or TES (Transcript End Site).
#' \itemize{
#'  \item{promoter_ranges: a \code{\link[base]{list}} with upstream and dowstream  elements distance (in base) to TSS}.
#'  \item{downstream_range: the dowstream distance (in base) to the TES}.
#' }
#' @details This method build a convenient \code{\link{genomic_features-class}} object, in order to be used for finding overlaps beetween
#' all selected features and genomic coordinates with the \code{\link{genome_features_overlaps}} method.
#' @return a \code{\link{genome_features-class}} object.
#' @include genome_features.R
#' @examples
#'
#' ###################
#  # import the chicken genome GFF from NCBI ftp (galGal5 in this case)
#' utils::download.file("ftp://ftp.ncbi.nlm.nih.gov/genomes/Gallus_gallus/GFF/ref_Gallus_gallus-5.0_top_level.gff3.gz",
#' quiet=T,destfile = "galGal5_gff.gz")
#'
#' ##################
#' #  uncompress the file
#' R.utils::gunzip("galGal5_gff.gz")
#'
#' ###################
#' # build the database
#' toplevel=GenomicFeatures::makeTxDbFromGFF(
#'  file="galGal5_gff",
#'  format="gff",
#'  organism="Gallus gallus",
#'  taxonomyId=9031,
#'  dbxrefTag="ID"
#' )
#'
#' ###################
#' # build genome_features object from txdb
#' features<-GenomeFeatures::build_genome_features(
#'  toplevel,
#'  features=base::c("promoter","UTR5","UTR3","exon","intron","cds","downstream"),
#'  parameters = base::list(
#'    promoter_ranges=base::list(
#'      upstream=3000,
#'      downstream=500
#'    ),
#'    downstream_range=1000
#'  )
#' )
#' @export
setGeneric(name="build_genome_features",def=function(txdb,features=base::c("promoter","UTR5","UTR3","exon","intron","cds","downstream"),
parameters=base::list(promoter_ranges=base::list(upstream=3000,downstream=500),downstream_range=1000)){standardGeneric("build_genome_features")})

62
setMethod("build_genome_features",definition=function(txdb,features,parameters) {
Aurelien Brionne's avatar
Aurelien Brionne committed
63
64
65
66
67
68
69
70
71
72
73
74
75

  ###################
  # check txdb
  ###################
  if(base::class(txdb)!="TxDb") base::stop("txdb must be a TxDb class object")

  ###################
  # txdb ids crossref
  ###################

  ###################
  # build ids cross reference
  crossref<-data.table::data.table(
Aurelien Brionne's avatar
Aurelien Brionne committed
76
77
78
79
80
81
    AnnotationDbi::select(
      txdb,
      columns=c("TXID","TXNAME"),
      keytype="GENEID",
      keys=AnnotationDbi::keys(txdb)
    )
Aurelien Brionne's avatar
Aurelien Brionne committed
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
  )

  ###################
  # convert TXID to character
  crossref[,TXID:=base::as.character(TXID)]

  ###################
  # promoters
  ###################

  promoters=NULL;promoters_DT=NULL

  if("promoter"%in%features){

    ###################
    # get promoters
    promoters <- GenomicFeatures::promoters(
      txdb,
      upstream=parameters$promoter_ranges$upstream,
101
      downstream=parameters$promoter_ranges$downstream
Aurelien Brionne's avatar
Aurelien Brionne committed
102
103
104
105
    )

    ###################
    # convert to table
106
    promoters_DT<-data.table::data.table(AnnotationDbi::as.data.frame(promoters,row.names = NULL))[,.(tx_id,tx_name)]
Aurelien Brionne's avatar
Aurelien Brionne committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

    ###################
    # convert to character
    promoters_DT[,tx_id:=base::as.character(tx_id)]

    ###################
    # merge with crossref
    promoters_DT<-merge(promoters_DT,crossref,by.x="tx_id",by.y="TXID",sort=F,all.x=T)

    ###################
    # rename
    base::names(promoters_DT)[1:2]<-base::c("group","feature")

    ###################
    # format feature
    promoters_DT[,  tx_name:=feature]

    ###################
    # format feature
    promoters_DT[,`:=`(
      feature=base::paste(GENEID,feature,"promoter",sep=":"),
      GENEID=NULL,
      TXNAME=NULL
    )]
  }

  ###################
  # downstream
  ###################

  downstream=NULL;downstream_DT=NULL

  if("downstream"%in%features){

    ###################
    # get downstream
    downstream<- GenomicFeatures::transcripts(txdb)
    downstream<-GenomicRanges::flank(downstream,parameters$downstream_range,start=F)

    ###################
    # convert to table
148
    downstream_DT<-data.table::data.table(AnnotationDbi::as.data.frame(downstream,,row.names = NULL))[,.(tx_id,tx_name)]
Aurelien Brionne's avatar
Aurelien Brionne committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

    ###################
    # convert to character
    downstream_DT[,tx_id:=base::as.character(tx_id)]

    ###################
    # merge with crossref
    downstream_DT<-merge(downstream_DT,crossref,by.x="tx_id",by.y="TXID",sort=F,all.x=T)

    ###################
    # rename
    base::names(downstream_DT)[1:2]<-base::c("group","feature")

    ###################
    # format feature
    downstream_DT[,tx_name:=feature]

    ###################
    # format feature
    downstream_DT[,`:=`(
      feature=base::paste(GENEID,feature,"downstream",sep=":"),
      GENEID=NULL,
      TXNAME=NULL
    )]
  }

  ###################
  # 5' UTR
  ###################

  UTR5=NULL;UTR5_DT=NULL

  if("UTR5"%in%features){

    ###################
    # get 5' UTR in GRangesList
    UTR5<-GenomicFeatures::fiveUTRsByTranscript(txdb,use.names=F)

    ###################
    # convert to table
189
    UTR5_DT<-data.table::data.table(BiocGenerics::as.data.frame(UTR5,,row.names = NULL))[,.(group_name,exon_rank)]
Aurelien Brionne's avatar
Aurelien Brionne committed
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

    ###################
    # convert to GRanges
    UTR5<-base::unlist(UTR5)

    ###################
    # merge with crossref
    UTR5_DT<-merge(UTR5_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T)

    ###################
    # format feature
    UTR5_DT[,`:=`(
      feature=base::paste(GENEID,":",TXNAME,":5'UTR(exon",exon_rank,")",sep=""),
      exon_rank=NULL,
      GENEID=NULL,
      tx_name=TXNAME,
      TXNAME=NULL,
      group_name=NULL
    )]
  }

  ###################
  # 3' UTR
  ###################

  UTR3=NULL;UTR3_DT=NULL

  if("UTR3"%in%features){

    ###################
    # get 3' UTR
    UTR3<-GenomicFeatures::threeUTRsByTranscript(txdb,use.names=F)

    ###################
    # convert to table
225
    UTR3_DT<-data.table::data.table(AnnotationDbi::as.data.frame(UTR3,,row.names = NULL))[,.(group_name,exon_rank)]
Aurelien Brionne's avatar
Aurelien Brionne committed
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260

    ###################
    # convert to GRanges
    UTR3<-base::unlist(UTR3)

    ###################
    # merge with crossref
    UTR3_DT<-merge(UTR3_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T)

    ###################
    # format feature
    UTR3_DT[,`:=`(
      feature=base::paste(GENEID,":",TXNAME,":3'UTR(exon",exon_rank,")",sep=""),
      exon_rank=NULL,
      GENEID=NULL,
      tx_name=TXNAME,
      TXNAME=NULL,
      group_name=NULL
    )]
  }

  ###################
  # Exons
  ###################

  exons=NULL;exons_DT=NULL

  if("exon"%in%features){

    ###################
    # get exons
    exons<-GenomicFeatures::exonsBy(txdb,use.names=F)

    ###################
    # convert to table
261
    exons_DT<-data.table::data.table(AnnotationDbi::as.data.frame(exons,row.names = NULL))[,.(group_name,exon_rank)]
Aurelien Brionne's avatar
Aurelien Brionne committed
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

    ###################
    # convert to GRanges
    exons<-base::unlist(exons)

    ###################
    # count exons by transcripts
    exons_nb<-exons_DT[!base::is.na(group_name),.N,by=group_name]

    ###################
    # merge count to exons_DT
    exons_DT<-merge(exons_DT,exons_nb,by="group_name",all.x=T,sort=F)

    ###################
    # merge with crossref
    exons_DT<-merge(exons_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T)

    ###################
    # format feature
    exons_DT[,`:=`(
      feature=base::paste(GENEID,":",TXNAME,":exon",exon_rank,"/",N,sep=""),
      exon_rank=NULL,
      GENEID=NULL,
      tx_name=TXNAME,
      TXNAME=NULL,
      N=NULL,
      group_name=NULL
    )]
  }

  ###################
  # Introns
  ###################

  introns=NULL;introns_DT=NULL

  if("intron"%in%features){

    ###################
    # get introns
    introns<-GenomicFeatures::intronsByTranscript(txdb,use.names=F)

    ###################
    # convert to table
306
    introns_DT<-data.table::data.table(AnnotationDbi::as.data.frame(introns,row.names = NULL))[,.(group_name,strand)]
Aurelien Brionne's avatar
Aurelien Brionne committed
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359

    ###################
    # convert to GRanges
    introns<-base::unlist(introns)

    ###################
    # count introns by transcripts
    introns_nb<-introns_DT[!base::is.na(group_name),.N,by=group_name]

    ###################
    # merge count to introns_DT
    introns_DT<-merge(introns_DT,introns_nb,by="group_name",all.x=T,sort=F)

    ###################
    # add intron_rank
    introns_DT[,intron_rank:=1:base::length(N),by=group_name]

    ###################
    # reverse intron_rank for reverse strand
    introns_DT[strand=="-",intron_rank:=base::rev(intron_rank),by=group_name]

    ###################
    # merge with crossref
    introns_DT<-merge(introns_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T)

    ###################
    # format feature
    introns_DT[,`:=`(
      feature=base::paste(GENEID,":",TXNAME,":intron",intron_rank,"/",N,sep=""),
      intron_rank=NULL,
      strand=NULL,
      GENEID=NULL,
      tx_name=TXNAME,
      TXNAME=NULL,
      N=NULL,
      group_name=NULL
    )]
  }

  ###################
  # CDS
  ###################

  cds=NULL;cds_DT=NULL

  if("cds"%in%features){

    ###################
    # get exons
    cds<-GenomicFeatures::cdsBy(txdb,use.names=F)

    ###################
    # convert to table
360
    cds_DT<-data.table::data.table(AnnotationDbi::as.data.frame(cds,row.names = NULL))[,.(group_name,exon_rank)]
Aurelien Brionne's avatar
Aurelien Brionne committed
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391

    ###################
    # convert to GRanges
    cds<-base::unlist(cds)

    ###################
    # count exons by transcripts
    cds_nb<-cds_DT[!base::is.na(group_name),.N,by=group_name]

    ###################
    # merge count to exons_DT
    cds_DT<-merge(cds_DT,cds_nb,by="group_name",all.x=T,sort=F)

    ###################
    # merge with crossref
    cds_DT<-merge(cds_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T)

    ###################
    # format feature
    cds_DT[,`:=`(
      feature=base::paste(GENEID,":",TXNAME,":exon",exon_rank,"/",N,sep=""),
      exon_rank=NULL,
      GENEID=NULL,
      tx_name=TXNAME,
      TXNAME=NULL,
      N=NULL,
      group_name=NULL
    )]
  }

  ###################
Aurelien Brionne's avatar
Aurelien Brionne committed
392
  # load annotation complement
Aurelien Brionne's avatar
Aurelien Brionne committed
393
394
395
  ###################

    ###################
Aurelien Brionne's avatar
Aurelien Brionne committed
396
397
398
399
    # check file name
    file=AnnotationDbi::metadata(txdb)$value[3]

    if(!base::is.na(file)){
Aurelien Brionne's avatar
Aurelien Brionne committed
400

401
      ###################
Aurelien Brionne's avatar
Aurelien Brionne committed
402
403
      # download
      if(base::length(grep("ftp.+\\.gz$",file))>0){
Aurelien Brionne's avatar
Aurelien Brionne committed
404

Aurelien Brionne's avatar
Aurelien Brionne committed
405
406
407
        ###################
        #  output name
        output=base::sub("^.+/","",file)
Aurelien Brionne's avatar
Aurelien Brionne committed
408

Aurelien Brionne's avatar
Aurelien Brionne committed
409
410
411
        ###################
        # import the GFF
        utils::download.file(file,quiet=T,destfile=output)
412

Aurelien Brionne's avatar
Aurelien Brionne committed
413
414
415
        ##################
        #  uncompress
        R.utils::gunzip(output)
416

Aurelien Brionne's avatar
Aurelien Brionne committed
417
418
419
420
        ##################
        # file to download
        file=output
      }
Aurelien Brionne's avatar
Aurelien Brionne committed
421

Aurelien Brionne's avatar
Aurelien Brionne committed
422
423
424
      ###################
      # read the gene file as GRanges
      gene_table<-rtracklayer::import(file)
Aurelien Brionne's avatar
Aurelien Brionne committed
425

Aurelien Brionne's avatar
Aurelien Brionne committed
426
427
428
429
430
431
432
433
434
435
436
437
      ###################
      # convert to data.table
      gene_table<-data.table::data.table(
        BiocGenerics::as.data.frame(gene_table,,row.names = NULL)
      )[ data.table::like(type,"RNA|transcript"),]


      if(base::length(base::grep("gtf$",file))>0){

        ###################
        # select columns from gtf
        gene_table<-gene_table[,
Aurelien Brionne's avatar
Aurelien Brionne committed
438
          .(seqnames,start,end,strand,type,gene_name,transcript_id,gene_id)
Aurelien Brionne's avatar
Aurelien Brionne committed
439
440
441
442
443
444
445
446
447
448
449
450
451
        ]

      }else{

        ###################
        # select columns from gff
        gene_table<-gene_table[,
          .(seqnames,start,end,strand,type,Dbxref,gene,transcript_id)
        ]

        ###################
        # extract gene_id
        gene_table[,
Aurelien Brionne's avatar
Aurelien Brionne committed
452
          gene_id:=base::sub("^.+:","",Dbxref[[1]][1]),
Aurelien Brionne's avatar
Aurelien Brionne committed
453
          by=1:base::nrow(gene_table)
Aurelien Brionne's avatar
Aurelien Brionne committed
454
455
456
457
458
459
        ][,Dbxref:=NULL]

        ###################
        # reneme gene to gene_name
        base::names(gene_table)[6]<-"gene_name"

Aurelien Brionne's avatar
Aurelien Brionne committed
460
      }
461
462

  }else{
Aurelien Brionne's avatar
Aurelien Brionne committed
463

464
465
    ###################
    # else NA
Aurelien Brionne's avatar
Aurelien Brionne committed
466
    gene_table=data.table::data.table(gene_table=NA)
467
  }
Aurelien Brionne's avatar
Aurelien Brionne committed
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483

  ###################
  # create genome_features
  ###################

  ###################
  # create genome_features
  methods::new("genome_features",
    metadata=base::list(data.table::data.table(AnnotationDbi::metadata(txdb))),
    promoter=base::list(GRanges=promoters,DT=promoters_DT),
    UTR5=base::list(GRanges=UTR5,DT=UTR5_DT),
    UTR3=base::list(GRanges=UTR3,DT=UTR3_DT),
    exon=base::list(GRanges=exons,DT=exons_DT),
    intron=base::list(GRanges=introns,DT=introns_DT),
    cds=base::list(GRanges=cds,DT=cds_DT),
    downstream=base::list(GRanges=downstream,DT=downstream_DT),
Aurelien Brionne's avatar
Aurelien Brionne committed
484
    gene_table=gene_table
Aurelien Brionne's avatar
Aurelien Brionne committed
485
486
  )
})