#' @title build a convenient genomic features object. #' @description build a convenient \code{\link{genome_features-class}} object needed for find overlaps beetween #' selected features and genomic coordinates in a single step. #' @importFrom AnnotationDbi select keys as.data.frame metadata #' @importFrom GenomicFeatures promoters transcripts fiveUTRsByTranscript threeUTRsByTranscript exonsBy intronsByTranscript cdsBy #' @importFrom GenomicRanges flank #' @importFrom rtracklayer import #' @importFrom BiocGenerics as.data.frame #' @importFrom methods new #' @import data.table #' @param txdb a txdb database built with \pkg{GenomicFeatures} (see example). #' @param features a \code{\link[base]{vector}} of features \code{\link[base]{character}} with by #' default all the availables features ("promoter", "UTR5", "UTR3", "exon", "intron", "cds", and "downstream"). #' @param parameters a \code{\link[base]{list}} of parameters to use in order to define promoter and #' downstream ranges distance to TSS (Transcript Start Site) or TES (Transcript End Site). #' \itemize{ #' \item{promoter_ranges: a \code{\link[base]{list}} with upstream and dowstream elements distance (in base) to TSS}. #' \item{downstream_range: the dowstream distance (in base) to the TES}. #' } #' @details This method build a convenient \code{\link{genomic_features-class}} object, in order to be used for finding overlaps beetween #' all selected features and genomic coordinates with the \code{\link{genome_features_overlaps}} method. #' @return a \code{\link{genome_features-class}} object. #' @include genome_features.R #' @examples #' #' ################### # # import the chicken genome GFF from NCBI ftp (galGal5 in this case) #' utils::download.file("ftp://ftp.ncbi.nlm.nih.gov/genomes/Gallus_gallus/GFF/ref_Gallus_gallus-5.0_top_level.gff3.gz", #' quiet=T,destfile = "galGal5_gff.gz") #' #' ################## #' # uncompress the file #' R.utils::gunzip("galGal5_gff.gz") #' #' ################### #' # build the database #' toplevel=GenomicFeatures::makeTxDbFromGFF( #' file="galGal5_gff", #' format="gff", #' organism="Gallus gallus", #' taxonomyId=9031, #' dbxrefTag="ID" #' ) #' #' ################### #' # build genome_features object from txdb #' features<-GenomeFeatures::build_genome_features( #' toplevel, #' features=base::c("promoter","UTR5","UTR3","exon","intron","cds","downstream"), #' parameters = base::list( #' promoter_ranges=base::list( #' upstream=3000, #' downstream=500 #' ), #' downstream_range=1000 #' ) #' ) #' @export setGeneric(name="build_genome_features",def=function(txdb,features=base::c("promoter","UTR5","UTR3","exon","intron","cds","downstream"), parameters=base::list(promoter_ranges=base::list(upstream=3000,downstream=500),downstream_range=1000)){standardGeneric("build_genome_features")}) setMethod("build_genome_features",definition=function(txdb,features,parameters) { ################### # check txdb ################### if(base::class(txdb)!="TxDb") base::stop("txdb must be a TxDb class object") ################### # txdb ids crossref ################### ################### # build ids cross reference crossref<-data.table::data.table( AnnotationDbi::select( txdb, columns=c("TXID","TXNAME"), keytype="GENEID", keys=AnnotationDbi::keys(txdb) ) ) ################### # convert TXID to character crossref[,TXID:=base::as.character(TXID)] ################### # promoters ################### promoters=NULL;promoters_DT=NULL if("promoter"%in%features){ ################### # get promoters promoters <- GenomicFeatures::promoters( txdb, upstream=parameters$promoter_ranges$upstream, downstream=parameters$promoter_ranges$downstream ) ################### # convert to table promoters_DT<-data.table::data.table(AnnotationDbi::as.data.frame(promoters,row.names = NULL))[,.(tx_id,tx_name)] ################### # convert to character promoters_DT[,tx_id:=base::as.character(tx_id)] ################### # merge with crossref promoters_DT<-merge(promoters_DT,crossref,by.x="tx_id",by.y="TXID",sort=F,all.x=T) ################### # rename base::names(promoters_DT)[1:2]<-base::c("group","feature") ################### # format feature promoters_DT[, tx_name:=feature] ################### # format feature promoters_DT[,`:=`( feature=base::paste(GENEID,feature,"promoter",sep=":"), GENEID=NULL, TXNAME=NULL )] } ################### # downstream ################### downstream=NULL;downstream_DT=NULL if("downstream"%in%features){ ################### # get downstream downstream<- GenomicFeatures::transcripts(txdb) downstream<-GenomicRanges::flank(downstream,parameters$downstream_range,start=F) ################### # convert to table downstream_DT<-data.table::data.table(AnnotationDbi::as.data.frame(downstream,,row.names = NULL))[,.(tx_id,tx_name)] ################### # convert to character downstream_DT[,tx_id:=base::as.character(tx_id)] ################### # merge with crossref downstream_DT<-merge(downstream_DT,crossref,by.x="tx_id",by.y="TXID",sort=F,all.x=T) ################### # rename base::names(downstream_DT)[1:2]<-base::c("group","feature") ################### # format feature downstream_DT[,tx_name:=feature] ################### # format feature downstream_DT[,`:=`( feature=base::paste(GENEID,feature,"downstream",sep=":"), GENEID=NULL, TXNAME=NULL )] } ################### # 5' UTR ################### UTR5=NULL;UTR5_DT=NULL if("UTR5"%in%features){ ################### # get 5' UTR in GRangesList UTR5<-GenomicFeatures::fiveUTRsByTranscript(txdb,use.names=F) ################### # convert to table UTR5_DT<-data.table::data.table(BiocGenerics::as.data.frame(UTR5,,row.names = NULL))[,.(group_name,exon_rank)] ################### # convert to GRanges UTR5<-base::unlist(UTR5) ################### # merge with crossref UTR5_DT<-merge(UTR5_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T) ################### # format feature UTR5_DT[,`:=`( feature=base::paste(GENEID,":",TXNAME,":5'UTR(exon",exon_rank,")",sep=""), exon_rank=NULL, GENEID=NULL, tx_name=TXNAME, TXNAME=NULL, group_name=NULL )] } ################### # 3' UTR ################### UTR3=NULL;UTR3_DT=NULL if("UTR3"%in%features){ ################### # get 3' UTR UTR3<-GenomicFeatures::threeUTRsByTranscript(txdb,use.names=F) ################### # convert to table UTR3_DT<-data.table::data.table(AnnotationDbi::as.data.frame(UTR3,,row.names = NULL))[,.(group_name,exon_rank)] ################### # convert to GRanges UTR3<-base::unlist(UTR3) ################### # merge with crossref UTR3_DT<-merge(UTR3_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T) ################### # format feature UTR3_DT[,`:=`( feature=base::paste(GENEID,":",TXNAME,":3'UTR(exon",exon_rank,")",sep=""), exon_rank=NULL, GENEID=NULL, tx_name=TXNAME, TXNAME=NULL, group_name=NULL )] } ################### # Exons ################### exons=NULL;exons_DT=NULL if("exon"%in%features){ ################### # get exons exons<-GenomicFeatures::exonsBy(txdb,use.names=F) ################### # convert to table exons_DT<-data.table::data.table(AnnotationDbi::as.data.frame(exons,row.names = NULL))[,.(group_name,exon_rank)] ################### # convert to GRanges exons<-base::unlist(exons) ################### # count exons by transcripts exons_nb<-exons_DT[!base::is.na(group_name),.N,by=group_name] ################### # merge count to exons_DT exons_DT<-merge(exons_DT,exons_nb,by="group_name",all.x=T,sort=F) ################### # merge with crossref exons_DT<-merge(exons_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T) ################### # format feature exons_DT[,`:=`( feature=base::paste(GENEID,":",TXNAME,":exon",exon_rank,"/",N,sep=""), exon_rank=NULL, GENEID=NULL, tx_name=TXNAME, TXNAME=NULL, N=NULL, group_name=NULL )] } ################### # Introns ################### introns=NULL;introns_DT=NULL if("intron"%in%features){ ################### # get introns introns<-GenomicFeatures::intronsByTranscript(txdb,use.names=F) ################### # convert to table introns_DT<-data.table::data.table(AnnotationDbi::as.data.frame(introns,row.names = NULL))[,.(group_name,strand)] ################### # convert to GRanges introns<-base::unlist(introns) ################### # count introns by transcripts introns_nb<-introns_DT[!base::is.na(group_name),.N,by=group_name] ################### # merge count to introns_DT introns_DT<-merge(introns_DT,introns_nb,by="group_name",all.x=T,sort=F) ################### # add intron_rank introns_DT[,intron_rank:=1:base::length(N),by=group_name] ################### # reverse intron_rank for reverse strand introns_DT[strand=="-",intron_rank:=base::rev(intron_rank),by=group_name] ################### # merge with crossref introns_DT<-merge(introns_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T) ################### # format feature introns_DT[,`:=`( feature=base::paste(GENEID,":",TXNAME,":intron",intron_rank,"/",N,sep=""), intron_rank=NULL, strand=NULL, GENEID=NULL, tx_name=TXNAME, TXNAME=NULL, N=NULL, group_name=NULL )] } ################### # CDS ################### cds=NULL;cds_DT=NULL if("cds"%in%features){ ################### # get exons cds<-GenomicFeatures::cdsBy(txdb,use.names=F) ################### # convert to table cds_DT<-data.table::data.table(AnnotationDbi::as.data.frame(cds,row.names = NULL))[,.(group_name,exon_rank)] ################### # convert to GRanges cds<-base::unlist(cds) ################### # count exons by transcripts cds_nb<-cds_DT[!base::is.na(group_name),.N,by=group_name] ################### # merge count to exons_DT cds_DT<-merge(cds_DT,cds_nb,by="group_name",all.x=T,sort=F) ################### # merge with crossref cds_DT<-merge(cds_DT,crossref,by.x="group_name",by.y="TXID",sort=F,all.x=T) ################### # format feature cds_DT[,`:=`( feature=base::paste(GENEID,":",TXNAME,":exon",exon_rank,"/",N,sep=""), exon_rank=NULL, GENEID=NULL, tx_name=TXNAME, TXNAME=NULL, N=NULL, group_name=NULL )] } ################### # load annotation complement ################### ################### # check file name file=AnnotationDbi::metadata(txdb)$value[3] if(!base::is.na(file)){ ################### # download if(base::length(grep("ftp.+\\.gz$",file))>0){ ################### # output name output=base::sub("^.+/","",file) ################### # import the GFF utils::download.file(file,quiet=T,destfile=output) ################## # uncompress R.utils::gunzip(output) ################## # file to download file=output } ################### # read the gene file as GRanges gene_table<-rtracklayer::import(file) ################### # convert to data.table gene_table<-data.table::data.table( BiocGenerics::as.data.frame(gene_table,,row.names = NULL) )[ data.table::like(type,"RNA|transcript"),] if(base::length(base::grep("gtf$",file))>0){ ################### # select columns from gtf gene_table<-gene_table[, .(seqnames,start,end,strand,type,gene_name,transcript_id,gene_id) ] }else{ ################### # select columns from gff gene_table<-gene_table[, .(seqnames,start,end,strand,type,Dbxref,gene,transcript_id) ] ################### # extract gene_id gene_table[, gene_id:=base::sub("^.+:","",Dbxref[[1]][1]), by=1:base::nrow(gene_table) ][,Dbxref:=NULL] ################### # reneme gene to gene_name base::names(gene_table)[6]<-"gene_name" } }else{ ################### # else NA gene_table=data.table::data.table(gene_table=NA) } ################### # create genome_features ################### ################### # create genome_features methods::new("genome_features", metadata=base::list(data.table::data.table(AnnotationDbi::metadata(txdb))), promoter=base::list(GRanges=promoters,DT=promoters_DT), UTR5=base::list(GRanges=UTR5,DT=UTR5_DT), UTR3=base::list(GRanges=UTR3,DT=UTR3_DT), exon=base::list(GRanges=exons,DT=exons_DT), intron=base::list(GRanges=introns,DT=introns_DT), cds=base::list(GRanges=cds,DT=cds_DT), downstream=base::list(GRanges=downstream,DT=downstream_DT), gene_table=gene_table ) })