Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • noemien.maillard/EAGLE
1 result
Show changes
Commits on Source (3)
# recursive ignorance of cache memory
**/__pycache__
**/.Rhistory
**/.RData
MIT License
Copyright (c) 2023 MAILLARD NOEMIEN
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Creation date: 2023/10/16
# Last review: 2023/12/12
# By Noémien MAILLARD
# Laboratory: GENEPI
# Project: EAGLE
# Script aim: Generate ROC curves to determine best DeepBind threshold(s)
# Modules versions: R version 4.3.2, caret 6.0-94, GenomicRanges 1.54.1, ggplot2 3.4.4, pracma 2.4.4, reshape2 1.4.4, ROCR 1.0-11, rtracklayer 1.62.0, tibble 3.2.1
library(caret)
library(GenomicRanges)
library(ggplot2)
library(pracma)
library(reshape2)
library(ROCR)
library(rtracklayer)
library(tibble)
# ggplot theme settings
theme_update(plot.title = element_text(hjust = 0.5)) # to set centered title for all plots
theme_update(plot.subtitle = element_text(hjust = 0.5)) # to set centered subtitle for all plots
theme_update(panel.background=element_rect(fill="white", color='black'))
theme_update(panel.grid=element_blank())
# load bins file
bins <- read.csv('/home/nomaillard/Documents/predictions_from_Sscrofa_refseq/deepbind/20220722_Analyse_listQTL1_2429_SNP_v11.bins.bed', header=F, sep='\t', col.names=c('chr', 'start', 'end'))
# load deepbind bigwig and keep scores > 0 (peak probability of 50%)
deepbind <- import.bw('/home/nomaillard/Documents/predictions_from_Sscrofa_refseq/deepbind/bigwig/D00328.018_CTCF_Homo_sapiens_ChIP-seq.bw')
deepbind$prob <- sigmoid(deepbind$score)
# load observations (merging of bed files)
ctcf_obs_liver <- import.bed('/home/nomaillard/Documents/03_ncbigeo/ranged_bed/CTCF_liver_merged_ranged.bed')
ctcf_obs_spleen <- import.bed('/home/nomaillard/Documents/03_ncbigeo/ranged_bed/CTCF_spleen_merged_ranged.bed')
ctcf_obs_lung <- import.bed('/home/nomaillard/Documents/03_ncbigeo/ranged_bed/GSM4799689_CTCF_Lung_P348_Peaks_ranged.bed')
# limit start-stop ranges to deepbind predictions ranges
ctcf_obs_liver <- ctcf_obs_liver[start(ctcf_obs_liver)>bins$start[1] & end(ctcf_obs_liver)<tail(bins$end, n=1)]
ctcf_obs_spleen <- ctcf_obs_spleen[start(ctcf_obs_spleen)>bins$start[1] & end(ctcf_obs_spleen)<tail(bins$end, n=1)]
ctcf_obs_lung <- ctcf_obs_lung[start(ctcf_obs_lung)>bins$start[1] & end(ctcf_obs_lung)<tail(bins$end, n=1)]
# bins (df) to gr
gr_bins <- GRanges(
seqnames=Rle(bins$chr),
ranges=IRanges(start=bins$start, end=bins$end)
)
# transform obs dataframe to GRanges object
df_to_bins <- function(gr_bins=gr_bins, df){
return( gr_bins[queryHits(findOverlaps(gr_bins, df))] )
}
# setup labels function
set_labels <- function(gr_pred, gr_obs){
true_labels <- findOverlaps(gr_pred, gr_obs)
gr_pred$labels <- 0
# replace 0 by 1 for all true labels
for (i in 1:length(true_labels)){
gr_pred[queryHits(true_labels[i])]$labels <- 1
}
# store proba and labels (related to obs data)
df <- data.frame(
pred=gr_pred$prob,
labels=gr_pred$labels
)
return(df)
}
# ROC and PR curves
calc_roc <- function(table, plot=FALSE){
# use ROCR library to calculate data for ROC
pred <- prediction(table$pred, table$labels)
perf <- performance(pred, "tpr", "fpr")
# store youden and threshold (in percentage) to assign to predictions associated with tpr, fpr
df_roc <- data.frame(
tpr=unlist(perf@y.values),
fpr=unlist(perf@x.values),
threshold=unlist(perf@alpha.values),
youden=unlist(perf@y.values)-unlist(perf@x.values)
)
# plot ROC with lines associated with optimal youden
if(plot){
plot(perf)
abline(
h=df_roc$tpr[df_roc$youden==max(df_roc$youden)],
v=df_roc$fpr[df_roc$youden==max(df_roc$youden)],
col='red',
lty='dotted'
)
}
return(df_roc)
}
calc_prc <- function(table, plot=FALSE){
pred <- prediction(table$pred, table$labels)
perf <- performance(pred, "tpr", "fpr")
# P=precision, R=recall
df_prc <- data.frame(
P=unlist(pred@tp)/(unlist(pred@tp)+unlist(pred@fp)),
R=unlist(pred@tp)/(unlist(pred@tp)+unlist(pred@fn)),
threshold=unlist(perf@alpha.values),
f.measure=(2*unlist(pred@tp))/(2*unlist(pred@tp)+unlist(pred@fp)+unlist(pred@fn))
)
# correct NA (=infinity) values
df_prc$P[1] <- 1
df_prc$threshold[1] <- 1
# plot PR curve
if(plot){
plot(
df_prc$R,
df_prc$P,
type='l',
xlab='Recall',
ylab='Precision'
)
}
return(df_prc)
}
# confusion matrix
plot_matrix <- function(gr_obs, gr_pred, bins){
# setup theme
theme_update(panel.background=element_rect(fill="white", color="white", linetype="solid"))
theme_update(panel.grid=element_line(color="white"))
# Find overlapping ranges
overlaps <- findOverlaps(gr_pred, gr_obs)
# calc observed pos/neg
obs_pos <- length(gr_obs)
obs_neg <- nrow(bins)-length(gr_obs)
# calc true/false pos/neg
true_positive <- length(queryHits(overlaps))
false_positive <- length(gr_pred)-length(queryHits(overlaps))
false_negative <- obs_pos-true_positive
true_negative <- nrow(bins)-sum(true_positive, false_positive, false_negative)
# factor positive and negative observations
lvl <- c('Peak', 'No Peak')
observed <- factor(
rep(lvl, times=c(obs_pos, obs_neg)),
levels=rev(lvl)
)
# factor prediction against observations (true/false)
predicted <- factor(
c(
rep(lvl, times=c(true_positive, false_negative)),
rep(lvl, times=c(false_positive, true_negative))
),
levels=rev(lvl)
)
# init confusion matrix (cm) and turn into dataframe (plt)
cm <- confusionMatrix(predicted, observed, dnn=c('Predicted', 'Observed'))
plt <- as.data.frame(cm$table)
# factorize predictions
plt$Predicted <- factor(plt$Predicted, levels=rev(levels(plt$Predicted)))
# generate plot with gradient
cm_plot <- ggplot(plt, aes(Observed, Predicted, fill= Freq)) +
geom_tile(color='black') + geom_text(aes(label=Freq), size=10) +
scale_fill_gradient(low="#56e0ab", high="#f4fdfa") +
labs(x = "Observed",y = "Predicted") +
scale_x_discrete(labels=c("No Peak","Peak")) +
scale_y_discrete(labels=c("Peak","No Peak")) +
theme(
axis.text=element_text(size=20),
axis.text.y=element_text(angle=90, hjust=0.5),
axis.title=element_text(size=24),
axis.ticks=element_blank()
)
return(cm_plot)
}
run_sample <- function(smpl_df, gr_bins=gr_bins, gr_deepbind=deepbind, plot=FALSE){
# binarize sample
gr_binned_smpl <- df_to_bins(gr_bins=gr_bins, df=smpl_df)
# label predictions according to observed data
df_labeled <- set_labels(gr_pred=deepbind, gr_obs=gr_binned_smpl)
# generate roc and prc curves
roc_df <- calc_roc(table=df_labeled, plot=plot)
prc_df <- calc_prc(table=df_labeled, plot=plot)
# set best threshold and confusion matrix with it
roc_df50 <- roc_df[roc_df$threshold>=0.5,]
best_threshold <- roc_df50$threshold[roc_df50$youden==max(roc_df50$youden)]
if (plot){
# confusion matrix before threshold setting (predicted peak=proba 50%)
print(plot_matrix(
gr_obs=gr_binned_smpl,
gr_pred=gr_deepbind[gr_deepbind$prob>=0.5],
bins=bins
))
# confusion matrix after threshold setting (predicted peak=best_threshold>=50%)
print(plot_matrix(
gr_obs=gr_binned_smpl,
gr_pred=gr_deepbind[gr_deepbind$prob>=best_threshold],
bins=bins
))
}
return(roc_df50[roc_df50$youden==max(roc_df50$youden),])
}
##### Use concatenated functions for each sample
for (smpl in list(ctcf_obs_liver, ctcf_obs_spleen, ctcf_obs_lung)){
write.table(
x=run_sample(smpl_df=smpl, gr_bins=gr_bins, gr_deepbind=deepbind, plot=T),
file="/home/nomaillard/Documents/ROC_PRC/test_youden.txt",
append=TRUE,
sep='\t',
row.names=FALSE
)
}
# Creation date: 2023/11/22
# Last review: 2023/12/12
# By Noémien MAILLARD
# Laboratory: GENEPI
# Project: EAGLE
# Script aim: Split chromosomes/scaffolds into n regular files of n_bins (+ 1 file with less than n_bins if necessary)
# Modules versions: R version 4.1.2, readxl 1.4.3, GenomicRanges 1.54.1, BSgenome 1.70.1, BSgenome.Sscrofa.UCSC.susScr11 1.4.2, seqinr 4.2-36
setwd("/home/nomaillard/Documents/Collab_GenEpi/data/Sscrofa_genome_binned/ChIP_hepG2/")
library(readxl)
library(GenomicRanges)
library(BSgenome.Sscrofa.UCSC.susScr11)
library(seqinr)
window=200 # bins size
n_bins=500 # n bins per file
# store seqnames and seqlengths for each chr/scaffold
df_seq <- data.frame(
"seqname"=names(seqlengths(BSgenome.Sscrofa.UCSC.susScr11)),
"seqlength"=seqlengths(BSgenome.Sscrofa.UCSC.susScr11)
)
bed_save <- '/home/nomaillard/Documents/Collab_GenEpi/results/deepbind/Sscrofa_genome_binned/ChIP_hepG2/'
# WARNING : long to run
# name = scaffold/chromosome
for (name in seqnames(BSgenome.Sscrofa.UCSC.susScr11)){
dir.create(name)
# Note: last range is lower than window size and will be removed during the loop
bin.GR=GRanges(
name,
IRanges(
breakInChunks(
totalsize=df_seq$seqlength[df_seq$seqname==name],
chunksize=window
) # end breakInChunk
) # end IRanges
) # end GRanges
# drop last bin if not of window size
if (width(bin.GR[length(bin.GR)]) != window){
bin.GR <- bin.GR[-c(length(bin.GR))]
}
# write 1 bed file per scaffold
write.table(as.data.frame(bin.GR)[,1:3],file=paste0(bed_save, name, ".bins.bed"),
row.names=F,col.names=F,sep='\t',quote=F)
if (length(bin.GR) < n_bins){
# store the only file into GRangesList
list.bin.GR=GRangesList(bin.GR)
}else if(length(bin.GR) %% n_bins == 0){
# split bin.GR into n_files of n_bins and store as GRangesList
n_files=trunc(length(bin.GR)/n_bins)
list.bin.GR=as(
split(bin.GR, rep(1:n_files, each=n_bins)),
"GRangesList"
) # close as GRangesList
}else{
# split bin.GR into n_files of n_bins + 1 file with remaining bins
n_files=trunc(length(bin.GR)/n_bins)
bin.GR.firsts=bin.GR[c(1:(n_files*n_bins))] # all GR except the last one
bin.GR.last=bin.GR[c((n_files*n_bins+1):length(bin.GR))] # last GR (less than n_bins bins)
list.bin.GR.firsts=as(
split(bin.GR.firsts, rep(1:n_files, each=n_bins)),
"GRangesList"
) # each = n_bins by GR (GR=output file) and keep it uncompressed
# concatenate all bin.GR into 1 GRangesList
list.bin.GR=append(list.bin.GR.firsts, GRangesList(bin.GR.last))
}
# for each GR, get and write fasta seq in numbered file
for (i in 1:length(list.bin.GR)){
bin.seq=getSeq(
BSgenome.Sscrofa.UCSC.susScr11,
names=seqnames(list.bin.GR[[i]]),
start=start(list.bin.GR[[i]]),
end=end(list.bin.GR[[i]])
)
bin.seq=as(bin.seq, "DNAStringSet") # turn DNAString to DNASringSet for uniq bin GR
writeXStringSet(x=bin.seq,filepath=paste0(name, "/", name, ".bins.", i, ".fa"))
}
}
# Creation date: 2023/11/24
# Last review: 2023/12/12
# By Noémien MAILLARD
# Laboratory: GENEPI
# Project: EAGLE
# Script aim: Subset deepbind experiments to choose the ones to predict
# Modules versions: R version 4.3.2, tidyr 1.3.0
library(tidyr)
# set input directory and database (db) files
setwd(dir = "/home/nomaillard/Documents/predictions_from_Sscrofa_refseq/deepbind")
db <- read.csv('db.tsv', header=T, skip=21, sep='\t')
# subset ChIP-seq not deprecated and store column containing cell line
db <- db[db$Experiment == 'ChIP-seq',]
db <- db[db$Labels != 'deprecated',]
cell.line <- as.data.frame(db$Experiment.Details)
colnames(cell.line) <- c('exp.details')
# not elegant way to extract cell lines but it works
cell.line <- cell.line %>%
separate(exp.details, c('cells'), sep=',')
cell.line <- cell.line %>%
separate(cells, c('bracket', 'cells'), sep='\'')
cell.line <- cell.line %>%
separate(cells, c('cell_line', 'cells'), sep='=')
# store unique cell lineS
cell.lines <- cell.line$cells
unique(cell.line$cells)
# write cell line IDs in a file, 1 ID per line
write.table(
data.frame(db$ID[grepl('HepG2', db$Experiment.Details)]),
file='/home/nomaillard/Documents/Collab_GenEpi/programs/deepbind/db/hepG2_IDs.txt',
quote=F,
sep='\t',
col.names=F,
row.names=F
)
# Creation date: 2023/12/12
# Last review: 2023/12/12
# By Noémien MAILLARD
# Laboratory: GENEPI
# Project: EAGLE
# Script aim: Generate bw files from deepbind prediction data. Inspired from 01_construct_bw_from_deepbind file.
# Modules versions: R version 4.3.2, GenomicRanges 1.54.1, rtracklayer 1.62.0, stringr 1.5.1
library(GenomicRanges)
library(rtracklayer)
library(stringr)
# set input directory for bins and deepbind predictions and database (db) files
setwd(dir = "/home/nomaillard/Documents/Collab_GenEpi/data/Sscrofa_genome_binned/ChIP_hepG2/")
deepbind <- read.csv('20220722_Analyse_listQTL1_2429_SNP_v11.bins.deepbind.txt', header=T, sep='\t')
bins <- read.csv('20220722_Analyse_listQTL1_2429_SNP_v11.bins.bed', header=F, sep='\t')
database <- read.csv('db.tsv', header=T, skip=21, sep='\t')
#!/usr/bin/bash
# load deepbind, parameters selected with Rscript (02_select_deepbind_experiement) and directory containing 1 directory per scaffold
deepbind='/home/nomaillard/Documents/Collab_GenEpi/programs/deepbind/deepbind'
params='/home/nomaillard/Documents/Collab_GenEpi/programs/deepbind/db/samples/ChIP_hepG2'
chr_dirs='/home/nomaillard/Documents/Collab_GenEpi/data/Sscrofa_genome_binned/ChIP_hepG2/'
for scaffold in `ls $chr_dirs`
do
# load binned fasta
list_binned_fasta=`ls $chr_dirs$scaffold/*.fa`
# store outputs with same name as binned fasta but with txt extension
outputs=$(for file in $list_binned_fasta; do echo ${file%.*}".txt";done)
# run deepbind
python deepbind_useCPU.py --deepbind $deepbind --params $params --binned-fasta $list_binned_fasta --outputs $outputs -p 22
done
#!/usr/bin/bash
# load directory containing 1 directory per scaffold and output
chr_dirs='/home/nomaillard/Documents/Collab_GenEpi/data/Sscrofa_genome_binned/ChIP_hepG2/'
output_path='/home/nomaillard/Documents/Collab_GenEpi/results/deepbind/Sscrofa_genome_binned/ChIP_hepG2/'
for scaffold in `ls $chr_dirs`;
do
echo "concatenating files for $scaffold..."
# scores_files are files containing predictions
scores_files=`ls $chr_dirs$scaffold/*.txt`
python concat_deepbind.py --files $scores_files --output $output_path$scaffold.bin.deepbind.txt
done
#! /usr/bin/python3
# Creation date: 2023/11/17
# Last review: 2023/12/12
# By Noémien MAILLARD
# Laboratory: GENEPI
# Project: EAGLE
# Script aim: Concatenate deepbind prediction files.
import argparse
from natsort import natsorted
import pandas as pd
def get_options():
parser = argparse.ArgumentParser()
parser.add_argument(
'-f',
'--files',
nargs='+',
help='List of files to concatenate.'
)
parser.add_argument(
'-o',
'--output',
help='Output file name.'
)
parser.add_argument(
'--sep',
default='\t',
help='Input and output tables separator. Default to tab.'
)
return parser.parse_args()
def sort_and_concat(files_list, sep):
# sort files in numerical order
sorted_list = natsorted(files_list)
# read files in list ready to concatenate
df_list = [pd.read_csv(file, sep=sep, engine='python') for file in sorted_list]
return pd.concat(df_list)
if __name__ == '__main__':
args = get_options()
sort_and_concat(files_list=args.files, sep=args.sep).to_csv(args.output, sep=args.sep, index=False)
#! /usr/bin/python3
# Creation date: 2023/11/17
# Last review: 2023/12/12
# By Noémien MAILLARD
# Laboratory: GENEPI
# Project: EAGLE
# Script aim: Parallelize deepbind predictions.
import argparse
import multiprocessing as mp
import subprocess as sp
import sys
def get_options():
parser = argparse.ArgumentParser()
parser.add_argument(
'--deepbind',
help='Deepbind executable file (including path).'
)
parser.add_argument(
'--params',
help='Deepbind parameters path. The last directory should contain all params files.'
)
parser.add_argument(
'-f',
'--binned-fasta',
nargs='+',
help='List of fasta files containing binned sequences.'
)
parser.add_argument(
'-o',
'--outputs',
nargs='+',
help='List of output files name including path.'
)
parser.add_argument(
'-p',
'--processes',
type=int,
default=None,
help='Number of worker processes to use. Default to None so the number returned by os.cpu_count() is used.'
)
return parser.parse_args()
def sp_deepbind(db_path, ids_path, data_path, output_path):
# run sh command
sp.Popen(
f"{db_path} `ls {ids_path} | sed s/.txt//g` < {data_path} > {output_path}",
shell=True
).wait()
if __name__ == '__main__':
args = get_options()
# check as much outputs than inputs
if len(args.binned_fasta) != len(args.outputs):
sys.exit('Binned fasta and outputs have to be of same length. Exit code 1')
# order fasta files and output files
list_binned_fasta = list(sorted(args.binned_fasta))
list_outputs = list(sorted(args.outputs))
# create a list of arguments for each run
runs_args = [[args.deepbind, args.params, fasta, output] for fasta, output in zip(list_binned_fasta, list_outputs)]
# parallelize runs
with mp.Pool(processes=args.processes) as pool:
pool.starmap(sp_deepbind, runs_args)
natsort==8.4.0
numpy==1.26.2
pandas==2.1.3
python-dateutil==2.8.2
pytz==2023.3.post1
setuptools==68.0.0
six==1.16.0
tzdata==2023.3
wheel==0.37.1