Commit 0f8cc7be authored by Elise Maigne's avatar Elise Maigne
Browse files

first commit add script and data

parent 16e12e64
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
# 15EX05_data_correction
Correction of the data 15EX05 (presented in the data paper https://doi.org/10.1051/ocl/2020027)
\ No newline at end of file
Correction of the data 15EX05 (presented in the data paper https://doi.org/10.1051/ocl/2020027)
# Organization
- [scripts/Analysis_Fluidigm_15EX05_Pl01_10.m](./scripts/Analysis_Fluidigm_15EX05_Pl01_10.m) : Script matlab for Amplification efficiency and Ct normalization. The entry data for this script is not provided.
- data/Analysis_Fluidigm_15EX05_results_export_20180430-112453.txt : Resulting dataset from the previous script
- [scripts/1_correct_plate_effect.R](./scripts/1_correct_plate_effect.R) : R script to correct the plate effect, on the previous dataset.
- [scripts/2_correct_field_effect.R](./scripts/2_correct_field_effect.R) : R script to correct the field effect, on the exit file from the previous script.
- [scripts/3_handling_missings.R](./scripts/3_handling_missings.R) : R script to correct the plate effect, on the exit file from the previous script.
"HanXRQChr01g0009121"
"HanXRQChr01g0014581"
"HanXRQChr01g0020591"
"HanXRQChr01g0029831"
"HanXRQChr01g0030841"
"HanXRQChr02g0032221"
"HanXRQChr02g0034201"
"HanXRQChr02g0035161"
"HanXRQChr02g0037321"
"HanXRQChr02g0049881"
"HanXRQChr02g0050641"
"HanXRQChr02g0051881"
"HanXRQChr02g0053231"
"HanXRQChr02g0053371"
"HanXRQChr02g0053591"
"HanXRQChr02g0054031"
"HanXRQChr02g0055811"
"HanXRQChr02g0056421"
"HanXRQChr02g0056481"
"HanXRQChr02g0058891"
"HanXRQChr03g0060421"
"HanXRQChr03g0068781"
"HanXRQChr03g0072181"
"HanXRQChr03g0079641"
"HanXRQChr03g0080881"
"HanXRQChr03g0081711"
"HanXRQChr03g0084781"
"HanXRQChr03g0088971"
"HanXRQChr04g0096011"
"HanXRQChr04g0096181"
"HanXRQChr04g0100411"
"HanXRQChr04g0106071"
"HanXRQChr04g0108981"
"HanXRQChr04g0109421"
"HanXRQChr04g0113841"
"HanXRQChr04g0116751"
"HanXRQChr04g0117421"
"HanXRQChr05g0130741"
"HanXRQChr05g0131121"
"HanXRQChr05g0132681"
"HanXRQChr05g0137501"
"HanXRQChr05g0140571"
"HanXRQChr05g0152981"
"HanXRQChr05g0153191"
"HanXRQChr05g0158111"
"HanXRQChr06g0166611"
"HanXRQChr06g0183171"
"HanXRQChr07g0192751"
"HanXRQChr07g0192881"
"HanXRQChr07g0195691"
"HanXRQChr07g0198131"
"HanXRQChr07g0198391"
"HanXRQChr07g0200781"
"HanXRQChr07g0207111"
"HanXRQChr07g0207121"
"HanXRQChr08g0208191"
"HanXRQChr08g0213601"
"HanXRQChr08g0216831"
"HanXRQChr08g0220911"
"HanXRQChr08g0225461"
"HanXRQChr08g0226861"
"HanXRQChr08g0227171"
"HanXRQChr08g0235581"
"HanXRQChr09g0240721"
"HanXRQChr09g0244621"
"HanXRQChr09g0258321"
"HanXRQChr09g0259101"
"HanXRQChr09g0260121"
"HanXRQChr09g0262941"
"HanXRQChr09g0264011"
"HanXRQChr09g0267551"
"HanXRQChr09g0268481"
"HanXRQChr09g0268631"
"HanXRQChr09g0269271"
"HanXRQChr09g0269581"
"HanXRQChr09g0270841"
"HanXRQChr09g0273481"
"HanXRQChr09g0274011"
"HanXRQChr09g0274431"
"HanXRQChr10g0278051"
"HanXRQChr10g0280681"
"HanXRQChr10g0304701"
"HanXRQChr10g0306801"
"HanXRQChr10g0309591"
"HanXRQChr10g0319451"
"HanXRQChr11g0321991"
"HanXRQChr11g0329911"
"HanXRQChr11g0336031"
"HanXRQChr11g0336051"
"HanXRQChr11g0338851"
"HanXRQChr11g0339211"
"HanXRQChr11g0339601"
"HanXRQChr11g0352091"
"HanXRQChr12g0354231"
"HanXRQChr12g0360991"
"HanXRQChr12g0362371"
"HanXRQChr12g0364951"
"HanXRQChr12g0369571"
"HanXRQChr12g0374021"
"HanXRQChr12g0374161"
"HanXRQChr12g0377601"
"HanXRQChr12g0378811"
"HanXRQChr12g0381101"
"HanXRQChr12g0382231"
"HanXRQChr12g0385811"
"HanXRQChr12g0385841"
"HanXRQChr13g0386971"
"HanXRQChr13g0387461"
"HanXRQChr13g0388001"
"HanXRQChr13g0389791"
"HanXRQChr13g0396741"
"HanXRQChr13g0396801"
"HanXRQChr13g0398561"
"HanXRQChr13g0402161"
"HanXRQChr13g0403931"
"HanXRQChr13g0403981"
"HanXRQChr13g0406481"
"HanXRQChr13g0407071"
"HanXRQChr13g0412021"
"HanXRQChr13g0413941"
"HanXRQChr13g0414871"
"HanXRQChr13g0422911"
"HanXRQChr14g0428901"
"HanXRQChr14g0429291"
"HanXRQChr14g0435881"
"HanXRQChr14g0440401"
"HanXRQChr14g0445511"
"HanXRQChr14g0452731"
"HanXRQChr14g0456831"
"HanXRQChr14g0459681"
"HanXRQChr14g0459931"
"HanXRQChr14g0460621"
"HanXRQChr14g0462071"
"HanXRQChr15g0465671"
"HanXRQChr15g0465681"
"HanXRQChr15g0465701"
"HanXRQChr15g0467741"
"HanXRQChr15g0468081"
"HanXRQChr15g0468981"
"HanXRQChr15g0469281"
"HanXRQChr15g0474211"
"HanXRQChr15g0481941"
"HanXRQChr15g0482591"
"HanXRQChr15g0482801"
"HanXRQChr15g0484281"
"HanXRQChr15g0485901"
"HanXRQChr15g0489891"
"HanXRQChr15g0490471"
"HanXRQChr15g0491791"
"HanXRQChr15g0495951"
"HanXRQChr15g0496991"
"HanXRQChr16g0502851"
"HanXRQChr16g0503081"
"HanXRQChr16g0505371"
"HanXRQChr16g0505951"
"HanXRQChr16g0505981"
"HanXRQChr16g0520631"
"HanXRQChr16g0528301"
"HanXRQChr16g0529251"
"HanXRQChr16g0529981"
"HanXRQChr16g0531801"
"HanXRQChr16g0531871"
"HanXRQChr16g0532271"
"HanXRQChr17g0534211"
"HanXRQChr17g0538981"
"HanXRQChr17g0541581"
"HanXRQChr17g0545171"
"HanXRQChr17g0545981"
"HanXRQChr17g0545991"
"HanXRQChr17g0547571"
"HanXRQChr17g0558531"
"HanXRQChr17g0564191"
"HanXRQChr17g0567501"
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
##########################################################################
#### DESCRIPTION #########################################################
##########################################################################
# Ajuste les niveaux d'expression mesurées (delta Ct) par qPCR fluidigm
# pour corriger l'effet plaque
##########################################################################
#### LIBRAIRY ############################################################
##########################################################################
library(reshape)
library(data.table)
##########################################################################
#### FUNCTION ############################################################
##########################################################################
#' updateSampleLevel
#'
#' Affiche le nom des sample / gènes retirés de l'analyse
#' Mets à jour les data pour que les nom des samples
#' retirés n'apparaisent plus
#'
#' @param data
#'
#' @return data
updateSampleLevel <- function(data) {
# Print names of genes removed from the analysis :
lowQualityGene <- which(table(data$Gene) == 0)
nbLowQualityGene <- length(lowQualityGene)
if (nbLowQualityGene > 0) {
print(paste0("/!\\ ", nbLowQualityGene, " samples discard"))
print(lowQualityGene)
}
# put real levels in changed factors
data$Gene <- as.factor(as.vector(data$Gene))
data$Sample_Name <- as.factor(as.vector(data$Sample_Name))
return(data)
}
##########################################################################
#### DATA ################################################################
##########################################################################
#------------------------------------------------------------------------#
# FIC_IN - A COMPLETER #
#------------------------------------------------------------------------#
# fichier contenant les deltaCt des différents gènes
fic_ct <-
"data/Analysis_Fluidigm_15EX05_results_export_20180430-112453.txt"
#------------------------------------------------------------------------#
# INFO - A COMPLETER #
#------------------------------------------------------------------------#
#
# # liste des gènes de références gardés pour l'analyse
# geneRef = c("HanXRQChr05g0131911","HanXRQChr01g0029581","HanXRQChr01g0021131")
# genotype / echantillon temoin
sampleTemoin = c(61, 500)
# nb sample by plate
nbSampleByPlate = 96
seuil <- 0.6 # for quality
#------------------------------------------------------------------------#
# FIC_OUT - A COMPLETER #
#------------------------------------------------------------------------#
ficOut <-
"results/Analysis_Fluidigm_15EX05_results_export_20180430-112453_plateAdjusted_2fieldEffect.txt"
##########################################################################
#### MAIN ################################################################
##########################################################################
#------------------------------------------------------------------------#
# Read data
#------------------------------------------------------------------------#
# Ct des differents gènes
dataAll <- read.table(file = fic_ct,
header = TRUE,
na.strings = c("NA", "NaN"))
#
# # efficacité des gènes (non necessaire car direct sur deltaCt)
# eff <- read.table(file = fic_eff, header = TRUE, sep = "\t")
#------------------------------------------------------------------------#
# pretraitement des data
#------------------------------------------------------------------------#
# discard sample with low Quality
data <- dataAll[dataAll$Gene_Quality == 1, ]
data <- updateSampleLevel(data = data)
# put '=ref' for reference gene on the second lot
data$Gene[data$Gene == "HanXRQChr05g0131911"] <-
"HanXRQChr05g0131911=ref"
data$Gene[data$Gene == "HanXRQChr01g0029581"] <-
"HanXRQChr01g0029581=ref"
data <- updateSampleLevel(data = data)
# discard sample corresponding to H20 and ADNg
sampleOut <-
which(data$Sample_Name == "H2O" | data$Sample_Name == "ADNg")
data <- data[-sampleOut, ]
data <- updateSampleLevel(data = data)
# count number of samples by gene and Plate
id.na <- which(is.na(data$dCt))
count.byplate <- table(data$Gene[-id.na], data$Plate[-id.na])
count.byplate
# create a quality index by gene
# gene with low quality
countByPlateMat <- as.data.frame(count.byplate)
countByPlateMat[which((countByPlateMat$Freq < (seuil * nbSampleByPlate)) &
(countByPlateMat$Freq > 0)), ]
as.vector(unique(countByPlateMat[which((countByPlateMat$Freq < (seuil * nbSampleByPlate)) &
(countByPlateMat$Freq > 0)), ]$Var1))
# # count without gamme
data.withoutGamme <- data[(data$Sample_Name != "gamme"), ]
id.na <- which(is.na(data.withoutGamme$dCt))
count.byplate.withoutGamme <-
table(data.withoutGamme$Gene[-id.na], data.withoutGamme$Plate[-id.na])
plateName = levels(data$Plate)
geneName = levels(data$Gene)
echName = levels(data$Sample_Name)
genotypeName = levels(data$CROSS_GENOTYPE)
#------------------------------------------------------------------------#
# plate effect
#------------------------------------------------------------------------#
data.sampleTemoin <- data[(data$Sample_Name %in% sampleTemoin), ]
data.sampleTemoin <- updateSampleLevel(data = data.sampleTemoin)
dataTemoinGeneList <- levels(data.sampleTemoin$Gene)
names(dataTemoinGeneList) <- dataTemoinGeneList
plate.effect <-
as.data.frame(lapply(dataTemoinGeneList, function(idGene) {
data.gene <-
data.sampleTemoin[data.sampleTemoin$Gene == idGene,] # récupère les deltaCt du gène sur la gamme
temp <-
lm(formula = data.gene$dCt ~ 1 + data.gene$Plate + as.factor(data.gene$Sample_Name))
temp.coeff <-
temp$coefficients[grep("PlatePl", names(temp$coefficients))]
res <- rep(0, 10)
names(res) <- plateName
name.coef <-
as.matrix(as.data.frame(strsplit(names(temp.coeff), "Plate")))[2, ]
res[name.coef] <- temp.coeff
return(res)
}))
#------------------------------------------------------------------------#
# adjust data
#------------------------------------------------------------------------#
plate.effect_long <- data.table(plate.effect, keep.rownames = T)
setnames(plate.effect_long, "rn", "Plate")
plate.effect_long <-
melt(plate.effect_long,
id.vars = "Plate",
variable.name = "Gene")
plate.effect_long[, Plate := as.factor(Plate)]
data <-
merge(data,
plate.effect_long,
by = c("Plate", "Gene"),
all.x = T)
data$dCt.plateEffect <- data$dCt - data$value
#------------------------------------------------------------------------#
# Format the results
#------------------------------------------------------------------------#
# tableau 1 ligne = 1 echantillon (1 plante) ------------------------#
# garde les genome de reference
# info dispo pour chaque echantillon : coor champs, qualité, type expé
# nom genotype, expression des 180 gènes
# selection des lignes correspondant a un genotype hybride (retire la gamme) et des colonne genotype gene deltaC
setDT(data)
dataAdj2fieldEff <- dcast(
data[Gene %in% dataTemoinGeneList & !is.na(CROSS_GENOTYPE)] ,
CROSS_GENOTYPE + Sample_Name + XTRIAL + YTRIAL + STATUS_EXP + Gene_Quality ~ Gene,
fun.aggregate = mean,
na.rm = TRUE,
value.var = "dCt.plateEffect"
)
##########################################################################
#### SAVE ################################################################
##########################################################################
write.table(
x = dataAdj2fieldEff,
file = ficOut,
sep = "\t",
row.names = FALSE,
col.names = TRUE,
quote = FALSE
)
##########################################################################
#### DESCRIPTION #########################################################
##########################################################################
# This script reads the 15EX05 expression file and outputs a set of plot
# that will be used to choose an adjustment model for this plot
# Adapted from Louise GODY script
# Need to run where asreml is installed
##########################################################################
#### LIBRARY #############################################################
##########################################################################
# library graphique
library(ggplot2)
# library(GGally) #/!\ NOT INSTALLED on lipm-calcul, used by ggpairs()
library(grid)
library(gridExtra)
library(asreml)
# library(asremlPlus) # not available on lipm-calcul ? need links to licence ? used by info.crit.asreml()
# not used library
# library(lme4)
# library(SpATS) # not available on lipm-calcul ? need links to licence ? used ?
##########################################################################
#### FUNCTION ############################################################
##########################################################################
# needed functions are in the personal package:
source("scripts/my_utility.r")
##########################################################################
#### DIRECTORIES, FILES & DATA ###########################################
##########################################################################
#--- INFORMATIONS -------------------------------------------------------#
# name of the experiment
exp <- "15EX05"
#--- SETTING UP DIRECTORIES ---------------------------------------------#
mainDir <- getwd() # setup main directory
inDir <- file.path(mainDir,".") # setup the input directory
outDir <- file.path(mainDir,"results") # setup the output directory
plotDir <- my_create_res_exp_dir(outDir,"adjustment")
# expDir <- my_create_res_exp_dir(outDir,exp)
#--- SETTING UP FILES ---------------------------------------------------#
traits_file <- "data/15EX05_geneList" # List of studied phenotype (in our case list of gene)
exp_file <- "results/Analysis_Fluidigm_15EX05_results_export_20180430-112453_plateAdjusted_2fieldEffect.txt" # Phenotype measurement for each genotype (in our case gene expression measurement)
#--- LOAD FILES ---------------------------------------------------------#
traits <- unlist(read.csv(file = traits_file, header = F, stringsAsFactors = F), use.names = F) # load phenotype of interest
names(traits) <- traits
data <- read.csv(file=exp_file, sep = "\t", stringsAsFactors = FALSE, na.strings = c("x","#VALUE!","NA","")) # load experimental measurement file
datakeep <- data
##########################################################################
#### ANALYSIS ############################################################
##########################################################################
#------------------------------------------------------------------------#
# FILTER DATA #
#------------------------------------------------------------------------#
# reformat experimental data
data <- my_reformat_expe_data(data, createRILcol = FALSE) #createRILcol is TRUE when working with nam_drr date. here we are working with GWA data set createRILcol to FALSE
#removing plots with a plant density below 3 and a quality below 1
data <- cleaning_plots(data)
# my_summary_randomization_plots(data, exp, expDir)
# create results data frame
result_df <- data[c("CROSS_GENOTYPE","XTRIAL","YTRIAL")]
#------------------------------------------------------------------------#
# LOOP ALL TRAIT TO ADJUST #
#------------------------------------------------------------------------#
# Create the dataframe for running lme4 asreml and SpATS (turn some column into factors)
# for each studied traits
### for (name.trait in names(traits[1:2])){ ##### TEST #####
for (name.trait in names(traits)){
# check if current trait is in data file
if (name.trait %in% colnames(data)){
print(name.trait)
data[,name.trait] <- as.numeric(as.character(data[,name.trait]))
data.for.model <- data
# add NA sample to the data to have a complete field
data.for.model <- my_add_empty_plots(data.for.model)
# format data for use models
data.for.model <- my_prepare_data_for_model(data.for.model)
colnames(data.for.model) <- gsub(colnames(data.for.model), pattern = name.trait,replacement = "PHENOTYPE")
########################################################
# modeled the field effect
m1 <- asreml(PHENOTYPE ~ CROSS_GENOTYPE ,
random = ~ YTRIAL + XTRIAL ,
data = data.for.model,
na.method.X = "include")
########################################################
# correct the data ?
result_df <- my_ASREML_add_effects(result_df, m1, name.trait, checks, random = FALSE, mean_trait = mean_trait)
#/!\ part of the function are desctivated as some packages are not available
summary_fitted_values(data.for.model,m1,exp,name.trait)
}
}
# save results
write.csv(result_df, file.path(outDir,paste(exp,"adjusted_phenotypes_field_corrected.csv",sep = "_")))
##########################################################################
#### DESCRIPTION #########################################################
##########################################################################
# This script reads the 15EX05 expression file after correction of plate
# and field effect and handle the missing values
# (curation and imputation)
# Removal of control genotypes
# Removal of genotypes with more than 10% of missing
# Imputation gene by gene
##########################################################################
#### LIBRARY #############################################################
##########################################################################
library(data.table)
##########################################################################
#### LOAD DATA ###########################################################
##########################################################################
data <- fread("results/15EX05_adjusted_phenotypes_field_corrected.csv")
##########################################################################
#### MAIN ################################################################
##########################################################################
##### Control genotypes
data <- data[!(CROSS_GENOTYPE %in% c("ES_AKUSTIC", "EXTRASOL", "LG5450HO", "NK_KONDI"))]
##### Missing by genotype --> removal
data[, V1 := NULL]
data[, XTRIAL := NULL]
data[, YTRIAL := NULL]
datalong <- data.table::melt(data, id.vars = c("CROSS_GENOTYPE"), variable.name = "Gene", value.name = "Expression")
datalong[is.na(Expression), nbmissing := .N, by=.(CROSS_GENOTYPE)]
genotypes_to_remove <- datalong[, .(pctmissing = max(0, nbmissing, na.rm=T)/.N),
by=.(CROSS_GENOTYPE)][pctmissing > 0.1, CROSS_GENOTYPE]
length(genotypes_to_remove)
# [1] 3
datalong <- datalong[!(CROSS_GENOTYPE %in% genotypes_to_remove), .(CROSS_GENOTYPE, Gene, Expression)]
##### Missing by gene --> imputation by mean for the gene
datalong[, meanExpr := mean(Expression, na.rm = T), by=.(Gene)]
datalong[is.na(Expression), Expression := meanExpr]
##########################################################################
#### FORMAT THE DATA #####################################################
##########################################################################
datanoNA <- data.table::dcast(datalong, Gene ~ CROSS_GENOTYPE, value.var = "Expression")