From 36510f3d1645983337b99611c5566f444ed493f6 Mon Sep 17 00:00:00 2001 From: unknown <isabelle.sanchez@inra.fr> Date: Thu, 5 May 2022 11:34:44 +0200 Subject: [PATCH] add EM method for inital parameters optimization - KBO_EM() --- NAMESPACE | 1 + R/kfino.R | 219 ++++++++++++++++++----------- R/utils_functions.R | 325 ++++++++++++++++++++++++++++++++++++++++---- man/KBO_EM.Rd | 60 ++++++++ man/KBO_L.Rd | 11 +- man/KBO_known.Rd | 23 +++- man/kfino_fit.Rd | 26 +++- 7 files changed, 543 insertions(+), 122 deletions(-) create mode 100644 man/KBO_EM.Rd diff --git a/NAMESPACE b/NAMESPACE index 47bb4f5..1a3e3ed 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(KBO_EM) export(KBO_L) export(KBO_known) export(kfino_fit) diff --git a/R/kfino.R b/R/kfino.R index 77e7112..87678b3 100644 --- a/R/kfino.R +++ b/R/kfino.R @@ -6,6 +6,9 @@ #' @param param list, a list of initialization parameters #' @param doOptim logical, if TRUE optimisation of the initial parameters #' default TRUE +#' @param method character, the method used to optimize the initial parameters: +#' Expectation-Maximization algorithm `"EM"` or Maximization +#' Likelihhod `"ML"`, default `"ML"` #' @param threshold numeric, threshold to qualify an observation as outlier #' according to the label_pred, default 0.5 #' @param kappa numeric, truncation setting, default 10 @@ -31,13 +34,14 @@ #' default seq(0.5,0.7,0.1)} #' } #' It has to be given by the user following his knowledge of the animal or -#' the dataset. All parameters are compulsory execpt m0, mm and pp that can be -#' optimized by the algorithm. In the optimisation step, those three parameters +#' the dataset. All parameters are compulsory except m0, mm and pp that can be +#' optimized by the algorithm. In the optimization step, those three parameters #' are initialized according to the input data (between the expert #' range) using quantile of the Y distribution (varying between 0.2 and 0.8 for #' m0 and 0.5 for mm). pp is a sequence varying between 0.5 and 0.7. A #' sub-sampling is performed to speed the algorithm if the number of possible -#' observations studied is greater than 500. +#' observations studied is greater than 500. Optimization is performed using +#' `"EM"` or `"ML"` methods. #' #' @importFrom stats dnorm quantile na.omit #' @importFrom dplyr mutate filter left_join arrange %>% @@ -57,7 +61,7 @@ #' data(spring1) #' library(dplyr) #' -#' # --- With Optimisation on initial parameters +#' # --- With Optimisation on initial parameters - ML method #' t0 <- Sys.time() #' param1<-list(m0=NULL, #' mm=NULL, @@ -73,8 +77,16 @@ #' #' resu1<-kfino_fit(datain=spring1, #' Tvar="dateNum",Yvar="Poids", -#' doOptim=TRUE,param=param1) +#' doOptim=TRUE,method="ML",param=param1) #' Sys.time() - t0 +#' +#' # --- With Optimisation on initial parameters - EM method +#' t0 <- Sys.time() +#' resu1b<-kfino_fit(datain=spring1, +#' Tvar="dateNum",Yvar="Poids", +#' doOptim=TRUE,method="EM",param=param1) +#' Sys.time() - t0 +#' #' # --- Without Optimisation on initial parameters #' t0 <- Sys.time() #' param2<-list(m0=41, @@ -111,11 +123,11 @@ #' seqp=seq(0.5,0.7,0.1)) #' resu3<-kfino_fit(datain=merinos2, #' Tvar="dateNum",Yvar="Poids", -#' doOptim=TRUE,param=param3) +#' doOptim=TRUE,method="ML",param=param3) #' Sys.time() - t0 kfino_fit<-function(datain,Tvar,Yvar, param=NULL, - doOptim=TRUE, + doOptim=TRUE,method="ML", threshold=0.5,kappa=10,kappaOpt=7){ if( any(is.null(param[["expertMin"]]) | @@ -174,22 +186,22 @@ kfino_fit<-function(datain,Tvar,Yvar, dix=sqrt(expertMax - expertMin) #------------------------------------------------------------------------ - # Optimisation sur les paramètres initiaux ou pas + # Optimisation on Initial parameters or not #------------------------------------------------------------------------ if (doOptim == TRUE){ if (N > 500){ - # optim avec sous-echantillonnage + # optim with sub-sampling print("-------:") - print("Optimisation of initial parameters with sub-sampling - result:") + print("Optimisation of initial parameters ") + print("with sub-sampling and ML method - result:") bornem0=quantile(Y[1:N/4], probs = c(.2, .8)) m0opt=quantile(Y[1:N/4], probs = c(.5)) mmopt=quantile(Y[(3*N/4):N], probs = c(.5)) - cat("bornem0: ",bornem0,"\n") - cat("m0opt: ",m0opt,"\n") - cat("mmopt: ",mmopt,"\n") - + cat("range m0: ",bornem0,"\n") + cat("Initial m0opt: ",m0opt,"\n") + cat("Initial mmopt: ",mmopt,"\n") popt=0.5 #--- Saving datain before sub-sampling YY=Y @@ -197,7 +209,6 @@ kfino_fit<-function(datain,Tvar,Yvar, NN=N Subechant=sort(sample(1:NN,50)) - #Subechant=sort(.Internal(sample(NN, 50L, FALSE, NULL))) Y=YY[Subechant] Tps=TpsTps[Subechant] N=50 @@ -217,9 +228,8 @@ kfino_fit<-function(datain,Tvar,Yvar, for (mm in seq((m0-5),(m0+20),2) ){ for (p in seqp){ # A voir si 50 sous-echantillons au hasard suffisent. Comme dans - # Robbins Monroe, permet aussi de reduire l'impact de la troncature + # Robbins Monroe, permet aussi de reduire l'impact de la troncature Subechant=sort(sample(1:NN,50)) - #Subechant=sort(.Internal(sample(NN, 50L, FALSE, NULL))) Y=YY[Subechant] Tps=TpsTps[Subechant] @@ -250,8 +260,10 @@ kfino_fit<-function(datain,Tvar,Yvar, Y=YY Tps=TpsTps N=NN - print("Optimised parameters: ") - print(c(mmopt,popt,m0opt)) + print("Optimised parameters with ML method: ") + cat("Optimized m0: ",m0opt,"\n") + cat("Optimized mm: ",mmopt,"\n") + cat("Optimized pp: ",popt,"\n") print("-------:") resultat=KBO_known(param=list(mm=mmopt, @@ -267,75 +279,118 @@ kfino_fit<-function(datain,Tvar,Yvar, threshold=threshold,Y=Y,Tps=Tps,N=N,kappa=kappa) } else if (N > 50){ - # optim sans sous-echantillonnage - print("-------:") - print("Optimisation of initial parameters - result:") - print("no sub-sampling performed:") - bornem0=quantile(Y[1:N/4], probs = c(.2, .8)) - m0opt=quantile(Y[1:N/4], probs = c(.5)) - mmopt=quantile(Y[(3*N/4):N], probs = c(.5)) - - cat("bornem0: ",bornem0,"\n") - cat("m0opt: ",m0opt,"\n") - cat("mmopt: ",mmopt,"\n") - - popt=0.5 - - Vopt=KBO_L(list(m0=m0opt, - mm=mmopt, - pp=popt, - aa=aa, - expertMin=expertMin, - expertMax=expertMax, - sigma2_mm=sigma2_mm, - sigma2_m0=sigma2_m0, - sigma2_pp=sigma2_pp, - K=K), - Y=Y,Tps=Tps,N=N,dix=dix,kappaOpt=kappaOpt) - - for (m0 in seq(bornem0[1],bornem0[2],2) ){ - for (mm in seq((m0-5),(m0+20),2) ){ - for (p in seqp){ - V=KBO_L(list(m0=m0, - mm=mm, - pp=p, - aa=aa, - expertMin=expertMin, - expertMax=expertMax, - sigma2_mm=sigma2_mm, - sigma2_m0=sigma2_m0, - sigma2_pp=sigma2_pp, - K=K), + # optimization without sub-sampling + if (method == "EM"){ + print("-------:") + print("Optimisation of initial parameters with EM method - result:") + print("no sub-sampling performed:") + bornem0=quantile(Y[1:N/4], probs = c(.2, .8)) + m0opt=quantile(Y[1:N/4], probs = c(.5)) + mmopt=quantile(Y[(3*N/4):N], probs = c(.5)) + + cat("range m0: ",bornem0,"\n") + cat("initial m0opt: ",m0opt,"\n") + cat("initial mmopt: ",mmopt,"\n") + + popt=0.5 + + EMopt=KBO_EM(param=list(m0=m0opt, + mm=mmopt, + pp=popt, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_mm=sigma2_mm, + sigma2_m0=sigma2_m0, + sigma2_pp=sigma2_pp, + K=K), Y=Y,Tps=Tps,N=N,dix=dix,kappaOpt=kappaOpt) - - if (V > Vopt){ - Vopt=V - m0opt=m0 - mmopt=mm - popt=p + print("Optimised parameters with EM method: ") + cat("Optimized m0: ",EMopt$m0[[1]],"\n") + cat("Optimized mm: ",EMopt$mm[[1]],"\n") + cat("Optimized pp: ",EMopt$pp[[1]],"\n") + print("-------:") + resultat=KBO_known(param=list(mm=EMopt$mm[[1]], + pp=EMopt$pp[[1]], + m0=EMopt$m0[[1]], + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_m0=sigma2_m0, + sigma2_mm=sigma2_mm, + sigma2_pp=sigma2_pp, + K=K), + threshold=threshold,Y=Y,Tps=Tps,N=N,kappa=kappa) + + } else if (method == "ML"){ + print("-------:") + print("Optimisation of initial parameters with ML method - result:") + print("no sub-sampling performed:") + bornem0=quantile(Y[1:N/4], probs = c(.2, .8)) + m0opt=quantile(Y[1:N/4], probs = c(.5)) + mmopt=quantile(Y[(3*N/4):N], probs = c(.5)) + + cat("range m0: ",bornem0,"\n") + cat("initial m0opt: ",m0opt,"\n") + cat("initial mmopt: ",mmopt,"\n") + + popt=0.5 + + Vopt=KBO_L(list(m0=m0opt, + mm=mmopt, + pp=popt, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_mm=sigma2_mm, + sigma2_m0=sigma2_m0, + sigma2_pp=sigma2_pp, + K=K), + Y=Y,Tps=Tps,N=N,dix=dix,kappaOpt=kappaOpt) + for (m0 in seq(bornem0[1],bornem0[2],2) ){ + for (mm in seq((m0-5),(m0+20),2) ){ + for (p in seqp){ + V=KBO_L(list(m0=m0, + mm=mm, + pp=p, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_mm=sigma2_mm, + sigma2_m0=sigma2_m0, + sigma2_pp=sigma2_pp, + K=K), + Y=Y,Tps=Tps,N=N,dix=dix,kappaOpt=kappaOpt) + + if (V > Vopt){ + Vopt=V + m0opt=m0 + mmopt=mm + popt=p + } } } } + + print("Optimised parameters: ") + cat("Optimized m0: ",m0opt,"\n") + cat("Optimized mm: ",mmopt,"\n") + cat("Optimized pp: ",popt,"\n") + print("-------:") + resultat=KBO_known(param=list(mm=mmopt, + pp=popt, + m0=m0opt, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_m0=sigma2_m0, + sigma2_mm=sigma2_mm, + sigma2_pp=sigma2_pp, + K=K), + threshold=threshold,Y=Y,Tps=Tps,N=N,kappa=kappa) } - print("Optimised parameters: ") - print(c(mmopt,popt,m0opt)) - print("-------:") - - resultat=KBO_known(param=list(mm=mmopt, - pp=popt, - m0=m0opt, - aa=aa, - expertMin=expertMin, - expertMax=expertMax, - sigma2_m0=sigma2_m0, - sigma2_mm=sigma2_mm, - sigma2_pp=sigma2_pp, - K=K), - threshold=threshold,Y=Y,Tps=Tps,N=N,kappa=kappa) - } else { - # N petit, si trop petit on ne fait rien if (N > 5) { # Not enough data - no optim if (is.null(param[["m0"]])){ @@ -346,7 +401,7 @@ kfino_fit<-function(datain,Tvar,Yvar, } print("-------:") print("Optimisation of initial parameters - result:") - print("Not enough data => so no optimisation performed:") + print("Not enough data => No optimisation performed:") print("Used parameters: ") print(X) print("-------:") diff --git a/R/utils_functions.R b/R/utils_functions.R index 512ca6e..9f59033 100644 --- a/R/utils_functions.R +++ b/R/utils_functions.R @@ -1,3 +1,12 @@ +#------------------------------------------------------------------- +# utils_functions.R +# some useful functions for kfino method +# loi_outlier() +# KBO_known() +# KBO_L() +# KBO_EM() +#------------------------------------------------------------------- + #' loi_outlier This function defines an outlier distribution (Surface of a #' trapezium) and uses input parameters given in the main function kfino_fit() #' @@ -23,18 +32,28 @@ loi_outlier<-function(y, #-------------------------------------------------------------------------- -#' KBO_known +#' KBO_known a function to calculate a likelihood on given parameters #' #' @param param list, a list of 10 input parameters for mm, pp and m0 #' @param threshold numeric, threshold for CI, default 0.5 #' @param kappa numeric, truncation setting, default 10 -#' @param Y num -#' @param Tps num -#' @param N num -#' +#' @param Y character, name of the numeric variable to predict in the +#' data.frame datain +#' @param Tps character, time column name in the data.frame datain, a +#' numeric vector. +#' Tvar can be expressed as a proportion of day in seconds +#' @param N numeric, length of Y #' #' @details uses the same input parameter list than the main function #' @return a list +#' \describe{ +#' \item{prediction}{vector, the prediction of weights} +#' \item{label}{vector, probability to be an outlier} +#' \item{likelihood}{numeric, the calculated likelihood} +#' \item{lwr}{vector of lower bound CI of the prediction } +#' \item{upr}{vector of upper bound CI of the prediction } +#' \item{flag}{char, is an outlier or not} +#' } #' @export #' #' @examples @@ -55,7 +74,7 @@ loi_outlier<-function(y, #' seqp=seq(0.5,0.7,0.1)) #' print(Y) #' KBO_known(param=param2,threshold=0.5,kappa=10,Y=Y,Tps=Tps,N=N) -KBO_known<-function(param,threshold,kappa,Y,Tps,N){ +KBO_known<-function(param,threshold,kappa=10,Y,Tps,N){ # load objects mm=param[["mm"]] pp=param[["pp"]] @@ -94,7 +113,7 @@ KBO_known<-function(param,threshold,kappa,Y,Tps,N){ #iteration (1.1.2) #----------------------- #// Pour l'instant, je fais comme si kappa<N-1 mettre un if sinon - #avant troncature + #before truncation for (k in 1:(kappa-1)){ # je crée le vecteur des nouveaux m, ##pour plus tard: au lieu de les # faire vide, je prend direct m_{u0} @@ -102,7 +121,7 @@ KBO_known<-function(param,threshold,kappa,Y,Tps,N){ sigma2new=rep(0 ,2^(k+1)) pnew=rep(0 ,2^(k+1)) Lnew=rep(0 ,2^(k+1)) - # CR = constante de renormalisation qui intervient dans le dénominateur des pu + # CR: renormalization constant that intervenes in the denominator of the pu qnew=rep(0 ,2^(k+1)) diffTps<-Tps[k+1] - Tps[k] #--- numérateur de pu0 @@ -138,20 +157,20 @@ KBO_known<-function(param,threshold,kappa,Y,Tps,N){ sigma2_new<-c(sigma2_new,sum(p*sigma2) + sum(p*m^2) - (sum(p*m))^2) } - #apres troncature + # after truncation #---------------------- for (k in kappa:(N-1)){ - # je cree le vecteur des nouveaux m, - # pour plus tard: au lieu de les faire vide, je prend direct m_{u0} + # Initialization of the new m vector + # TODO: instead of 0, perhaps set directly at m_{u0} mnew=rep(0,2^(kappa+1)) sigma2new=rep(0 ,2^(kappa+1)) pnew=rep(0 ,2^(kappa+1)) Lnew=rep(0 ,2^(kappa+1)) - # CR = constante de renormalisation qui intervient dans le dénominateur des pu + # CR: renormalization constant that intervenes in the denominator of the pu qnew=rep(0 ,2^(kappa+1)) diffTps<-Tps[k+1] - Tps[k] - #--- numérateur de pu0 + #--- pu0 numerator tpbeta<-loi_outlier(Y[k+1],K,expertMin,expertMax) pnew[1:(2^kappa)]=p[1:(2^kappa)]*(1-pp)*tpbeta @@ -212,9 +231,12 @@ KBO_known<-function(param,threshold,kappa,Y,Tps,N){ #' #' @param param a list of 10 input parameters mm, pp and m0 #' @param kappaOpt numeric, truncation setting, default 7 -#' @param Y num -#' @param Tps num -#' @param N num +#' @param Y character, name of the numeric variable to predict in the +#' data.frame datain +#' @param Tps character, time column name in the data.frame datain, a +#' numeric vector. +#' Tvar can be expressed as a proportion of day in seconds +#' @param N numeric, length of Y #' @param dix num #' #' @details uses the same input parameter list than the main function @@ -239,7 +261,7 @@ KBO_known<-function(param,threshold,kappa,Y,Tps,N){ #' seqp=seq(0.5,0.7,0.1)) #' print(Y) #' KBO_L(param=param2,kappaOpt=7,Y=Y,Tps=Tps,N=N,dix=6) -KBO_L<-function(param,kappaOpt,Y,Tps,N,dix){ +KBO_L<-function(param,kappaOpt=7,Y,Tps,N,dix){ # load objects mm=param[["mm"]] pp=param[["pp"]] @@ -252,9 +274,9 @@ KBO_L<-function(param,kappaOpt,Y,Tps,N,dix){ sigma2_pp<-param[["sigma2_pp"]] K<-param[["K"]] - #---- paramètre de troncature - # Ici je met kappa =7, ca me fais retirer les proba <0.01 au lieu de 0.001 - # comme qd kappa=10 mais ca divise par 10 le temps de calcul + #---- truncation parameter + # Here, kappa is set to 7, the probabilities are < 0.01 instead of 0.001 + # with kappa=10 and so computing time is divided by 10. kappa=kappaOpt # initialisation (1.1.1) @@ -277,13 +299,13 @@ KBO_L<-function(param,kappaOpt,Y,Tps,N,dix){ #iteration (1.1.2) #----------------------- #// Pour l'instant, je fais comme si kappa<N-1 mettre un if sinon - # avant troncature + # before truncation for (k in 1:(kappa-1)){ mnew=rep(0,2^(k+1)) sigma2new=rep(0 ,2^(k+1)) pnew=rep(0 ,2^(k+1)) Lnew=rep(0 ,2^(k+1)) - # CR = constante de renormalisation qui intervient dans le dénominateur des pu + # CR: renormalization constant that intervenes in the denominator of the pu qnew=rep(0 ,2^(k+1)) diffTps<-Tps[k+1] - Tps[k] #--- numérateur de pu0 @@ -310,7 +332,7 @@ KBO_L<-function(param,kappaOpt,Y,Tps,N,dix){ q=dix*qnew } - # apres troncature + # after truncation #---------------------- for (k in kappa:(N-1)){ # je cree le vecteur des nouveaux m, @@ -318,7 +340,7 @@ KBO_L<-function(param,kappaOpt,Y,Tps,N,dix){ sigma2new=rep(0 ,2^(kappa+1)) pnew=rep(0 ,2^(kappa+1)) Lnew=rep(0 ,2^(kappa+1)) - # CR = constante de renormalisation qui intervient dans le dénominateur des pu + # CR: renormalization constant that intervenes in the denominator of the pu qnew=rep(0 ,2^(kappa+1)) diffTps<-Tps[k+1] - Tps[k] #--- numérateur de pu0 @@ -352,4 +374,259 @@ KBO_L<-function(param,kappaOpt,Y,Tps,N,dix){ return(Vraisemblance) } +#------------------------------------------------------------------------ +#' KBO_EM a function to calculate a likelihood on optimized parameters by +#' an Expectation-Maximization (EM) algorithm +#' +#' @param param a list of 10 input parameters mm, pp and m0 +#' @param kappaOpt numeric, truncation setting, default 7 +#' @param Y character, name of the numeric variable to predict in the +#' data.frame datain +#' @param Tps character, time column name in the data.frame datain, a +#' numeric vector. +#' Tvar can be expressed as a proportion of day in seconds +#' @param N numeric, length of Y +#' @param dix num +#' +#' @details uses the same input parameter list than the main function +#' @return a list: +#' \describe{ +#' \item{m0}{numeric, optimized m0} +#' \item{mm}{numeric, optimized mm} +#' \item{pp}{numeric, optimized pp} +#' \item{likelihood}{numeric, the calculated likelihood} +#' } +#' @export +#' +#' @examples +#' set.seed(1234) +#' Y<-rnorm(n=10,mean=50,4) +#' Tps<-seq(1,10) +#' N=10 +#' param2<-list(m0=41, +#' mm=45, +#' pp=0.5, +#' aa=0.001, +#' expertMin=30, +#' expertMax=75, +#' sigma2_m0=1, +#' sigma2_mm=0.05, +#' sigma2_pp=5, +#' K=2, +#' seqp=seq(0.5,0.7,0.1)) +#' print(Y) +#' KBO_EM(param=param2,kappaOpt=7,Y=Y,Tps=Tps,N=N,dix=6) +KBO_EM<-function(param,kappaOpt,Y,Tps,N,dix){ + # load objects + mm<-param[["mm"]] + pp<-param[["pp"]] + m0<-param[["m0"]] + aa<-param[["aa"]] + expertMin<-param[["expertMin"]] + expertMax<-param[["expertMax"]] + sigma2_m0<-param[["sigma2_m0"]] + sigma2_mm<-param[["sigma2_mm"]] + sigma2_pp<-param[["sigma2_pp"]] + K<-param[["K"]] + + #---- truncation parameter + # Here, kappa is set to 7, the probabilities are < 0.01 instead of 0.001 + # with kappa=10 and so computing time is divided by 10. + kappa=kappaOpt + + # initialisation (1.1.1) + #-------------------- + m1= (sigma2_pp*m0 + Y[1]*sigma2_m0)/(sigma2_m0+sigma2_pp) + sigma1=(sigma2_m0*sigma2_pp)/(sigma2_m0+sigma2_pp) + + l0<- loi_outlier(Y[1],K,expertMin,expertMax) + loinorm1<-dnorm(Y[1],m0, sqrt(sigma2_m0+sigma2_pp)) + + p0= ((1-pp)*l0) / (pp*loinorm1 + (1-pp)*l0) + p1=1-p0 + + m=c(m0,m1) + p=c(p0,p1) + sigma2=c(sigma2_m0,sigma1) + L=dix*c(l0,loinorm1) #Attention *2 pr le grandir + q=c(1,1) #attention + + #Pmean=c(p1)#A VIRER + + #nouveaux paramètres + a=c(1,sigma2_pp/(sigma2_m0+sigma2_pp)) + b=c(0,0) + c=c(0, Y[1]*sigma2_m0/(sigma2_m0+sigma2_pp)) + matrice_A=matrix(a*a/(sigma2 + sigma2_pp), ncol=1, nrow=2, byrow=TRUE) + matrice_B=matrix(b*b/(sigma2 + sigma2_pp), ncol=1, nrow=2, byrow=TRUE) + matrice_C=matrix(a*b/(sigma2 + sigma2_pp), ncol=1, nrow=2, byrow=TRUE) + matrice_Ya=matrix((a*(Y[1]-c)/(sigma2 + sigma2_pp)), ncol=1, nrow=2, byrow=TRUE) + matrice_Yb=matrix((b*(Y[1]-c)/(sigma2 + sigma2_pp)), ncol=1, nrow=2, byrow=TRUE) + + Z=matrix(c(0,1), ncol=1, nrow=2, byrow=TRUE) + #iteration (1.1.2) + #----------------------- + # kappa < N-1 for now. add an if condition if necessary + # before truncation + + for (k in 1:(kappa-1)){ + mnew=rep(0,2^(k+1)) + sigma2new=rep(0 ,2^(k+1)) + pnew=rep(0 ,2^(k+1)) + Lnew=rep(0 ,2^(k+1)) + anew=rep(0 ,2^(k+1)) + bnew=rep(0 ,2^(k+1)) + cnew=rep(0 ,2^(k+1)) + + # CR: renormalization constant that intervenes in the denominator of the pu + qnew=rep(0 ,2^(k+1)) + diffTps<-Tps[k+1] - Tps[k] + #--- numérateur de pu0 + tpbeta<-loi_outlier(Y[k+1],K,expertMin,expertMax) + pnew[1:(2^k)]=p[1:(2^k)]*(1-pp)*tpbeta + Lnew[1:(2^k)]=L[1:(2^k)]*tpbeta + mnew[1:(2^k)]= m[1:(2^k)]*exp(-aa*diffTps) + mm*(1-exp(-aa*diffTps)) #m_u0 + sigma2new[1:(2^k)] = sigma2[1:(2^k)]*exp(-2*aa*(diffTps)) + + (1-exp(-2*aa*(diffTps)))*sigma2_mm/(2*aa) + qnew[1:(2^k)] <- q[1:(2^k)]*(1-pp) + qnew[(1+2^k):2^(k+1)] <- q[1:(2^k)]*pp + # new parameters + anew[1:(2^k)]= a[1:(2^k)]*exp(-aa*diffTps) + bnew[1:(2^k)]= b[1:(2^k)]*exp(-aa*diffTps) + (1-exp(-aa*diffTps)) + cnew[1:(2^k)]= c[1:(2^k)]*exp(-aa*diffTps) + + # continuation + sommevar=sigma2new[1:(2^k)] + sigma2_pp + tpnorm<-dnorm(Y[k+1], mnew[1:(2^k)], sqrt(sommevar)) + pnew[(1+2^k):2^(k+1)]=p[1:(2^k)]*pp*tpnorm + Lnew[(1+2^k):2^(k+1)]=L[1:(2^k)]*tpnorm + mnew[(1+2^k):2^(k+1)] = (sigma2new[1:(2^k)]*Y[k+1] + mnew[1:(2^k)]*sigma2_pp)/sommevar + sigma2new[(1+2^k):2^(k+1)] = sigma2new[1:(2^k)] * sigma2_pp/sommevar + # new parameters + anew[(1+2^k):2^(k+1)]= a[1:(2^k)]*sigma2_pp/sommevar + bnew[(1+2^k):2^(k+1)]= b[1:(2^k)]*sigma2_pp/sommevar + cnew[(1+2^k):2^(k+1)]= c[1:(2^k)]*sigma2_pp/sommevar + Y[k+1]*sigma2new[1:(2^k)]/sommevar + + #Proba1<-sum(pnew[(1+2^k):2^(k+1)])#A VIRER + #Pmean=c(Pmean,Proba1/sum(pnew))#A VIRER + + m=mnew + sigma2=sigma2new + p=pnew/sum(pnew) + # new parameters + a=anew + b=bnew + c=cnew + + matrice_A=cbind(rbind(matrice_A,matrice_A), a*a/(sigma2 + sigma2_pp)) + matrice_B=cbind(rbind(matrice_B,matrice_B), b*b/(sigma2 + sigma2_pp)) + matrice_C=cbind(rbind(matrice_C,matrice_C), a*b/(sigma2 + sigma2_pp)) + matrice_Ya=cbind(rbind(matrice_Ya,matrice_Ya), a*(Y[k+1]-c)/(sigma2 + sigma2_pp)) + matrice_Yb=cbind(rbind(matrice_Yb,matrice_Yb), b*(Y[k+1]-c)/(sigma2 + sigma2_pp)) + + Znew=matrix(rep(0,(k+1)*2^(k+1)), ncol=k+1, nrow=2^(k+1), byrow=TRUE) + Znew[(1:2^k),]=cbind(Z, rep(0,2^k)) + Znew[(2^k+1):(2^(k+1)),]=cbind(Z, rep(1,2^k)) + Z=Znew + + L=dix*Lnew # fois 2 pr le grandir + q=dix*qnew + } + + # after truncation + #---------------------- + for (k in kappa:(N-1)){ + # Initialisation of the new m vectors + mnew=rep(0,2^(kappa+1)) + sigma2new=rep(0 ,2^(kappa+1)) + pnew=rep(0 ,2^(kappa+1)) + Lnew=rep(0 ,2^(kappa+1)) + anew=rep(0 ,2^(kappa+1)) + bnew=rep(0 ,2^(kappa+1)) + cnew=rep(0 ,2^(kappa+1)) + # CR: renormalization constant that intervenes in the denominator of the pu + qnew=rep(0 ,2^(kappa+1)) + diffTps<-Tps[k+1] - Tps[k] + #--- pu0 numerator + tpbeta<-loi_outlier(Y[k+1],K,expertMin,expertMax) + pnew[1:(2^kappa)]=p[1:(2^kappa)]*(1-pp)*tpbeta + Lnew[1:(2^kappa)]=L[1:(2^kappa)]*tpbeta + # m_uO + mnew[1:(2^kappa)]= m[1:(2^kappa)]*exp(-aa*diffTps) + mm*(1-exp(-aa*diffTps)) + sigma2new[1:(2^kappa)] = sigma2[1:(2^kappa)]*exp(-2*aa*(diffTps)) + + (1-exp(-2*aa*(diffTps)))*sigma2_mm/(2*aa) + qnew[1:(2^kappa)]=q[1:(2^kappa)]*(1-pp) + qnew[(1+2^kappa):2^(kappa+1)]=q[1:(2^kappa)]*pp + # new parameters + anew[1:(2^kappa)]= a[1:(2^kappa)]*exp(-aa*diffTps) + bnew[1:(2^kappa)]= b[1:(2^kappa)]*exp(-aa*diffTps) + (1-exp(-aa*diffTps)) + cnew[1:(2^kappa)]= c[1:(2^kappa)]*exp(-aa*diffTps) + sommevar=sigma2new[1:(2^kappa)] + sigma2_pp + tpnorm<-dnorm(Y[k+1], mnew[1:(2^kappa)], sqrt(sommevar)) + pnew[(1+2^kappa):2^(kappa+1)]=p[1:(2^kappa)]*pp*tpnorm + Lnew[(1+2^kappa):2^(kappa+1)]=L[1:(2^kappa)]*tpnorm + # m_u1 + mnew[(1+2^kappa):2^(kappa+1)] = (sigma2new[1:(2^kappa)]*Y[k+1] + + mnew[1:(2^kappa)]*sigma2_pp)/sommevar + sigma2new[(1+2^kappa):2^(kappa+1)]=sigma2new[1:(2^kappa)] * sigma2_pp/sommevar + + # new parameters + anew[(1+2^kappa):2^(kappa+1)]= a[1:(2^kappa)]*sigma2_pp/sommevar + bnew[(1+2^kappa):2^(kappa+1)]= b[1:(2^kappa)]*sigma2_pp/sommevar + cnew[(1+2^kappa):2^(kappa+1)]= c[1:(2^kappa)]*sigma2_pp/sommevar + + Y[k+1]*sigma2new[1:(2^kappa)]/sommevar + + selection=order(pnew, decreasing=T)[1:2^kappa] + + #Proba1<-sum(pnew[(1+2^kappa):2^(kappa+1)]) #A VIRER + #Pmean=c(Pmean,Proba1/sum(pnew))#A VIRER + + m=mnew[selection] + sigma2=sigma2new[selection] + p=pnew[selection]/sum(pnew[selection]) + L=dix*Lnew[selection] #fois 2 pr le grandir + q=dix*qnew[selection] + + Znew=matrix(rep(0,(k+1)*2^(kappa+1)), ncol=k+1, nrow=2^(kappa+1), byrow=TRUE) + Znew[1:2^kappa,]=cbind(Z, rep(0,2^kappa)) + Znew[((1+2^kappa):2^(kappa+1)),]=cbind(Z, rep(1,2^kappa)) + + Z=Znew[selection,] + + a=anew[selection] + b=bnew[selection] + c=cnew[selection] + matrice_A=(cbind(rbind(matrice_A,matrice_A), + anew*anew/(sigma2new + sigma2_pp)))[selection,] + matrice_B=(cbind(rbind(matrice_B,matrice_B), + bnew*bnew/(sigma2new + sigma2_pp)))[selection,] + matrice_C=(cbind(rbind(matrice_C,matrice_C), + anew*bnew/(sigma2new + sigma2_pp)))[selection,] + matrice_Ya=(cbind(rbind(matrice_Ya,matrice_Ya), + anew*(Y[k+1]-cnew)/(sigma2new + sigma2_pp)))[selection,] + matrice_Yb=(cbind(rbind(matrice_Yb,matrice_Yb), + bnew*(Y[k+1]-cnew)/(sigma2new + sigma2_pp)))[selection,] + } + + Vraisemblance=L%*%q + + A=(Z[,1]%*%p)/(sigma2_m0+sigma2_pp) + sum((p%*%(matrice_A*Z))[1:(N-1)]) + Ya=(Z[,1]%*%p)*Y[1]/(sigma2_m0+sigma2_pp) + sum((p%*%(matrice_Ya*Z))[1:(N-1)]) + B= sum((p%*%(matrice_B*Z))[1:(N-1)]) + C= sum((p%*%(matrice_C*Z))[1:(N-1)]) + Yb= sum((p%*%(matrice_Yb*Z))[1:(N-1)]) + + newm0= (C*Yb-B*Ya)/(C^2-A*B) + newmm= (C*Ya-A*Yb)/(C^2-A*B) + ppnew=sum(p%*%Z)/N + + # Outputs + resultat=list("m0"=newm0, + "mm"=newmm, + "pp"=ppnew, + "likelihood"=Vraisemblance) + return(resultat) +} + + #------------------------ End of file ----------------------------------- diff --git a/man/KBO_EM.Rd b/man/KBO_EM.Rd new file mode 100644 index 0000000..578b5da --- /dev/null +++ b/man/KBO_EM.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_functions.R +\name{KBO_EM} +\alias{KBO_EM} +\title{KBO_EM a function to calculate a likelihood on optimized parameters by +an Expectation-Maximization (EM) algorithm} +\usage{ +KBO_EM(param, kappaOpt, Y, Tps, N, dix) +} +\arguments{ +\item{param}{a list of 10 input parameters mm, pp and m0} + +\item{kappaOpt}{numeric, truncation setting, default 7} + +\item{Y}{character, name of the numeric variable to predict in the +data.frame datain} + +\item{Tps}{character, time column name in the data.frame datain, a +numeric vector. +Tvar can be expressed as a proportion of day in seconds} + +\item{N}{numeric, length of Y} + +\item{dix}{num} +} +\value{ +a list: +\describe{ + \item{m0}{numeric, optimized m0} + \item{mm}{numeric, optimized mm} + \item{pp}{numeric, optimized pp} + \item{likelihood}{numeric, the calculated likelihood} +} +} +\description{ +KBO_EM a function to calculate a likelihood on optimized parameters by +an Expectation-Maximization (EM) algorithm +} +\details{ +uses the same input parameter list than the main function +} +\examples{ +set.seed(1234) +Y<-rnorm(n=10,mean=50,4) +Tps<-seq(1,10) +N=10 +param2<-list(m0=41, + mm=45, + pp=0.5, + aa=0.001, + expertMin=30, + expertMax=75, + sigma2_m0=1, + sigma2_mm=0.05, + sigma2_pp=5, + K=2, + seqp=seq(0.5,0.7,0.1)) +print(Y) +KBO_EM(param=param2,kappaOpt=7,Y=Y,Tps=Tps,N=N,dix=6) +} diff --git a/man/KBO_L.Rd b/man/KBO_L.Rd index bb30361..ba5adef 100644 --- a/man/KBO_L.Rd +++ b/man/KBO_L.Rd @@ -4,18 +4,21 @@ \alias{KBO_L} \title{KBO_L a function to calculate a likelihood on optimized parameters} \usage{ -KBO_L(param, kappaOpt, Y, Tps, N, dix) +KBO_L(param, kappaOpt = 7, Y, Tps, N, dix) } \arguments{ \item{param}{a list of 10 input parameters mm, pp and m0} \item{kappaOpt}{numeric, truncation setting, default 7} -\item{Y}{num} +\item{Y}{character, name of the numeric variable to predict in the +data.frame datain} -\item{Tps}{num} +\item{Tps}{character, time column name in the data.frame datain, a +numeric vector. +Tvar can be expressed as a proportion of day in seconds} -\item{N}{num} +\item{N}{numeric, length of Y} \item{dix}{num} } diff --git a/man/KBO_known.Rd b/man/KBO_known.Rd index f16a86a..641f638 100644 --- a/man/KBO_known.Rd +++ b/man/KBO_known.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/utils_functions.R \name{KBO_known} \alias{KBO_known} -\title{KBO_known} +\title{KBO_known a function to calculate a likelihood on given parameters} \usage{ -KBO_known(param, threshold, kappa, Y, Tps, N) +KBO_known(param, threshold, kappa = 10, Y, Tps, N) } \arguments{ \item{param}{list, a list of 10 input parameters for mm, pp and m0} @@ -13,17 +13,28 @@ KBO_known(param, threshold, kappa, Y, Tps, N) \item{kappa}{numeric, truncation setting, default 10} -\item{Y}{num} +\item{Y}{character, name of the numeric variable to predict in the +data.frame datain} -\item{Tps}{num} +\item{Tps}{character, time column name in the data.frame datain, a +numeric vector. +Tvar can be expressed as a proportion of day in seconds} -\item{N}{num} +\item{N}{numeric, length of Y} } \value{ a list +\describe{ + \item{prediction}{vector, the prediction of weights} + \item{label}{vector, probability to be an outlier} + \item{likelihood}{numeric, the calculated likelihood} + \item{lwr}{vector of lower bound CI of the prediction } + \item{upr}{vector of upper bound CI of the prediction } + \item{flag}{char, is an outlier or not} +} } \description{ -KBO_known +KBO_known a function to calculate a likelihood on given parameters } \details{ uses the same input parameter list than the main function diff --git a/man/kfino_fit.Rd b/man/kfino_fit.Rd index d6396eb..9278756 100644 --- a/man/kfino_fit.Rd +++ b/man/kfino_fit.Rd @@ -10,6 +10,7 @@ kfino_fit( Yvar, param = NULL, doOptim = TRUE, + method = "ML", threshold = 0.5, kappa = 10, kappaOpt = 7 @@ -28,6 +29,10 @@ Tvar can be expressed as a proportion of day in seconds} \item{doOptim}{logical, if TRUE optimisation of the initial parameters default TRUE} +\item{method}{character, the method used to optimize the initial parameters: +Expectation-Maximization algorithm `"EM"` or Maximization +Likelihhod `"ML"`, default `"ML"`} + \item{threshold}{numeric, threshold to qualify an observation as outlier according to the label_pred, default 0.5} @@ -70,19 +75,20 @@ The initialization parameter list param contains: default seq(0.5,0.7,0.1)} } It has to be given by the user following his knowledge of the animal or -the dataset. All parameters are compulsory execpt m0, mm and pp that can be -optimized by the algorithm. In the optimisation step, those three parameters +the dataset. All parameters are compulsory except m0, mm and pp that can be +optimized by the algorithm. In the optimization step, those three parameters are initialized according to the input data (between the expert range) using quantile of the Y distribution (varying between 0.2 and 0.8 for m0 and 0.5 for mm). pp is a sequence varying between 0.5 and 0.7. A sub-sampling is performed to speed the algorithm if the number of possible -observations studied is greater than 500. +observations studied is greater than 500. Optimization is performed using +`"EM"` or `"ML"` methods. } \examples{ data(spring1) library(dplyr) -# --- With Optimisation on initial parameters +# --- With Optimisation on initial parameters - ML method t0 <- Sys.time() param1<-list(m0=NULL, mm=NULL, @@ -98,8 +104,16 @@ param1<-list(m0=NULL, resu1<-kfino_fit(datain=spring1, Tvar="dateNum",Yvar="Poids", - doOptim=TRUE,param=param1) + doOptim=TRUE,method="ML",param=param1) +Sys.time() - t0 + +# --- With Optimisation on initial parameters - EM method +t0 <- Sys.time() +resu1b<-kfino_fit(datain=spring1, + Tvar="dateNum",Yvar="Poids", + doOptim=TRUE,method="EM",param=param1) Sys.time() - t0 + # --- Without Optimisation on initial parameters t0 <- Sys.time() param2<-list(m0=41, @@ -136,6 +150,6 @@ param3<-list(m0=NULL, seqp=seq(0.5,0.7,0.1)) resu3<-kfino_fit(datain=merinos2, Tvar="dateNum",Yvar="Poids", - doOptim=TRUE,param=param3) + doOptim=TRUE,method="ML",param=param3) Sys.time() - t0 } -- GitLab