diff --git a/NAMESPACE b/NAMESPACE index 9ad4cc09efbaff49d04368de53b2aad59e914aba..4ec3f732cda1d0160df93983aed056ffc6e5cd4a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,9 @@ # Generated by roxygen2: do not edit by hand +export(KBO_L) +export(KBO_known) export(kfino_fit) +export(kfino_fit2) export(kfino_plot) importFrom(dplyr,"%>%") importFrom(dplyr,.data) diff --git a/R/kfino2.R b/R/kfino2.R new file mode 100644 index 0000000000000000000000000000000000000000..0daa68b1a9edeaecc159198c88c08d29846b4649 --- /dev/null +++ b/R/kfino2.R @@ -0,0 +1,436 @@ +#' kafino_fit2 a function to detect outlier with a Kalman Filtering approach +#' @param datain an input data.frame of one time course to study (unique IDE) +#' @param Tvar char, time column name in the data.frame datain, a numeric vector +#' Tvar can be expressed as a proportion of day in seconds +#' @param Yvar char, name of the variable to predict in the data.frame datain +#' @param param list, a list of initialization parameter +#' @param doOptim logical, if TRUE optimisation of the initial parameters +#' default TRUE +#' @param threshold numeric, threshold to qualify an observation as outlier +#' according to the label_pred, default 0.5 +#' +#' @details The initialization parameter list param contains: +#' \describe{ +#' \item{mm}{target weight, NULL if the user wants to optimize it} +#' \item{pp}{probability to be correctly weighed, NULL if the user wants to +#' optimize it} +#' \item{m0}{Initial weight, NULL if the user wants to optimize it} +#' \item{aa}{numeric, rate of weight change, default 0.001 } +#' \item{expertMin}{numeric, the minimal weight expected by the user} +#' \item{expertMax}{numeric, the maximal weight expected by the user} +#' \item{sigma2_m0}{variance of m0} +#' \item{sigma2_mm}{numeric, variance of mm, related to the unit of Tvar, +#' default 0.05} +#' \item{sigma2_pp}{numeric, variance of pp, related to the unit of Yvar, +#' default 5} +#' \item{K}{numeric, a constant value in the outlier function (trapezoid), +#' by default K=2 - increasing K, XXX} +#' \item{seqp}{numeric, minimal pp probability to be correctly weighted.} +#' } +#' It can be given by the user following his knowledge of the animal or dataset +#' or entirely constructed by the function. In the optimisation step, this +#' vector is initialized according to the input data (between the expert +#' range) using quantile of the Y distribution (varying between 0.2 and 0.8 for +#' m0 and 0.5 for mm). pp is a sequence varying between minp to (minp+0.3). A +#' sub-sampling is performed to speed the algorithm if the number of possible +#' observations studied is greater than 500. +#' +#' @importFrom stats dnorm quantile na.omit +#' @importFrom dplyr mutate filter left_join arrange %>% +#' @importFrom dplyr .data if_else row_number select +#' @return a S3 list with two data frames and a list of vectors of +#' kafino results +#' \describe{ +#' \item{detectOutlier}{The whole dataset with the detected outliers flagged +#' and prediction} +#' \item{PredictionOK}{A dataset with the predictions on possible values} +#' \item{kafino.results}{kfino results (a list of vectors) on optimized input +#' parameters or not} +#' } +#' @export +#' @examples +#' data(spring1) +#' library(dplyr) +#' +#' # --- With Optimisation on initial parameters +#' t0 <- Sys.time() +#' param1<-list(m0=NULL, +#' mm=NULL, +#' pp=NULL, +#' aa=0.001, +#' expertMin=30, +#' expertMax=75, +#' sigma2_m0=1, +#' sigma2_mm=0.05, +#' sigma2_pp=5, +#' K=2, +#' seqp=seq(0.5,0.7,0.1)) +#' +#' resu1<-kfino_fit2(datain=spring1, +#' Tvar="dateNum",Yvar="Poids", +#' doOptim=TRUE,param=param1) +#' Sys.time() - t0 +#' # --- Without Optimisation on initial parameters +#' t0 <- Sys.time() +#' param2<-list(m0=41, +#' mm=45, +#' pp=0.5, +#' aa=0.001, +#' expertMin=30, +#' expertMax=75, +#' sigma2_m0=1, +#' sigma2_mm=0.05, +#' sigma2_pp=5, +#' K=2, +#' seqp=seq(0.5,0.7,0.1)) +#' resu2<-kfino_fit2(datain=spring1, +#' Tvar="dateNum",Yvar="Poids", +#' param=param2, +#' doOptim=FALSE) +#' Sys.time() - t0 +#' +#' # complex data on merinos2 dataset +#' data(merinos2) +#' +#' t0 <- Sys.time() +#' param3<-list(m0=NULL, +#' mm=NULL, +#' pp=NULL, +#' aa=0.001, +#' expertMin=10, +#' expertMax=45, +#' sigma2_m0=1, +#' sigma2_mm=0.05, +#' sigma2_pp=5, +#' K=2, +#' seqp=seq(0.5,0.7,0.1)) +#' resu3<-kfino_fit2(datain=merinos2, +#' Tvar="dateNum",Yvar="Poids", +#' doOptim=TRUE,param=param3) +#' Sys.time() - t0 +kfino_fit2<-function(datain,Tvar,Yvar, + param=NULL, + doOptim=TRUE,threshold=0.5){ + + if( any(is.null(param[["expertMin"]]) | + is.null(param[["expertMax"]])) ) + stop('You have to define expertMin and expertMax.') + if( any(!is.numeric(param[["expertMin"]]) | + !is.numeric(param[["expertMax"]])) ) + stop('expertMin and expertMax must be numeric.') + if( !is.numeric(t(datain[,Yvar]))) + stop('Input parameter Yvar must contain numeric values.') + if( !is.numeric(t(datain[,Tvar]))) + stop('Input parameter Tvar must contain numeric values.') + + #------------------------------------------------------------- + # load objects + m0<-param[["m0"]] + mm<-param[["mm"]] + pp<-param[["pp"]] + if(is.null(param[["aa"]])){ aa<-0.001 } else { aa<-param[["aa"]] } + expertMin<-param[["expertMin"]] + expertMax<-param[["expertMax"]] + if(is.null(param[["sigma2_m0"]])){ + sigma2_m0<-1 + } else {sigma2_m0<-param[["sigma2_m0"]]} + if(is.null(param[["sigma2_mm"]])){ + sigma2_mm<-0.05 + } else {sigma2_mm<-param[["sigma2_mm"]]} + if(is.null(param[["sigma2_pp"]])){ + sigma2_pp<-5 + } else {sigma2_pp<-param[["sigma2_pp"]]} + if(is.null(param[["K"]])){ K<-5 } else { K<-param[["K"]] } + seqp<-param[["seqp"]] + #------------------------------------------------------------- + + # Flag impossible weight values (expert knowledge) + # the algorithm works on the dataset with possible values + # Create an artificial column numbering the rows for joining purpose + datain<-as.data.frame(datain) + datain<-datain %>% + mutate(rowNum=row_number(), + flag1=if_else(.data[[Yvar]] > expertMin & + .data[[Yvar]] <= expertMax,"OK","Bad")) + tp.dt<-datain %>% filter(.data$flag1 == "OK") + + Y<-tp.dt[,Yvar] + N<-nrow(tp.dt) + Tps<-tp.dt[,Tvar] + + #WARNING WARNING: AU lieu de calculer L qui est arrondi à 0, + # je calcule 10^N fois Lu et 10^N q. Au total pr chaque donnée + # je multiplie par 100 mais comme c'est l'ordre de grandeur de loioutlier() + # ca ne me parait pas disproportionné. + # Au lieu de 10 et 10 je fais simplement sqrt(expertMax - expertMin) + dix=sqrt(expertMax - expertMin) + + #------------------------------------------------------------------------ + # Optimisation sur les paramètres initiaux ou pas + #------------------------------------------------------------------------ + if (doOptim == TRUE){ + + if (N > 500){ + # optim avec sous-echantillonnage + print("-------:") + print("Optimisation of initial parameters with sub-sampling - result:") + bornem0=quantile(Y[1:N/4], probs = c(.2, .8)) + m0opt=quantile(Y[1:N/4], probs = c(.5)) + mmopt=quantile(Y[(3*N/4):N], probs = c(.5)) + + cat("bornem0: ",bornem0,"\n") + cat("m0opt: ",m0opt,"\n") + cat("mmopt: ",mmopt,"\n") + + popt=0.5 + #--- Saving datain before sub-sampling + YY=Y + TpsTps=Tps + NN=N + + Subechant=sort(sample(1:NN,50)) + Y=YY[Subechant] + Tps=TpsTps[Subechant] + N=50 + Vopt=KBO_L(list(m0=m0opt, + mm=mmopt, + pp=popt, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_mm=sigma2_mm, + sigma2_m0=sigma2_m0, + sigma2_pp=sigma2_pp, + K=K), + Y,Tps,N,dix) + + for (m0 in seq(bornem0[1],bornem0[2],2) ){ + for (mm in seq((m0-5),(m0+20),2) ){ + for (p in seqp){ + # A voir si 50 sous-echantillons au hasard suffisent. Comme dans + # Robbins Monroe, permet aussi de reduire l'impact de la troncature + Subechant=sort(sample(1:NN,50)) + Y=YY[Subechant] + Tps=TpsTps[Subechant] + + V=KBO_L(list(m0=m0, + mm=mm, + pp=p, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_mm=sigma2_mm, + sigma2_m0=sigma2_m0, + sigma2_pp=sigma2_pp, + K=K), + Y,Tps,N,dix) + if (V > Vopt){ + Vopt=V + m0opt=m0 + mmopt=mm + popt=p + } + } + } + } + + # Le fait de prendre des sous-echantillons remarquables + # (ie 50 au lieu de 200) ne divise que par 2 le temps de calculs + # (au lieu de 4 comme imaginé cela vient p.e de la fonction sample()) + Y=YY + Tps=TpsTps + N=NN + print("Optimised parameters: ") + print(c(mmopt,popt,m0opt)) + print("-------:") + + resultat=KBO_known(param=list(mm=mmopt, + pp=popt, + m0=m0opt, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_m0=sigma2_m0, + sigma2_mm=sigma2_mm, + sigma2_pp=sigma2_pp, + K=K), + threshold,Y=Y,Tps=Tps,N=N) + + } else if (N > 50){ + # optim sans sous-echantillonnage + print("-------:") + print("Optimisation of initial parameters - result:") + print("no sub-sampling performed:") + bornem0=quantile(Y[1:N/4], probs = c(.2, .8)) + m0opt=quantile(Y[1:N/4], probs = c(.5)) + mmopt=quantile(Y[(3*N/4):N], probs = c(.5)) + + cat("bornem0: ",bornem0,"\n") + cat("m0opt: ",m0opt,"\n") + cat("mmopt: ",mmopt,"\n") + + popt=0.5 + + Vopt=KBO_L(list(m0=m0opt, + mm=mmopt, + pp=popt, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_mm=sigma2_mm, + sigma2_m0=sigma2_m0, + sigma2_pp=sigma2_pp, + K=K), + Y,Tps,N,dix) + + for (m0 in seq(bornem0[1],bornem0[2],2) ){ + for (mm in seq((m0-5),(m0+20),2) ){ + for (p in seqp){ + V=KBO_L(list(m0=m0, + mm=mm, + pp=p, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_mm=sigma2_mm, + sigma2_m0=sigma2_m0, + sigma2_pp=sigma2_pp, + K=K), + Y,Tps,N,dix) + + if (V > Vopt){ + Vopt=V + m0opt=m0 + mmopt=mm + popt=p + } + } + } + } + + print("Optimised parameters: ") + print(c(mmopt,popt,m0opt)) + print("-------:") + + resultat=KBO_known(param=list(mm=mmopt, + pp=popt, + m0=m0opt, + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_m0=sigma2_m0, + sigma2_mm=sigma2_mm, + sigma2_pp=sigma2_pp, + K=K), + threshold=threshold,Y=Y,Tps=Tps,N=N) + + } else { + # N petit, si trop petit on ne fait rien + if (N > 5) { + # Not enough data - no optim + if (is.null(param[["m0"]])){ + X<-c(trunc(quantile(Y[1:N/4], probs = .2)), 0.5, + round(quantile(Y[(3*N/4):N], probs = 0.8))) + } else { + X<-c(m0,pp,mm) + } + print("-------:") + print("Optimisation of initial parameters - result:") + print("Not enough data => so no optimisation performed:") + print("Used parameters: ") + print(X) + print("-------:") + resultat=KBO_known(param=list(m0=X[[1]], + pp=X[[2]], + mm=X[[3]], + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_m0=sigma2_m0, + sigma2_mm=sigma2_mm, + sigma2_pp=sigma2_pp, + K=K), + threshold=threshold,Y=Y,Tps=Tps,N=N) + } else { + warning("Not enough data between expert knowledge. The algorithm is not performed.") + resultat<-NULL + } + } + } else { + # Pas d'optimisation et test si N petit - si trop petit on ne fait rien + if (N > 5) { + # No optimisation on initial parameters + if (is.null(param[["m0"]])){ + X<-c(trunc(quantile(Y[1:N/4], probs = .2)), 0.5, + round(quantile(Y[(3*N/4):N], probs = 0.8))) + } else { + X<-c(m0,pp,mm) + } + print("-------:") + print("No optimisation of initial parameters:") + print("Used parameters: ") + print(X) + resultat=KBO_known(param=list(m0=X[[1]], + pp=X[[2]], + mm=X[[3]], + aa=aa, + expertMin=expertMin, + expertMax=expertMax, + sigma2_m0=sigma2_m0, + sigma2_mm=sigma2_mm, + sigma2_pp=sigma2_pp, + K=K), + threshold=threshold,Y=Y,Tps=Tps,N=N) + } else { + warning("Not enough data between expert knowledge. The algorithm is not performed.") + resultat<-NULL + } + } + + #--------------------------------------------------------------------------- + # Formatting output results + #--------------------------------------------------------------------------- + # If resultat NULL then create output with datain and 2 NULL objects + # else create output with detectoutlier, PredictionOK and kafino.results + # useful for the kafino_plot() function + #--------------------------------------------------------------------------- + if (is.null(resultat)){ + dt.out<-datain %>% mutate(flag=.data$flag1) + dt.pred<-NULL + resultat<-NULL + + mylist<-list(dt.out,dt.pred,resultat) + names(mylist)<-c("detectOutlier","PredictionOK","kafino.results") + class(mylist) = c("kfino") + return(invisible(mylist)) + } else { + prediction=na.omit(resultat$prediction) + label_pred=round(na.omit(resultat$label),2) + lwr=na.omit(resultat$lwr) + upr=na.omit(resultat$upr) + flag=na.omit(resultat$flag) + + dt.pred=cbind.data.frame(select(tp.dt,.data$rowNum), + prediction,label_pred,lwr,upr,flag) + + # join dt.pred with the initial datain containing all the observations + dt.out<-left_join(datain,dt.pred,by="rowNum") + dt.out<-mutate(dt.out,flag=if_else(.data$flag1 == "Bad", .data$flag1, .data$flag)) + dt.out<-arrange(dt.out,.data$rowNum) + + #-------------------------------------- + # return a S3 list object + #-------------------------------------- + # 1. a whole dataset with the detected outliers flagged and prediction + # 2. a dataset with the prediction on possible values + # 3. optimisation results (a list of vectors) + mylist<-list(dt.out,dt.pred,resultat) + names(mylist)<-c("detectOutlier","PredictionOK","kfino.results") + class(mylist) = c("kfino") + return(invisible(mylist)) + } +} + + + +#-------------------------------------- End of file -------------------------- diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000000000000000000000000000000000000..a59db8f024c0b29b0c2abec6ba2abcdd3fd6751b --- /dev/null +++ b/R/utils.R @@ -0,0 +1,310 @@ +#' KBO_known +#' +#' @param param list, a list of 10 input parameters for mm, pp and m0 +#' @param threshold numeric, threshold for CI +#' @param Y num +#' @param Tps num +#' @param N num +#' +#' +#' @details uses the same input parameter list than the main function +#' @return a list +#' @export +#' +#' @examples +#' # not run +KBO_known<-function(param,threshold,Y,Tps,N){ + # load objects + mm=param[["mm"]] + pp=param[["pp"]] + m0=param[["m0"]] + aa=param[["aa"]] + expertMin=param[["expertMin"]] + expertMax=param[["expertMax"]] + sigma2_m0=param[["sigma2_m0"]] + sigma2_mm=param[["sigma2_mm"]] + sigma2_pp=param[["sigma2_pp"]] + K=param[["K"]] + # paramètre de troncature + kappa <-10 + + #--- function defining an outlier distribution (Surface of a trapezium) + # uses expertMin and expertMax + loi_outlier<-function(y){ + 2/((K+1)*(expertMax - expertMin)) + + (2*(K-1)/(K+1))*((y - expertMin)/((expertMax - expertMin)^2)) + } + + # initialisation (1.1.1) + #-------------------- + m1= (sigma2_pp*m0 + Y[1]*sigma2_m0)/(sigma2_m0+sigma2_pp) + sigma1=(sigma2_m0*sigma2_pp)/(sigma2_m0+sigma2_pp) + + l0<- loi_outlier(Y[1]) + loinorm1<-dnorm(Y[1],m0, sqrt(sigma2_m0+sigma2_pp)) + + p0= ((1-pp)*l0) / (pp*loinorm1 + (1-pp)*l0) + p1=1-p0 + + m=c(m0,m1) + p=c(p0,p1) + sigma2=c(sigma2_m0,sigma1) + L=c(l0,loinorm1) + + Xmean=c(p0*m0 + p1*m1) + Pmean=c(p1) + q=c(1,1) + sigma2_new=c(sigma2_m0) + + #iteration (1.1.2) + #----------------------- + #// Pour l'instant, je fais comme si kappa<N-1 mettre un if sinon + #avant troncature + for (k in 1:(kappa-1)){ + # je crée le vecteur des nouveaux m, ##pour plus tard: au lieu de les + # faire vide, je prend direct m_{u0} + mnew=rep(0,2^(k+1)) + sigma2new=rep(0 ,2^(k+1)) + pnew=rep(0 ,2^(k+1)) + Lnew=rep(0 ,2^(k+1)) + # CR = constante de renormalisation qui intervient dans le dénominateur des pu + qnew=rep(0 ,2^(k+1)) + diffTps<-Tps[k+1] - Tps[k] + #--- numérateur de pu0 + tpbeta<-loi_outlier(Y[k+1]) + + pnew[1:(2^k)]=p[1:(2^k)]*(1-pp)*tpbeta + Lnew[1:(2^k)]=L[1:(2^k)]*tpbeta + mnew[1:(2^k)]= m[1:(2^k)]*exp(-aa*diffTps) + mm*(1-exp(-aa*diffTps)) #m_u0 + + sigma2new[1:(2^k)] = sigma2[1:(2^k)]*exp(-2*aa*(diffTps)) + + (1-exp(-2*aa*(diffTps)))*sigma2_mm/(2*aa) + qnew[1:(2^k)] <- q[1:(2^k)]*(1-pp) + qnew[(1+2^k):2^(k+1)] <- q[1:(2^k)]*pp + sommevar=sigma2new[1:(2^k)] + sigma2_pp + tpnorm<-dnorm(Y[k+1], mnew[1:(2^k)], sqrt(sommevar)) + pnew[(1+2^k):2^(k+1)]=p[1:(2^k)]*pp*tpnorm + Lnew[(1+2^k):2^(k+1)]=L[1:(2^k)]*tpnorm + mnew[(1+2^k):2^(k+1)] = (sigma2new[1:(2^k)]*Y[k+1] + + mnew[1:(2^k)]*sigma2_pp)/sommevar + sigma2new[(1+2^k):2^(k+1)] = sigma2new[1:(2^k)] * sigma2_pp/sommevar + + CR<-sum(pnew) + Proba1<-sum(pnew[(1+2^k):2^(k+1)]) + SommeMasse<-sum(pnew*mnew) + + m=mnew + sigma2=sigma2new + p=pnew/CR + Xmean=c(Xmean,SommeMasse/CR) + Pmean=c(Pmean,Proba1/CR) + L=Lnew + q=qnew + sigma2_new<-c(sigma2_new,sum(p*sigma2) + sum(p*m^2) - (sum(p*m))^2) + } + + #apres troncature + #---------------------- + for (k in kappa:(N-1)){ + # je cree le vecteur des nouveaux m, + # pour plus tard: au lieu de les faire vide, je prend direct m_{u0} + mnew=rep(0,2^(kappa+1)) + sigma2new=rep(0 ,2^(kappa+1)) + pnew=rep(0 ,2^(kappa+1)) + Lnew=rep(0 ,2^(kappa+1)) + # CR = constante de renormalisation qui intervient dans le dénominateur des pu + qnew=rep(0 ,2^(kappa+1)) + diffTps<-Tps[k+1] - Tps[k] + + #--- numérateur de pu0 + tpbeta<-loi_outlier(Y[k+1]) + + pnew[1:(2^kappa)]=p[1:(2^kappa)]*(1-pp)*tpbeta + Lnew[1:(2^kappa)]=L[1:(2^kappa)]*tpbeta + mnew[1:(2^kappa)]= m[1:(2^kappa)]*exp(-aa*diffTps) + mm*(1-exp(-aa*diffTps)) #m_u0 + sigma2new[1:(2^kappa)] = sigma2[1:(2^kappa)]*exp(-2*aa*(diffTps)) + + (1-exp(-2*aa*(diffTps)))*sigma2_mm/(2*aa) + qnew[1:(2^kappa)]=q[1:(2^kappa)]*(1-pp) + qnew[(1+2^kappa):2^(kappa+1)]=q[1:(2^kappa)]*pp + sommevar=sigma2new[1:(2^kappa)] + sigma2_pp + tpnorm<-dnorm(Y[k+1], mnew[1:(2^kappa)], sqrt(sommevar)) + pnew[(1+2^kappa):2^(kappa+1)]=p[1:(2^kappa)]*pp*tpnorm + Lnew[(1+2^kappa):2^(kappa+1)]=L[1:(2^kappa)]*tpnorm + mnew[(1+2^kappa):2^(kappa+1)] = (sigma2new[1:(2^kappa)]*Y[k+1] + + mnew[1:(2^kappa)]*sigma2_pp)/sommevar #m_u1 + sigma2new[(1+2^kappa):2^(kappa+1)] = sigma2new[1:(2^kappa)] * sigma2_pp/sommevar + + CR<-sum(pnew) + Proba1<-sum(pnew[(1+2^kappa):2^(kappa+1)]) + SommeMasse<-sum(pnew*mnew) + + selection=order(pnew, decreasing=T)[1:2^kappa] + + m=mnew[selection] + sigma2=sigma2new[selection] + p=pnew[selection]/sum(pnew[selection]) + Xmean=c(Xmean,SommeMasse/CR) + Pmean=c(Pmean,Proba1/CR) + L=Lnew[selection] + q=qnew[selection] + sigma2_new<-c(sigma2_new,sum(p*sigma2) + sum(p*m^2) - (sum(p*m))^2) + } + + #---------------------------------------------- + # Ajout Intervalle de confiance des estimations + # Variance (see section 1.2.1) + # IC est approximé... + #---------------------------------------------- + lwr <- Xmean - 1.96*sqrt(sigma2_new) + upr <- Xmean + 1.96*sqrt(sigma2_new) + # flag + flag<-if_else(Pmean > threshold,"OK","KO") + + #--- Calcul des derniers éléments + Vraisemblance=L%*%q + + resultat=list("prediction"=Xmean, + "label"=Pmean, + "likelihood"=Vraisemblance, + "lwr"=lwr, + "upr"=upr, + "flag"=flag) + return(resultat) +} + +#------------------------------------------------------------------------ +#' KBO_L a function to calculate a likelihood on optimized parameters +#' +#' @param param a list of 10 input parameters mm, pp and m0 +#' @param Y num +#' @param Tps num +#' @param N num +#' @param dix num +#' +#' @details uses the same input parameter list than the main function +#' @return a likelihood +#' @export +#' +#' @examples +#' # not run +KBO_L<-function(param,Y,Tps,N,dix){ + # load objects + mm=param[["mm"]] + pp=param[["pp"]] + m0=param[["m0"]] + aa=param[["aa"]] + expertMin=param[["expertMin"]] + expertMax=param[["expertMax"]] + sigma2_m0<-param[["sigma2_m0"]] + sigma2_mm<-param[["sigma2_mm"]] + sigma2_pp<-param[["sigma2_pp"]] + K<-param[["K"]] + + #--- function defining an outlier distribution (Surface of a trapezium) + # uses expertMin and expertMax + loi_outlier<-function(y){ + 2/((K+1)*(expertMax - expertMin)) + + (2*(K-1)/(K+1))*((y - expertMin)/((expertMax - expertMin)^2)) + } + + #---- paramètre de troncature + # Ici je met kappa =7, ca me fais retirer les proba <0.01 au lieu de 0.001 + # comme qd kappa=10 mais ca divise par 10 le temps de calcul + kappa=7 + + # initialisation (1.1.1) + #-------------------- + m1= (sigma2_pp*m0 + Y[1]*sigma2_m0)/(sigma2_m0+sigma2_pp) + sigma1=(sigma2_m0*sigma2_pp)/(sigma2_m0+sigma2_pp) + + l0<- loi_outlier(Y[1]) + loinorm1<-dnorm(Y[1],m0, sqrt(sigma2_m0+sigma2_pp)) + + p0= ((1-pp)*l0) / (pp*loinorm1 + (1-pp)*l0) + p1=1-p0 + + m=c(m0,m1) + p=c(p0,p1) + sigma2=c(sigma2_m0,sigma1) + L=dix*c(l0,loinorm1) #Attention *2 pr le grandir + q=c(1,1) #attention + + #iteration (1.1.2) + #----------------------- + #// Pour l'instant, je fais comme si kappa<N-1 mettre un if sinon + # avant troncature + for (k in 1:(kappa-1)){ + mnew=rep(0,2^(k+1)) + sigma2new=rep(0 ,2^(k+1)) + pnew=rep(0 ,2^(k+1)) + Lnew=rep(0 ,2^(k+1)) + # CR = constante de renormalisation qui intervient dans le dénominateur des pu + qnew=rep(0 ,2^(k+1)) + diffTps<-Tps[k+1] - Tps[k] + #--- numérateur de pu0 + tpbeta<-loi_outlier(Y[k+1]) + pnew[1:(2^k)]=p[1:(2^k)]*(1-pp)*tpbeta + Lnew[1:(2^k)]=L[1:(2^k)]*tpbeta + mnew[1:(2^k)]= m[1:(2^k)]*exp(-aa*diffTps) + mm*(1-exp(-aa*diffTps)) #m_u0 + sigma2new[1:(2^k)] = sigma2[1:(2^k)]*exp(-2*aa*(diffTps)) + + (1-exp(-2*aa*(diffTps)))*sigma2_mm/(2*aa) + qnew[1:(2^k)] <- q[1:(2^k)]*(1-pp) + qnew[(1+2^k):2^(k+1)] <- q[1:(2^k)]*pp + sommevar=sigma2new[1:(2^k)] + sigma2_pp + tpnorm<-dnorm(Y[k+1], mnew[1:(2^k)], sqrt(sommevar)) + pnew[(1+2^k):2^(k+1)]=p[1:(2^k)]*pp*tpnorm + Lnew[(1+2^k):2^(k+1)]=L[1:(2^k)]*tpnorm + mnew[(1+2^k):2^(k+1)] = (sigma2new[1:(2^k)]*Y[k+1] + mnew[1:(2^k)]*sigma2_pp)/sommevar + sigma2new[(1+2^k):2^(k+1)] = sigma2new[1:(2^k)] * sigma2_pp/sommevar + + m=mnew + sigma2=sigma2new + p=pnew/sum(pnew) + + L=dix*Lnew # fois 2 pr le grandir + q=dix*qnew + } + + # apres troncature + #---------------------- + for (k in kappa:(N-1)){ + # je cree le vecteur des nouveaux m, + mnew=rep(0,2^(kappa+1)) + sigma2new=rep(0 ,2^(kappa+1)) + pnew=rep(0 ,2^(kappa+1)) + Lnew=rep(0 ,2^(kappa+1)) + # CR = constante de renormalisation qui intervient dans le dénominateur des pu + qnew=rep(0 ,2^(kappa+1)) + diffTps<-Tps[k+1] - Tps[k] + #--- numérateur de pu0 + tpbeta<-loi_outlier(Y[k+1]) + pnew[1:(2^kappa)]=p[1:(2^kappa)]*(1-pp)*tpbeta + Lnew[1:(2^kappa)]=L[1:(2^kappa)]*tpbeta + mnew[1:(2^kappa)]= m[1:(2^kappa)]*exp(-aa*diffTps) + mm*(1-exp(-aa*diffTps)) #m_u0 + sigma2new[1:(2^kappa)] = sigma2[1:(2^kappa)]*exp(-2*aa*(diffTps)) + + (1-exp(-2*aa*(diffTps)))*sigma2_mm/(2*aa) + qnew[1:(2^kappa)]=q[1:(2^kappa)]*(1-pp) + qnew[(1+2^kappa):2^(kappa+1)]=q[1:(2^kappa)]*pp + sommevar=sigma2new[1:(2^kappa)] + sigma2_pp + tpnorm<-dnorm(Y[k+1], mnew[1:(2^kappa)], sqrt(sommevar)) + pnew[(1+2^kappa):2^(kappa+1)]=p[1:(2^kappa)]*pp*tpnorm + Lnew[(1+2^kappa):2^(kappa+1)]=L[1:(2^kappa)]*tpnorm + mnew[(1+2^kappa):2^(kappa+1)] = (sigma2new[1:(2^kappa)]*Y[k+1] + + mnew[1:(2^kappa)]*sigma2_pp)/sommevar #m_u1 + sigma2new[(1+2^kappa):2^(kappa+1)] = sigma2new[1:(2^kappa)] * sigma2_pp/sommevar + + selection=order(pnew, decreasing=T)[1:2^kappa] + + m=mnew[selection] + sigma2=sigma2new[selection] + p=pnew[selection]/sum(pnew[selection]) + L=dix*Lnew[selection] #fois 2 pr le grandir + q=dix*qnew[selection] + } + + Vraisemblance=L%*%q + + return(Vraisemblance) +} + +#------------------------ End of file ----------------------------------- diff --git a/man/KBO_L.Rd b/man/KBO_L.Rd new file mode 100644 index 0000000000000000000000000000000000000000..2759c2546ba9bc5203172ee91cf616dd92ef980c --- /dev/null +++ b/man/KBO_L.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{KBO_L} +\alias{KBO_L} +\title{KBO_L a function to calculate a likelihood on optimized parameters} +\usage{ +KBO_L(param, Y, Tps, N, dix) +} +\arguments{ +\item{param}{a list of 10 input parameters mm, pp and m0} + +\item{Y}{num} + +\item{Tps}{num} + +\item{N}{num} + +\item{dix}{num} +} +\value{ +a likelihood +} +\description{ +KBO_L a function to calculate a likelihood on optimized parameters +} +\details{ +uses the same input parameter list than the main function +} +\examples{ +# not run +} diff --git a/man/KBO_known.Rd b/man/KBO_known.Rd new file mode 100644 index 0000000000000000000000000000000000000000..d16c04368865909d5bec9109f5357876d98462a6 --- /dev/null +++ b/man/KBO_known.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{KBO_known} +\alias{KBO_known} +\title{KBO_known} +\usage{ +KBO_known(param, threshold, Y, Tps, N) +} +\arguments{ +\item{param}{list, a list of 10 input parameters for mm, pp and m0} + +\item{threshold}{numeric, threshold for CI} + +\item{Y}{num} + +\item{Tps}{num} + +\item{N}{num} +} +\value{ +a list +} +\description{ +KBO_known +} +\details{ +uses the same input parameter list than the main function +} +\examples{ +# not run +} diff --git a/man/kfino_fit2.Rd b/man/kfino_fit2.Rd new file mode 100644 index 0000000000000000000000000000000000000000..f602c533cadb5db5dcc356da2817f43c7acbac67 --- /dev/null +++ b/man/kfino_fit2.Rd @@ -0,0 +1,126 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kfino2.R +\name{kfino_fit2} +\alias{kfino_fit2} +\title{kafino_fit2 a function to detect outlier with a Kalman Filtering approach} +\usage{ +kfino_fit2(datain, Tvar, Yvar, param = NULL, doOptim = TRUE, threshold = 0.5) +} +\arguments{ +\item{datain}{an input data.frame of one time course to study (unique IDE)} + +\item{Tvar}{char, time column name in the data.frame datain, a numeric vector +Tvar can be expressed as a proportion of day in seconds} + +\item{Yvar}{char, name of the variable to predict in the data.frame datain} + +\item{param}{list, a list of initialization parameter} + +\item{doOptim}{logical, if TRUE optimisation of the initial parameters +default TRUE} + +\item{threshold}{numeric, threshold to qualify an observation as outlier +according to the label_pred, default 0.5} +} +\value{ +a S3 list with two data frames and a list of vectors of +kafino results +\describe{ +\item{detectOutlier}{The whole dataset with the detected outliers flagged + and prediction} +\item{PredictionOK}{A dataset with the predictions on possible values} +\item{kafino.results}{kfino results (a list of vectors) on optimized input + parameters or not} +} +} +\description{ +kafino_fit2 a function to detect outlier with a Kalman Filtering approach +} +\details{ +The initialization parameter list param contains: +\describe{ + \item{mm}{target weight, NULL if the user wants to optimize it} + \item{pp}{probability to be correctly weighed, NULL if the user wants to + optimize it} + \item{m0}{Initial weight, NULL if the user wants to optimize it} + \item{aa}{numeric, rate of weight change, default 0.001 } + \item{expertMin}{numeric, the minimal weight expected by the user} + \item{expertMax}{numeric, the maximal weight expected by the user} + \item{sigma2_m0}{variance of m0} + \item{sigma2_mm}{numeric, variance of mm, related to the unit of Tvar, + default 0.05} + \item{sigma2_pp}{numeric, variance of pp, related to the unit of Yvar, + default 5} + \item{K}{numeric, a constant value in the outlier function (trapezoid), + by default K=2 - increasing K, XXX} + \item{seqp}{numeric, minimal pp probability to be correctly weighted.} +} +It can be given by the user following his knowledge of the animal or dataset +or entirely constructed by the function. In the optimisation step, this +vector is initialized according to the input data (between the expert +range) using quantile of the Y distribution (varying between 0.2 and 0.8 for +m0 and 0.5 for mm). pp is a sequence varying between minp to (minp+0.3). A +sub-sampling is performed to speed the algorithm if the number of possible +observations studied is greater than 500. +} +\examples{ +data(spring1) +library(dplyr) + +# --- With Optimisation on initial parameters +t0 <- Sys.time() +param1<-list(m0=NULL, + mm=NULL, + pp=NULL, + aa=0.001, + expertMin=30, + expertMax=75, + sigma2_m0=1, + sigma2_mm=0.05, + sigma2_pp=5, + K=2, + seqp=seq(0.5,0.7,0.1)) + +resu1<-kfino_fit2(datain=spring1, + Tvar="dateNum",Yvar="Poids", + doOptim=TRUE,param=param1) +Sys.time() - t0 +# --- Without Optimisation on initial parameters +t0 <- Sys.time() +param2<-list(m0=41, + mm=45, + pp=0.5, + aa=0.001, + expertMin=30, + expertMax=75, + sigma2_m0=1, + sigma2_mm=0.05, + sigma2_pp=5, + K=2, + seqp=seq(0.5,0.7,0.1)) +resu2<-kfino_fit2(datain=spring1, + Tvar="dateNum",Yvar="Poids", + param=param2, + doOptim=FALSE) +Sys.time() - t0 + +# complex data on merinos2 dataset +data(merinos2) + +t0 <- Sys.time() +param3<-list(m0=NULL, + mm=NULL, + pp=NULL, + aa=0.001, + expertMin=10, + expertMax=45, + sigma2_m0=1, + sigma2_mm=0.05, + sigma2_pp=5, + K=2, + seqp=seq(0.5,0.7,0.1)) +resu3<-kfino_fit2(datain=merinos2, + Tvar="dateNum",Yvar="Poids", + doOptim=TRUE,param=param3) +Sys.time() - t0 +} diff --git a/tests/testthat/test-outputAlgo.R b/tests/testthat/test-outputAlgo.R index 3a9a5a0735ce35790926cd02b96c80da11458534..bceeda3b2219ac4c59440d9fd4a17a4c7fa1224e 100644 --- a/tests/testthat/test-outputAlgo.R +++ b/tests/testthat/test-outputAlgo.R @@ -5,10 +5,10 @@ test_that("Output type", { expertMin=30,expertMax=75, doOptim=FALSE,aa=0.001,sigma2_mm=0.05, K=2,sigma2_pp=5) - - expect_equal(str(resu1),"list") - expect_equal(str(resu1[[1]]),"data.frame") - expect_equal(str(resu1[[2]]),"data.frame") - expect_equal(str(resu1[[3]]),"list") + + expect_equal(length(resu1),3) + expect_s3_class(resu1[[1]],"data.frame") + expect_s3_class(resu1[[2]],"data.frame") + expect_type(resu1[[3]],"list") #expect_true(is_SBM(netA)) -}) \ No newline at end of file +})