Showing posts with label flexsurv. Show all posts
Showing posts with label flexsurv. Show all posts

Friday, September 22, 2017

custom distribution for flexsurvreg

##' ## Custom distribution
##' ## make "dEV" and "pEV" from eha package (if installed)
##' ##   available to the working environment
##' if (require("eha")) {
##' custom.ev <- list(name="EV",
##'                       pars=c("shape","scale"),
##'                       location="scale",
##'                       transforms=c(log, log),
##'                       inv.transforms=c(exp, exp),
##'                       inits=function(t){ c(1, median(t)) })
##' fitev <- flexsurvreg(formula = Surv(futime, fustat) ~ 1, data = ovarian,
##'                     dist=custom.ev)
##' fitev
##' lines(fitev, col="purple", col.ci="purple")
##' }


##'
##' ## Custom distribution: supply the hazard function only
##' hexp2 <- function(x, rate=1){ rate } # exponential distribution
##' hexp2 <- Vectorize(hexp2)
##' custom.exp2 <- list(name="exp2", pars=c("rate"), location="rate",
##'                     transforms=c(log), inv.transforms=c(exp),
##'                     inits=function(t)1/mean(t))
##' flexsurvreg(Surv(futime, fustat) ~ 1, data = ovarian, dist=custom.exp2)
##' flexsurvreg(Surv(futime, fustat) ~ 1, data = ovarian, dist="exp")
##' ## should give same answer

Monday, August 18, 2014

LRT test on Gompertz-Makeham model,

# I used single-fitting as initial values for the double-fitting. It worked. 
# Hessian does seems to be available when lower bounds are specifized in optim()



#20140818 Nested model test on LS1 RLS using the Gompertz-Makehma model
# Conclusions:
# H0: all I, G, M are the same between control and LS1
# H1: M are different between control and LS1
# p = 1.285958e-05, indicating that H1 is significantly different from the hull hypothesis

# H0 I = 1.000000e-05, G= 0.35, M = 0.021
# H1: I = 5.597089e-07, G = 0.45, M1 = 0.0063, M2 = 0.052


rm(list=ls());
source("~/lib/R/lifespan.r")
require(nlme)
require(survival)
require(xlsx)
require(flexsurv)

tb = read.xlsx( "LS1 MD data.xlsx", 1 );
ctl = tb[,2]
ls1 = tb[,3]

fitCtlGom = flexsurvreg(formula = Surv(ctl) ~ 1, dist="gompertz")
fitCtlGom$res

S = calculate.s(ctl)
s= S$s; t=S$t;
g1 = gnls( s ~  exp( (I/G) *(1 - exp(G* t)) - M*t ), start=list( I=0.003, G=0.16, M=0.01) )
retGM = optim ( g1$coef,  llh.GM.single.run, lifespan=ctl,
               lower = c(1E-10, 1E-5, 0), upper = c(0.2, 2, 0.1), method="L-BFGS-B");
retG = optim ( c(3.6E-5, 0.3),  llh.G.single.run, lifespan=ctl,
               lower = c(1E-10, 1E-5), upper = c(0.2, 2 ), method="L-BFGS-B");

fitLS1Gom = flexsurvreg(formula = Surv(ls1) ~ 1, dist="gompertz")
fitLS1Gom$res

S2 = calculate.s(ls1)
s = S2$s; t = S2$t
g2 = gnls( s ~  exp( (I/G) *(1 - exp(G* t)) - M*t ), start=list( I=0.003, G=0.16, M=0.01) )
retGM2 = optim ( g2$coef,  llh.GM.single.run, lifespan=ls1, lower = c(1E-10, 1E-5, 0), upper = c(0.2, 2, 0.1), method="L-BFGS-B");
retG2 = optim ( c(0.0166, 0.1),  llh.G.single.run, lifespan=ls1,
               lower = c(1E-10, 1E-5), upper = c(0.2, 2 ), method="L-BFGS-B");

retGM$par
retGM2$par
initIGM = c(retGM$par, retGM2$par)

#################################
##### LRT to exam whether two data on I,G,M.
#                            rawIGM =c( I1,     G1,          M1,      I2,         G2,     M2 )
H0   <- function( rawIGM ) { IG <- c(rawIGM[1], rawIGM[2], rawIGM[3], rawIGM[1],  rawIGM[2], rawIGM[3]) }  #all the same
H1m  <- function( rawIGM ) { IG <- c(rawIGM[1], rawIGM[2], rawIGM[3], rawIGM[1],  rawIGM[2], rawIGM[6]) } # M different

Hx.llh.GM2sample.single.run <- function( rawIGM, model, lifespan1, lifespan2 ) {
  IGM = model(rawIGM);
  I1 = IGM[1]; G1 = IGM[2]; M1=IGM[3]; I2 = IGM[4]; G2 = IGM[5]; M2=IGM[6];
  my.lh1 = llh.GM.single.run(IGM[1:3], lifespan1)
  my.lh2 = llh.GM.single.run(IGM[4:6], lifespan2)
  my.lh = my.lh1 + my.lh2
  print (IGM ); #trace the convergence
  ret = my.lh
}

rawIGM = c(retGM$par, retGM2$par)
llh.H0   = optim( initIGM, Hx.llh.GM2sample.single.run, model=H0,   lifespan1=ctl, lifespan2=ls1,
                  method="L-BFGS-B", lower=c(1E-5,1E-5,1E-5, 1E-5,1E-5,1E-5) );
                 
llh.H1m  = optim( initIGM, Hx.llh.GM2sample.single.run, model=H1m,  lifespan1=ctl, lifespan2=ls1,
                  method="L-BFGS-B", lower=c(1E-7,1E-7,1E-7, 1E-7,1E-7,1E-7) );

rbind( llh.H0$par, llh.H1m$par)
deltaLH = llh.H0$value - rbind( llh.H0$value, llh.H1m$value)

1 - pchisq( 2*deltaLH, df =1 );


#################################
###### End of LRT    ############
#################################

Custom distribution for flexsurv(), example



data(ovarian)
## Compare generalized gamma fit with Weibull
fitg <- flexsurvreg(formula = Surv(futime, fustat) ~ 1, data = ovarian, dist="gengamma")
fitg
fitw <- flexsurvreg(formula = Surv(futime, fustat) ~ 1, data = ovarian, dist="weibull")
fitw
plot(fitg)
lines(fitw, col="blue", lwd.ci=1, lty.ci=1)
## Identical AIC, probably not enough data in this simple example for a
## very flexible model to be worthwhile.

## Custom distribution
library(eha)  ## make "dllogis" and "pllogis" available to the working environment
custom.llogis <- list(name="llogis",
                      pars=c("shape","scale"),
                      location="scale",
                      transforms=c(log, log),
                      inv.transforms=c(exp, exp),
                      inits=function(t){ c(1, median(t)) })
fitl <- flexsurvreg(formula = Surv(futime, fustat) ~ 1, data = ovarian, dist=custom.llogis)
fitl

lines(fitl, col.fit="purple", col.ci="purple")

Saturday, July 12, 2014

flexsurv usage and results




fitGomp = flexsurvreg( formula=Surv(tb$rls) ~ 1, dist='gompertz')

> fitGomp$res
              est        L95%       U95%
shape 0.074024323 0.065724069 0.08232458

rate  0.009551328 0.007533154 0.01211018



fitGomp$coeff['rate'] has to be transformed by exp() function. 

Friday, July 11, 2014

Compare binomial, gompertz, and weibull model, fitting with simulated binomial-aging-model lifespan


# ~/0.network.aging.prj/0a.network.fitting/_binomial_aging20140711.r


#test binomial aging model
rm(list=ls());

source("lifespan.20131221.r")
require(flexsurv)


##### log likelihood function, 3-parameter binomial mortality rate model
# http://hongqinlab.blogspot.com/2013/12/binomial-mortailty-model.html
# m = R ( 1 + t/t0)^(n-1)
# s = exp( (R t0/n)*(1 - (1+t/t0)^n ) )  

llh.binomialMortality.single.run <- function( Rt0n, lifespan ) {
  I = Rt0n[1]; t0 = Rt0n[2]; n=Rt0n[3];
  my.data = lifespan[!is.na(lifespan)];
  log_s = (I * t0 /n )*(1 - (1 + my.data/t0)^n);
  log_m = log(I) +  (n-1) * log(1 + my.data/t0 ); 
  my.lh = sum(log_s)  + sum(log_m);
  print (Rt0n ); #trace the convergence
  ret = - my.lh # because optim seems to maximize 
}

#random number from binomial-aging model
rbinomial.aging <- function ( Rt0n, n){
    x.uniform = runif(n)
    inverse.binomialaging.CDF = function(R,t0,n,y) {t0*((1- n*log(y)/(R*t0))^(1/n) -1) }
    x.binomialaging = inverse.binomialaging.CDF(Rt0n[1],Rt0n[2], Rt0n[3], x.uniform)
    return(x.binomialaging)
}

set.seed(20140711)
x = rbinomial.aging( c(0.001,10, 5), 1000)
fitBinom = optim ( c(0.001,10, 3),  llh.binomialMortality.single.run , lifespan=x, 
              method="L-BFGS-B", lower=c(1E-10,0.05,1E-1), upper=c(10,100,50) );  

fitGomp = optim ( c(0.005, 0.08),  llh.G.single.run , lifespan=x,
                  method="L-BFGS-B", lower=c(1E-10,1E-2), upper=c(10,10) );  

fitWeib2 = flexsurvreg(formula = Surv(x) ~ 1, dist="weibull")
fitGomp2 = flexsurvreg(formula = Surv(x) ~ 1, dist="gompertz")
fitGomp2$coeff

c(-fitGomp$value, fitGomp2$loglik, fitWeib2$loglik, -fitBinom$value)
Flexsurv can over-fit the Gompertz with negative values, and Weibull gave the best value!!! 
Even though the random number are generated by binomial aging model, its fitting gave the lowest likelihood?!

TODO: I should to provide a gradient function for optim(method='L-BFGS-B')
Or try optimx()

For binomial-aging, $n$ is under-estimated by optim(), but $m0$ is over-estimated. When the entire data is fitted with Gompertz model, $G$ is too low. How about fitting within initial stage? 


set.seed(20140711)
x = round( rbinomial.aging( c(0.001,10, 4), 5000), digits=1 )#t0=10, n=4
tb = calculate.mortality.rate(x)
plot( tb$s ~ tb$t)
plot( tb$mortality.rate ~ tb$t )
plot( tb$mortality.rate ~ tb$t, log='y', main='linear for Gompertz' )
plot( tb$mortality.rate ~ tb$t, log='yx', main='linear for Weibull' )





set.seed(20140711)
x = rbinomial.aging( c(0.01,25, 4), 100)  #large m0, t0=25, n=4
x = round( x + 0.5, digits=0)
fitBinom = optim ( c(0.01,25, 3.5),  llh.binomialMortality.single.run , lifespan=x,
              method="L-BFGS-B", lower=c(1E-10,0.05,1E-1), upper=c(10,100,50) );

fitGomp = optim ( c(0.005, 0.08),  llh.G.single.run , lifespan=x,
                  method="L-BFGS-B", lower=c(1E-10,1E-2), upper=c(10,10) );

fitWeib2 = flexsurvreg(formula = Surv(x) ~ 1, dist="weibull")
fitGomp2 = flexsurvreg(formula = Surv(x) ~ 1, dist="gompertz")
fitGomp2$coeff

c(-fitGomp$value, fitGomp2$loglik, fitWeib2$loglik, -fitBinom$value)
Good, binomial aging model gives the best likelihood.
Plot confirmed that Gompertz looks better than Weibull. 





Saturday, December 21, 2013

Use simulated Gompertz random number to test flexsurv Gompertz and Weibull fitting results

Summary: I experimented the sample size. For 50 cells, the Gompertz model may be the better fit 4 out of 5 times. For 100 cells, Gompertz is better than Weibull maybe 9 out of 10 times. For 500 cells, Gompertz is always bettern than the Weibull model.

Conclusion: For most yeast life span assay, it is not always easy to tell whether Gompertz or Weibull is a better fit.

Comments: The actual results probably also depend on R and G, which determine the variance.

code: 20131221.gompertz.simulation.R


# generate gompertz random numbers
# fit with flexsurv Gompertz and weibull models

#inverse of gompertz CDF
# see http://hongqinlab.blogspot.com/2013/06/median-lifespan-of-2-parameter-gompertz.html
inverse.gomp.CDF = function(R,G,y) {  (1/G)*log(1 - (G/R)*log(1-y)  ) }

#see generate random number with a given distribution
# http://hongqinlab.blogspot.com/2013/12/generate-gompertz-random-numbers.html

x.uniform = runif(60)
hist(x.uniform)

x.gompertz = inverse.gomp.CDF(0.001,0.2, x.uniform)
hist(x.gompertz)

summary(x.gompertz)

source("lifespan.r")
tb = calculate.s(x.gompertz)
plot(tb$s ~ tb$t)

require(flexsurv)
require(flexsurv)
lifespan = x.gompertz
lifespanGomp = flexsurvreg(formula = Surv(lifespan) ~ 1, dist = 'gompertz')  
lifespanWeib = flexsurvreg(formula = Surv(lifespan) ~ 1, dist = 'weibull')  

c(lifespanWeib$AIC, lifespanGomp$AIC, lifespanWeib$AIC - lifespanGomp$AIC )

Wednesday, October 16, 2013

flexsurvreg() in package flexsurv

Notes on 20140818: I did not see the specification of gradient function in the call of optim()?!


# package(flexsurv)

> flexsurvreg
function (formula, data, dist, inits, fixedpars = NULL, cl = 0.95, 
    ...) 
{
    call <- match.call() #a useful base function
    indx <- match(c("formula", "data"), names(call), nomatch = 0)
    if (indx[1] == 0) 
        stop("A \"formula\" argument is required")
    temp <- call[c(1, indx)]
    temp[[1]] <- as.name("model.frame")
    m <- eval(temp, parent.frame())
    Y <- model.extract(m, "response")
    if (!inherits(Y, "Surv")) 
        stop("Response must be a survival object")
    Terms <- attr(m, "terms")
    X <- model.matrix(Terms, m)
    dat <- list(Y = Y, X = X[, -1, drop = FALSE], Xraw = m[, 
        -1, drop = FALSE])
    X <- dat$X
    if (missing(dist)) 
        stop("Distribution \"dist\" not specified")
    if (is.character(dist)) {
        match.arg(dist, names(flexsurv.dists))
        dlist <- flexsurv.dists[[dist]]
        dlist$name <- dist
    }
    else if (is.list(dist)) {
        dlist <- check.dlist(dist)
    }
    else stop("\"dist\" should be a string for a built-in distribution, or a list for a custom distribution")
    parnames <- dlist$pars
    ncovs <- ncol(X)
    nbpars <- length(parnames)
    npars <- nbpars + ncovs
    if (!missing(inits) && (!is.numeric(inits) || (length(inits) != 
        npars))) 
        stop("inits must be a numeric vector of length ", npars)
    if (missing(inits) || any(is.na(inits))) 
        default.inits <- c(dlist$inits(Y[, "time"]), rep(0, ncovs))
    if (missing(inits)) 
        inits <- default.inits
    else if (any(is.na(inits))) 
        inits[is.na(inits)] <- default.inits[is.na(inits)]
    for (i in 1:nbpars) inits[i] <- dlist$transforms[[i]](inits[i])
    outofrange <- which(is.nan(inits) | is.infinite(inits))
    if (any(outofrange)) {
        plural <- if (length(outofrange) > 1) 
            "s"
        else ""
        stop("Initial value", plural, " for parameter", plural, 
            " ", paste(outofrange, collapse = ","), " out of range")
    }
    cnames <- if (ncovs == 0) 
        NULL
    else colnames(X)
    names(inits) <- c(parnames, cnames)
    if (!is.null(fixedpars) && !is.logical(fixedpars) && (!is.numeric(fixedpars) || 
        any(!(fixedpars %in% 1:npars)))) {
        dots <- if (npars > 2) 
            "...,"
        else ""
        stop("fixedpars must be TRUE/FALSE or a vector of indices in 1,", 
            dots, npars)
    }
    if ((is.logical(fixedpars) && fixedpars == TRUE) || (is.numeric(fixedpars) && 
        all(fixedpars == 1:npars))) {
        minusloglik <- minusloglik.flexsurv(inits, t = Y[, "time"], 
            dead = Y[, "status"], X = X, dlist = dlist, inits = inits)
        for (i in 1:nbpars) inits[i] <- dlist$inv.transforms[[i]](inits[i])
        res <- matrix(inits, ncol = 1)
        dimnames(res) <- list(names(inits), "est")
        ret <- list(call = call, dlist = dlist, res = res, npars = 0, 
            loglik = -minusloglik, AIC = 2 * minusloglik, data = dat, 
            datameans = colMeans(dat$X), N = nrow(dat$Y), events = sum(dat$Y[, 
                "status"]), trisk = sum(dat$Y[, "time"]))
    }
    else {
        optpars <- inits[setdiff(1:npars, fixedpars)]  #nice trick to optimize a subset of pars
        opt <- optim(optpars, minusloglik.flexsurv, t = Y[, "time"], 
            dead = Y[, "status"], X = X, dlist = dlist, inits = inits, 
            fixedpars = fixedpars, hessian = TRUE, ...) #fixedpars is a useful optim feature
        est <- opt$par
        if (all(!is.na(opt$hessian)) && all(!is.nan(opt$hessian)) && 
            all(is.finite(opt$hessian)) && all(eigen(opt$hessian)$values > 
            0)) {
            cov <- solve(opt$hessian)
            se <- sqrt(diag(cov)) #qin, standard errors, see wiki entry 
            if (!is.numeric(cl) || length(cl) > 1 || !(cl > 0) || 
                !(cl < 1)) 
                stop("cl must be a number in (0,1)")
            lcl <- est - qnorm(1 - (1 - cl)/2) * se
            ucl <- est + qnorm(1 - (1 - cl)/2) * se
        }
        else {
            warning("Could not calculate asymptotic standard errors - Hessian is not positive definite. Optimisation has probably not converged to the maximum likelihood")
            lcl <- ucl <- NA
        }
        res <- cbind(est = inits, lcl = NA, ucl = NA)
        res[setdiff(1:npars, fixedpars), ] <- cbind(est, lcl, 
            ucl)
        colnames(res) <- c("est", paste(c("L", "U"), round(cl * 
            100), "%", sep = ""))
        for (i in 1:nbpars) res[i, ] <- dlist$inv.transforms[[i]](res[i, 
            ])
        ret <- list(call = match.call(), dlist = dlist, res = res, 
            npars = length(est), loglik = -opt$value, AIC = 2 * 
                opt$value + 2 * length(est), cl = cl, opt = opt, 
            data = dat, datameans = colMeans(dat$X), N = nrow(dat$Y), 
            events = sum(dat$Y[, "status"]), trisk = sum(dat$Y[, 
                "time"]))
    }
    class(ret) <- "flexsurvreg"
    ret
}

Tuesday, June 25, 2013

*** Two-parameter Gompertz model

Median lifespan of the 2-parameter

Survivorship

The Gompertz model in 'flexsurv' is

Notice that pdf = mortality rate * (1-s)  
pdf = mortality rate * s (based on mortality definition)
 m = - 1/s ds/dt  , so - ds/dt = pdf = m * s

So, flexsurv-shape a ==G, flexsurv rate b = R. 



Inverse of Gompertz CDF: