Open Notebook: flexsurv

Showing posts with label flexsurv. Show all posts

Friday, September 22, 2017

custom distribution for flexsurvreg

##' ## Custom distribution
##' ## make "dEV" and "pEV" from eha package (if installed)
##' ## available to the working environment
##' if (require("eha")) {
##' custom.ev <- list(name="EV",
##' pars=c("shape","scale"),
##' location="scale",
##' transforms=c(log, log),
##' inv.transforms=c(exp, exp),
##' inits=function(t){ c(1, median(t)) })
##' fitev <- flexsurvreg(formula = Surv(futime, fustat) ~ 1, data = ovarian,
##' dist=custom.ev)
##' fitev
##' lines(fitev, col="purple", col.ci="purple")
##' }

##'
##' ## Custom distribution: supply the hazard function only
##' hexp2 <- function(x, rate=1){ rate } # exponential distribution
##' hexp2 <- Vectorize(hexp2)
##' custom.exp2 <- list(name="exp2", pars=c("rate"), location="rate",
##' transforms=c(log), inv.transforms=c(exp),
##' inits=function(t)1/mean(t))
##' flexsurvreg(Surv(futime, fustat) ~ 1, data = ovarian, dist=custom.exp2)
##' flexsurvreg(Surv(futime, fustat) ~ 1, data = ovarian, dist="exp")
##' ## should give same answer

Monday, August 18, 2014

LRT test on Gompertz-Makeham model,

# I used single-fitting as initial values for the double-fitting. It worked.
# Hessian does seems to be available when lower bounds are specifized in optim()

#20140818 Nested model test on LS1 RLS using the Gompertz-Makehma model
# Conclusions:
# H0: all I, G, M are the same between control and LS1
# H1: M are different between control and LS1
# p = 1.285958e-05, indicating that H1 is significantly different from the hull hypothesis

# H0 I = 1.000000e-05, G= 0.35, M = 0.021
# H1: I = 5.597089e-07, G = 0.45, M1 = 0.0063, M2 = 0.052

rm(list=ls());
source("~/lib/R/lifespan.r")
require(nlme)
require(survival)
require(xlsx)
require(flexsurv)

tb = read.xlsx( "LS1 MD data.xlsx", 1 );
ctl = tb[,2]
ls1 = tb[,3]

fitCtlGom = flexsurvreg(formula = Surv(ctl) ~ 1, dist="gompertz")
fitCtlGom$res

S = calculate.s(ctl)
s= S$s; t=S$t;
g1 = gnls( s ~ exp( (I/G) *(1 - exp(G* t)) - M*t ), start=list( I=0.003, G=0.16, M=0.01) )
retGM = optim ( g1$coef, llh.GM.single.run, lifespan=ctl,
lower = c(1E-10, 1E-5, 0), upper = c(0.2, 2, 0.1), method="L-BFGS-B");
retG = optim ( c(3.6E-5, 0.3), llh.G.single.run, lifespan=ctl,
lower = c(1E-10, 1E-5), upper = c(0.2, 2 ), method="L-BFGS-B");

fitLS1Gom = flexsurvreg(formula = Surv(ls1) ~ 1, dist="gompertz")
fitLS1Gom$res

S2 = calculate.s(ls1)
s = S2$s; t = S2$t
g2 = gnls( s ~ exp( (I/G) *(1 - exp(G* t)) - M*t ), start=list( I=0.003, G=0.16, M=0.01) )
retGM2 = optim ( g2$coef, llh.GM.single.run, lifespan=ls1, lower = c(1E-10, 1E-5, 0), upper = c(0.2, 2, 0.1), method="L-BFGS-B");
retG2 = optim ( c(0.0166, 0.1), llh.G.single.run, lifespan=ls1,
lower = c(1E-10, 1E-5), upper = c(0.2, 2 ), method="L-BFGS-B");

retGM$par
retGM2$par
initIGM = c(retGM$par, retGM2$par)

#################################
##### LRT to exam whether two data on I,G,M.
# rawIGM =c( I1, G1, M1, I2, G2, M2 )
H0 <- function( rawIGM ) { IG <- c(rawIGM[1], rawIGM[2], rawIGM[3], rawIGM[1], rawIGM[2], rawIGM[3]) } #all the same
H1m <- function( rawIGM ) { IG <- c(rawIGM[1], rawIGM[2], rawIGM[3], rawIGM[1], rawIGM[2], rawIGM[6]) } # M different

Hx.llh.GM2sample.single.run <- function( rawIGM, model, lifespan1, lifespan2 ) {
IGM = model(rawIGM);
I1 = IGM[1]; G1 = IGM[2]; M1=IGM[3]; I2 = IGM[4]; G2 = IGM[5]; M2=IGM[6];
my.lh1 = llh.GM.single.run(IGM[1:3], lifespan1)
my.lh2 = llh.GM.single.run(IGM[4:6], lifespan2)
my.lh = my.lh1 + my.lh2
print (IGM ); #trace the convergence
ret = my.lh
}

rawIGM = c(retGM$par, retGM2$par)
llh.H0 = optim( initIGM, Hx.llh.GM2sample.single.run, model=H0, lifespan1=ctl, lifespan2=ls1,
method="L-BFGS-B", lower=c(1E-5,1E-5,1E-5, 1E-5,1E-5,1E-5) );

llh.H1m = optim( initIGM, Hx.llh.GM2sample.single.run, model=H1m, lifespan1=ctl, lifespan2=ls1,
method="L-BFGS-B", lower=c(1E-7,1E-7,1E-7, 1E-7,1E-7,1E-7) );

rbind( llh.H0$par, llh.H1m$par)
deltaLH = llh.H0$value - rbind( llh.H0$value, llh.H1m$value)

1 - pchisq( 2*deltaLH, df =1 );

#################################
###### End of LRT ############
#################################

Custom distribution for flexsurv(), example

data(ovarian)

## Compare generalized gamma fit with Weibull

fitg <- flexsurvreg(formula = Surv(futime, fustat) ~ 1, data = ovarian, dist="gengamma")

fitg

fitw <- flexsurvreg(formula = Surv(futime, fustat) ~ 1, data = ovarian, dist="weibull")

fitw

plot(fitg)

lines(fitw, col="blue", lwd.ci=1, lty.ci=1)

## Identical AIC, probably not enough data in this simple example for a

## very flexible model to be worthwhile.

## Custom distribution

library(eha) ## make "dllogis" and "pllogis" available to the working environment

custom.llogis <- list(name="llogis",

pars=c("shape","scale"),

location="scale",

transforms=c(log, log),

inv.transforms=c(exp, exp),

inits=function(t){ c(1, median(t)) })

fitl <- flexsurvreg(formula = Surv(futime, fustat) ~ 1, data = ovarian, dist=custom.llogis)

fitl

lines(fitl, col.fit="purple", col.ci="purple")

Saturday, July 12, 2014

flexsurv usage and results

fitGomp = flexsurvreg( formula=Surv(tb$rls) ~ 1, dist='gompertz')

> fitGomp$res
est L95% U95%
shape 0.074024323 0.065724069 0.08232458

rate 0.009551328 0.007533154 0.01211018

fitGomp$coeff['rate'] has to be transformed by exp() function.

Friday, July 11, 2014

Compare binomial, gompertz, and weibull model, fitting with simulated binomial-aging-model lifespan

# ~/0.network.aging.prj/0a.network.fitting/_binomial_aging20140711.r

#test binomial aging model
rm(list=ls());

source("lifespan.20131221.r")
require(flexsurv)

##### log likelihood function, 3-parameter binomial mortality rate model
# http://hongqinlab.blogspot.com/2013/12/binomial-mortailty-model.html
# m = R ( 1 + t/t0)^(n-1)
# s = exp( (R t0/n)*(1 - (1+t/t0)^n ) )

llh.binomialMortality.single.run <- function( Rt0n, lifespan ) {
I = Rt0n[1]; t0 = Rt0n[2]; n=Rt0n[3];
my.data = lifespan[!is.na(lifespan)];
log_s = (I * t0 /n )*(1 - (1 + my.data/t0)^n);
log_m = log(I) + (n-1) * log(1 + my.data/t0 );
my.lh = sum(log_s) + sum(log_m);
print (Rt0n ); #trace the convergence
ret = - my.lh # because optim seems to maximize
}

#random number from binomial-aging model
rbinomial.aging <- function ( Rt0n, n){
x.uniform = runif(n)
inverse.binomialaging.CDF = function(R,t0,n,y) {t0*((1- n*log(y)/(R*t0))^(1/n) -1) }
x.binomialaging = inverse.binomialaging.CDF(Rt0n[1],Rt0n[2], Rt0n[3], x.uniform)
return(x.binomialaging)
}

set.seed(20140711)
x = rbinomial.aging( c(0.001,10, 5), 1000)
fitBinom = optim ( c(0.001,10, 3), llh.binomialMortality.single.run , lifespan=x,
method="L-BFGS-B", lower=c(1E-10,0.05,1E-1), upper=c(10,100,50) );

fitGomp = optim ( c(0.005, 0.08), llh.G.single.run , lifespan=x,
method="L-BFGS-B", lower=c(1E-10,1E-2), upper=c(10,10) );

fitWeib2 = flexsurvreg(formula = Surv(x) ~ 1, dist="weibull")
fitGomp2 = flexsurvreg(formula = Surv(x) ~ 1, dist="gompertz")
fitGomp2$coeff

c(-fitGomp$value, fitGomp2$loglik, fitWeib2$loglik, -fitBinom$value)

Flexsurv can over-fit the Gompertz with negative values, and Weibull gave the best value!!!
Even though the random number are generated by binomial aging model, its fitting gave the lowest likelihood?!

TODO: I should to provide a gradient function for optim(method='L-BFGS-B')
Or try optimx()

For binomial-aging, $n$ is under-estimated by optim(), but $m0$ is over-estimated. When the entire data is fitted with Gompertz model, $G$ is too low. How about fitting within initial stage?

set.seed(20140711)

x = round( rbinomial.aging( c(0.001,10, 4), 5000), digits=1 )#t0=10, n=4

tb = calculate.mortality.rate(x)

plot( tb$s ~ tb$t)

plot( tb$mortality.rate ~ tb$t )

plot( tb$mortality.rate ~ tb$t, log='y', main='linear for Gompertz' )

plot( tb$mortality.rate ~ tb$t, log='yx', main='linear for Weibull' )

set.seed(20140711)
x = rbinomial.aging( c(0.01,25, 4), 100) #large m0, t0=25, n=4
x = round( x + 0.5, digits=0)
fitBinom = optim ( c(0.01,25, 3.5), llh.binomialMortality.single.run , lifespan=x,
method="L-BFGS-B", lower=c(1E-10,0.05,1E-1), upper=c(10,100,50) );

fitGomp = optim ( c(0.005, 0.08), llh.G.single.run , lifespan=x,
method="L-BFGS-B", lower=c(1E-10,1E-2), upper=c(10,10) );

fitWeib2 = flexsurvreg(formula = Surv(x) ~ 1, dist="weibull")
fitGomp2 = flexsurvreg(formula = Surv(x) ~ 1, dist="gompertz")
fitGomp2$coeff

c(-fitGomp$value, fitGomp2$loglik, fitWeib2$loglik, -fitBinom$value)

Good, binomial aging model gives the best likelihood.

Plot confirmed that Gompertz looks better than Weibull.

Saturday, December 21, 2013

Use simulated Gompertz random number to test flexsurv Gompertz and Weibull fitting results

Summary: I experimented the sample size. For 50 cells, the Gompertz model may be the better fit 4 out of 5 times. For 100 cells, Gompertz is better than Weibull maybe 9 out of 10 times. For 500 cells, Gompertz is always bettern than the Weibull model.

Conclusion: For most yeast life span assay, it is not always easy to tell whether Gompertz or Weibull is a better fit.

Comments: The actual results probably also depend on R and G, which determine the variance.

code: 20131221.gompertz.simulation.R

# generate gompertz random numbers
# fit with flexsurv Gompertz and weibull models

#inverse of gompertz CDF
# see http://hongqinlab.blogspot.com/2013/06/median-lifespan-of-2-parameter-gompertz.html
inverse.gomp.CDF = function(R,G,y) { (1/G)*log(1 - (G/R)*log(1-y) ) }

#see generate random number with a given distribution
# http://hongqinlab.blogspot.com/2013/12/generate-gompertz-random-numbers.html

x.uniform = runif(60)
hist(x.uniform)

x.gompertz = inverse.gomp.CDF(0.001,0.2, x.uniform)
hist(x.gompertz)

summary(x.gompertz)

source("lifespan.r")
tb = calculate.s(x.gompertz)
plot(tb$s ~ tb$t)

require(flexsurv)
require(flexsurv)
lifespan = x.gompertz
lifespanGomp = flexsurvreg(formula = Surv(lifespan) ~ 1, dist = 'gompertz')
lifespanWeib = flexsurvreg(formula = Surv(lifespan) ~ 1, dist = 'weibull')

c(lifespanWeib$AIC, lifespanGomp$AIC, lifespanWeib$AIC - lifespanGomp$AIC )

Wednesday, October 16, 2013

flexsurvreg() in package flexsurv

Notes on 20140818: I did not see the specification of gradient function in the call of optim()?!

# package(flexsurv)

> flexsurvreg
function (formula, data, dist, inits, fixedpars = NULL, cl = 0.95,
...)
{
call <- match.call() #a useful base function
indx <- match(c("formula", "data"), names(call), nomatch = 0)
if (indx[1] == 0)
stop("A \"formula\" argument is required")
temp <- call[c(1, indx)]
temp[[1]] <- as.name("model.frame")
m <- eval(temp, parent.frame())
Y <- model.extract(m, "response")
if (!inherits(Y, "Surv"))
stop("Response must be a survival object")
Terms <- attr(m, "terms")
X <- model.matrix(Terms, m)
dat <- list(Y = Y, X = X[, -1, drop = FALSE], Xraw = m[,
-1, drop = FALSE])
X <- dat$X
if (missing(dist))
stop("Distribution \"dist\" not specified")
if (is.character(dist)) {
match.arg(dist, names(flexsurv.dists))
dlist <- flexsurv.dists[[dist]]
dlist$name <- dist
}
else if (is.list(dist)) {
dlist <- check.dlist(dist)
}
else stop("\"dist\" should be a string for a built-in distribution, or a list for a custom distribution")
parnames <- dlist$pars
ncovs <- ncol(X)
nbpars <- length(parnames)
npars <- nbpars + ncovs
if (!missing(inits) && (!is.numeric(inits) || (length(inits) !=
npars)))
stop("inits must be a numeric vector of length ", npars)
if (missing(inits) || any(is.na(inits)))
default.inits <- c(dlist$inits(Y[, "time"]), rep(0, ncovs))
if (missing(inits))
inits <- default.inits
else if (any(is.na(inits)))
inits[is.na(inits)] <- default.inits[is.na(inits)]
for (i in 1:nbpars) inits[i] <- dlist$transforms[[i]](inits[i])
outofrange <- which(is.nan(inits) | is.infinite(inits))
if (any(outofrange)) {
plural <- if (length(outofrange) > 1)
"s"
else ""
stop("Initial value", plural, " for parameter", plural,
" ", paste(outofrange, collapse = ","), " out of range")
}
cnames <- if (ncovs == 0)
NULL
else colnames(X)
names(inits) <- c(parnames, cnames)
if (!is.null(fixedpars) && !is.logical(fixedpars) && (!is.numeric(fixedpars) ||
any(!(fixedpars %in% 1:npars)))) {
dots <- if (npars > 2)
"...,"
else ""
stop("fixedpars must be TRUE/FALSE or a vector of indices in 1,",
dots, npars)
}
if ((is.logical(fixedpars) && fixedpars == TRUE) || (is.numeric(fixedpars) &&
all(fixedpars == 1:npars))) {
minusloglik <- minusloglik.flexsurv(inits, t = Y[, "time"],
dead = Y[, "status"], X = X, dlist = dlist, inits = inits)
for (i in 1:nbpars) inits[i] <- dlist$inv.transforms[[i]](inits[i])
res <- matrix(inits, ncol = 1)
dimnames(res) <- list(names(inits), "est")
ret <- list(call = call, dlist = dlist, res = res, npars = 0,
loglik = -minusloglik, AIC = 2 * minusloglik, data = dat,
datameans = colMeans(dat$X), N = nrow(dat$Y), events = sum(dat$Y[,
"status"]), trisk = sum(dat$Y[, "time"]))
}
else {
optpars <- inits[setdiff(1:npars, fixedpars)] #nice trick to optimize a subset of pars
opt <- optim(optpars, minusloglik.flexsurv, t = Y[, "time"],
dead = Y[, "status"], X = X, dlist = dlist, inits = inits,
fixedpars = fixedpars, hessian = TRUE, ...) #fixedpars is a useful optim feature
est <- opt$par
if (all(!is.na(opt$hessian)) && all(!is.nan(opt$hessian)) &&
all(is.finite(opt$hessian)) && all(eigen(opt$hessian)$values >
0)) {
cov <- solve(opt$hessian)
se <- sqrt(diag(cov)) #qin, standard errors, see wiki entry
if (!is.numeric(cl) || length(cl) > 1 || !(cl > 0) ||
!(cl < 1))
stop("cl must be a number in (0,1)")
lcl <- est - qnorm(1 - (1 - cl)/2) * se
ucl <- est + qnorm(1 - (1 - cl)/2) * se
}
else {
warning("Could not calculate asymptotic standard errors - Hessian is not positive definite. Optimisation has probably not converged to the maximum likelihood")
lcl <- ucl <- NA
}
res <- cbind(est = inits, lcl = NA, ucl = NA)
res[setdiff(1:npars, fixedpars), ] <- cbind(est, lcl,
ucl)
colnames(res) <- c("est", paste(c("L", "U"), round(cl *
100), "%", sep = ""))
for (i in 1:nbpars) res[i, ] <- dlist$inv.transforms[[i]](res[i,
])
ret <- list(call = match.call(), dlist = dlist, res = res,
npars = length(est), loglik = -opt$value, AIC = 2 *
opt$value + 2 * length(est), cl = cl, opt = opt,
data = dat, datameans = colMeans(dat$X), N = nrow(dat$Y),
events = sum(dat$Y[, "status"]), trisk = sum(dat$Y[,
"time"]))
}
class(ret) <- "flexsurvreg"
ret
}

Tuesday, June 25, 2013

*** Two-parameter Gompertz model

Median lifespan of the 2-parameter

Survivorship

The Gompertz model in 'flexsurv' is

Notice that ~~pdf = mortality rate * (1-s)~~
pdf = mortality rate * s (based on mortality definition)
m = - 1/s ds/dt , so - ds/dt = pdf = m * s

So, flexsurv-shape a ==G, flexsurv rate b = R.

Inverse of Gompertz CDF: