# ~/9.ken/Survival Analysis Gomp Weib_20140705qin.r
#####################################################################################################
#################install and declare needed packaqes for survival analysis
#install.packages('survival')
#install.packages('KMsurv')
#install.packages('data.table')
#install.packages('RSQLite')
library('survival')
library('KMsurv')
library('data.table')
library('RSQLite')
library('flexsurv')
library('muhaz')
#########################################################################################################
#########################################################################################################
#####################Declare names of files and folders. Move working directory to desired folder. Establish connection to database
#folderLinearHazard = '//udrive.uw.edu/udrive/Kaeberlein Lab/RLS Survival Analysis 2/Linear Hazard'
Folder = "~/projects/0.network.aging.prj/9.ken"
File = 'rls.db'
setwd(Folder)
drv <- SQLite()
con <- dbConnect(drv, dbname = File)
##########################################################################################################
##########################################################################################################
######################Create a complete table of conditions that have data in the RLS database.
######################(Each row in this table will be a unique combination of genotype, mating type, media, and temperature
###get unique conditions from "set" columns (experimental treatments/mutations)
conditions1 = dbGetQuery(con, "
SELECT DISTINCT set_genotype as genotype, set_mating_type as mat, set_media as media, set_temperature as temp
FROM result
WHERE pooled_by = 'file'
")
### rows in the result table of the database are not mutually exclusive. In some rows, data has sometimes been pooled by genotype, background strain, etc.
###get unique conditions from "reference" columns (these columns are the control conditions/lifespan results for each row of experimental lifespan results. Rows are not mutually exclusive (1 control to many experimental conditions)
conditions2 = dbGetQuery(con, "
SELECT DISTINCT ref_genotype as genotype, ref_mating_type as mat, ref_media as media, ref_temperature as temp
FROM result
WHERE pooled_by = 'file'
")
####combine and take unique conditions from these two
conditions = rbind(conditions1, conditions2)
conditions = unique(conditions)
conditions = conditions[complete.cases(conditions),]
row.names(conditions) = NULL ###renumber the rows. important because future processes will refer to a unique condition by its row number in this table
controlConditions = conditions[conditions$genotype %in% c('BY4741', 'BY4742', 'BY4743'),] ###create a table of conditions that have WT genotypes
###Add columns to the conditions data frame. These columns will be filled in by their respective variable: ie. gompertz shape/rate of the lifespan data associated with a given conditions (genotype, mating type, media, temp)
conditions$n = apply(conditions, 1, function(row) 0)
conditions$avgLS = apply(conditions, 1, function(row) 0)
conditions$StddevLS = apply(conditions, 1, function(row) 0)
conditions$medianLS = apply(conditions, 1, function(row) 0)
conditions$gompShape = apply(conditions, 1, function(row) 0)
conditions$gompRate = apply(conditions, 1, function(row) 0)
conditions$gompLogLik = apply(conditions, 1, function(row) 0)
conditions$gompAIC = apply(conditions, 1, function(row) 0)
conditions$weibShape = apply(conditions, 1, function(row) 0)
conditions$weibScale = apply(conditions, 1, function(row) 0)
conditions$weibLogLik = apply(conditions, 1, function(row) 0)
conditions$weibAIC = apply(conditions, 1, function(row) 0)
#############################################################################################################
#############################################################################################################
#########################################Loop through the conditions to get lifespan data
for (r in 1:length(conditions$genotype)) {
genotypeTemp = conditions$genotype[r]
mediaTemp = conditions$media[r]
temperatureTemp = conditions$temp[r]
matTemp = conditions$mat[r]
conditionName = apply(conditions[r,1:4], 1, paste, collapse=" ") #### create a string to name a possible output file
conditionName = gsub("[[:punct:]]", "", conditionName) #### remove special characters from the name (ie. quotations marks, slashes, etc.)
genotypeTemp = gsub("'", "''", genotypeTemp)
genotypeTemp = gsub('"', '""', genotypeTemp)
##### Query the database to take data (including lifespan data) from every mutually exclusive row (pooled by file, not genotype, not background, etc).
##### There will often be multiple rows (representing different experiments) for each unique condition. This analysis pools lifespan data from multiple experiments if the conditions are all the same
queryStatementSet = paste(
"SELECT * ",
"FROM result ",
"WHERE pooled_by = 'file' AND set_genotype = '", genotypeTemp, "' AND set_mating_type = '", matTemp, "' AND set_media = '", mediaTemp, "' AND set_temperature = '", temperatureTemp, "'",
sep = ""
)
queryStatementRef = paste( ### both the reference and set columns will be searched for matching conditions
"SELECT * ",
"FROM result ",
"WHERE pooled_by = 'file' AND ref_genotype = '", genotypeTemp, "' AND ref_mating_type = '", matTemp, "' AND ref_media = '", mediaTemp, "' AND ref_temperature = '", temperatureTemp, "'",
sep = ""
)
dataListSet = dbGetQuery(con, queryStatementSet)
dataListRef = dbGetQuery(con, queryStatementRef)
lifespansChar = unique(c(dataListSet$set_lifespans, dataListRef$ref_lifespans)) ### combine lifespan values for a given condition into a single data structure. (problems of having non-mutually exclusive rows in the ref_lifespans column are overcome by only taking unique groups of lifespans. The assumption is that no two experiments produced identical lifespan curvs)
##### Database codes the lifespan data for each experiment as a string. So, lifespanChar is a vector of strings
##### Convert lifespanChar into a single vector of integers lifespansTemp
lifespansTemp = c()
if (length(lifespansChar) > 0) {
for (s in 1:length(lifespansChar)) {
lifespansNum = as.numeric(unlist(strsplit(lifespansChar[s], ",")))
lifespansNum = lifespansNum[!is.na(lifespansNum)]
if (length(lifespansNum) > 0) {
lifespansTemp = c(lifespansTemp, lifespansNum)
}
}
}
conditions$n[r] = length(lifespansTemp) #### record number of individuals
##### Now that there is a single vector of lifespans for a condition. Do the analysis
if (length(lifespansTemp) > 4) {
lifespanGomp = flexsurvreg(formula = Surv(lifespansTemp) ~ 1, dist = 'gompertz') ### Use the flexsurvreg package to fit lifespan data to gompertz or weibull distribution
lifespanWeib = flexsurvreg(formula = Surv(lifespansTemp) ~ 1, dist = 'weibull')
### Fill in added columns in the conditions table with data from gompertz and weibull fitting. Columns are named according to respective variables
conditions$avgLS[r] = mean(lifespansTemp)
conditions$StddevLS[r] = sd(lifespansTemp)
conditions$medianLS[r] = median(lifespansTemp)
conditions$gompShape[r] = lifespanGomp$res[1,1]
conditions$gompRate[r] = lifespanGomp$res[2,1]
conditions$gompLogLik[r] = lifespanGomp$loglik
conditions$gompAIC[r] = lifespanGomp$AIC
conditions$weibShape[r] = lifespanWeib$res[1,1]
conditions$weibScale[r] = lifespanWeib$res[2,1]
conditions$weibLogLik[r] = lifespanWeib$loglik
conditions$weibAIC[r] = lifespanWeib$AIC
}
#### plot survival/hazard/linearized hazard curves. For gompertz fitting data, ln(hazard) should be linear with time (generations). For weibull fitting data ln(hazard) should be linear with ln(generations).
if(exists('lifespanWeib')) {
#setwd(folderSurvivalCurves)
setwd('/tmp/')
jpeg(paste(conditionName, 'Survival Curve Weibull.jpg'))
plot(lifespanWeib, xlab = 'generations', ylab = 'S', main = paste(conditionName, 'Survival Curve Weibull'))
dev.off()
#setwd(folderCumulativeHazard)
setwd('/tmp/')
jpeg(paste(conditionName, 'Cumulative Hazard Weibull.jpg'))
plot(lifespanGomp, type = 'cumhaz', xlab = 'generations', ylab = 'H', main = paste(conditionName, 'Cumulative Hazard Weibull'))
dev.off()
#setwd(folderLinearHazard)
setwd('/tmp/')
jpeg(paste(conditionName, 'Linear Hazard Weibull.jpg'))
hazard = kphaz.fit(lifespansTemp, rep(1, length(lifespansTemp)))
plot(log(hazard$time), log(hazard$haz), xlab = 'log(generations)', ylab = 'log(h)', main = paste(conditionName, 'Linear Hazard Weibull'))
a = lifespanWeib$res[1,1]
b = lifespanWeib$res[2,1]
intercept = log(a/b) - (a-1)*log(b)
slope = a-1
abline(intercept, slope)
dev.off()
rm(lifespanWeib)
}
}
setwd(Folder)
write.csv(conditions, 'conditionsWeibRedo_qin.csv') ##### output conditions table to a CSV file
No comments:
Post a Comment