Sunday, July 6, 2014

Stddev of 'rls.db'

# ~/9.ken/Survival Analysis Gomp Weib_20140705qin.r

#####################################################################################################
#################install and declare needed packaqes for survival analysis

#install.packages('survival') 
#install.packages('KMsurv')
#install.packages('data.table')
#install.packages('RSQLite')

library('survival')
library('KMsurv')
library('data.table')
library('RSQLite')
library('flexsurv')
library('muhaz')

#########################################################################################################
#########################################################################################################

#####################Declare names of files and folders. Move working directory to desired folder. Establish connection to database
#folderLinearHazard = '//udrive.uw.edu/udrive/Kaeberlein Lab/RLS Survival Analysis 2/Linear Hazard'
Folder = "~/projects/0.network.aging.prj/9.ken"
File = 'rls.db'
setwd(Folder)
drv <- SQLite()
con <- dbConnect(drv, dbname = File)

##########################################################################################################
##########################################################################################################

######################Create a complete table of conditions that have data in the RLS database. 
######################(Each row in this table will be a unique combination of genotype, mating type, media, and temperature

###get unique conditions from "set" columns (experimental treatments/mutations)
conditions1 = dbGetQuery(con, "
  SELECT DISTINCT set_genotype as genotype, set_mating_type as mat, set_media as media, set_temperature as temp
  FROM result
  WHERE pooled_by = 'file'
  ")
### rows in the result table of the database are not mutually exclusive. In some rows, data has sometimes been pooled by genotype, background strain, etc.   

###get unique conditions from "reference" columns (these columns are the control conditions/lifespan results for each row of experimental lifespan results. Rows are not mutually exclusive (1 control to many experimental conditions)
conditions2 = dbGetQuery(con, "
  SELECT DISTINCT ref_genotype as genotype, ref_mating_type as mat, ref_media as media, ref_temperature as temp
  FROM result 
  WHERE pooled_by = 'file'
  ")

####combine and take unique conditions from these two
conditions = rbind(conditions1, conditions2)
conditions = unique(conditions)
conditions = conditions[complete.cases(conditions),]
row.names(conditions) = NULL ###renumber the rows. important because future processes will refer to a unique condition by its row number in this table
controlConditions = conditions[conditions$genotype %in% c('BY4741', 'BY4742', 'BY4743'),] ###create a table of conditions that have WT genotypes

###Add columns to the conditions data frame. These columns will be filled in by their respective variable: ie. gompertz shape/rate of the lifespan data associated with a given conditions (genotype, mating type, media, temp)
conditions$n = apply(conditions, 1, function(row) 0)
conditions$avgLS = apply(conditions, 1, function(row) 0)
conditions$StddevLS = apply(conditions, 1, function(row) 0)
conditions$medianLS = apply(conditions, 1, function(row) 0)
conditions$gompShape = apply(conditions, 1, function(row) 0)
conditions$gompRate = apply(conditions, 1, function(row) 0)
conditions$gompLogLik = apply(conditions, 1, function(row) 0)
conditions$gompAIC = apply(conditions, 1, function(row) 0)
conditions$weibShape = apply(conditions, 1, function(row) 0)
conditions$weibScale = apply(conditions, 1, function(row) 0)
conditions$weibLogLik = apply(conditions, 1, function(row) 0)
conditions$weibAIC = apply(conditions, 1, function(row) 0)

#############################################################################################################
#############################################################################################################

#########################################Loop through the conditions to get lifespan data 
for (r in 1:length(conditions$genotype)) {

  genotypeTemp = conditions$genotype[r]
  mediaTemp = conditions$media[r]
  temperatureTemp = conditions$temp[r]
  matTemp = conditions$mat[r]
  
  conditionName = apply(conditions[r,1:4], 1, paste, collapse=" ") #### create a string to name a possible output file
  conditionName = gsub("[[:punct:]]", "", conditionName) #### remove special characters from the name (ie. quotations marks, slashes, etc.)

  genotypeTemp = gsub("'", "''", genotypeTemp)
  genotypeTemp = gsub('"', '""', genotypeTemp)
  
  ##### Query the database to take data (including lifespan data) from every mutually exclusive row (pooled by file, not genotype, not background, etc). 
  ##### There will often be multiple rows (representing different experiments) for each unique condition. This analysis pools lifespan data from multiple experiments if the conditions are all the same
  queryStatementSet = paste(
    "SELECT * ",
    "FROM result ",
    "WHERE pooled_by = 'file' AND set_genotype = '", genotypeTemp, "' AND set_mating_type = '", matTemp, "' AND set_media = '", mediaTemp, "' AND set_temperature = '", temperatureTemp, "'", 
    sep = ""
  )
  
  queryStatementRef = paste( ### both the reference and set columns will be searched for matching conditions
    "SELECT * ",
    "FROM result ",
    "WHERE pooled_by = 'file' AND ref_genotype = '", genotypeTemp, "' AND ref_mating_type = '", matTemp, "' AND ref_media = '", mediaTemp, "' AND ref_temperature = '", temperatureTemp, "'", 
    sep = ""
  )
  
  dataListSet = dbGetQuery(con, queryStatementSet)
  dataListRef = dbGetQuery(con, queryStatementRef)
  lifespansChar = unique(c(dataListSet$set_lifespans, dataListRef$ref_lifespans)) ### combine lifespan values for a given condition into a single data structure. (problems of having non-mutually exclusive rows in the ref_lifespans column are overcome by only taking unique groups of lifespans. The assumption is that no two experiments produced identical lifespan curvs)
  
  ##### Database codes the lifespan data for each experiment as a string. So, lifespanChar is a vector of strings
  ##### Convert lifespanChar into a single vector of integers lifespansTemp
  lifespansTemp = c()
  if (length(lifespansChar) > 0) {
    for (s in 1:length(lifespansChar)) {
      lifespansNum = as.numeric(unlist(strsplit(lifespansChar[s], ",")))
      lifespansNum = lifespansNum[!is.na(lifespansNum)]
      if (length(lifespansNum) > 0) {
        lifespansTemp = c(lifespansTemp, lifespansNum)
      } 
    }
  }
    
  conditions$n[r] = length(lifespansTemp) #### record number of individuals 
  

  ##### Now that there is a single vector of lifespans for a condition. Do the analysis
  if (length(lifespansTemp) > 4) {
    
    lifespanGomp = flexsurvreg(formula = Surv(lifespansTemp) ~ 1, dist = 'gompertz') ### Use the flexsurvreg package to fit lifespan data to gompertz or weibull distribution
    lifespanWeib = flexsurvreg(formula = Surv(lifespansTemp) ~ 1, dist = 'weibull')  
  
    ### Fill in added columns in the conditions table with data from gompertz and weibull fitting. Columns are named according to respective variables
    conditions$avgLS[r] = mean(lifespansTemp)
    conditions$StddevLS[r] = sd(lifespansTemp)
    conditions$medianLS[r] = median(lifespansTemp)
    conditions$gompShape[r] = lifespanGomp$res[1,1]
    conditions$gompRate[r] = lifespanGomp$res[2,1]
    conditions$gompLogLik[r] = lifespanGomp$loglik
    conditions$gompAIC[r] = lifespanGomp$AIC

    conditions$weibShape[r] = lifespanWeib$res[1,1]
    conditions$weibScale[r] = lifespanWeib$res[2,1]
    conditions$weibLogLik[r] = lifespanWeib$loglik
    conditions$weibAIC[r] = lifespanWeib$AIC    
    }
  

  #### plot survival/hazard/linearized hazard curves. For gompertz fitting data, ln(hazard) should be linear with time (generations). For weibull fitting data ln(hazard) should be linear with ln(generations).
  if(exists('lifespanWeib')) {
    #setwd(folderSurvivalCurves)
    setwd('/tmp/')
    jpeg(paste(conditionName, 'Survival Curve Weibull.jpg'))
    plot(lifespanWeib, xlab = 'generations', ylab = 'S', main = paste(conditionName, 'Survival Curve Weibull'))
    dev.off()
    
    #setwd(folderCumulativeHazard)
    setwd('/tmp/')
    jpeg(paste(conditionName, 'Cumulative Hazard Weibull.jpg'))
    plot(lifespanGomp, type = 'cumhaz', xlab = 'generations', ylab = 'H', main = paste(conditionName, 'Cumulative Hazard Weibull'))
    dev.off()
    
    #setwd(folderLinearHazard)
    setwd('/tmp/')
    jpeg(paste(conditionName, 'Linear Hazard Weibull.jpg'))
    hazard = kphaz.fit(lifespansTemp, rep(1, length(lifespansTemp)))
    plot(log(hazard$time), log(hazard$haz), xlab = 'log(generations)', ylab = 'log(h)', main = paste(conditionName, 'Linear Hazard Weibull')) 
    a = lifespanWeib$res[1,1]
    b = lifespanWeib$res[2,1]
    intercept = log(a/b) - (a-1)*log(b)
    slope = a-1
    abline(intercept, slope)  
    dev.off()
    
    
    rm(lifespanWeib)
    
  }
  
}

setwd(Folder) write.csv(conditions, 'conditionsWeibRedo_qin.csv') ##### output conditions table to a CSV file



No comments:

Post a Comment