Wednesday, July 9, 2014

extract rls in csv format from rls.db



548 rls*.csv files are extracted with sample size > 100.


file "_export_rls20140709.r"

#export rls to individual files. I need them for nrmca fittings.

#####################################################################################################
#################install and declare needed packaqes for survival analysis
rm(list=ls())


#require('survival')
#library('KMsurv')
library('data.table')
library('RSQLite')
#library('flexsurv')
#library('muhaz')

#########################################################################################################
#########################################################################################################

#####################Declare names of files and folders. Move working directory to desired folder. Establish connection to database
#folderLinearHazard = '//udrive.uw.edu/udrive/Kaeberlein Lab/RLS Survival Analysis 2/Linear Hazard'
Folder = "~/projects/0.network.aging.prj/9.ken"
File = 'rls.db'
setwd(Folder)
drv <- SQLite()
con <- dbConnect(drv, dbname = File)

##########################################################################################################
##########################################################################################################

######################Create a complete table of conditions that have data in the RLS database.
######################(Each row in this table will be a unique combination of genotype, mating type, media, and temperature

###get unique conditions from "set" columns (experimental treatments/mutations)
conditions1 = dbGetQuery(con, "
  SELECT DISTINCT set_genotype as genotype, set_mating_type as mat, set_media as media, set_temperature as temp
  FROM result
  WHERE pooled_by = 'file'
  ")
### rows in the result table of the database are not mutually exclusive. In some rows, data has sometimes been pooled by genotype, background strain, etc.

###get unique conditions from "reference" columns (these columns are the control conditions/lifespan results for each row of experimental lifespan results. Rows are not mutually exclusive (1 control to many experimental conditions)
conditions2 = dbGetQuery(con, "
  SELECT DISTINCT ref_genotype as genotype, ref_mating_type as mat, ref_media as media, ref_temperature as temp
  FROM result
  WHERE pooled_by = 'file'
  ")

####combine and take unique conditions from these two
conditions = rbind(conditions1, conditions2)
conditions = unique(conditions)
conditions = conditions[complete.cases(conditions),]
row.names(conditions) = NULL ###renumber the rows. important because future processes will refer to a unique condition by its row number in this table
controlConditions = conditions[conditions$genotype %in% c('BY4741', 'BY4742', 'BY4743'),] ###create a table of conditions that have WT genotypes

###Add columns to the conditions data frame. These columns will be filled in by their respective variable: ie. gompertz shape/rate of the lifespan data associated with a given conditions (genotype, mating type, media, temp)
conditions$n = apply(conditions, 1, function(row) 0)
conditions$avgLS = apply(conditions, 1, function(row) 0)
conditions$StddevLS = apply(conditions, 1, function(row) 0)
conditions$medianLS = apply(conditions, 1, function(row) 0)
conditions$gompShape = apply(conditions, 1, function(row) 0)
conditions$gompRate = apply(conditions, 1, function(row) 0)
conditions$gompLogLik = apply(conditions, 1, function(row) 0)
conditions$gompAIC = apply(conditions, 1, function(row) 0)
conditions$weibShape = apply(conditions, 1, function(row) 0)
conditions$weibScale = apply(conditions, 1, function(row) 0)
conditions$weibLogLik = apply(conditions, 1, function(row) 0)
conditions$weibAIC = apply(conditions, 1, function(row) 0)

#############################################################################################################
#############################################################################################################

#########################################Loop through the conditions to get lifespan data
# r =1

for (r in 1:length(conditions$genotype)) {
  genotypeTemp = conditions$genotype[r]
  mediaTemp = conditions$media[r]
  temperatureTemp = conditions$temp[r]
  matTemp = conditions$mat[r]

  conditionName = apply(conditions[r,1:4], 1, paste, collapse=" ") #### create a string to name a possible output file
  conditionName = gsub("[[:punct:]]", "", conditionName) #### remove special characters from the name (ie. quotations marks, slashes, etc.)

  genotypeTemp = gsub("'", "''", genotypeTemp)
  genotypeTemp = gsub('"', '""', genotypeTemp)

  ##### Query the database to take data (including lifespan data) from every mutually exclusive row (pooled by file, not genotype, not background, etc).
  ##### There will often be multiple rows (representing different experiments) for each unique condition. This analysis pools lifespan data from multiple experiments if the conditions are all the same
  queryStatementSet = paste(
    "SELECT * ",
    "FROM result ",
    "WHERE pooled_by = 'file' AND set_genotype = '", genotypeTemp, "' AND set_mating_type = '", matTemp, "' AND set_media = '", mediaTemp, "' AND set_temperature = '", temperatureTemp, "'",
    sep = ""
  )

  queryStatementRef = paste( ### both the reference and set columns will be searched for matching conditions
    "SELECT * ",
    "FROM result ",
    "WHERE pooled_by = 'file' AND ref_genotype = '", genotypeTemp, "' AND ref_mating_type = '", matTemp, "' AND ref_media = '", mediaTemp, "' AND ref_temperature = '", temperatureTemp, "'",
    sep = ""
  )

  dataListSet = dbGetQuery(con, queryStatementSet)
  dataListRef = dbGetQuery(con, queryStatementRef)
  lifespansChar = unique(c(dataListSet$set_lifespans, dataListRef$ref_lifespans)) ### combine lifespan values for a given condition into a single data structure. (problems of having non-mutually exclusive rows in the ref_lifespans column are overcome by only taking unique groups of lifespans. The assumption is that no two experiments produced identical lifespan curvs)

  ##### Database codes the lifespan data for each experiment as a string. So, lifespanChar is a vector of strings
  ##### Convert lifespanChar into a single vector of integers lifespansTemp
  lifespansTemp = c()
  if (length(lifespansChar) > 0) {
    for (s in 1:length(lifespansChar)) {
      lifespansNum = as.numeric(unlist(strsplit(lifespansChar[s], ",")))
      lifespansNum = lifespansNum[!is.na(lifespansNum)]
      if (length(lifespansNum) > 0) {
        lifespansTemp = c(lifespansTemp, lifespansNum)
      }
    }
  }

  if (length(lifespansTemp) > 100) {
    require(stringr)
    conditions$media[r] = str_replace( conditions$media[r], "\\/", "")
    conditions$genotype[r] = str_replace( conditions$genotype[r], "\\/", "-")
    conditions$n[r] = length(lifespansTemp)   #### record number of individuals
    filename = paste( 'rls_csv/',conditions$genotype[r], '_', conditions$mat[r],'_temp', conditions$temp[r], '_n',conditions$n[r],
                      '_media_', conditions$media[r], '.csv',sep='')
    out = data.frame(lifespansTemp)
    names(out) = c("rls")
      write.csv(out, filename, quote=F, row.names=F)
  }

}#outer loop
 
 

No comments:

Post a Comment