Open Notebook: extract rls in csv format from rls.db

548 rls*.csv files are extracted with sample size > 100.

file "_export_rls20140709.r"

#export rls to individual files. I need them for nrmca fittings.

#####################################################################################################
#################install and declare needed packaqes for survival analysis
rm(list=ls())

#require('survival')
#library('KMsurv')
library('data.table')
library('RSQLite')
#library('flexsurv')
#library('muhaz')

#########################################################################################################
#########################################################################################################

#####################Declare names of files and folders. Move working directory to desired folder. Establish connection to database
#folderLinearHazard = '//udrive.uw.edu/udrive/Kaeberlein Lab/RLS Survival Analysis 2/Linear Hazard'
Folder = "~/projects/0.network.aging.prj/9.ken"
File = 'rls.db'
setwd(Folder)
drv <- SQLite()
con <- dbConnect(drv, dbname = File)

##########################################################################################################
##########################################################################################################

######################Create a complete table of conditions that have data in the RLS database.
######################(Each row in this table will be a unique combination of genotype, mating type, media, and temperature

###get unique conditions from "set" columns (experimental treatments/mutations)
conditions1 = dbGetQuery(con, "
SELECT DISTINCT set_genotype as genotype, set_mating_type as mat, set_media as media, set_temperature as temp
FROM result
WHERE pooled_by = 'file'
")
### rows in the result table of the database are not mutually exclusive. In some rows, data has sometimes been pooled by genotype, background strain, etc.

###get unique conditions from "reference" columns (these columns are the control conditions/lifespan results for each row of experimental lifespan results. Rows are not mutually exclusive (1 control to many experimental conditions)
conditions2 = dbGetQuery(con, "
SELECT DISTINCT ref_genotype as genotype, ref_mating_type as mat, ref_media as media, ref_temperature as temp
FROM result
WHERE pooled_by = 'file'
")

####combine and take unique conditions from these two
conditions = rbind(conditions1, conditions2)
conditions = unique(conditions)
conditions = conditions[complete.cases(conditions),]
row.names(conditions) = NULL ###renumber the rows. important because future processes will refer to a unique condition by its row number in this table
controlConditions = conditions[conditions$genotype %in% c('BY4741', 'BY4742', 'BY4743'),] ###create a table of conditions that have WT genotypes

###Add columns to the conditions data frame. These columns will be filled in by their respective variable: ie. gompertz shape/rate of the lifespan data associated with a given conditions (genotype, mating type, media, temp)
conditions$n = apply(conditions, 1, function(row) 0)
conditions$avgLS = apply(conditions, 1, function(row) 0)
conditions$StddevLS = apply(conditions, 1, function(row) 0)
conditions$medianLS = apply(conditions, 1, function(row) 0)
conditions$gompShape = apply(conditions, 1, function(row) 0)
conditions$gompRate = apply(conditions, 1, function(row) 0)
conditions$gompLogLik = apply(conditions, 1, function(row) 0)
conditions$gompAIC = apply(conditions, 1, function(row) 0)
conditions$weibShape = apply(conditions, 1, function(row) 0)
conditions$weibScale = apply(conditions, 1, function(row) 0)
conditions$weibLogLik = apply(conditions, 1, function(row) 0)
conditions$weibAIC = apply(conditions, 1, function(row) 0)

#############################################################################################################
#############################################################################################################

#########################################Loop through the conditions to get lifespan data
# r =1

for (r in 1:length(conditions$genotype)) {
genotypeTemp = conditions$genotype[r]
mediaTemp = conditions$media[r]
temperatureTemp = conditions$temp[r]
matTemp = conditions$mat[r]

conditionName = apply(conditions[r,1:4], 1, paste, collapse=" ") #### create a string to name a possible output file
conditionName = gsub("[[:punct:]]", "", conditionName) #### remove special characters from the name (ie. quotations marks, slashes, etc.)

genotypeTemp = gsub("'", "''", genotypeTemp)
genotypeTemp = gsub('"', '""', genotypeTemp)

##### Query the database to take data (including lifespan data) from every mutually exclusive row (pooled by file, not genotype, not background, etc).
##### There will often be multiple rows (representing different experiments) for each unique condition. This analysis pools lifespan data from multiple experiments if the conditions are all the same
queryStatementSet = paste(
"SELECT * ",
"FROM result ",
"WHERE pooled_by = 'file' AND set_genotype = '", genotypeTemp, "' AND set_mating_type = '", matTemp, "' AND set_media = '", mediaTemp, "' AND set_temperature = '", temperatureTemp, "'",
sep = ""
)

queryStatementRef = paste( ### both the reference and set columns will be searched for matching conditions
"SELECT * ",
"FROM result ",
"WHERE pooled_by = 'file' AND ref_genotype = '", genotypeTemp, "' AND ref_mating_type = '", matTemp, "' AND ref_media = '", mediaTemp, "' AND ref_temperature = '", temperatureTemp, "'",
sep = ""
)

dataListSet = dbGetQuery(con, queryStatementSet)
dataListRef = dbGetQuery(con, queryStatementRef)
lifespansChar = unique(c(dataListSet$set_lifespans, dataListRef$ref_lifespans)) ### combine lifespan values for a given condition into a single data structure. (problems of having non-mutually exclusive rows in the ref_lifespans column are overcome by only taking unique groups of lifespans. The assumption is that no two experiments produced identical lifespan curvs)

##### Database codes the lifespan data for each experiment as a string. So, lifespanChar is a vector of strings
##### Convert lifespanChar into a single vector of integers lifespansTemp
lifespansTemp = c()
if (length(lifespansChar) > 0) {
for (s in 1:length(lifespansChar)) {
lifespansNum = as.numeric(unlist(strsplit(lifespansChar[s], ",")))
lifespansNum = lifespansNum[!is.na(lifespansNum)]
if (length(lifespansNum) > 0) {
lifespansTemp = c(lifespansTemp, lifespansNum)
}
}
}

if (length(lifespansTemp) > 100) {
require(stringr)
conditions$media[r] = str_replace( conditions$media[r], "\\/", "")
conditions$genotype[r] = str_replace( conditions$genotype[r], "\\/", "-")
conditions$n[r] = length(lifespansTemp) #### record number of individuals
filename = paste( 'rls_csv/',conditions$genotype[r], '_', conditions$mat[r],'_temp', conditions$temp[r], '_n',conditions$n[r],
'_media_', conditions$media[r], '.csv',sep='')
out = data.frame(lifespansTemp)
names(out) = c("rls")
write.csv(out, filename, quote=F, row.names=F)
}

}#outer loop

Open Notebook

Wednesday, July 9, 2014

extract rls in csv format from rls.db

No comments:

Post a Comment