Tuesday, January 12, 2016

GEO, Ovarian cancer, Cisplatin

Analysis of ovarian cancer under treatment of cisplatin GEO Data: http://www.ncbi.nlm.nih.gov/geo/geo2r/?acc=GSE47856 Use limma package for differnetial expression analysis.
################################################################
#   Differential expression analysis with limma
library(Biobase)
library(GEOquery)
library(limma)
Load series and platform data from GEO. After the first run, save the image for working offline.
# gset <- getGEO("GSE47856", GSEMatrix =TRUE)
# save.image("GSE47856.RData")
# rm(list=ls())
Load the image.
Study the structure of gset
# study the meta data
experimental_design = gset@phenoData@data
# gset@phenoData@varMetadata
# experimental_design[, "source_name_ch1"][1:10]
experimental_design[1:10,  c("characteristics_ch1", "characteristics_ch1.2")]
           characteristics_ch1 characteristics_ch1.2
GSM1160723    cell line: HeyA8       treatment: none
GSM1160724    cell line: HeyA8       treatment: none
GSM1160725    cell line: HeyA8       treatment: none
GSM1160726    cell line: HeyA8       treatment: none
GSM1160727    cell line: HeyA8       treatment: none
GSM1160728    cell line: HeyA8  treatment: Cisplatin
GSM1160729    cell line: HeyA8  treatment: Cisplatin
GSM1160730    cell line: HeyC2       treatment: none
GSM1160731    cell line: HeyC2       treatment: none
GSM1160732    cell line: HeyC2       treatment: none
unique( experimental_design$characteristics_ch1 )
 [1] cell line: HeyA8     cell line: HeyC2     cell line: A2780    
 [4] cell line: OVCA420   cell line: OVCA429   cell line: PA-1     
 [7] cell line: TYK-nu    cell line: CH1       cell line: OV90     
[10] cell line: FU-OV-1   cell line: A2008     cell line: DOV13    
[13] cell line: OVCA433   cell line: OVCAR-10  cell line: DOV13B   
[16] cell line: C13       cell line: IGROV-1   cell line: OVCAR-8  
[19] cell line: M41       cell line: A2780cisR cell line: ovary1847
[22] cell line: Caov-2    cell line: Caov-3    cell line: Hey      
[25] cell line: JHOS-2    cell line: JHOS-3    cell line: OAW28    
[28] cell line: OAW42     cell line: DOV13A    cell line: OV56     
[31] cell line: OVCAR-3   cell line: OVCA432   cell line: OVCAR-2  
[34] cell line: OVCAR-5   cell line: OVK-18    cell line: PEO1     
[37] cell line: RMG-I     cell line: RMG-II    cell line: SKOV-4   
[40] cell line: SKOV-3    cell line: SKOV-6    cell line: SKOV-8   
[43] cell line: TAYA      cell line: TOV-112D  cell line: TOV-21G  
[46] cell line: UWB1.289 
46 Levels: cell line: A2008 cell line: A2780 ... cell line: UWB1.289
Pick one cell line for test.
sel = grep("HeyC2", experimental_design$characteristics_ch1)
gset = gset[ ,sel]
Do a histogram of the expression data
ex <- exprs(gset)
hist(ex[,1])

hist(log2(ex[,1]))

boxplot(ex)

Log2 transform. Not sure about LogC
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprs(gset) <- log2(ex) }
Are the microarray data normalized? The sum of each array are similar.
head(ex)
        GSM1160730 GSM1160731 GSM1160732 GSM1160733 GSM1160734 GSM1160735
7892501    7.34145    5.99586    6.36227    7.71416    6.96127    7.66464
7892502    4.18187    3.83990    3.27792    4.86781    3.55629    4.10709
7892503    4.13694    3.35185    3.05373    3.59022    3.33752    3.33640
7892504    9.62438    9.01348    8.63508    9.06869    9.11466    9.04583
7892505    2.30237    2.23863    2.68872    4.01024    2.68134    2.19076
7892506    4.21606    2.84229    3.93290    3.51904    3.54838    3.37970
sums = apply(ex,2, sum)
sums / max(sums)
GSM1160730 GSM1160731 GSM1160732 GSM1160733 GSM1160734 GSM1160735 
 0.9921900  0.9946481  0.9945619  1.0000000  0.9930194  0.9935355 
medians = apply(ex, 2, median)
medians
GSM1160730 GSM1160731 GSM1160732 GSM1160733 GSM1160734 GSM1160735 
   5.76975    5.85706    5.81737    5.78378    5.85064    5.76896 
means = apply(ex, 2, mean)
means
GSM1160730 GSM1160731 GSM1160732 GSM1160733 GSM1160734 GSM1160735 
  6.091404   6.106495   6.105966   6.139352   6.096496   6.099664 
Set up the data and proceed with analysis. Modified by GEO sample script.
gset@phenoData@data[,  c("characteristics_ch1", "characteristics_ch1.2")]
           characteristics_ch1 characteristics_ch1.2
GSM1160730    cell line: HeyC2       treatment: none
GSM1160731    cell line: HeyC2       treatment: none
GSM1160732    cell line: HeyC2       treatment: none
GSM1160733    cell line: HeyC2  treatment: Cisplatin
GSM1160734    cell line: HeyC2  treatment: Cisplatin
GSM1160735    cell line: HeyC2  treatment: Cisplatin
gset@phenoData@data$characteristics_ch1.2
                  V9                  V10                  V11 
     treatment: none      treatment: none      treatment: none 
                 V12                  V13                  V14 
treatment: Cisplatin treatment: Cisplatin treatment: Cisplatin 
Levels: treatment: Cisplatin treatment: none
treatments <- make.names(gset@phenoData@data$characteristics_ch1.2)
fl.names = unique(treatments)
fl = as.factor( LETTERS[1:length(fl.names)]) 
names(fl) = fl.names
gset$description <- fl
design <- model.matrix(~ description + 0, gset)
colnames(design) <- levels(fl)
fit <- lmFit(gset, design)
cont.matrix <- makeContrasts( A-B, levels=fl)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=50)

No comments:

Post a Comment