Thursday, October 8, 2015

bio233, 20151008Thu Ecoli genome investigation in R

2:30-3:20pm. short class. Using R code to analyze E coli K12 genome.
Ask students to work in pairs. Several students said, "We like this exercise."

Youtube:  https://youtu.be/UeUVSxnK8qo

R Code:
source("http://bioconductor.org/biocLite.R")
biocLite('seqinr')
require(seqinr)

# Change working direcotry to the current folder
list.files()

seqs = read.fasta(file="EcoliK12.fna")
# seqs = seqs[[1:3]]
str(seqs);

# look at the first sequence
seq1 = seqs[[789]]
getName(seqs[[789]])
table( seq1 ); #nucleotide composition
GC(seq1);  # GC content
  
# now, try to find name and GC content for 865th gene.

# a loop for all sequences 
num = 1:length(seqs); #these are storages for later use
gc  = 1:length(seqs); # gc = num; 
out = data.frame( cbind( num, gc ) );

for( i in 1:length(seqs) ) {
  out$gene_name = getName( seqs[[i]] )
  out$gc[i] = GC( seqs[[i]] );
}
head(out)

write.csv(out, "gc.csv", row.names=F) # output the results

#### now, calculate the length of each gene
num = 1:length(seqs);
out2 = data.frame( cbind( num) );
for( i in 1:length(seqs) ) {
  out2$name[i] = getName( seqs[[i]] )
  out2$len[i] = length( seqs[[i]] );  
  out2$gc[i] = GC( seqs[[i]] );
}
out2;

write.csv(out2, "gc2.csv", row.names=F) # output the results

prot = translate( seqs[[999 ]])
length(prot)
prot
table(prot)

# now, find the protein length  for 1005th gene

#### now, translate the ORF into proteins
num = 1:length(seqs);
out3 = data.frame( cbind( num) );
for( i in 1:length(seqs) ) {
  out3$name[i] = getName( seqs[[i]] )
  out3$ORFLen[i] = length( seqs[[i]] );  
  prot = translate( seqs[[i ]])
  length(prot)
  out3$ProtLen[i] = length( prot)  
}
out3;
write.csv( out3, "gc-prot.csv")

No comments:

Post a Comment