2:30-3:20pm. short class. Using R code to analyze E coli K12 genome.
Ask students to work in pairs. Several students said, "We like this exercise."
Youtube: https://youtu.be/UeUVSxnK8qo
R Code:
source("http://bioconductor.org/biocLite.R")
biocLite('seqinr')
require(seqinr)
# Change working direcotry to the current folder
list.files()
seqs = read.fasta(file="EcoliK12.fna")
# seqs = seqs[[1:3]]
str(seqs);
# look at the first sequence
seq1 = seqs[[789]]
getName(seqs[[789]])
table( seq1 ); #nucleotide composition
GC(seq1); # GC content
# now, try to find name and GC content for 865th gene.
# a loop for all sequences
num = 1:length(seqs); #these are storages for later use
gc = 1:length(seqs); # gc = num;
out = data.frame( cbind( num, gc ) );
for( i in 1:length(seqs) ) {
out$gene_name = getName( seqs[[i]] )
out$gc[i] = GC( seqs[[i]] );
}
head(out)
write.csv(out, "gc.csv", row.names=F) # output the results
#### now, calculate the length of each gene
num = 1:length(seqs);
out2 = data.frame( cbind( num) );
for( i in 1:length(seqs) ) {
out2$name[i] = getName( seqs[[i]] )
out2$len[i] = length( seqs[[i]] );
out2$gc[i] = GC( seqs[[i]] );
}
out2;
write.csv(out2, "gc2.csv", row.names=F) # output the results
prot = translate( seqs[[999 ]])
length(prot)
prot
table(prot)
# now, find the protein length for 1005th gene
#### now, translate the ORF into proteins
num = 1:length(seqs);
out3 = data.frame( cbind( num) );
for( i in 1:length(seqs) ) {
out3$name[i] = getName( seqs[[i]] )
out3$ORFLen[i] = length( seqs[[i]] );
prot = translate( seqs[[i ]])
length(prot)
out3$ProtLen[i] = length( prot)
}
out3;
write.csv( out3, "gc-prot.csv")
No comments:
Post a Comment