Showing posts with label seqinr. Show all posts
Showing posts with label seqinr. Show all posts

Sunday, February 1, 2015

bio125 R: sequence exercise, DNA words, RE sites

Files nucleotid_word_RE_demo.R

# Exercise to study how occurence of DNA words are influenced by their length.
# What are the occurence of 1-letter, 2-letter, 3-letter, ... 8-letter DNA words? 
# Learning outcome: longer words should have less occurrence in DNA
# by Hong Qin, Jan 30, 2015, for Bio125 @ Spelman College

library("seqinr");

#reverse complementary DNA in R
c2s( rev(comp(s2c("atg")) ))
c2s( rev(comp(s2c("atcg")) ))
c2s( rev(comp(s2c("gaattc")) ))

# Try on your own
# read in some bacterial 16s rDNA sequences
# seqs = read.fasta( "http://www.bioinformatics.org/ctls/download/data/16srDNA.fasta",seqtype="DNA");

# Use plasmid pMSH2 sequence
seqs = read.fasta( "pMSH2.seq")

# look at the first sequence
seq1 = seqs[[1]]
seq1    #sequence in array
seq1RC = rev(comp(seq1))  #reverse complement
  
c2s(seq1)  #print out sequence in a long string
count(seq1, 1) #nucleotide composition
count(seq1RC, 1) #nucleotide composition
mean( count(seq1, 1) )

count(seq1, 2) # occurence of two-letter DNA words
count(seq1RC, 2)
mean( count(seq1, 2) )


count(seq1, 3) # occurence of 3-letter DNA words
mean( count(seq1, 3) )
results = count(seq1, 3)
results[c('att','aat')]

results = count(seq1RC, 3)
results[c('att','aat')]

# ?? # occurence 4-letter words?

results = count(seq1, 4)
results['agct']

results = count(seq1RC, 4)
results['agct']



# ?  # occurence of 5-letter DNA words

# ? # occurence of 6-letter DNA words
# results = count(seq1, ?? )

# EcoRI site is GAATTC
# How many GAATCC in pMSH2? 

results = count(seq1, 6)
results['gaattc']

results = count(seq1RC, 6)
results['gaattc']


count(seq1, 8) # occurence of 8-letter DNA words
mean( count(seq1, 8) )
median( count(seq1, 8) )
max( count(seq1, 8) )
hist(count(seq1, 8), br=30)

results = count(seq1, 8)
#NotI site is GCGCCGC
results['gcggccgc']

results[results==1]










Monday, September 29, 2014

reverse complementary sequences in R seqinr

##
## Show that comp() does *not* return the reverve complementary strand:
##
c2s(comp(s2c("aaaattttggggcccc")))
##
## Show how to get the reverse complementary strand:
##
c2s(rev(comp(s2c("aaaattttggggcccc"))))
##
## Show what happens with non allowed values:
##
c2s(rev(comp(s2c("aaaaXttttYggggZcccc"))))
##
## Show what happens with ambiguous bases:
##
allbases <- s2c("abcdghkmstvwn")
comp(allbases) # NA are produced

comp(allbases, ambiguous = TRUE) # No more NA


my code: _get_double_strand.R

setwd("~/github/ctls/sequences")
install.packages("seqinr")
library(seqinr);
list.files()

seqs = read.fasta("panda16srDNA.fasta")
str(seqs);

seqs[[1]]
length(seqs[[1]])
c2s( rev( comp(seqs[[1]]) ) )

c2s(seqs[[1]][1:120])
c2s(comp(seqs[[1]][1:120]))