Monday, November 12, 2012

botseq_hong_variant_calling.mk

# Run freebayes on bam files
#
.DELETE_ON_ERROR:

ROOT_DIR?=/ndata/botlabseq/seqshare
include ${ROOT_DIR}/src/config.mk
include ${ROOT_DIR}/src/hong_analysis/botseq_hong_mapping.mk

REFERENCE_INDEX?=${ROOT_DIR}/reference_genomes/saccharomyces_cerevisiae_s288c_saccer3/saccharomyces_cerevisiae_s288c_saccer3.fasta

HONG_VCF_DIR=${ROOT_DIR}/hong_analysis/vcf

HONG_VCF_FILES=$(addprefix ${HONG_VCF_DIR}/, $(patsubst %.bam,%.vcf,$(notdir ${HONG_SORTEDBAM_FILES})))

HONG_VCF_GZ_FILES=$(addsuffix .gz, ${HONG_VCF_FILES})
HONG_VCF_INDEX_FILES=$(addsuffix .tbi, ${HONG_VCF_GZ_FILES})

.DEFAULT_GOAL:=hongvcall
hongvcall: ${HONG_VCF_INDEX_FILES}

${HONG_VCF_DIR}:
        mkdir ${HONG_VCF_DIR}

.SECONDARY: ${HONG_VCF_FILES}
${HONG_VCF_FILES}: ${HONG_VCF_DIR}/%.vcf: ${HONG_MAPPED_BAM_DIR}/%.bam | ${HONG_VCF_DIR}
        ${FREEBAYES} --fasta ${REFERENCE_INDEX} --pooled $< > $@

${HONG_VCF_GZ_FILES}: ${HONG_VCF_DIR}/%.gz: ${HONG_VCF_DIR}/% 
        ${BGZIP} $<

${HONG_VCF_INDEX_FILES}: ${HONG_VCF_DIR}/%.tbi: ${HONG_VCF_DIR}/% 
        ${TABIX} $<
hongvctest:
        @echo ${HONG_VCF_INDEX_FILES}

Friday, November 9, 2012

Parse genetic variants from NGS data, sequenced qin-lab natural isolates


These shell script were written by Lance at the Princeton genome center.


batch code are in src/
mkdir -p /Volumes/BotlabSeq/hong_analysis/mapped_bam 
bwa aln -t 2 /Volumes/BotlabSeq/reference_genomes/saccharomyces_cerevisiae_s288c_saccer3/saccharomyces_cerevisiae_s288c_saccer3.fasta /Volumes/BotlabSeq/fastq/3-001_M1-2_lib1_read-1.fastq.gz > /Volumes/BotlabSeq/hong_analysis/mapped_bam/3-001_M1-2_lib1_read-1_aln_sa.sai
... ... 


#!/bin/bash

DIRECTORY=$(cd `dirname $0` && pwd)

ROOT_DIR=$(cd "${DIRECTORY}/../../../" && pwd)
STRAINS_FILE=${ROOT_DIR}/Sequencing_Consortium_Strains-MASTER.txt
FASTQ_DIR=${ROOT_DIR}/fastq
ORIGINAL_BAM_DIR=${ROOT_DIR}/original_bam

function process_batch {
    HONG_FASTQ_FILES=""
    ORIGINAL_BAM_FILES=""
    for strain in "${strain_batch[@]}"; do
        HONG_FASTQ_FILES+=`find "${FASTQ_DIR}" -name "$strain*.fastq.gz" | xargs echo`
        HONG_FASTQ_FILES+=" "
        HONG_ORIGINAL_BAM_FILES+=`find "${ORIGINAL_BAM_DIR}" -name "$strain*.bam" | xargs echo`
        HONG_ORIGINAL_BAM_FILES+=" "
    done
    export HONG_FASTQ_FILES
    echo $HONG_FASTQ_FILES
    export HONG_ORIGINAL_BAM_FILES
    echo $HONG_ORIGINAL_BAM_FILES
    echo "make -n -f ${DIRECTORY}/botseq_hong_mapping.mk -j 5"
    echo ""
}

strain_list=()
while IFS=$'\t' read -r -a fields
do
    genus_species=${fields[1]}
    individual_name=${fields[4]}
    number=${fields[11]}
    if [[ $number == 3-* ]]; then
        echo "$number $individual_name $genus_species - SELECTED"
        strain_list+=("$number")
    fi
done < $STRAINS_FILE

strain_batch=()
i=0
for strain in "${strain_list[@]}"; do
    strain_batch+=("$strain")
    i=$i+1
    if [[ $i -eq 10 ]]; then
        echo ${strain_batch[@]}
        process_batch
        strain_batch=()
        i=0
    fi
done
echo ${strain_batch[@]}
process_batch

exit

#${DIRECTORY}/botseq_lte_merge_bam.py > ${DIRECTORY}/botseq_combine_bam.mk
#make -f ${DIRECTORY}/botseq_combine_bam.mk -j 4




The mapped variant are in "vcf" files.

References
http://www.broadinstitute.org/igv/book/export/html/16
  File formats

http://vcftools.sourceforge.net/perl_module.html
  VCF perl tools

http://sourceforge.net/projects/samtools/files/tabix/


Monday, November 5, 2012