#### example scripts for BIOS555 lab4, updated 9/22/2020 ############################################## ## using Rsubread for sequence alignment ############################################## library(Rsubread) ## build reference buildindex(basename="phage",reference="phage-ref.fa") ## alignment align.stat <- align(index="phage",readfile1="phage-reads.fa", output_file="phage-reads.bam",type='dna') align.stat # check the mapping results ############################################## ## Joint analysis for RNA- and ChIP-seq ############################################## library(GenomicRanges) library(GenomicFeatures) library(Rsamtools) ## first get gene information from UCSC. This will take 2 minutes or so. ## Or you can download knowngene.hg18.sqlite from class site and do ## gene.hg18=loadDb("knowngene.hg18.sqlite") gene.hg18 = makeTxDbFromUCSC(genom="hg18",tablename="knownGene") allgenes = genes(gene.hg18) allgenes.chr22 = allgenes[seqnames(allgenes)=="chr22"] ## genes on chr 22 ## Read in RNA-seq bam file what = c("rname", "strand", "pos", "qwidth") TSS.counts = NULL param = ScanBamParam(what = what) bam = scanBam("K562_RNAseq_chr22.bam", param=param)[[1]] IRange.reads = GRanges(seqnames=Rle(bam$rname), ranges=IRanges(bam$pos, width=bam$qwidth)) ## get RNA-seq counts within gene bodies - ## should obtain exon counts and sum up. I'm approximating the process here. counts.RNAseq = countOverlaps(allgenes.chr22, IRange.reads) ## get Cmyc binding around TSS ## first get region around TSS (+/- 1000 bp). This is a little tricky, be careful in strands directions. strands = strand(allgenes.chr22) ix.gene.minus = which(strands=="-") TSS = start(allgenes.chr22) TSS[ix.gene.minus] = end(allgenes.chr22)[ix.gene.minus] ext = 1000 TSS.GRanges = GRanges(seqnames=Rle("chr22", length(allgenes.chr22)), ranges=IRanges(start=TSS-ext, end=TSS+ext)) bam = scanBam("K562Cmyc_chr22.bam", param=param)[[1]] IRange.reads = GRanges(seqnames=Rle(bam$rname), ranges=IRanges(bam$pos, width=bam$qwidth)) counts.Cmyc = countOverlaps(TSS.GRanges, IRange.reads) ## see if there's any correlation of gene expression and Cmyc binding. cor(log(counts.Cmyc+1), log(counts.RNAseq+1)) ## pretty good correlation in log scale plot(counts.Cmyc, counts.RNAseq, log="xy", xlab="Cmyc binding", ylab="Gene expression") ## Another practice: get window counts for Cmyc binding: counts in 50 bp windows around TSS ## This can show the peak shapes at binding sites ## take TSS +/- 1000 bp and cut into 50bp windows ext = 1000 winsize = 50 allss = sapply(TSS, function(x) seq(x-ext, x+ext-1, by=winsize)) ## create GRanges allchrs = as.character(seqnames(allgenes.chr22)) allseq = Rle(allchrs,rep(ext/winsize*2,length(allchrs))) allrange = IRanges(start=allss, end=allss+winsize-1) TSS.GRanges = GRanges(seqnames=allseq, ranges=allrange) tmp = countOverlaps(TSS.GRanges, IRange.reads) counts.Cmyc.win = matrix(tmp, ncol=40, byrow=TRUE) plot(colMeans(counts.Cmyc.win), type="b", xaxt="n", xlab="position", ylab="Average Cmyc binding") axis(1, c(1, 10,20, 30,40), labels=c("-500", "-200", "TSS", "250", "500")) #################################################################### ## DE test for RNA-seq data #################################################################### library(DESeq2) library(edgeR) load("X.rda") nreps=ncol(X)/2 ## DESeq2 library(DESeq2) condition <- c(rep(0,nreps),rep(1,nreps)) dds <- DESeqDataSetFromMatrix(X, DataFrame(condition), ~ as.factor(condition) ) dds <- DESeq(dds) res <- results(dds) pval.DEseq <- res$pvalue ## edgeR library(edgeR) d=DGEList(counts=X, group=condition) d <- calcNormFactors(d) d=estimateCommonDisp(d) d=estimateTagwiseDisp(d) fit.edgeR <- exactTest(d) pval.edgeR <- fit.edgeR$table$PValue ## DSS library(DSS) seqData=newSeqCountSet(X, condition) seqData=estNormFactors(seqData) seqData=estDispersion(seqData) result=waldTest(seqData, 0, 1, equal.var=FALSE) pval.DSS=result$pval pval.DSS[result$geneIndex]=result$pval ## compare pvalues plot(pval.DEseq, pval.edgeR, pch=".", xlab="DEseq", ylab="edgeR") plot(pval.DEseq, pval.DSS, pch=".", xlab="DEseq", ylab="DSS") plot(pval.edgeR, pval.DSS, pch=".", xlab="edgeR", ylab="DSS") ## look at ROC curve - almost identical library(ROCR) ROC.DE <- function(DE.gs, pval) { pred = prediction(1-pval, DE.gs) perf = performance(pred,"tpr","fpr") perf } roc.DEseq=ROC.DE(flag, pval.DEseq) roc.edgeR=ROC.DE(flag, pval.edgeR) roc.DSS=ROC.DE(flag, pval.DSS) xlim=c(0,1) plot(roc.DEseq, xlim=xlim) plot(roc.edgeR, add=TRUE, col="red", lty=1) plot(roc.DSS, add=TRUE, col="blue", lty=1) legend("bottomright", legend=c("DEseq2", "edgeR", "DSS"),col=c("black","red", "blue"), lty=1)