#E. coli rep all rm(list=ls()) library(ggplot2) library(grid) #calculate FDR getFDRDataFrame = function(df){ cntDecoy = 1.0 cntTarget = 0.0 numberAccepted = 0 fdr = data.frame(matrix(ncol=2,nrow=0)) colnames(fdr) = c("fdr","accepted") if(df[1,]$target=="target"){ cntTarget = cntTarget + 1.0 numberAccepted = numberAccepted + 1 } else { cntDecoy = cntDecoy + 1.0 } fdr = rbind(fdr,c(cntDecoy/cntTarget,numberAccepted)) for(i in 2:nrow(df)) { if(df[i,]$target=="target"){ cntTarget = cntTarget + 1.0 numberAccepted = numberAccepted + 1 } else { cntDecoy = cntDecoy + 1.0 } fdr = rbind(fdr,c(cntDecoy/cntTarget,numberAccepted)) } names(fdr) = c("fdr","accepted") for(i in (nrow(fdr)-1):1) { if (fdr[i+1,1] < fdr[i,1]){ fdr[i,1] = fdr[i+1,1] } } return(fdr) } runMSGF = function(msgfFile) { msgfFile1 = msgfFile[sample(nrow(msgfFile)),] #randomize order to better deal with ties msgfFile2 = msgfFile1[order(msgfFile1$SpecEValue),] msgfFile3 = msgfFile2[1:8000,] msgfFile4 = subset(msgfFile3,select=c(scan,SpecEValue,MSGFPeptide,MSGFTargetOrDecoy)) msgfFile4$target = msgfFile4$MSGFTargetOrDecoy fdr = getFDRDataFrame(msgfFile4) msgfFinalFile = cbind(msgfFile4,fdr) msgfFinalFile1 = msgfFinalFile[msgfFinalFile$target=="target",] return(msgfFinalFile1) } runCombined = function(tideFile) { tideFile1 = tideFile[sample(nrow(tideFile)),] #randomize order to better deal with ties tideFile2 = tideFile1[order(tideFile1$combined.p.value),] tideFile3 = tideFile2[1:7000,] tideFile4 = subset(tideFile3,select=c(scan,combined.p.value,combinedPeptide, combinedTargetOrDecoy)) tideFile4$target = tideFile4$combinedTargetOrDecoy fdr = getFDRDataFrame(tideFile4) combinedFinalFile = cbind(tideFile4,fdr) combinedFinalFile2 = combinedFinalFile[combinedFinalFile$target=="target",] return(combinedFinalFile2) } runResEv = function(tideFile) { tideFile1 = tideFile[sample(nrow(tideFile)),] #randomize order to better deal with ties tideFile2 = tideFile1[order(tideFile1$res.ev.p.value),] tideFile3 = tideFile2[1:7000,] tideFile4 = subset(tideFile3,select=c(scan,res.ev.p.value,resEvPeptide, resEvTargetOrDecoy)) tideFile4$target = tideFile4$resEvTargetOrDecoy fdr = getFDRDataFrame(tideFile4) combinedFinalFile = cbind(tideFile4,fdr) combinedFinalFile2 = combinedFinalFile[combinedFinalFile$target=="target",] return(combinedFinalFile2) } runResEvRaw = function(tideFile) { tideFile1 = tideFile[sample(nrow(tideFile)),] #randomize order to better deal with ties tideFile2 = tideFile1[order(tideFile1$res.ev.score,decreasing=T),] tideFile3 = tideFile2[1:7000,] tideFile4 = subset(tideFile3,select=c(scan,res.ev.score,resEvRawPeptide, resEvRawTargetOrDecoy)) tideFile4$target = tideFile4$resEvRawTargetOrDecoy fdr = getFDRDataFrame(tideFile4) combinedFinalFile = cbind(tideFile4,fdr) combinedFinalFile2 = combinedFinalFile[combinedFinalFile$target=="target",] return(combinedFinalFile2) } runXcorr = function(tideFile) { tideFile1 = tideFile[sample(nrow(tideFile)),] #randomize order to better deal with ties tideFile2 = tideFile1[order(tideFile1$exact.p.value),] tideFile3 = tideFile2[1:7000,] tideFile4 = subset(tideFile3,select=c(scan,exact.p.value,xcorrPeptide, xcorrTargetOrDecoy)) tideFile4$target = tideFile4$xcorrTargetOrDecoy fdr = getFDRDataFrame(tideFile4) combinedFinalFile = cbind(tideFile4,fdr) combinedFinalFile2 = combinedFinalFile[combinedFinalFile$target=="target",] return(combinedFinalFile2) } runAmanda = function(amandaFile) { amandaFile1 = amandaFile[sample(nrow(amandaFile)),] #randomize order to better deal with ties amandaFile2 = amandaFile1[order(amandaFile1$Weighted.Probability),] amandaFile3 = amandaFile2[1:8000,] amandaFile4 = subset(amandaFile3,select=c(scan,Weighted.Probability,amandaPeptide, amandaTargetOrDecoy)) amandaFile4$target = amandaFile4$amandaTargetOrDecoy fdr = getFDRDataFrame(amandaFile4) amandaFinal = cbind(amandaFile4,fdr) amandaFinal1 = amandaFinal[amandaFinal$target=="target",] return(amandaFinal1) } runMorpheus = function(morpheusFile) { morpheusFile1 = morpheusFile[sample(nrow(morpheusFile)),] #randomize order to better deal with ties morpheusFile2 = morpheusFile1[order(morpheusFile1$Morpheus.Score,decreasing=T),] morpheusFile3 = morpheusFile2[1:7000,] morpheusFile4 = subset(morpheusFile3,select=c(scan,Morpheus.Score,morpheusPeptide, morpheusTargetOrDecoy)) morpheusFile4$target = morpheusFile4$morpheusTargetOrDecoy fdr = getFDRDataFrame(morpheusFile4) morpheusFinal = cbind(morpheusFile4,fdr) morpheusFinal1 = morpheusFinal[morpheusFinal$target=="target",] return(morpheusFinal1) } #set working directory setwd("~/projects/winterRotation/results/paperResults4/ecoli/") #read input file inputFile = read.csv(file='finalFile.txt',sep='\t',header=T,stringsAsFactors=F) #calculate FDR for each score function msgfFile = runMSGF(inputFile) combinedFile = runCombined(inputFile) resEvFile = runResEv(inputFile) xcorrFile = runXcorr(inputFile) amandaFile = runAmanda(inputFile) morpheusFile = runMorpheus(inputFile) # Number of PSMs detected at a 1% FDR nrow(xcorrFile[xcorrFile$fdr<.01,]) nrow(resEvFile[resEvFile$fdr<.01,]) nrow(combinedFile[combinedFile$fdr<.01,]) nrow(msgfFile[msgfFile$fdr<.01,]) nrow(amandaFile[amandaFile$fdr<.01,]) nrow(morpheusFile[morpheusFile$fdr<.01,]) # Calculate the target match percentage for each search engine inputFile1 = na.omit(inputFile) tar = nrow(inputFile1[inputFile1$combinedTargetOrDecoy=='target',]) dec = nrow(inputFile1[inputFile1$combinedTargetOrDecoy=='decoy',]) tar/(tar+dec) tar = nrow(inputFile1[inputFile1$resEvTargetOrDecoy=='target',]) dec = nrow(inputFile1[inputFile1$resEvTargetOrDecoy=='decoy',]) tar/(tar+dec) tar = nrow(inputFile1[inputFile1$xcorrTargetOrDecoy=='target',]) dec = nrow(inputFile1[inputFile1$xcorrTargetOrDecoy=='decoy',]) tar/(tar+dec) tar = nrow(inputFile1[inputFile1$MSGFTargetOrDecoy=='target',]) dec = nrow(inputFile1[inputFile1$MSGFTargetOrDecoy=='decoy',]) tar/(tar+dec) tar = nrow(inputFile1[inputFile1$morpheusTargetOrDecoy=='target',]) dec = nrow(inputFile1[inputFile1$morpheusTargetOrDecoy=='decoy',]) tar/(tar+dec) tar = nrow(inputFile1[inputFile1$amandaTargetOrDecoy=='target',]) dec = nrow(inputFile1[inputFile1$amandaTargetOrDecoy=='decoy',]) tar/(tar+dec) #scatter plot of res-ev p-value against XCprr p-values pdf("~/projects/winterRotation/2015exact-pvalue-hires/doc/paper/scatterPlots/ecoli_scatterPlot.pdf", width=3,height=3) ggplot(inputFile,aes(x=log(res.ev.p.value),y=log(exact.p.value))) + stat_binhex(bin=45) + labs(x="log res-ev p-value",y="log XCorr p-value") + theme_bw() + theme(legend.position = c(.8,.20), axis.title=element_text(size=15), axis.text=element_text(size=15), legend.key.height=unit(.30,'cm')) + geom_abline(slope=1,col='red') dev.off() #comparision between combined p-value and other search engines pdf("~/projects/winterRotation/2015exact-pvalue-hires/doc/paper/otherScoreFxnRankCurve/ecoliAllRankCurve.pdf", width=3,height=3) par(mar=c(4,4,0.5,0.5)) plot(x=combinedFile$fdr,y=combinedFile$accepted,xlim=c(0,.1),ylim=c(0000,6500), xlab="",ylab="",col="black", type='l',lwd=2,cex.axis=.75,cex.lab=1.25) title(xlab="false discovery rate",line=2.5,cex.lab=1.25) title(ylab="# accepted PSMs",line=2.5,cex.lab=1.25) axis(side=1,at=c(0,0.02,0.04,0.06,0.08,.10),cex.axis=.75) axis(side=1,at=c(0,.01,.03,.05,.07,.09),labels=F) lines(x=msgfFile$fdr,y=msgfFile$accepted,col="blue",lwd=2) lines(x=morpheusFile$fdr,y=morpheusFile$accepted,col="green",lwd=2) lines(x=amandaFile$fdr,y=amandaFile$accepted,col="orange",lwd=2) grid() legend(x=.01,y=2300,legend=c("MSGF+","combined p-value", "Morpheus",'MS Amanda'), lty=c(1,1),lwd=c(2,2),col=c("blue","black",'green','orange'),bty='n',cex=.8,y.intersp=1) dev.off() #comparision between combined p-value and #res-ev p-value xcorr p-value pdf("~/projects/winterRotation/2015exact-pvalue-hires/doc/paper/rankingCurves/ecoliCombinedRankCurve.pdf", width=3,height=3) par(mar=c(4,4,0.5,0.5)) plot(x=combinedFile$fdr,y=combinedFile$accepted,xlim=c(0,.1),ylim=c(0000,6500), xlab="",ylab="",col="black", type='l',lwd=2,cex.axis=1,cex.lab=1.25) title(xlab="false discovery rate",line=2.5,cex.lab=1.25) title(ylab="# accepted PSMs",line=2.5,cex.lab=1.25) axis(side=1,at=c(0,0.02,0.04,0.06,0.08,.10),cex.axis=1) axis(side=1,at=c(0,.01,.03,.05,.07,.09),labels=F) lines(x=resEvFile$fdr,y=resEvFile$accepted,col="red",lwd=2) #lines(x=resEvRaw1$fdr,y=resEvRaw1$accepted,col="purple",lwd=4) lines(x=xcorrFile$fdr,y=xcorrFile$accepted,col="gray",lwd=2) grid() legend(x=.01,y=2000,legend=c("combined p-value","res-ev p-value","XCorr p-value"), lty=c(1,1),lwd=c(2,2),col=c("black","red","gray"),bty='n',cex=.8,y.intersp=1) dev.off()