Analyzing Proteomics UPS1 Spike-in Experiments (Example Ramus 2016 Dataset)

Wolfgang Raffelsberger

2021-03-10

Introduction

This vignette shows how UPS1 spike-in experiments may be analyzed using the packages wrProteo, wrMisc and wrGraph, all these packages are available on CRAN.

Furthermore, the Bioconductor package limma will be used internally for it’s moderated statistical testing.

# This is R code, you can run this to redo all analysis presented here.
# If not already installed, you'll have to install wrMisc and wrProteo first.
install.packages("wrMisc")
install.packages("wrProteo")

# Installation of limma from Bioconductor
if(!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager")
BiocManager::install("limma")

# The package wrGraph is recommended for better graphics
install.packages("wrGraph")

# You cat start the vignettes for this package by typing :
browseVignettes("wrProteo")    #  ... and the select the html output

Now let’s load the packages we’ll use :

library(knitr)
library(wrMisc)
library(wrGraph)
library(wrProteo)

# Version number for wrProteo :
packageVersion("wrProteo")
#> [1] '1.4.0'

There you can find also a more general introductory vignette for this package.

Experimental Setup For Benchmark Tests

The main aim of the experimental setup using heterologous spike-in experiments is to provide a framework to test identification and quantitation procedures in proteomics.

By mixing known amounts of a collection of human proteins (UPS1) in various concentrations on top of a constant level yeast total protein extract, one expects to find only the spiked human UPS1 proteins varying between samples. In terms of ROC curves (see also ROC on Wikipedia) the spike-in proteins are expected to show up as true positives (TP). In contrast, all yeast proteins were added in the same quantity to same samples and should thus be observed as constant, ie as true negatives (TN) when looking for proteins changing abundance.

The Ramus Data-Set

The data used in this vignette were published with the article : Ramus et al 2016 “Benchmarking quantitative label-free LC-MS data processing workflows using a complex spiked proteomic standard dataset” in J Proteomics 2016 Jan 30;132:51-62.

This dataset is available on PRIDE as PXD001819 (and on ProteomeXchange).

Briefly, this experiment aims to evaluate and compare various quantification appoaches of the heterologous spike-in UPS1 in yeast protein extracts as constant matrix. 9 different concentrations of the heterologous spike-in (UPS1) were run in triplicates. The proteins were initially digested by Trypsin and then analyzed by LC-MS/MS in DDA mode.

Key Elements and Additional Functions

## A few elements and functions we'll need lateron
UPSconc <- c(50,125,250,500,2500,5000,12500,25000,50000)  

methNa <- c("ProteomeDiscoverer","MaxQuant","Proline")
names(methNa) <- c("PD","MQ","PL")

## The accession numbers for the UPS1 proteins
UPS1ac <- c("P00915", "P00918", "P01031", "P69905", "P68871", "P41159", "P02768", "P62988",
  "P04040", "P00167", "P01133", "P02144", "P15559", "P62937", "Q06830", "P63165",
  "P00709", "P06732", "P12081", "P61626", "Q15843", "P02753", "P16083", "P63279",
  "P01008", "P61769", "P55957", "O76070", "P08263", "P01344", "P01127", "P10599",
  "P99999", "P06396", "P09211", "P01112", "P01579", "P02787", "O00762", "P51965",
  "P08758", "P02741", "P05413", "P10145", "P02788", "P10636-8", "P00441", "P01375" ) 

replSpecType <- function(x, annCol="SpecType", replBy=cbind(old=c("mainSpe","species2"), new=c("Yeast","UPS1")), silent=TRUE) {
  ## rename $annot[,"SpecType"] to more specific names
  chCol <- annCol[1] %in% colnames(x$annot)
  if(chCol) { chCol <- which(colnames(x$annot)==annCol[1])
    chIt <- replBy[,1] %in% unique(x$annot[,chCol])    # check items to replace if present
    if(any(chIt)) for(i in which(chIt)) {useLi <- which(x$annot[,chCol] %in% replBy[i,1]); cat("useLi",head(useLi),"\n"); x$annot[useLi,chCol] <- replBy[i,2]}
  } else if(!silent) message(" replSpecType: 'annCol' not found in x$annot !")
  x }

plotConcHist <- function(mat, ref, refColumn=3:4, matCluNa="cluNo", lev=NULL, ylab=NULL, tit=NULL) {
  ## plot histogram like counts of UPS1 concentrations
  if(is.null(tit)) tit <- "Frequency of UPS1 Concentrations Appearing in Cluster"
  gr <- unique(mat[,matCluNa])
  ref <- ref[,refColumn]
  if(length(lev) <2) lev <- sort(unique(as.numeric(as.matrix(ref))))
  if(length(ylab) !=1) ylab <- "Frequency"
  tbl <- table(factor( as.numeric(ref[which(rownames(ref) %in% rownames(mat)),]), levels=lev))
  graphics::barplot(tbl, las=1, beside=TRUE, main=paste(tit,gr), col=grDevices::gray(0.8), ylab=ylab)
}

plotMultRegrPar <- function(dat, methInd, tit=NULL, useColumn=c("logp","slope","medAbund","startFr"), lineGuide=list(v=c(-12,-10),h=c(0.7,0.75),col="grey"), xlim=NULL,ylim=NULL,subTit=NULL) {
  ## scatter plot logp (x) vs slope (y) for all UPS proteins, symbol by useColumn[4], color by hist of useColumn[3] 
  ## dat (array) UPS1 data
  ## useColumn (character) 1st as 'logp', 2nd as 'slope', 3rd as median abundance, 4th as starting best regression from this point 
  fxNa <- "plotMultRegrPar"
   #fxNa <- wrMisc::.composeCallName(callFrom,newNa="plotMultRegrPar")
  if(length(dim(dat)) !=3) stop("invalid input, expecting as 'dat' array with 3 dimensions (proteins,Softw,regrPar)")
  if(any(length(methInd) >1, methInd > dim(dat)[2], !is.numeric(methInd))) stop("invalid 'methInd'")
  chCol <- useColumn %in% dimnames(dat)[[3]]
  if(any(!chCol)) stop("argument 'useColumn' does not fit to 3rd dim dimnames of 'dat'")
  useCol <- colorAccording2(dat[,methInd,useColumn[3]], gradTy="rainbow", revCol=TRUE, nEndOmit=14)
  graphics::plot(dat[,methInd,useColumn[1:2]], main=tit, type="n",xlim=xlim,ylim=ylim)   #col=1, bg.col=useCol, pch=20+lmPDsum[,"startFr"],
  graphics::points(dat[,methInd,useColumn[1:2]], col=1, bg=useCol, pch=20+dat[,methInd,useColumn[4]],)
  graphics::legend("topright",paste("best starting from ",1:5), text.col=1, pch=21:25, col=1, pt.bg="white", cex=0.9, xjust=0.5, yjust=0.5)
  if(length(subTit)==1) graphics::mtext(subTit,cex=0.9)
  if(is.list(lineGuide) & length(lineGuide) >0) {if(length(lineGuide$v) >0) graphics::abline(v=lineGuide$v,lty=2,col=lineGuide$col) 
    if(length(lineGuide$h) >0) graphics::abline(h=lineGuide$h,lty=2,col=lineGuide$col)}
  hi1 <- graphics::hist(dat[,methInd,useColumn[3]], plot=FALSE)
  wrGraph::legendHist(sort(dat[,methInd,useColumn[3]]), colRamp=useCol[order(dat[,methInd,useColumn[3]])][cumsum(hi1$counts)], location="bottomleft", legTit="median raw abundance")  #
}

Protein Identification and Initial Quantification

Multiple algorithms and software implementations have been developed for quantitation label-free proteomics experiments, in particular for extracted ion chromatograms (XIC). Here, the use of the output for 3 such implementations is shown. They were run individually using equivalent settings, ie identifcation based on the same fasta-database, starting at a single peptide with 1% FDR, MS mass tolerance for ion precursors at 0.7 ppm, oxidation of methionins and N-terminal acetylation as fixed as well as carbamidomethylation of cysteins as variable modifications.

Since in this context it is crucial to recognize all UPS1 proteins as such, the import-functions make use of the specPref argument, allowing to define custom tags. Most additional arguments to the various import-functions have been kept common for conventient use and for generating output structured the same way. Indeed, simply separating proteins by their species origin is not sufficient since common contaminants like human keratin might get considered by error as UPS1.

MaxQuant

MaxQuant is free software provided by the Max-Planck-Institute, see also Tyanova et al 2016. Typically MaxQuant exports by default quantitation data on level of consensus-proteins to a folder called txt with a file called “proteinGroups.txt” . So in a standard case (when the file name has not been changed manually) it is sufficient to provide the path to this file. Of course, you can explicitely point to a specific file, as shown below. With the data presented here MaxQuant version 1.6.10 was run. Files compressed as .gz can be read, too (like in the example below).

path1 <- system.file("extdata", package="wrProteo")
fiNaMQ <- "proteinGroups.txt.gz"
specPrefMQ <- list(conta="CON_|LYSC_CHICK", mainSpecies="OS=Saccharomyces cerevisiae", spike=UPS1ac)

dataMQ <- readMaxQuantFile(path1, file=fiNaMQ, specPref=specPrefMQ, refLi="mainSpe")
#>  -> readMaxQuantFile : Note: Found 11 proteins marked as 'REV_' (reverse peptide identification) - Removing
#>  -> readMaxQuantFile : Display 2845(9.5%) initial '0' values as 'NA'
#>  -> readMaxQuantFile :  Note: 6 proteins with unknown species
#>     by species : Homo sapiens: 49,  Mus musculus: 1,  Saccharomyces cerevisiae: 1047,  Sus scrofa: 1,
#>  -> readMaxQuantFile : Found 75 composite accession-numbers, truncating (eg P00761;CON__P00761)
#>  -> readMaxQuantFile : Use column 'Accession' as identifyer (has fewest, ie 0 duplicated entries) as rownames
#>  -> readMaxQuantFile :  normalize using subset of 1047

The data were imported and median-normalized, the protein annotation was parsed to automatically extract IDs, protein-names and species information.

## the number of lines and colums
dim(dataMQ$quant)
#> [1] 1104   27
## a summary of the quantitation data
summary(dataMQ$quant[,1:8])                # the first 8 cols
#>   12500amol_1     12500amol_2     12500amol_3      125amol_1    
#>  Min.   :17.52   Min.   :15.64   Min.   :14.90   Min.   :15.19  
#>  1st Qu.:22.50   1st Qu.:22.49   1st Qu.:22.50   1st Qu.:22.39  
#>  Median :23.46   Median :23.46   Median :23.46   Median :23.43  
#>  Mean   :23.69   Mean   :23.65   Mean   :23.67   Mean   :23.60  
#>  3rd Qu.:24.82   3rd Qu.:24.76   3rd Qu.:24.78   3rd Qu.:24.82  
#>  Max.   :30.31   Max.   :30.29   Max.   :30.35   Max.   :30.25  
#>  NA's   :98      NA's   :100     NA's   :104     NA's   :114    
#>    125amol_2       125amol_3      25000amol_1     25000amol_2   
#>  Min.   :14.86   Min.   :14.91   Min.   :15.78   Min.   :18.42  
#>  1st Qu.:22.36   1st Qu.:22.40   1st Qu.:22.52   1st Qu.:22.54  
#>  Median :23.42   Median :23.44   Median :23.53   Median :23.53  
#>  Mean   :23.59   Mean   :23.62   Mean   :23.74   Mean   :23.73  
#>  3rd Qu.:24.81   3rd Qu.:24.79   3rd Qu.:24.95   3rd Qu.:24.89  
#>  Max.   :30.25   Max.   :30.30   Max.   :30.31   Max.   :30.13  
#>  NA's   :106     NA's   :114     NA's   :109     NA's   :115

Now we can summarize the presence of UPS1 proteins after treatment by MaxQuant : In sum 48 UPS1 proteins were found, 0 are missing. Later in this document data from MaxQuant will by frequently abbreviated as MQ.

ProteomeDiscoverer

ProteomeDiscoverer is commercial software from ThermoFisher (www.thermofisher.com). With the data used here, the identification was performed using the XCalibur module of ProteomeDiscoverer version 2.4 . Quantitation data at the level of consensus-proteins can be exported to tabulated text files, which can be treated by the function shown below. The resultant data were export in tablulated format and the file automatically named ‘all_Proteins.txt’ (the option R-headers was checked, however this option is not mandatory). Files compressed as .gz can be read, too (like in the example below).

path1 <- system.file("extdata", package="wrProteo")
fiNaPd <- "pxd001819_PD24_Proteins.txt.gz"

## Note: data exported from ProteomeDiscoverer frequently do not have proper column-names, providing names here
sampNa <- paste(rep(UPSconc, each=3),"amol_",rep(1:3,length(UPSconc)),sep="") 
specPrefPD <- list(conta="Bos tauris|Gallus", mainSpecies="OS=Saccharomyces cerevisiae", spike=UPS1ac)

dataPD <- readProtDiscovFile(file=fiNaPd, path=path1, sampleNames=sampNa, refLi="mainSpe", specPref=specPrefPD)
#>  -> readProtDiscovFile :  setting 'annotCol' to export of 'R-friendly' colnames
#>  -> readProtDiscovFile :  Note: 10 (out of 1297) unrecognized species
#>  -> readProtDiscovFile : Count by 'specPref' : Gallus gallus: 1 ;  Homo sapiens: 47 ;  Saccharomyces cerevisiae: 1239 ;
#>  -> readProtDiscovFile :  normalize using subset of 1239

The data were imported and median-normalized, the protein annotation was parsed to atomatically extract IDs, protein-names and species information.

Note, that the original column-heads in the file exported from ProteomeDiscoverer has simply increasing numbers as names. Thus, much care should be taken on the order when preparing the vector sampleNames with the names to use instead.

## the number of lines and colums
dim(dataPD$quant)
#> [1] 1296   27
## a summary of the quantitation data
summary(dataPD$quant[,1:8])        # the first 8 cols
#>     50amol_1        50amol_2        50amol_3        125amol_1    
#>  Min.   :13.68   Min.   :11.60   Min.   : 9.609   Min.   :10.92  
#>  1st Qu.:18.15   1st Qu.:18.19   1st Qu.:18.237   1st Qu.:18.23  
#>  Median :19.36   Median :19.37   Median :19.380   Median :19.37  
#>  Mean   :19.52   Mean   :19.53   Mean   :19.539   Mean   :19.52  
#>  3rd Qu.:20.83   3rd Qu.:20.88   3rd Qu.:20.895   3rd Qu.:20.84  
#>  Max.   :26.33   Max.   :26.38   Max.   :26.418   Max.   :26.41  
#>  NA's   :92      NA's   :93      NA's   :87       NA's   :88     
#>    125amol_2       125amol_3       250amol_1       250amol_2    
#>  Min.   :10.51   Min.   :11.15   Min.   :10.04   Min.   :10.21  
#>  1st Qu.:18.21   1st Qu.:18.20   1st Qu.:18.15   1st Qu.:18.17  
#>  Median :19.40   Median :19.40   Median :19.40   Median :19.37  
#>  Mean   :19.51   Mean   :19.53   Mean   :19.50   Mean   :19.47  
#>  3rd Qu.:20.82   3rd Qu.:20.83   3rd Qu.:20.78   3rd Qu.:20.78  
#>  Max.   :26.37   Max.   :26.47   Max.   :26.50   Max.   :26.45  
#>  NA's   :99      NA's   :86      NA's   :87      NA's   :77

# there are proteins where the 'OS='-tag won't be visible as Species (orig Fasta-header and Protein-name not accessible) :
which(is.na(dataPD$annot[,"Species"]) & dataPD$annot[,"SpecType"]=="species2")    # is NA as Species
#> P02768 
#>     25

Confirming the presence of UPS1 proteins by ProteomeDiscoverer:

Now we can summarize the presence of UPS1 proteins after treatment by ProteomeDiscoverer : In sum 48 UPS1 proteins were found, 0 are missing. Later in this document data from ProteomeDiscoverer will by frequently abbreviated as PD.

Proline

Proline is open-source software provided by the Profi-consortium (see also proline-core on github), published by Bouyssié et al 2020.

Protein identification in Proline gets performed by SearchGUI, see also Vaudel et al 2015. In this case X!Tandem (see also Duncan et al 2005) was used as search engine.

Quantitation data at the level of consensus-proteins can be exported from Proline as .xlsx or tabulated text files, both formats can be treated by the import-functions shown below. Here, Proline version 1.6.1 was used.

path1 <- system.file("extdata", package="wrProteo")
fiNaPl <- "pxd001819_PL.xlsx"

specPrefPL <- c(conta="_conta", mainSpecies="OS=Saccharomyces cerevisiae", spike="_ups")  
dataPL <- readProlineFile(file.path(path1,fiNaPl), specPref=specPrefPL, normalizeMeth="median", refLi="mainSpe")
#>  -> readProlineFile : Found sheets 'Search settings and infos', 'Import and filters', 'Quant config' and 'Protein sets'
#> New names:
#> * `` -> ...2
#>  -> readProlineFile :  normalize using subset of 1137

In addition, we need to correct the quantification column-heads (like ‘Levure2ug+ UPS1-100amol-1’) and bring them to a simpler version :

head(colnames(dataPL$raw),8)
#> [1] "Levure2ug+ UPS1-100amol-1" "Levure2ug+ UPS1-100amol-2"
#> [3] "Levure2ug+ UPS1-100amol-3" "Levure2ug+ UPS1-250amol-1"
#> [5] "Levure2ug+ UPS1-250amol-2" "Levure2ug+ UPS1-250amol-3"
#> [7] "Levure2ug+ UPS1-500amol-1" "Levure2ug+ UPS1-500amol-2"
sub1 <- cbind(ini=paste0("-",c("100a","250a","500a","1f","5f","10f","25f","50f","100f"),"mol-"),
  paste0(fin=c("50a","125a","250a","500a","2500a","5000a","12500a","25000a","50000a"),"mol_"))
dataPL <- cleanListCoNames(dataPL, rem=c("abundance_","Levure2ug+ UPS1"), subst=sub1)
## let's check the result
head(colnames(dataPL$raw),8)
#> [1] "50amol_1"  "50amol_2"  "50amol_3"  "125amol_1" "125amol_2" "125amol_3"
#> [7] "250amol_1" "250amol_2"
## the number of lines and colums
dim(dataPL$quant)
#> [1] 1186   27
## a summary of the quantitation data
summary(dataPL$quant[,1:8])        # the first 8 cols
#>     50amol_1        50amol_2        50amol_3       125amol_1    
#>  Min.   :14.29   Min.   :14.53   Min.   :13.77   Min.   :14.41  
#>  1st Qu.:19.64   1st Qu.:19.66   1st Qu.:19.70   1st Qu.:19.75  
#>  Median :21.64   Median :21.55   Median :21.65   Median :21.65  
#>  Mean   :21.66   Mean   :21.66   Mean   :21.71   Mean   :21.75  
#>  3rd Qu.:23.64   3rd Qu.:23.61   3rd Qu.:23.63   3rd Qu.:23.70  
#>  Max.   :29.52   Max.   :29.42   Max.   :29.44   Max.   :29.50  
#>  NA's   :44      NA's   :40      NA's   :40      NA's   :46     
#>    125amol_2       125amol_3       250amol_1       250amol_2    
#>  Min.   :13.53   Min.   :14.76   Min.   :14.55   Min.   :14.74  
#>  1st Qu.:19.76   1st Qu.:19.76   1st Qu.:19.74   1st Qu.:19.72  
#>  Median :21.64   Median :21.65   Median :21.65   Median :21.61  
#>  Mean   :21.73   Mean   :21.73   Mean   :21.71   Mean   :21.69  
#>  3rd Qu.:23.69   3rd Qu.:23.65   3rd Qu.:23.62   3rd Qu.:23.63  
#>  Max.   :29.49   Max.   :29.43   Max.   :29.42   Max.   :29.40  
#>  NA's   :46      NA's   :48      NA's   :54      NA's   :51

Now we can summarize the presence of UPS1 proteins after treatment by Proline : In sum 48 UPS1 proteins were found, 0 are missing. Later in this document data from Proline will by frequently abbreviated as PL.

Uniform Re-Arranging of Data

For easy and proper comparisons we need to make sure all columns are in the same order.

# get all results (MaxQuant,ProteomeDiscoverer, ...) in same order
grp9 <- paste0(rep(UPSconc,each=3),"amol") 

## it is more convenient to re-order columns this way in each project
dataPD <- corColumnOrder(dataPD, sampNames=sampNa)          # already in good order
#>  -> corColumnOrder : Order already correct
dataMQ <- corColumnOrder(dataMQ, sampNames=sampNa) 
dataPL <- corColumnOrder(dataPL, sampNames=sampNa) 
#>  -> corColumnOrder : Order already correct

At import we made use of the argument specPref (specifying _‘mainSpecies’, ‘conta’ and ‘spike’) which allows to build categories based on searching keywords based on the initial annotation. In turn we obtain the labels : ‘main Spe’ for yeast (ie matrix), ‘species2’ for the UPS1 (ie spike) ‘conta’ for contaminants. Let’s replace the first two generic terms by more specific ones (ie ‘Yeast’ and ‘UPS1’) :

## Need to rename $annot[,"SpecType"]  
dataPD <- replSpecType(dataPD, replBy=cbind(old=c("mainSpe","species2"), new=c("Yeast","UPS1")))
#> useLi 1 2 3 4 5 6 
#> useLi 24 25 36 37 45 70
dataMQ <- replSpecType(dataMQ, replBy=cbind(old=c("mainSpe","species2"), new=c("Yeast","UPS1")))
#> useLi 1 12 13 14 15 16 
#> useLi 4 11 21 27 39 47
dataPL <- replSpecType(dataPL, replBy=cbind(old=c("mainSpe","species2"), new=c("Yeast","UPS1")))
#> useLi 1 2 3 4 5 6 
#> useLi 20 22 48 53 54 71

## Need to address missing ProteinNames (UPS1) due to missing tags in Fasta
dataPD <- replMissingProtNames(dataPD) 
#>  -> replreplMissingProtNames :  ..trying to replace 1296 'EntryName'
dataMQ <- replMissingProtNames(dataMQ) 
#>  -> replreplMissingProtNames :  ..trying to replace 5 'EntryName'
dataPL <- replMissingProtNames(dataPL) 
## extract names of quantified UPS1-proteins
NamesUpsPD <- dataPD$annot[which(dataPD$annot[,"SpecType"]=="UPS1"),"Accession"]
NamesUpsMQ <- dataMQ$annot[which(dataMQ$annot[,"SpecType"]=="UPS1"),"Accession"]
NamesUpsPL <- dataPL$annot[which(dataPL$annot[,"SpecType"]=="UPS1"),"Accession"]
tabS <- mergeVectors(PD=table(dataPD$annot[,"SpecType"]), MQ=table(dataMQ$annot[,"SpecType"]), PL=table(dataPL$annot[,"SpecType"]))  
knitr::kable(tabS, caption="Number of proteins identified, by custom tags and software")
Number of proteins identified, by custom tags and software
UPS1 Yeast conta
PD 48 1239 1
MQ 48 1047 1
PL 48 1137 1
tabT <- mergeVectors(PD=table(dataPD$annot[,"Species"]), MQ=table(dataMQ$annot[,"Species"]), PL=table(dataPL$annot[,"Species"]))  
knitr::kable(tabT, caption="Number of proteins identified, by species and software")
Number of proteins identified, by species and software
Gallus gallus Homo sapiens Mus musculus Saccharomyces cerevisiae Sus scrofa
PD 1 47 NA 1239 NA
MQ NA 49 1 1047 1
PL NA 48 NA 1137 1

The initial fasta file also contained the yeast strain number, this has been stripped off when using default parameters.


Basic Data Treatment

Normalization

No additional normalization is needed, all data were already median normalized to the host proteins (ie Saccharomyces cerevisiae) after importing the initial quantification-output using ‘readMaxQuantFile()’, ‘readProlineFile()’ and ‘readProtDiscovFile()’.

Presence of NA-values

As mentioned in the general vignette of this package ‘wrProteoVignette1’, it is important to investigate the nature of NA-values. In particular, checking the hypothesis that NA-values originate from very low abundance instances is very important for deciding how to treat NA-values furtheron.

## Let's inspect NA values from ProteomeDiscoverer as graphic
matrixNAinspect(dataPD$quant, gr=grp9, tit="ProteomeDiscoverer")  
#>  -> stableMode :  method='density',  length of x =976, 'bandw' has been set to 44

## Let's inspect NA values from MaxQuant as graphic
matrixNAinspect(dataMQ$quant, gr=gl(length(UPSconc),3), tit="MaxQuant") 
#>  -> stableMode :  method='density',  length of x =1142, 'bandw' has been set to 47

## Let's inspect NA values from Proline as graphic
matrixNAinspect(dataPL$quant, gr=grp9, tit="Proline") 
#>  -> stableMode :  method='density',  length of x =413, 'bandw' has been set to 28

A key element to understand the nature of NA-value is to investigate their NA-neighbours. If a given protein has for just ond of the 3 replicates an NA, the other two valid quantifications can be considered as NA-neighbours. In the figures above all NA-neighbours are shown in the histogram and their mode is marked by an arrow. One can see, that NA-neighbours are predominantely (but not exclusively) part of the lower quantitation values. This supports the hypothesis that NAs occur most frequently with low abundance proteins.

NA-Imputation and Statistical Testing for Changes in Abundance

NA-values represent a challange for statistical testing. In addition, techniques like PCA don’t allow NAs, neither.

The number of NAs varies between samples : Indeed, very low concentrations of UPS1 are difficult to get detected and contribute largely to the NAs (as we will see later in more detail). Since the amout of yeast proteins (ie the matrix in this setup) stays constant across all samples, yeast proteins should always get detected the same way.

## Let's look at the number of NAs. Is there an accumulated number in lower UPS1 samples ?
tabSumNA <- rbind(PD=sumNAperGroup(dataPD$raw, grp9), MQ=sumNAperGroup(dataMQ$raw, grp9), PL=sumNAperGroup(dataPL$raw, grp9) )
knitr::kable(tabSumNA, caption="Number of NAs per group of samples", align="r")
Number of NAs per group of samples
50amol 125amol 250amol 500amol 2500amol 5000amol 12500amol 25000amol 50000amol
PD 272 273 257 234 205 207 195 209 220
MQ 318 334 323 337 282 297 302 330 322
PL 124 140 157 140 137 139 131 141 131

In the section above we investigated the circumstances of NA-instances and provided evidence that NA-values typically represent proteins with low abundance which frequently ended up as non-detectable (NA). Thus, we hypothesize that (in most cases) NA-values might also have been detected in quantities like their NA-neighbours. In consequence, we will model a normal distribution based on the NA-neighbours and use for substituting.

The function testRobustToNAimputation() from this package (wrProteo) allows to perform NA-imputation and subsequent statistical testing (after repeated imputation) between all groups of samples (see also the general vignette). One of the advantages of this implementation, is that multiple rounds of imputation are run, so that final results (including pair-wise testing) get stabilized to (rare) stochastic effects. For this reason one may also speak of stabilized NA-imputations.

The statistical tests used underneith make use of the shrinkage-procedure provided from the empirical Bayes procedure as implemented to the Bioconductor package limma, see also Ritchie et al 2015. In addition, various formats of multiple testing correction can be added to the results : Benjamini-Hochberg FDR (lateron referred to as BH or BH-FDR, see FDR on Wikipedia, see also Benjamini and Hochberg 1995), local false discovery rate (lfdr, using the package fdrtool, see Strimmer 2008), or modified testing by ROTS, etc … In this vignette we will make use of the BH-FDRs.

We are ready to launch the NA-imputation and testing for data from ProteomeDiscoverer :

testPD <- testRobustToNAimputation(dataPD, gr=grp9)     # ProteomeDiscoverer
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  invalid entry for argument 'seedNo', it may be single integer or NULL, setting to NULL
#> -> testRobustToNAimputation -> matrixNAneighbourImpute -> stableMode :  method='density',  length of x =976, 'bandw' has been set to 44
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  n.woNA= 32920 , n.NA = 2072
#>     impute based on 'mode2' using mean= 17.34 and sd= 0.5
#> 
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at presenceFilt:   1259 1252 1245 1248 1234 1235 1247 1259 1253 1251 1263 1235 1247 1238 1264 1255 1257 1253 1255 1254 1259 1246 1257 1252 1252 1248 1262 1255 1247 1257 1251 1249 1245 1261 1251 1248   out of  1296 
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at abundanceFilt:  1246 1240 1227 1235 1214 1215 1236 1244 1241 1238 1252 1221 1235 1224 1250 1245 1245 1242 1242 1240 1247 1236 1246 1242 1241 1235 1250 1243 1236 1247 1242 1237 1234 1250 1241 1239
#> -> testRobustToNAimputation -> combineMultFilterNAimput :    at NA> mean:   1175, 1172, 1175, 1173, 1176, 1183, 1168, 1177, 1182, 1194, 1171, 1175, 1184, 1185, 1208, 1176, 1182, 1187, 1200, 1210, 1210, 1167, 1178, 1184, 1196, 1208, 1207, 1214, 1161, 1176, 1177, 1189, 1197, 1197, 1207 and 1201

Then for MaxQuant …

testMQ <- testRobustToNAimputation(dataMQ, gr=grp9)      # MaxQuant
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  invalid entry for argument 'seedNo', it may be single integer or NULL, setting to NULL
#> -> testRobustToNAimputation -> matrixNAneighbourImpute -> stableMode :  method='density',  length of x =1142, 'bandw' has been set to 47
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  n.woNA= 26963 , n.NA = 2845
#>     impute based on 'mode2' using mean= 21.46 and sd= 0.5
#> 
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at presenceFilt:   1051 1043 1022 1051 1017 1019 1027 1051 1046 1048 1067 1022 1032 1028 1061 1040 1050 1050 1056 1030 1061 1032 1047 1040 1045 1037 1060 1049 1027 1050 1047 1051 1027 1066 1034 1039   out of  1104 
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at abundanceFilt:  1039 1034 1008 1035 1002 1000 1018 1038 1034 1029 1064 1016 1025 1019 1052 1038 1046 1044 1049 1021 1055 1030 1042 1033 1039 1031 1054 1043 1024 1045 1044 1048 1024 1061 1031 1035
#> -> testRobustToNAimputation -> combineMultFilterNAimput :    at NA> mean:   967, 960, 966, 955, 956, 966, 943, 955, 959, 958, 943, 944, 947, 945, 978, 944, 950, 952, 954, 972, 976, 934, 938, 942, 946, 975, 969, 979, 927, 938, 938, 941, 961, 962, 974 and 964

And finally for Proline :

testPL <- testRobustToNAimputation(dataPL, gr=grp9)      # Proline
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  invalid entry for argument 'seedNo', it may be single integer or NULL, setting to NULL
#> -> testRobustToNAimputation -> matrixNAneighbourImpute -> stableMode :  method='density',  length of x =413, 'bandw' has been set to 28
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  n.woNA= 30782 , n.NA = 1240
#>     impute based on 'mode2' using mean= 17.74 and sd= 0.5
#>     note mean for impuation is  below 0.05  quantile !!
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at presenceFilt:   1156 1154 1148 1155 1142 1145 1149 1156 1154 1153 1167 1161 1163 1159 1167 1150 1158 1155 1156 1150 1169 1147 1155 1153 1154 1148 1167 1153 1149 1156 1154 1154 1148 1167 1151 1149   out of  1186 
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at abundanceFilt:  1139 1140 1127 1141 1125 1128 1134 1138 1137 1136 1155 1145 1147 1142 1150 1138 1145 1141 1143 1136 1154 1135 1141 1137 1139 1133 1152 1140 1139 1144 1142 1141 1136 1154 1139 1137
#> -> testRobustToNAimputation -> combineMultFilterNAimput :    at NA> mean:   1113, 1108, 1110, 1115, 1116, 1112, 1108, 1110, 1104, 1112, 1113, 1110, 1104, 1109, 1120, 1113, 1116, 1110, 1118, 1122, 1123, 1109, 1112, 1105, 1114, 1121, 1120, 1126, 1112, 1113, 1108, 1115, 1120, 1122, 1125 and 1123

From these results we’ll use i) the NA-imputed version of our datasets for plotting principal components (PCA) and ii) the (stabilized) testing results for counting TP, FP, etc and to construct ROC curves.

Let’s add the NA-imputed data to our main object :

dataPD$datImp <- testPD$datImp       # recuperate imputeded data to main data-object
dataMQ$datImp <- testMQ$datImp
dataPL$datImp <- testPL$datImp

Analysis Using All Proteins Identified (Matrix + UPS1)

In this section all proteins identified and quantified are compared in a pair-wise fashion based on the t-tests already run in the previous section. As mentioned, the experimental setup is very special, since all proteins that are truly changing are known in advance (the UPS1 spike-in proteins). Tables get constructed by counting based on various thresholds for considering given protein abundances as differential or not.
A traditional 5 percent FDR cut-off is used for Volcano-plots, while ROC-curves allow inspecting the entire range of potential cut-off values.

Pairwise Testing Summary

A very universal and simple way to analyze data is by checking as several pairwise comparisons, in particular, if the experimental setup does not include complete multifactorial plans.

This UPS1 spike-in experiment has 27 samples organized (according to meta-information) as 9 groups. Thus, one obtains in total 36 pair-wise comparisons which will make comparisons very crowded. The publication by Ramus et al 2016 focussed on 3 pairwise comparisons only. In this vignette it is shown how all of them can get considered.

Now, we’ll construct a table showing all possible pairwise-comparisons. Using the function numPairDeColNames() we can easily extract the UPS1 concentrations as numeric content and show the (log-)ratio of the pairwise comparisons (column ‘log2rat’), the final concentrations (columns ‘conc1’ and ‘conc2’, in amol) and the number of differentially abundant proteins passing 5% FDR (using classical Benjamini-Hochberg FDR (columns ‘sig.xx.BH’) or lfdr (Strimmer 2008, columns ‘sig._xx_.lfdr’ ).

## The number of differentially abundant proteins passing 5% FDR (ProteomeDiscoverer and MaxQuant) 
signCount <- cbind( sig.PD.BH=colSums(testPD$BH < 0.05, na.rm=TRUE), sig.PD.lfdr=if("lfdr" %in% names(testPD)) colSums(testPD$lfdr < 0.05, na.rm=TRUE),
  sig.MQ.BH=colSums(testMQ$BH < 0.05, na.rm=TRUE), sig.MQ.lfdr=if("lfdr" %in% names(testMQ)) colSums(testMQ$lfdr < 0.05, na.rm=TRUE),
  sig.PL.BH=colSums(testPL$BH < 0.05, na.rm=TRUE), sig.PL.lfdr=if("lfdr" %in% names(testPL)) colSums(testPL$lfdr < 0.05, na.rm=TRUE)  )

table1 <- numPairDeColNames(testPD$BH, stripTxt="amol", sortByAbsRatio=TRUE)
table1 <- cbind(table1, signCount[table1[,1],])
knitr::kable(table1, caption="All pairwise comparisons and number of significant proteins", align="c")
All pairwise comparisons and number of significant proteins
index log2rat conc1 conc2 sig.PD.BH sig.PD.lfdr sig.MQ.BH sig.MQ.lfdr sig.PL.BH sig.PL.lfdr
50000amol-50amol 34 9.966 50 50000 572 498 376 305 570 486
25000amol-50amol 31 8.966 50 25000 582 498 367 316 544 478
125amol-50000amol 12 8.644 125 50000 354 292 232 160 412 352
12500amol-50amol 29 7.966 50 12500 505 429 290 231 484 401
125amol-25000amol 3 7.644 125 25000 328 258 179 143 307 264
250amol-50000amol 15 7.644 250 50000 381 306 314 265 324 261
12500amol-125amol 1 6.644 125 12500 222 154 79 50 235 190
25000amol-250amol 9 6.644 250 25000 232 178 205 167 200 149
50000amol-500amol 27 6.644 500 50000 357 320 252 184 366 287
5000amol-50amol 35 6.644 50 5000 582 562 345 308 523 456
12500amol-250amol 7 5.644 250 12500 131 87 50 45 128 98
25000amol-500amol 24 5.644 500 25000 241 175 141 121 198 142
2500amol-50amol 32 5.644 50 2500 527 499 295 235 489 436
125amol-5000amol 17 5.322 125 5000 270 259 120 86 201 148
12500amol-500amol 22 4.644 500 12500 151 110 59 50 128 84
125amol-2500amol 5 4.322 125 2500 195 164 76 50 183 147
2500amol-50000amol 14 4.322 2500 50000 284 230 170 152 254 190
250amol-5000amol 20 4.322 250 5000 178 118 135 97 135 112
25000amol-2500amol 6 3.322 2500 25000 45 34 14 10 69 49
2500amol-250amol 10 3.322 250 2500 100 74 23 21 91 68
50000amol-5000amol 21 3.322 5000 50000 331 276 152 126 323 281
5000amol-500amol 28 3.322 500 5000 153 120 61 37 125 95
500amol-50amol 36 3.322 50 500 420 378 232 182 378 337
12500amol-2500amol 4 2.322 2500 12500 17 12 5 4 41 31
25000amol-5000amol 18 2.322 5000 25000 42 28 20 15 71 53
2500amol-500amol 25 2.322 500 2500 107 77 34 24 96 61
250amol-50amol 33 2.322 50 250 355 265 186 133 311 249
12500amol-50000amol 11 2.000 12500 50000 225 147 109 118 164 107
125amol-500amol 23 2.000 125 500 22 18 6 2 30 18
12500amol-5000amol 16 1.322 5000 12500 18 10 2 1 26 18
125amol-50amol 30 1.322 50 125 278 262 128 95 216 174
12500amol-25000amol 2 1.000 12500 25000 10 6 1 0 28 15
125amol-250amol 8 1.000 125 250 15 12 0 0 2 1
25000amol-50000amol 13 1.000 25000 50000 123 71 56 46 93 63
2500amol-5000amol 19 1.000 2500 5000 4 3 1 0 13 10
250amol-500amol 26 1.000 250 500 0 0 2 0 2 1

You can see that in numerous cases much more than the 48 UPS1 proteins showed up significant, ie yeast proteins supposed to remain constant also showed up in part as ‘sigificantly changing’. However, some proteins with enthousiastic FDR values have very low log-FC amplitude and will be removed by filtering in the following steps.

par(mar=c(6.2, 4.7, 4, 1))   
imageW(table1[,c("sig.PD.BH","sig.MQ.BH","sig.PL.BH" )], tit="Number of BH.FDR signif proteins by the quantification approaches")
mtext("red for high number signif proteins", cex=0.7)

In the original Ramus et al 2016 et al paper only 3 pairwise comparisons were further analyzed :

## Selection in Ramus paper 
knitr::kable(table1[which(rownames(table1) %in% colnames(testPD$BH)[c(2,21,27)]),], caption="Selected pairwise comparisons (as in Ramus et al)", align="c")
Selected pairwise comparisons (as in Ramus et al)
index log2rat conc1 conc2 sig.PD.BH sig.PD.lfdr sig.MQ.BH sig.MQ.lfdr sig.PL.BH sig.PL.lfdr
50000amol-500amol 27 6.644 500 50000 357 320 252 184 366 287
50000amol-5000amol 21 3.322 5000 50000 331 276 152 126 323 281
12500amol-25000amol 2 1.000 12500 25000 10 6 1 0 28 15

Volcano Plots

Volcano-plots offer additional insight in how statistical test results relate to log-fold-change of pair-wise comparisons. In addition, we can mark the different protein-groups (or species) by different symbols, see also the general vignette ‘wrProteoVignette1’ (from this package) and the vignette to the package wrGraph. Counting the number of proteins passing a classical threshold for differential expression combined with a filter for minimum log-fold-change is a good way to start.

As mentioned, the dataset from Ramus et al 2016 contains 9 different levels of UPS1 concentrations, in consequence 36 pair-wise comparisons are possible. Again, plotting all possible Volcano plots would make way too crowded plots, instead we’ll try to summarize (see ROC curves), cluster into groups and finally plot only a few representative ones.

ROC for Multiple Pairs

Receiver Operator Curves (ROC) curves display sensitivity (True Positive Rate) versus 1-Specificity (False Positive Rate). They are typically used as illustrate and compare the discriminiative capacity of a yes/no decision system (here: differential abundance or not), see eg also the original publication Hand and Till 2001.

The data get constructed by sliding through a panel of threshold-values for the statistical tests instead of just using 0.05. Due to the experimental setup we know that all yeast proteins should stay constant and only UPS1 proteins are expected to change. For each of these threshold values one counts the number of true positives (TP), false positives (FP) etc, allowing then to calculate sensitivity and specificity.

In the case of bechmarking quantitation efforts, ROC curves are used to judge how well heterologous spikes UPS1 proteins can be recognized as differentially abundant while constant yeast matrix proteins should not get classified as differential. Finally, ROC curves let us also gain some additional insights in the question which cutoff may be optimal or if the commonly used 5-percent FDR threshld cutoff allows getting the best out of the testing system.

Below, these calculations of summarizeForROC() are run in batch.

layout(1)
rocPD <- lapply(table1[,1], function(x) summarizeForROC(testPD, useComp=x, annotCol="SpecType", spec=c("Yeast","UPS1"), tyThr="BH", plotROC=FALSE))
rocMQ <- lapply(table1[,1], function(x) summarizeForROC(testMQ, useComp=x, annotCol="SpecType", spec=c("Yeast","UPS1"), tyThr="BH", plotROC=FALSE))
rocPL <- lapply(table1[,1], function(x) summarizeForROC(testPL, useComp=x, annotCol="SpecType", spec=c("Yeast","UPS1"), tyThr="BH", plotROC=FALSE))

# we still need to add the names for the pair-wise groups:
names(rocPD) <- colnames(testPD$BH)
names(rocMQ) <- colnames(testMQ$BH)
names(rocPL) <- colnames(testPL$BH)

The next step consists in calculating the area under the curve (AUC) for the individual profiles of each pairwise comparison.

## calulate  AUC for each ROC 
AucAll <- cbind(ind=table1[match(names(rocPD),rownames(table1)),"index"], clu=NA, 
  PD=sapply(rocPD, AucROC), MQ=sapply(rocMQ, AucROC), PL=sapply(rocPL, AucROC) )

To provide a quick overview, the clustered AUC values are displayed as PCA :

biplot(prcomp(AucAll[,names(methNa)]), cex=0.7, main="PCA of AUC from ROC Curves")   

On this PCA one can see a rather disperse/distinct group of low concentration-pairs (separated by PC1, mostly containg at least one 50 or 500fmol), low/med conc pairs (containing 2500, ev 5000) and the rest (med+high to any, high degree of text-superposition). Furthermore, we can see that AUC values from MaxQuant correlate somehow less to Proline and ProteomeDiscoverer (red arrows).

Now we are ready to inspect the 5 clusters in detail :

Grouping of ROC Curves to Display Representative Ones

As mentioned, there are too many pair-wise combinations available for plotting and inspecting all ROC-curves. So we can try to group similar pairwise comparison AUC values into clusters and then easily display representative examples for each cluster/group. Again, we (pre)define that we want to obtain 5 groups (like customer-ratings from 5 to 1 stars), a k-Means clustering approach was chosen.

## number of groups for clustering
nGr <- 5
## K-Means clustering
kMAx <- stats::kmeans(standardW(AucAll[,c("PD","MQ","PL")]), nGr)$cluster  
   table(kMAx)
#> kMAx
#>  1  2  3  4  5 
#>  6  1  3 21  5
AucAll[,"clu"] <- kMAx
AucAll <- reorgByCluNo(AucAll, cluNo=kMAx, useColumn=c("PD","MQ","PL"))
AucAll <- cbind(AucAll, iniInd=table1[match(rownames(AucAll), rownames(table1)), "index"])
colnames(AucAll)[1:(which(colnames(AucAll)=="index")-1)] <- paste("Auc",colnames(AucAll)[1:(which(colnames(AucAll)=="index")-1)], sep=".")
AucAll[,"cluNo"] <- rep(nGr:1, table(AucAll[,"cluNo"]))        # make cluNo descencing

kMAx <- AucAll[,"cluNo"]      # update
  table(AucAll[,"cluNo"])
#> 
#>  1  2  3  4  5 
#>  6  1  3  5 21
 ## note : column 'index' is relative to table1, iniInd to ordering inside objects from clustering 

To graphically summarize the AUC values, the clustered AUC values are plotted accompagnied by the geometric mean:

profileAsClu(AucAll[,c(1:length(methNa),(length(methNa)+2:3))], clu="cluNo", meanD="geoMean", tit="Pairwise Comparisons as Clustered AUC from ROC Curves", xlab="Comparison number", ylab="AUC", meLty=1, meLwd=3)

From this figure we can see clearly that there are some pairwise comparisons where all initial analysis-software results yield high AUC values, while other pairwise comparisons less discriminative power.

Again, now we can select a representative pairwise-comparison for each cluster (from the center of each cluster):

AucRep <- table(AucAll[,"cluNo"])[rank(unique(AucAll[,"cluNo"]))]   # representative for each cluster
AucRep <- round(cumsum(AucRep) -AucRep/2 +0.1) 

## select representative for each cluster
knitr::kable(round(AucAll[AucRep,c("Auc.PD","Auc.MQ","Auc.PL","cluNo")],3), caption="Selected representative for each cluster ", align="c")  
Selected representative for each cluster
Auc.PD Auc.MQ Auc.PL cluNo
2500amol-500amol 0.925 0.984 0.977 5
50000amol-50amol 0.705 0.765 0.921 4
2500amol-250amol 0.638 0.487 0.882 3
5000amol-50amol 0.384 0.767 0.905 2
25000amol-50amol 0.527 0.535 0.445 1

Now we can check if some experimental UPS1 log-fold-change have a bias for some clusters.

ratTab <- sapply(5:1, function(x) { y <- table1[match(rownames(AucAll),rownames(table1)),]
  table(factor(signif(y[which(AucAll[,"cluNo"]==x),"log2rat"],1), levels=unique(signif(table1[,"log2rat"],1))) )}) 
colnames(ratTab) <- paste0("clu",5:1,"\nn=",rev(table(kMAx)))
layout(1)
imageW(ratTab, tit="Frequency of log2FC in the 5 clusters", xLab="log2FC")

We can see, that the cluster of best ROC-curves (cluster 5) covers practically all UPS1 log-ratios from this experiment without being restricted just to the high ratios.

Plotting ROC Curves for the Best Cluster (the ‘+++++’)

colPanel <- 2:5
gr <- 5 
j <- match(rownames(AucAll)[AucRep[6-gr]], colnames(testPD$t)) 

## table of all proteins in cluster
useLi <- which(AucAll[,"cluNo"]==gr)
tmp <- cbind(round(as.data.frame(AucAll)[useLi,c("cluNo","Auc.PD","Auc.MQ","Auc.PL")],3), 
  as.data.frame(table1)[match(names(useLi),rownames(table1)),c(2,5,7,9)])
knitr::kable(tmp, caption="AUC details for best pairwise-comparisons ", align="c")  
AUC details for best pairwise-comparisons
cluNo Auc.PD Auc.MQ Auc.PL log2rat sig.PD.BH sig.MQ.BH sig.PL.BH
125amol-250amol 5 0.976 0.990 0.997 1.000 15 0 2
125amol-25000amol 5 0.972 0.988 0.995 7.644 328 179 307
25000amol-250amol 5 0.973 0.985 0.994 6.644 232 205 200
125amol-50000amol 5 0.972 0.985 0.994 8.644 354 232 412
2500amol-5000amol 5 0.953 0.997 0.993 1.000 4 1 13
12500amol-50000amol 5 0.970 0.974 0.993 2.000 225 109 164
25000amol-500amol 5 0.963 0.992 0.982 5.644 241 141 198
25000amol-2500amol 5 0.969 0.971 0.993 3.322 45 14 69
250amol-50000amol 5 0.964 0.971 0.991 7.644 381 314 324
125amol-5000amol 5 0.947 0.984 0.992 5.322 270 120 201
2500amol-500amol 5 0.925 0.984 0.977 2.322 107 34 96
125amol-2500amol 5 0.961 0.924 0.995 4.322 195 76 183
50000amol-5000amol 5 0.919 0.973 0.981 3.322 331 152 323
12500amol-125amol 5 0.935 0.948 0.987 6.644 222 79 235
12500amol-2500amol 5 0.914 0.951 0.981 2.322 17 5 41
12500amol-25000amol 5 0.929 0.915 0.990 1.000 10 1 28
5000amol-500amol 5 0.888 0.912 0.967 3.322 153 61 125
12500amol-250amol 5 0.882 0.889 0.992 5.644 131 50 128
125amol-50amol 5 0.889 0.896 0.922 1.322 278 128 216
2500amol-50amol 5 0.852 0.851 0.935 5.644 527 295 489
2500amol-50000amol 5 0.848 0.829 0.961 4.322 284 170 254
## frequent concentrations :
layout(matrix(1:2), heights=c(1,2.5)) 
plotConcHist(mat=tmp, ref=table1)
    
## representative ROC    
plotROC(rocPD[[j]],rocMQ[[j]],rocPL[[j]], col=colPanel, methNames=methNa, pointSi=0.8, xlim=c(0,0.45),
  txtLoc=c(0.12,0.1,0.033), tit=paste("Cluster",gr," Example: ",names(rocPD)[j]), legCex=1)

layout(matrix(1:4,ncol=2))
VolcanoPlotW2(testPD, useComp=j, FCthrs=1.5, FdrThrs=0.05, annColor=c(4,2,3), ProjNa=methNa[1], silent=TRUE)
VolcanoPlotW2(testMQ, useComp=j, FCthrs=1.5, FdrThrs=0.05, annColor=c(4,2,3), ProjNa=methNa[2], silent=TRUE)
VolcanoPlotW2(testPL, useComp=j, FCthrs=1.5, FdrThrs=0.05, annColor=c(4,2,3), ProjNa=methNa[3], silent=TRUE)

ROC Curves for 2nd Best Cluster (the ‘++++’)

gr <- 4
j <- match(rownames(AucAll)[AucRep[6-gr]], colnames(testPD$t)) 

## table of all proteins in cluster
useLi <- which(AucAll[,"cluNo"]==gr)
tmp <- cbind(round(as.data.frame(AucAll)[useLi,c("cluNo","Auc.PD","Auc.MQ","Auc.PL")],3), 
  as.data.frame(table1)[match(names(useLi),rownames(table1)),c(2,5,7,9)])
knitr::kable(tmp, caption="AUC details for best pairwise-comparisons ", align="c")  
AUC details for best pairwise-comparisons
cluNo Auc.PD Auc.MQ Auc.PL log2rat sig.PD.BH sig.MQ.BH sig.PL.BH
250amol-500amol 4 0.841 0.659 0.956 1.000 0 2 2
12500amol-500amol 4 0.862 0.653 0.940 4.644 151 59 128
50000amol-50amol 4 0.705 0.765 0.921 9.966 572 376 570
25000amol-5000amol 4 0.841 0.588 0.927 2.322 42 20 71
250amol-5000amol 4 0.810 0.587 0.942 4.322 178 135 135
## frequent concentrations :
layout(matrix(1:2), heights=c(1,2.5)) 
plotConcHist(mat=tmp, ref=table1)
    
## representative ROC    
plotROC(rocPD[[j]], rocMQ[[j]],rocPL[[j]], col=colPanel, methNames=methNa, pointSi=0.8, xlim=c(0,0.45),
  txtLoc=c(0.12,0.1,0.033), tit=paste("Cluster",gr," Example: ",names(rocPD)[j]), legCex=1)

layout(matrix(1:4, ncol=2)) 
VolcanoPlotW2(testPD, useComp=j, FCthrs=1.5, FdrThrs=0.05, annColor=c(4,2,3), ProjNa=methNa[1], silent=TRUE)
VolcanoPlotW2(testMQ, useComp=j, FCthrs=1.5, FdrThrs=0.05, annColor=c(4,2,3), ProjNa=methNa[2], silent=TRUE)
VolcanoPlotW2(testPL, useComp=j, FCthrs=1.5, FdrThrs=0.05, annColor=c(4,2,3), ProjNa=methNa[3], silent=TRUE)