Analyzing UPS1 Spike-in Experiments (Example Ramus 2016 Dataset)

Wolfgang Raffelsberger

2020-11-27

Introduction

This vignete shows how UPS1 spike-in experiments may be analyzed using the packages wrProteo, wrMisc and wrGraph, all are available on CRAN.

Furthermore, the Bioconductor package limma will be used internally for it’s robust statistical testing.

# If not already installed, you'll have to install this package and wrMisc first.
install.packages("wrMisc")
install.packages("wrProteo")

# The package wrGraph is recommended for better graphics
install.packages("wrGraph")

# You cat start the vignettes for this package by typing :
browseVignettes("wrProteo")    #  ... and the select the html output

Now let’s load the packages needed :

library(knitr)
library(wrMisc)
library(wrGraph)
library(wrProteo)

# Version number for wrProteo :
packageVersion("wrProteo")
#> [1] '1.3.0'

Benchmark Tests Experimental Setup

The main aim of the experimental setup in UPS1 spike-in experiments is to provide a framework to test identification and quantitation procedures in proteomics. By mixing known amounts of a collection of human proteins (UPS1) in various concentrations into a yeast protein extract, one expects to find only human proteins varying between samples. In terms of ROC curves the human proteins are expected to show up as true positives (TP). In contrast, all yeast proteins were always added in the same quantity and should thus be observed constant, ie as true negatives (TN).

The Ramus Data-Set

The data were published with the article : Ramus et al 2016 “Benchmarking quantitative label-free LC-MS data processing workflows using a complex spiked proteomic standard dataset” in J Proteomics 2016 Jan 30;132:51-62. PMID: 26585461 doi: 10.1016/j.jprot.2015.11.011

This dataset is available on PRIDE as PXD001819 (and/or on ProteomeXchange).

Briefly, this experiment aims to compare quantification of the heterologous spike-in UPS1 in yeast protein extracts as constant matrix. 9 different concentrations of the heterologous spike-in were run in triplicates.

Additional Functions

## A few functions we'll need lateron
mergeVectors <- function(...,callFrom=NULL,silent=FALSE) {
  ## merge for simple named vectors (each element needs to be named)
  namesXYZ <- c(deparse(substitute(...)))
  fxNa <- wrMisc::.composeCallName(callFrom, newNa="mergeVectors")
  inpL <- list(...)
  chNa <- sapply(inpL, function(x) length(unique(names(x)))==length(x))
   cat("yy\n"); yy <<- list(inpL=inpL,chNa=chNa,namesXYZ=namesXYZ)
  if(any(!chNa)) {if(!silent) message(fxNa," Vectors must have names on each element for merging; omit vectors ")
    inpL <- inpL }
  if(length(names(inpL)) <1) { names(inpL) <- 1:length(inpL)}
  if(length(inpL) >0) {
    spe <- sort(unique(unlist(lapply(inpL,names))))
    ta3 <- matrix(0, nrow=length(inpL), ncol=length(spe), dimnames=list(names(inpL),spe))
    for(i in 1:length(inpL)) ta3[i, match(names(inpL[[i]]),spe)] <- inpL[[i]]
    ta3 
  } else NULL }

replSpecType <- function(x, annCol="SpecType", replBy=cbind(old=c("mainSpe","species2"), new=c("Yeast","UPS1")), silent=TRUE) {
  ## rename $annot[,"SpecType"] to more specific names
  chCol <- annCol[1] %in% colnames(x$annot)
  if(chCol) { chCol <- which(colnames(x$annot)==annCol[1])
    chIt <- replBy[,1] %in% unique(x$annot[,chCol])    # check items to replace if present
    if(any(chIt)) for(i in which(chIt)) {useLi <- which(x$annot[,chCol] %in% replBy[i,1]); cat("useLi",head(useLi),"\n"); x$annot[useLi,chCol] <- replBy[i,2]}
  } else if(!silent) message(" replSpecType: 'annCol' not found in x$annot !")
  x }
  
replNAProtNames <- function(x,annCol=c("EntryName","Accession","SpecType"), silent=FALSE) {
  ## replace in $annot missing EntryNames by concatenating Accession + SpecType (ie 2nd & 3rd of annCol)
  chCol <- annCol %in% colnames(x$annot)
  if(all(chCol)) {
    chNA <- is.na(x$annot[,annCol[1]])
    if(any(chNA)) { if(!silent) message(" ..replacing ",sum(chNA)," entry-names")
      x$annot[which(chNA),annCol[1]] <- paste(x$annot[which(chNA),annCol[2]],x$annot[which(chNA),annCol[3]],sep="_")}
  } else message(" replNAProtNames: some of the columnnames from 'annCol' found in x$annot !")
  x }

standEntMatr <- function(mat,useCol=NULL) {
  ## standardize selected content of entire matrix (ie not column-wise, relative differences in row get thus conserved), return selected columns only 
  if(is.null(useCol)) useCol <- 1:ncol(mat)
  out <- as.matrix(mat[,useCol])
  (out - mean(out,na.rm=TRUE))/sd(out,na.rm=TRUE)
}

reorgByCluNo <- function(mat, cluNo, useMeth=1:3,useCol="sco", cluCol="cluNo", retList=FALSE, silent=FALSE, callFrom=NULL) {
  ## reorganize input matrix as sorted by cluster numbers (and geometric mean) according to vector with cluster names, and index for sorting per cluster and per geometric mean
  ## mat (matrix or data.frame) main input
  ## cluNo (integer) initial cluster numbers for each line of 'mat' (obtained by separate clustering or other segmentation)
  ## useMeth (character or integer) the columns to use from mat
  ## useCol (character or integer) the column to use for buidling geometrix mean
  ## retList (character or integer) the culumn of 'mat' which will be used for sorting the clusters
  ## retList (logical) decide if return list of clusters with data from 'cluNo' or matrix in order of input 'mat' with index,cluNo,geoMean
  fxNa <- wrMisc::.composeCallName(callFrom, newNa="reorgByCluNo")
  iniCla <- class(mat)
  chDim <- dim(mat)
  dataOK <- FALSE
  if(length(chDim) >1) if(all(chDim >1)) dataOK <- TRUE
  if(!silent & !dataOK) message(fxNa," invalid input ... (returning entry)")
  ## main
  if(dataOK){
    mat1 <- if(length(chDim) ==3) mat[,useMeth,useCol] else mat[,useMeth]
    nClu <- length(unique(wrMisc::naOmit(cluNo)))
    ## construct geometric mean for sorting
    mat1 <- cbind(mat1, index=1:nrow(mat), geoMean=apply(mat1,1, function(x) prod(x,na.rm=TRUE)^(1/sum(!is.na(x)))))    
    ## 1: split in list, determine clu median, overall score & sort clusters  
    cluL <- by(mat1, cluNo, as.matrix)
    clTab <- table(cluNo)                # already sorted by cluNo
    if(length(clTab) < max(cluNo) & !silent) message(fxNa," Note: Some cluster-names seem to be absent (no-consecutive numbers for names) !")
    ## need to correct when single occurance
    if(any(clTab ==1)) for(i in which(clTab ==1)) cluL[[i]] <- matrix(cluL[[i]],nrow=1,dimnames=list(rownames(mat1)[which(cluNo==i)] ,colnames(mat1)))
    ## sort intra
    cluL <- lapply(cluL, function(x) if(nrow(x) >1) x[order(x[,ncol(x)], decreasing=TRUE),] else x)
    ## sort inter
    cluL <- cluL[order(sapply(cluL, function(x) median(x[,ncol(x)],na.rm=TRUE)),  decreasing=TRUE)]        
    names(cluL) <- 1:length(cluL)
    nByClu  <- sapply(cluL,nrow)
    if(retList) { out <- cluL
      for(i in 1:nClu) out[[i]] <- cbind(out[[i]], cluNo=1)
       if("data.frame" %in% iniCla) out <- lapply(out,wrMisc::convMatr2df, addIniNa=FALSE, silent=silent,callFrom=fxNa) 
    } else {
      out <- cbind(wrMisc::lrbind(cluL), cluNo=rep(1:nClu,nByClu)) }   # in order of input
  } else {out <- NULL; if(!silent) message(fxNa," invalid input, return NULL")}
  out }


methNa <- c("ProteomeDiscoverer","MaxQuant","Proline" )

## The accession numbers for the UPS1 proteins
UPS1ac <- c("P00915", "P00918", "P01031", "P69905", "P68871", "P41159", "P02768", "P62988",
  "P04040", "P00167", "P01133", "P02144", "P15559", "P62937", "Q06830", "P63165",
  "P00709", "P06732", "P12081", "P61626", "Q15843", "P02753", "P16083", "P63279",
  "P01008", "P61769", "P55957", "O76070", "P08263", "P01344", "P01127", "P10599",
  "P99999", "P06396", "P09211", "P01112", "P01579", "P02787", "O00762", "P51965",
  "P08758", "P02741", "P05413", "P10145", "P02788", "P10636-8", "P00441", "P01375" ) 

Protein Identification and Initial Quantification

MaxQuant

MaxQuant is free software provided by the Max-Planck-Insutute, see Tyanova et al 2016. Typically MaxQuant exports by default quantitation data on level of consensus-proteins as a folder called txt with a file called proteinGroups.txt . So in a standard case one needs only to provide the path to this file.

path1 <- system.file("extdata", package="wrProteo")
fiNaMQ <- "proteinGroups.txt.gz"
specPrefMQ <- list(conta="CON_|LYSC_CHICK", mainSpecies="OS=Saccharomyces cerevisiae", spike=UPS1ac)

dataMQ <- readMaxQuantFile(path1, file=fiNaMQ, specPref=specPrefMQ, refLi="mainSpe")
#>  -> readMaxQuantFile :  Note: Found 11 proteins marked as 'REV_' (reverse peptide identification) - Removing
#>  -> readMaxQuantFile :  Note: 8 proteins with unknown species
#>    by species : conta: 1  mainSpe: 1047  species2: 48
#>  -> readMaxQuantFile : Found 1 species name(s) appearing inside other ones, assume as truncated (eg  Saccharomyces cerevis)
#>  -> readMaxQuantFile : Found 75 composite accession-numbers, truncating (eg P00761;CON__P00761)
#>  -> readMaxQuantFile : Use column 'Accession' (has fewest, ie 0 duplicated entries) as rownames
#>  -> readMaxQuantFile :  normalize using subset of 1047

The data were imported and median-normalized, the protein annotation was parsed to automatically extract IDs, protein-names and species information.

In addition we need to correct the accesion-names for attached ‘ups’-tags (originating from the fasta-file) :

## the number of lines and colums
dim(dataMQ$quant)
#> [1] 1104   27
## a summary of the quantitation data
summary(dataMQ$quant[,1:8])                # the first 8 cols
#>   12500amol_1     12500amol_2     12500amol_3      125amol_1    
#>  Min.   :17.52   Min.   :15.64   Min.   :14.90   Min.   :15.19  
#>  1st Qu.:22.50   1st Qu.:22.49   1st Qu.:22.50   1st Qu.:22.39  
#>  Median :23.46   Median :23.46   Median :23.46   Median :23.43  
#>  Mean   :23.69   Mean   :23.65   Mean   :23.67   Mean   :23.60  
#>  3rd Qu.:24.82   3rd Qu.:24.76   3rd Qu.:24.78   3rd Qu.:24.82  
#>  Max.   :30.31   Max.   :30.29   Max.   :30.35   Max.   :30.25  
#>  NA's   :98      NA's   :100     NA's   :104     NA's   :114    
#>    125amol_2       125amol_3      25000amol_1     25000amol_2   
#>  Min.   :14.86   Min.   :14.91   Min.   :15.78   Min.   :18.42  
#>  1st Qu.:22.36   1st Qu.:22.40   1st Qu.:22.52   1st Qu.:22.54  
#>  Median :23.42   Median :23.44   Median :23.53   Median :23.53  
#>  Mean   :23.59   Mean   :23.62   Mean   :23.74   Mean   :23.73  
#>  3rd Qu.:24.81   3rd Qu.:24.79   3rd Qu.:24.95   3rd Qu.:24.89  
#>  Max.   :30.25   Max.   :30.30   Max.   :30.31   Max.   :30.13  
#>  NA's   :106     NA's   :114     NA's   :109     NA's   :115

Confirming the presence of UPS1 proteins by MaxQuant: In sum 48 UPS1 proteins were found, 0 are missing.

ProteomeDiscoverer

ProteomeDiscoverer is commercial software from ThermoFisher (www.thermofisher.com). In this case, the identification was performed using the XCalibur module of ProteomeDiscoverer. In ProteomeDiscoverer quantitation data on level of consensus-proteins should be exported to tabulated text files, which can be treated by this function. The resultant data were export as ‘Proteins’ in tablulated format (the option R-headers was checked, however data can also be read when this option was not chosen).

path1 <- system.file("extdata", package="wrProteo")
fiNaPd <- "pxd001819_PD24_Proteins.txt.gz"

## Note: data exported from ProteomeDiscoverer do not have proper column-names, providing names here
UPSconc <- c(50,125,250,500,2500,5000,12500,25000,50000)  
sampNa <- paste(rep(UPSconc, each=3),"amol_",rep(1:3,length(UPSconc)),sep="") 
specPrefPD <- list(conta="Bos tauris|Gallus", mainSpecies="OS=Saccharomyces cerevisiae", spike=UPS1ac)

dataPD <- readPDExport(file=fiNaPd, path=path1, sampleNames=sampNa, refLi="mainSpe", specPref=specPrefPD)
#>  -> readPDExport :  Trouble ahead, expecting tabulated text file (this file might not be right format) !!
#>  -> readPDExport :  setting 'annotCol' to export of 'R-friendly' colnames
#>  -> readPDExport :  Note: 10 (out of 1297) unrecognized species
#>  -> readPDExport : Count by 'specPref' : Gallus gallus: 1 ;  Homo sapiens: 47 ;  Saccharomyces cerevisiae: 1239 ;
#>  -> readPDExport :  normalize using subset of 1239

The data were imported and median-normalized, the protein annotation was parsed to atomatically extract IDs, protein-names and species information.

Note, that the original column-heads in the file exported from ProteomeDiscoverer has simply increasing numbers as names, much care should be taken on the order when preparing the vector with the names to use instead.

## the number of lines and colums
dim(dataPD$quant)
#> [1] 1296   27
## a summary of the quantitation data
summary(dataPD$quant[,1:8])        # the first 8 cols
#>     50amol_1        50amol_2        50amol_3        125amol_1    
#>  Min.   :13.68   Min.   :11.60   Min.   : 9.609   Min.   :10.92  
#>  1st Qu.:18.15   1st Qu.:18.19   1st Qu.:18.237   1st Qu.:18.23  
#>  Median :19.36   Median :19.37   Median :19.380   Median :19.37  
#>  Mean   :19.52   Mean   :19.53   Mean   :19.539   Mean   :19.52  
#>  3rd Qu.:20.83   3rd Qu.:20.88   3rd Qu.:20.895   3rd Qu.:20.84  
#>  Max.   :26.33   Max.   :26.38   Max.   :26.418   Max.   :26.41  
#>  NA's   :92      NA's   :93      NA's   :87       NA's   :88     
#>    125amol_2       125amol_3       250amol_1       250amol_2    
#>  Min.   :10.51   Min.   :11.15   Min.   :10.04   Min.   :10.21  
#>  1st Qu.:18.21   1st Qu.:18.20   1st Qu.:18.15   1st Qu.:18.17  
#>  Median :19.40   Median :19.40   Median :19.40   Median :19.37  
#>  Mean   :19.51   Mean   :19.53   Mean   :19.50   Mean   :19.47  
#>  3rd Qu.:20.82   3rd Qu.:20.83   3rd Qu.:20.78   3rd Qu.:20.78  
#>  Max.   :26.37   Max.   :26.47   Max.   :26.50   Max.   :26.45  
#>  NA's   :99      NA's   :86      NA's   :87      NA's   :77

# there are proteins where the 'OS='-tag won't be visible as Species (orig Fasta-header and Protein-name not accessible) :
which(is.na(dataPD$annot[,"Species"]) & dataPD$annot[,"SpecType"]=="species2")    # is NA as Species
#> P02768 
#>     25

Confirming the presence of UPS1 proteins by ProteomeDiscoverer: In sum 48 UPS1 proteins were found, 0 are missing.

Proline

Proline is free software provided by the Profi-consortium, see Ramus et al 2016 and Bouyssié et al 2020 (Proline: an efficient and user-friendly software suite for large-scale proteomics. Bioinformatics. 2020, PMID: 32096818, DOI: 10.1093/bioinformatics/btaa118).

Underneith identifications in Proline are performed by SearchGUI, see also Vaudel et al 2015. In this case X!Tandem (see also Duncan et al 2005) was used as search engine.

In Proline quantitation data at the level of consensus-proteins can be exported as xlsx or tabulated text files, both formats can be treated by the functions of this package.

## shifted for not printing
path1 <- system.file("extdata", package="wrProteo")
fiNaPl <- "pxd001819_PL.xlsx"

specPrefPL <- c(conta="_conta", mainSpecies="OS=Saccharomyces cerevisiae", spike="_ups")  
dataPL <- readProlineFile(file.path(path1,fiNaPl), specPref=specPrefPL, normalizeMeth="median", refLi="mainSpe")
#>  -> readProlineFile : Found sheets 'Search settings and infos', 'Import and filters', 'Quant config' and 'Protein sets'
#> New names:
#> * `` -> ...2
#>  -> readProlineFile :  normalize using subset of 1137

In addition, we need to correct the quantification column-heads (like ‘Levure2ug+ UPS1-100amol-1’ ) and bring them to a simpler version :

head(colnames(dataPL$raw),8)
#> [1] "Levure2ug+ UPS1-100amol-1" "Levure2ug+ UPS1-100amol-2"
#> [3] "Levure2ug+ UPS1-100amol-3" "Levure2ug+ UPS1-250amol-1"
#> [5] "Levure2ug+ UPS1-250amol-2" "Levure2ug+ UPS1-250amol-3"
#> [7] "Levure2ug+ UPS1-500amol-1" "Levure2ug+ UPS1-500amol-2"
sub1 <- cbind(ini=paste0("-",c("100a","250a","500a","1f","5f","10f","25f","50f","100f"),"mol-"),
  paste0(fin=c("50a","125a","250a","500a","2500a","5000a","12500a","25000a","50000a"),"mol_"))
dataPL <- cleanListCoNames(dataPL, rem=c("abundance_","Levure2ug+ UPS1"), subst=sub1)
## the number of lines and colums
dim(dataPL$quant)
#> [1] 1186   27
## a summary of the quantitation data
summary(dataPL$quant[,1:8])        # the first 8 cols
#>     50amol_1        50amol_2        50amol_3       125amol_1    
#>  Min.   :14.29   Min.   :14.53   Min.   :13.77   Min.   :14.41  
#>  1st Qu.:19.64   1st Qu.:19.66   1st Qu.:19.70   1st Qu.:19.75  
#>  Median :21.64   Median :21.55   Median :21.65   Median :21.65  
#>  Mean   :21.66   Mean   :21.66   Mean   :21.71   Mean   :21.75  
#>  3rd Qu.:23.64   3rd Qu.:23.61   3rd Qu.:23.63   3rd Qu.:23.70  
#>  Max.   :29.52   Max.   :29.42   Max.   :29.44   Max.   :29.50  
#>  NA's   :44      NA's   :40      NA's   :40      NA's   :46     
#>    125amol_2       125amol_3       250amol_1       250amol_2    
#>  Min.   :13.53   Min.   :14.76   Min.   :14.55   Min.   :14.74  
#>  1st Qu.:19.76   1st Qu.:19.76   1st Qu.:19.74   1st Qu.:19.72  
#>  Median :21.64   Median :21.65   Median :21.65   Median :21.61  
#>  Mean   :21.73   Mean   :21.73   Mean   :21.71   Mean   :21.69  
#>  3rd Qu.:23.69   3rd Qu.:23.65   3rd Qu.:23.62   3rd Qu.:23.63  
#>  Max.   :29.49   Max.   :29.43   Max.   :29.42   Max.   :29.40  
#>  NA's   :46      NA's   :48      NA's   :54      NA's   :51

Confirming the presence of UPS1 proteins by Proline : In sum 48 UPS1 proteins were found, 0 are missing.

Uniform Re-Arranging of Data

In order for easy and proper comparisons we need to make sure all columns are in the same order.

# get all results (MaxQuant,ProteomeDiscoverer, ...) in same order
grp9 <- paste0(rep(UPSconc,each=3),"amol") 

## it is more convenient to re-order columns this way in each project
dataPD <- corColumnOrder(dataPD,sampNames=sampNa)          # already in good order
#>  -> corColumnOrder : Order already correct
dataMQ <- corColumnOrder(dataMQ,sampNames=sampNa) 
dataPL <- corColumnOrder(dataPL,sampNames=sampNa) 
#>  -> corColumnOrder : Order already correct

The from the protein annotation the membership to 3 groups was extracted : yeast (matrix) as “‘main Spe’”, UPS1 (spike) as ‘species2’ and other contaminants (‘conta’). The first two terms will be replace by more specific ones (‘Yeast’ and ‘UPS1’) :

## Need to rename $annot[,"SpecType"]  
dataPD <- replSpecType(dataPD, replBy=cbind(old=c("mainSpe","species2"), new=c("Yeast","UPS1")))
#> useLi 1 2 3 4 5 6 
#> useLi 24 25 36 37 45 70
dataMQ <- replSpecType(dataMQ, replBy=cbind(old=c("mainSpe","species2"), new=c("Yeast","UPS1")))
#> useLi 1 12 13 14 15 16 
#> useLi 4 11 21 27 39 47
dataPL <- replSpecType(dataPL, replBy=cbind(old=c("mainSpe","species2"), new=c("Yeast","UPS1")))
#> useLi 1 2 3 4 5 6 
#> useLi 20 22 48 53 54 71

## Need to addres missing ProteinNames (UPS1) due to missing tags in Fasta
dataPD <- replNAProtNames(dataPD) 
#>  ..replacing 1296 entry-names
dataMQ <- replNAProtNames(dataMQ) 
#>  ..replacing 1104 entry-names
dataPL <- replNAProtNames(dataPL) 
## extract names of quantified UPS1-proteins
NamesUpsPD <- dataPD$annot[which(dataPD$annot[,"SpecType"]=="UPS1"),"Accession"]
NamesUpsMQ <- dataMQ$annot[which(dataMQ$annot[,"SpecType"]=="UPS1"),"Accession"]
NamesUpsPL <- dataPL$annot[which(dataPL$annot[,"SpecType"]=="UPS1"),"Accession"]
tabS <- mergeVectors(PD=table(dataPD$annot[,"SpecType"]), MQ=table(dataMQ$annot[,"SpecType"]), PL=table(dataPL$annot[,"SpecType"]))  
#> yy
knitr::kable(tabS, caption="Number of proteins identified, by custom tags and software")
Number of proteins identified, by custom tags and software
UPS1 Yeast conta
PD 48 1239 1
MQ 48 1047 1
PL 48 1137 1
tabT <- mergeVectors(PD=table(dataPD$annot[,"Species"]), MQ=table(dataMQ$annot[,"Species"]), PL=table(dataPL$annot[,"Species"]))  
#> yy
knitr::kable(tabT, caption="Number of proteins identified, by species and software")
Number of proteins identified, by species and software
Gallus gallus Homo sapiens Mus musculus Saccharomyces cerevisiae Saccharomyces cerevisiae (strain ATCC 204508 / S288c) Sus scrofa
PD 1 47 0 1239 0 0
MQ 1 49 1 1047 0 1
PL 0 48 0 0 1137 1

Data Treatment

Normalization

No additional normalization is needed, all data were already median normalized to the host proteins (ie Saccaromyces cerevisiae) after importing the initial quantification-output using ‘readMaxQuantFile()’, ‘readProlineFile()’ and ‘readPDExport()’.

Presence of NA-values

As mentioned in the (general) vignette ‘wrProteoVignette1’ (of this package) it is important to investigate the nature of NA-values. In particular the hypothesis that NA-values originate from very low abundance instances is important for deciding how to treat NA-values furtheron.

## Let's inspect NA values as graphic
matrixNAinspect(dataPD$quant, gr=grp9, tit="ProteomeDiscoverer")  

## Let's inspect NA values as graphic
matrixNAinspect(dataMQ$quant, gr=gl(length(UPSconc),3), tit="MaxQuant") 

## Let's inspect NA values as graphic
matrixNAinspect(dataPL$quant, gr=as.factor(substr(colnames(dataPL$quant),1,1)), tit="Proline") 

NA-Imputation and Statistical Testing for Changes in Abundance

NA-values represent a challange for statistical testing. In addition, techniques like PCA don’t allow NAs neither. In the sections above we investigate and provided evidence that NA-values typically represent proteins with very low protein abundance that finally ended as non-detectable (NA). Thus, we hypothesize that NA-values might by chance as well (in most cases) get reported as (very) low borderline values. The number of NAs varies between samples : Very low concentrations of UPS1 are difficult to get detected and thus contribute largely to the NAs. Since the amout if yeast proteins stays constant they should always get detected the way in all samples.

## Let's look at the number of NAs. Is there an accumulated number in lower UPS1 samples ?
tabSumNA <- rbind(PD=sumNAperGroup(dataPD$raw, grp9), MQ=sumNAperGroup(dataMQ$raw, grp9), PL=sumNAperGroup(dataPL$raw, grp9) )
knitr::kable(tabSumNA, caption="Number of NAs per group of samples", align="r")
Number of NAs per group of samples
50amol 125amol 250amol 500amol 2500amol 5000amol 12500amol 25000amol 50000amol
PD 272 273 257 234 205 207 195 209 220
MQ 318 334 323 337 282 297 302 330 322
PL 124 140 157 140 137 139 131 141 131

The function testRobustToNAimputation() from this package (wrProteo) performs NA-imputation and subsequent statistical testing (after repeated imputation) between all groups of samples the same time (as it would be inefficient to separate these two tasks), see also the general vignette. The tests underneith apply shrinkage from the empirical Bayes procedure from the bioconductor package limma.
In addition, various formats of multiple test correction can be directly added to the results : Benjamini-Hochberg FDR, local false discovery rate (lfdr, using the package fdrtool, see Strimmer 2008 doi: 10.1093/bioinformatics/btn209), or modified testing by ROTS, etc … We will make also use of the testing results later in this vignette.

One of the advantages of this implementation, is that multiple rounds of imputation are run, so that final results (including pair-wise testing) gets stabilized to (rare) stochastic effects without bias due to low variances. For this reason one may also speak of stabilized NA-imputations.

We are ready to launch the NA-imputation and testing for data from ProteomeDiscoverer :

testPD <- testRobustToNAimputation(dataPD, gr=grp9, lfdrInclude=TRUE)     # ProteomeDiscoverer
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  n.woNA= 32920  n.NA = 2072
#>     model 10 %-tile of (min 1 NA/grp) 766 NA-neighbour values
#>     imputation: mean= 14.7   sd= 0.8
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at presenceFilt:   1259 1252 1245 1248 1234 1235 1247 1259 1253 1251 1263 1235 1247 1238 1264 1255 1257 1253 1255 1254 1259 1246 1257 1252 1252 1248 1262 1255 1247 1257 1251 1249 1245 1261 1251 1248   out of  1296 
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at abundanceFilt:  1203 1200 1201 1196 1200 1195 1223 1234 1229 1225 1250 1211 1221 1207 1227 1238 1235 1233 1230 1229 1229 1225 1235 1231 1230 1228 1222 1227 1228 1238 1236 1232 1228 1239 1237 1230
#> -> testRobustToNAimputation -> combineMultFilterNAimput :    at NA> mean:   1172, 1170, 1172, 1170, 1174, 1180, 1166, 1175, 1182, 1191, 1171, 1173, 1184, 1184, 1207, 1175, 1180, 1186, 1197, 1208, 1209, 1165, 1175, 1182, 1193, 1204, 1205, 1212, 1160, 1174, 1175, 1188, 1195, 1196, 1204 and 1200

Then for MaxQuant …

testMQ <- testRobustToNAimputation(dataMQ, gr=grp9, lfdrInclude=TRUE)      # MaxQuant
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  n.woNA= 26963  n.NA = 2845
#>     model 10 %-tile of (min 1 NA/grp) 850 NA-neighbour values
#>     imputation: mean= 20.2   sd= 0.83
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at presenceFilt:   1051 1043 1022 1051 1017 1019 1027 1051 1046 1048 1067 1022 1032 1028 1061 1040 1050 1050 1056 1030 1061 1032 1047 1040 1045 1037 1060 1049 1027 1050 1047 1051 1027 1066 1034 1039   out of  1104 
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at abundanceFilt:  1010 1006 997 1005 995 991 1007 1025 1017 1013 1056 1002 1006 1000 1033 1031 1037 1033 1039 1012 1030 1026 1038 1026 1033 1023 1030 1031 1020 1040 1037 1037 1014 1044 1023 1025
#> -> testRobustToNAimputation -> combineMultFilterNAimput :    at NA> mean:   966, 958, 965, 954, 955, 965, 943, 952, 956, 958, 943, 944, 947, 945, 976, 942, 948, 951, 952, 969, 975, 934, 938, 942, 946, 974, 968, 978, 926, 937, 937, 940, 958, 962, 971 and 963

And finally for Proline

testPL <- testRobustToNAimputation(dataPL, gr=grp9, lfdrInclude=TRUE)      # Proline
#> -> testRobustToNAimputation -> matrixNAneighbourImpute :  n.woNA= 30782  n.NA = 1240
#>     model 10 %-tile of (min 1 NA/grp) 310 NA-neighbour values
#>     imputation: mean= 16.1   sd= 0.86
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at presenceFilt:   1156 1154 1148 1155 1142 1145 1149 1156 1154 1153 1167 1161 1163 1159 1167 1150 1158 1155 1156 1150 1169 1147 1155 1153 1154 1148 1167 1153 1149 1156 1154 1154 1148 1167 1151 1149   out of  1186 
#> -> testRobustToNAimputation -> combineMultFilterNAimput :     at abundanceFilt:  1127 1127 1118 1132 1119 1117 1131 1132 1129 1126 1151 1134 1128 1126 1129 1136 1141 1133 1134 1127 1136 1134 1138 1131 1131 1126 1130 1131 1137 1141 1136 1134 1132 1139 1137 1131
#> -> testRobustToNAimputation -> combineMultFilterNAimput :    at NA> mean:   1110, 1106, 1108, 1113, 1114, 1109, 1106, 1108, 1102, 1111, 1113, 1110, 1103, 1109, 1119, 1112, 1114, 1108, 1116, 1119, 1122, 1109, 1111, 1104, 1113, 1120, 1118, 1124, 1111, 1112, 1107, 1115, 1120, 1122, 1124 and 1121

From these results we’ll use i) the NA-imputed version of our datasets for plotting principal components (PCA) and ii) the (stabilized) testing results for counting TP, FP, etc.

Let’s add the NA-imputed data to our main object :

dataPD$datImp <- testPD$datImp       # recuperate imputeded data to main data-object
dataMQ$datImp <- testMQ$datImp
dataPL$datImp <- testPL$datImp

Similarity by PCA (UPS1 Proteins Only)

Plotting the principal components typically allow to gain an overview on how samples are related to each other. This experiment is particular for the fact that the majority of proteins is expected to remain constant (yeast matrix), while only the UPS1 proteins vary. Since we are primarily intereseted in the UPS1 proteins, the regular plots of PCA are not shown here, but PCA of the lines identified as UPS1.

Principal component analysis (PCA) cannot handle NA-values. Either all lines with any NAs have to be excluded, or data after NA-imputation have to be used. Here, the option of plotting data after NA-imputation was chosen (in the context of filtering UPS1 lines only one whould loose too many lines, ie proteins). Plots are be made using the plotPCAw function from the package wrGraph.

# limit to UPS1 
plotPCAw(testPD$datImp[which(testPD$annot[,"SpecType"]=="UPS1"),], sampleGrp=grp9, tit="PCA on ProteomeDiscoverer, UPS1 only (NAs imputed)",rowTyName="proteins", useSymb2=0)

plotPCAw(testMQ$datImp[which(testMQ$annot[,"SpecType"]=="UPS1"),], sampleGrp=grp9, tit="PCA on MaxQuant, UPS1 only (NAs imputed)",rowTyName="proteins", useSymb2=0)

plotPCAw(testPL$datImp[which(testPL$annot[,"SpecType"]=="UPS1"),], sampleGrp=grp9, tit="PCA on Proline, UPS1 only (NAs imputed)",rowTyName="proteins", useSymb2=0)

Based on PCA one can see that the comparison with concentrations >= 250 aMol may actually be better to detect differences, as also confirmed by ROC part later.


Analysis Using All Proteins Identified (Matrix + UPS1)

In this section all proteins identified and quantified are compared in a pair-wise fashion based on the t-tests already run before. As mentioned, the experimental setup is interesting since all proteins that are truly changing are known in advance, the UPS-1 spike-in proteins. Counting tables get constructed based on various thresholds for considering protein abundance as differential.
For Volcano-plots a traditional 5 percent FDR cut-off is used, while ROC-curves allow inspecting the entire range of potential cut-offs.

Pairwise Testing Summary

A very universal and simple way to analyze data is by checking on several pairwise comparisons, in particular if the experimental setup does not include complete multifactorial plans.

This UPS1 spike-in experiment has 27 samples organized (according to meta-information) as 9 groups. Thus one obtains in total 36 comparisons which will make comparisons very crowded. The publication by Ramus et al 2016 focussed on 3 pairwise comparisons only. Here we’ll consider all of them.

The graphical comparisons were restricted to three comparisons presented in the original publication plus two additional ones. The distribution of intra-group CV-values showed (without major surprise) that the highest UPS1 concentrations replicated best. In consequence comparisons using this group are expected to have a decent chance to rather specifically reveil a high number of UPS1 proteins.

Now, we’ll construct a table showing all possible pairwise-comparisons. Using the function numPairDeColNames() we can easily extract the UPS1 concentrations as numeric content and show the (log-)ratio of the pairwise comparisons (column ‘log2rat’), the final concentrations (columns ‘conc1’ and ‘conc2’, in amol) and the number of differentially abundant proteins passing 5% FDR (using classical Benjamini-Hochberg FDR (columns ‘sig.xx.BH’) or lfdr Strimmer 2008 (columns ‘sig.xx.lfdr’ ).

## The number of differentially abundant proteins passing 5% FDR (ProteomeDiscoverer and MaxQuant) 
signCount <- cbind( sig.PD.BH=colSums(testPD$BH < 0.05, na.rm=TRUE), sig.PD.lfdr=if("lfdr" %in% names(testPD)) colSums(testPD$lfdr < 0.05, na.rm=TRUE),
  sig.MQ.BH=colSums(testMQ$BH < 0.05, na.rm=TRUE), sig.MQ.lfdr=if("lfdr" %in% names(testMQ)) colSums(testMQ$lfdr < 0.05, na.rm=TRUE),
  sig.PL.BH=colSums(testPL$BH < 0.05, na.rm=TRUE), sig.PL.lfdr=if("lfdr" %in% names(testPL)) colSums(testPL$lfdr < 0.05, na.rm=TRUE)  )

table1 <- numPairDeColNames(testPD$BH, stripTxt="amol", sortByAbsRatio=TRUE)
table1 <- cbind(table1, signCount[table1[,1],])
knitr::kable(table1, caption="All pairwise comparisons and number of significant proteins", align="c")
All pairwise comparisons and number of significant proteins
index log2rat conc1 conc2 sig.PD.BH sig.PD.lfdr sig.MQ.BH sig.MQ.lfdr sig.PL.BH sig.PL.lfdr
50000amol-50amol 34 9.966 50 50000 563 484 370 303 569 492
25000amol-50amol 31 8.966 50 25000 580 490 369 302 544 479
125amol-50000amol 12 8.644 125 50000 352 292 229 165 417 354
12500amol-50amol 29 7.966 50 12500 480 408 284 216 479 416
125amol-25000amol 3 7.644 125 25000 316 262 187 149 314 253
250amol-50000amol 15 7.644 250 50000 378 302 312 254 329 262
12500amol-125amol 1 6.644 125 12500 214 148 86 55 239 185
25000amol-250amol 9 6.644 250 25000 225 152 213 167 200 148
50000amol-500amol 27 6.644 500 50000 364 304 252 186 364 290
5000amol-50amol 35 6.644 50 5000 587 523 355 314 527 457
12500amol-250amol 7 5.644 250 12500 125 84 62 49 134 99
25000amol-500amol 24 5.644 500 25000 242 175 141 124 197 142
2500amol-50amol 32 5.644 50 2500 539 460 288 230 491 452
125amol-5000amol 17 5.322 125 5000 274 188 122 88 204 149
12500amol-500amol 22 4.644 500 12500 153 114 61 32 131 86
125amol-2500amol 5 4.322 125 2500 190 158 75 49 186 142
2500amol-50000amol 14 4.322 2500 50000 290 225 166 152 260 183
250amol-5000amol 20 4.322 250 5000 197 135 142 114 145 121
25000amol-2500amol 6 3.322 2500 25000 45 31 15 11 68 49
2500amol-250amol 10 3.322 250 2500 112 83 24 22 98 73
50000amol-5000amol 21 3.322 5000 50000 324 279 157 112 324 280
5000amol-500amol 28 3.322 500 5000 169 138 78 43 132 88
500amol-50amol 36 3.322 50 500 412 336 218 167 380 337
12500amol-2500amol 4 2.322 2500 12500 18 14 4 2 40 28
25000amol-5000amol 18 2.322 5000 25000 53 35 20 15 70 50
2500amol-500amol 25 2.322 500 2500 112 80 34 21 100 63
250amol-50amol 33 2.322 50 250 349 262 179 124 316 252
12500amol-50000amol 11 2.000 12500 50000 238 170 111 110 166 114
125amol-500amol 23 2.000 125 500 17 15 6 2 37 36
12500amol-5000amol 16 1.322 5000 12500 24 17 2 1 30 20
125amol-50amol 30 1.322 50 125 277 217 126 93 224 178
12500amol-25000amol 2 1.000 12500 25000 11 8 0 0 29 15
125amol-250amol 8 1.000 125 250 15 12 0 0 2 1
25000amol-50000amol 13 1.000 25000 50000 135 85 58 49 95 64
2500amol-5000amol 19 1.000 2500 5000 6 3 2 1 14 10
250amol-500amol 26 1.000 250 500 2 1 2 1 3 2

You can see that in numerous cases much more than the 48 UPS1 proteins showed up significant, ie yeast proteins supposed to remain constant also showed up in part as ‘sigificantly changing’.

par(mar=c(6.2, 4.7, 4, 1))   
imageW(table1[,c("sig.PD.BH","sig.MQ.BH","sig.PL.BH" )], tit="Number of BH.FDR signif proteins by the quantification approaches")
mtext("red for high number signif proteins", cex=0.7)

In the original Ramus et al 2016 et al paper only 3 pairwise comparisons were further analyzed :

## Selection in Ramus paper 
knitr::kable(table1[which(rownames(table1) %in% colnames(testPD$BH)[c(2,21,27)]),], caption="Selected pairwise comparisons (as in Ramus et al)", align="c")
Selected pairwise comparisons (as in Ramus et al)
index log2rat conc1 conc2 sig.PD.BH sig.PD.lfdr sig.MQ.BH sig.MQ.lfdr sig.PL.BH sig.PL.lfdr
50000amol-500amol 27 6.644 500 50000 364 304 252 186 364 290
50000amol-5000amol 21 3.322 5000 50000 324 279 157 112 324 280
12500amol-25000amol 2 1.000 12500 25000 11 8 0 0 29 15

Volcano Plots

Volcano-plots offer more insight in how statistical test results vary in respect to p-values of pair-wise copparisons. In addition we can mark the different protein-groups (or species), see also vignette to the package wrGraph.

The PCA plots already told us graphically how strong the differences appear in the various (pairwise) comparisons. Counting the number of proteins passing a classical threshold for differential expression is a good way to start.

The dataset from Ramus et al 2016 contains 9 different levels of UPS1 concentrations, in consequence 36 pair-wise comparisons are possible. Plotting all these pair-wise comparisons would make way too crowded plots.

ROC for Multiple Pairs

Receiver Operator Curves (ROC) curves display sensitivity (True Positive Rate) versus 1-Specificity (False Positive Rate). They are typically used as illustrate and compare the discriminiative capacity of a yes/no decision system (here: differential bundance or not), see eg also ROC on Wikipedia or the original publication Hand and Till 2001.
In this case ROC curves are used to judge how well heterologous human UPS1 proteins can be recognized as differential abundant while constant yeast matrix proteins should not get classified as differential. Finally, ROC curves let us also gain some additional insights if the commonly used 5-percent FDR threshld cutoff allows getting the best out of the testing system.

The Ramus et al 2016-dataset contains 9 different levels of UPS1 concentrations, in consequence 36 pair-wise comparisons are possible. Plotting all these pair-wise comparisons would make way too crowded plots.

Thus, the graphical comparisons were restricted to three comparisons presented in the original publication by Ramus et al 2016 plus two additional ones. The distribution of intra-group CV-values showed (without major surprise) that the highest UPS1 concentrations replicated best. In consequence comparisons using this group are expected to have a decent chance to rather specifically reveil a high number of UPS1 proteins.

Initially a ROC-curve cat get calculated for each pair-wise comparison where it is known which proteins should be found differential (ie human UPS1 proteins).

However, since we’re treating a larger data-set this can be done in batch. Now we are ready to extract all counts of each UPS1 for constructing ROC-curves.

layout(1)
rocPD <- lapply(table1[,1], function(x) summarizeForROC(testPD, annotCol="SpecType", spec=c("Yeast","UPS1"), columnTest=x, tyThr="BH", plotROC=FALSE))
rocMQ <- lapply(table1[,1], function(x) summarizeForROC(testMQ, annotCol="SpecType", spec=c("Yeast","UPS1"), columnTest=x, tyThr="BH", plotROC=FALSE))
rocPL <- lapply(table1[,1], function(x) summarizeForROC(testPL, annotCol="SpecType", spec=c("Yeast","UPS1"), columnTest=x, tyThr="BH", plotROC=FALSE))

names(rocPD) <- colnames(testPD$BH)
names(rocMQ) <- colnames(testMQ$BH)
names(rocPL) <- colnames(testPL$BH)

## calulate  AUC for each ROC 
AucAll <- cbind(ind=table1[match(names(rocPD),rownames(table1)),"index"], comb=NA, clu=NA, 
  PD=sapply(rocPD,AucROC), MQ=sapply(rocMQ,AucROC), PL=sapply(rocPL,AucROC) )

Grouping of ROC Curves to Display Representative Ones

Now, we can try to group the pairwise comparison AUC values into groups and easily display representative examples for each group. Again, we (pre)define that we want to obtain 5 groups (like ratings from 1 -5 starts), a k-Means clustering approach was chosen.

## number of groups for clustering
nGr <- 5
## K-Means clustering
kMAx <- stats::kmeans(standEntMatr(AucAll[,c("PD","MQ","PL")]), nGr)$cluster  
   table(kMAx)
#> kMAx
#>  1  2  3  4  5 
#>  5 18  2  5  6
AucAll[,"clu"] <- kMAx
AucAll <- reorgByCluNo(AucAll,cluNo=kMAx,useMeth=c("PD","MQ","PL"))
AucAll <- cbind(AucAll, iniInd=table1[match(rownames(AucAll), rownames(table1)), "index"])
colnames(AucAll)[1:(which(colnames(AucAll)=="index")-1)] <- paste("Auc",colnames(AucAll)[1:(which(colnames(AucAll)=="index")-1)], sep=".")

kMAx <- AucAll[,"cluNo"]   # update
  table(AucAll[,"cluNo"])
#> 
#>  1  2  3  4  5 
#> 18  5  5  2  6
 ## note : column 'index' is relative to table1, iniInd to ordering inside objects from clustering 

To provide a quick overview, the clustered AUC values are plotted :

layout(1)
plot(AucAll[,"geoMean"], type="l", col=grey(0.7), lty=2, las=1, ylab="AUC",
  main="Pairwise Comparisons as Clustered AUC from ROC Curves", xlab="Comparison number")
abline(v=cumsum(table(AucAll[,"cluNo"])[-length(unique(AucAll[,"cluNo"]))])+0.5 ,lty=4,col=grey(0.8))      # clu borders
points(1:nrow(AucAll), AucAll[,"Auc.PD"], pch=AucAll[,"cluNo"], col=1)
points(1:nrow(AucAll), AucAll[,"Auc.MQ"], pch=AucAll[,"cluNo"], col=2)
points(1:nrow(AucAll), AucAll[,"Auc.PL"], pch=AucAll[,"cluNo"], col=3)
legend("bottomleft",c("PD","MQ","PL","geomMean"), text.col=c(1:3,1),pch=c(8,8,8,NA),lty=c(NA,NA,NA,2),lwd=c(NA,NA,NA,1.5),col=c(1:3,1),cex=0.85,xjust=0.5,yjust=0.5)

Again, now we can select a representative pairwise-compariso for each cluster :

AucRep <- table(AucAll[,"cluNo"])[rank(unique(AucAll[,"cluNo"]))]   # representative for each cluster
AucRep <- round(cumsum(AucRep) -AucRep/2 +0.1) 

## select representative for each cluster
knitr::kable(round(AucAll[AucRep,c("Auc.PD","Auc.MQ","Auc.PL","cluNo")],3), caption="Selected representative for each cluster ", align="c")
Selected representative for each cluster
Auc.PD Auc.MQ Auc.PL cluNo
12500amol-125amol 0.967 0.977 0.989 1
2500amol-50amol 0.880 0.831 0.926 2
250amol-500amol 0.896 0.774 0.957 3
12500amol-5000amol 0.700 0.570 0.933 4
250amol-50amol 0.521 0.489 0.500 5

To provide a quick overview, the clustered AUC values are displayed as PCA :

## appendix : what characterizes a good or bad auc ?
biplot(prcomp(AucAll[,1:3]), cex=0.7, main="PCA of AUC from ROC Curves")   

On this PCA one can see a rather distinct group of low concentration-pairs (mostly containg a 50-250), low/med conc pairs (containing 2500, ev 5000) & rest (med+high to any)

Plotting ROC Curves for the Best Cluster (the ‘+++++’)

gr <- 1 
colPanel <- 2:5
layout(1)

j <- match(rownames(AucAll)[AucRep[gr]],names(rocPD)) 

## table of all of best cluster
useLi <- which(AucAll[,"cluNo"]==gr)
knitr::kable(cbind(round(AucAll[useLi,c("cluNo","Auc.PD","Auc.MQ","Auc.PL")],3), 
  table1[match(names(which(AucAll[,"cluNo"]==gr)),rownames(table1)),c(2,5,7,9)]), caption="AUC details for best pairwise-comparisons ", align="c")  
AUC details for best pairwise-comparisons
cluNo Auc.PD Auc.MQ Auc.PL log2rat sig.PD.BH sig.MQ.BH sig.PL.BH
125amol-250amol 1 0.973 0.997 0.996 1.000 15 0 2
25000amol-250amol 1 0.976 0.996 0.993 6.644 225 213 200
25000amol-2500amol 1 0.978 0.993 0.994 3.322 45 15 68
12500amol-50000amol 1 0.975 0.991 0.997 2.000 238 111 166
125amol-50000amol 1 0.970 0.995 0.993 8.644 352 229 417
125amol-25000amol 1 0.961 0.997 0.993 7.644 316 187 314
250amol-50000amol 1 0.969 0.984 0.992 7.644 378 312 329
125amol-5000amol 1 0.953 0.993 0.990 5.322 274 122 204
12500amol-125amol 1 0.967 0.977 0.989 6.644 214 86 239
12500amol-25000amol 1 0.973 0.970 0.986 1.000 11 0 29
2500amol-5000amol 1 0.934 0.999 0.991 1.000 6 2 14
25000amol-500amol 1 0.939 0.997 0.979 5.644 242 141 197
50000amol-5000amol 1 0.954 0.978 0.982 3.322 324 157 324
2500amol-500amol 1 0.943 0.993 0.976 2.322 112 34 100
125amol-2500amol 1 0.945 0.952 0.992 4.322 190 75 186
12500amol-2500amol 1 0.923 0.974 0.985 2.322 18 4 40
5000amol-500amol 1 0.929 0.941 0.973 3.322 169 78 132
125amol-50amol 1 0.911 0.932 0.930 1.322 277 126 224
## frequent concentrations :
layout(matrix(1:2), heights=c(1,2.5)) 
tbl <- table(table1[match(names(which(AucAll[,"cluNo"]==gr)), rownames(table1)),c(3:4)])  # with(mydata, table(Species, Depth))
barplot(tbl, las=1, beside = TRUE, main=paste("Frequency of UPS-1 Concentrations Appearing in Cluster",gr))
    
## representative ROC    
plotROC(rocPD[[j]],rocMQ[[j]],rocPL[[j]], col=colPanel, methNames=methNa, pointSi=0.8, xlim=c(0,0.45),
  txtLoc=c(0.12,0.1,0.03), tit=paste("Cluster",gr," Example: ",rownames(AucAll)[AucRep[gr]]), legCex=1)