# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # The aromaRMA function # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - require(aroma.affymetrix); require(R.utils); require(affy) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # The AGRONOMICS1_RMA function # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - AGRONOMICS1_RMA <- function(experimentName, Filter = FALSE, force = TRUE, Save = FALSE, TAIR = 9) { cntrlCdf <- "agronomics1_cntrl" dataChipType <- "agronomics1" if (TAIR==9) { analysisChipType <- "agronomics1_TAIR9_gene" } else { analysisChipType <- "agronomics1_allprobes" } resultsPath <- paste("reports", experimentName, dataChipType, sep="/"); mkdirs(resultsPath); ## remove bad probes if(Filter) { ### identify the non working probes try( clearCache(paste(getCachePath(), "aroma.affymetrix", analysisChipType, sep="/"), prompt=FALSE),silent=TRUE) cdf <- AffymetrixCdfFile$byChipType(analysisChipType); badProbeList = getBadProbeList(experimentName, dataChipType, cdf, verbose=verbose) assign("badProbeList", badProbeList, globalenv()) ### change the CDF environment try( clearCache(paste(getCachePath(), "aroma.affymetrix", analysisChipType, sep="/"), prompt=FALSE),silent=TRUE) cdfFiltered <- AffymetrixCdfFile$byChipType(analysisChipType); setName(cdfFiltered, paste(getName(cdfFiltered), "Filtered", sep="_")) cdfFiltered$setRestructor(removeBadProbesFromUnitList) #How many probes were flagged? nFlaggedProbes <- length(getCellIndices(cdf, useNames=FALSE, unlist=TRUE)) - length(getCellIndices(cdfFiltered, useNames=FALSE, unlist=TRUE)) cdf <- cdfFiltered force <- TRUE if (Save) writeGoodProbesCdf(newCdfName=paste(analysisChipType, experimentName,sep="_"), oldCdfName=analysisChipType, badProbeList, stratifyBy="nothing") } else { try( clearCache(paste(getCachePath(), "aroma.affymetrix", analysisChipType, sep="/"), prompt=FALSE),silent=TRUE) nFlaggedProbes <- NULL cdf <- AffymetrixCdfFile$byChipType(analysisChipType); } celSet <- AffymetrixCelSet$byName(experimentName, checkChipType=FALSE, chipType = dataChipType, cdf=cdf); print(celSet) # ---------------------------------- # RMA estimates by aroma.affymetrix # ---------------------------------- verbose && enter(verbose, "RMA by aroma.affymetrix", suffix=paste("...", getBuiltinTime.GString())); # RMA background correction bc <- RmaBackgroundCorrection(celSet); if (force & !is.null(getOutputFiles(bc))){ file.remove(getOutputFiles(bc)) } csBC <- process(bc, verbose=verbose, force=force, overwrite=force); #return(csBC$getUnitIntensities(1, stratifyBy="pm")[[1]][[1]][[1]][,1]) # RMA quantile normalization qn <- QuantileNormalization(csBC, typesToUpdate="pm"); if (force) file.remove(getTargetDistributionPathname(qn)) clearCache(qn) if (force & !is.null(getOutputFiles(qn))){ file.remove(getOutputFiles(qn)) } csN <- process(qn, verbose=verbose, force=force); plm <- RmaPlm(csN); fit(plm, unit=NULL, verbose=verbose, force=force); df = extractDataFrame(getChipEffectSet(plm), addUgcMap=FALSE, addNames=TRUE) rownames(df) = df$unitName df$unitName = NULL df$groupName = NULL theta = log2(df) if (!is.null(cntrlCdf)){ cntrlCdf <- AffymetrixCdfFile$byChipType(cntrlCdf); csN$setCdf(cntrlCdf) cntrlPlm <- RmaPlm(csN); fit(cntrlPlm, verbose=verbose); df = extractDataFrame(getChipEffectSet(cntrlPlm), addUgcMap=FALSE, addNames=TRUE) rownames(df) = df$unitName df$unitName = NULL df$groupName = NULL cntrlTheta = log2(df) print(paste(getBuiltinTime.GString(), "RMA control probe summarization completed")); cntrlCutoffs <- apply(cntrlTheta, 2, quantile, probs=0.95) Calls = theta >= rep(cntrlCutoffs, each=nrow(theta)) } else { Calls = NULL cntrlTheta = NULL } save(theta, Calls, cntrlTheta, nFlaggedProbes, file = paste(resultsPath, paste(getName(cdf), "RMA.RData", sep="_"), sep="\\")); print(paste("Result file: ", paste(resultsPath, paste(getName(cdf), "RMA.Rdata", sep="_"), sep="\\"))) verbose && exit(verbose, suffix=paste("...done", getBuiltinTime.GString())); return(invisible(list(theta=theta, Calls=Calls, cntrlTheta=cntrlTheta, nFlaggedProbes=nFlaggedProbes))) } getBadProbeList = function(experimentName, dataChipType, cdf, force=TRUE, verbose=verbose, sigThresh=5, minPresentValues=1, minProbeCount=3){ celSet <- AffymetrixCelSet$byName(experimentName, checkChipType=FALSE, chipType = dataChipType, cdf=cdf); message("indices: ", readUnits(getCdf(celSet), units=NULL)[[1]]$groups[[1]]$indices) message("x: ", readUnits(getCdf(celSet), units=NULL)[[1]]$groups[[1]]$x) #uiN = celSet$getUnitIntensities(stratifyBy="pm") bc <- RmaBackgroundCorrection(celSet); csBC <- process(bc, verbose=verbose, force=force, overwrite=force); # RMA quantile normalization qn <- QuantileNormalization(csBC, typesToUpdate="pm"); file.remove(getTargetDistributionPathname(qn)) clearCache(qn) csN <- process(qn, verbose=verbose, force=force); message("csN indices: ", readUnits(getCdf(csN), units=NULL)[[1]]$groups[[1]]$indices) message("csN x: ", readUnits(getCdf(csN), units=NULL)[[1]]$groups[[1]]$x) uiN = csN$getUnitIntensities(stratifyBy="pm", force=force, verbose=TRUE, units=readUnits(getCdf(celSet), units = NULL, readIndices=TRUE)) uiN = uiN[ -grep("^AFFX", names(uiN))] badProbeList = lapply(uiN, flagBadProbes, sigThresh=sigThresh, minPresentValues=minPresentValues, minProbeCount=minProbeCount) return(badProbeList) } ## computes a flag indicating that a probe does not work flagBadProbes = function(unit, sigThresh=NA, minPresentValues=NA, minProbeCount=NA){ x = log2(unit[[1]]$intensities) if (nrow(x) <= minProbeCount){ return(rep(FALSE, nrow(x))) } isGood = rowSums(x > sigThresh) >= minPresentValues flagsToUndo = minProbeCount - sum(isGood) if (flagsToUndo > 0){ ## keep those with the highest mean values meanValues = rowMeans(x) meanValues[isGood] = NA ## idx = order(meanValues, decreasing=TRUE)[1:flagsToUndo] isGood[idx] = TRUE } return(!isGood) } removeBadProbesFromUnitList = function(inList){ message("removeBadProbesFromUnitList start") if (exists("badProbeList", where=globalenv())){ badProbeList = get("badProbeList", pos=globalenv()) } else { stop("no bad probe information found in environment") } #message(length(badProbeList)) #message(length(inList)) outList = inList doFilter = names(inList) %in% names(badProbeList) if (any(doFilter)){ idx = match(names(inList)[doFilter], names(badProbeList)) outList[doFilter] = mapply(removeBadProbesFromUnit, inList[doFilter], badProbeList[idx]) #message(str(inList[doFilter][1])) #message(str(outList[doFilter][1])) } return(outList) } removeBadProbesFromUnit = function(unit, isBadProbe){ if (is.null(unit$groups)){ probe = names(unit) if (length(unit[[probe]]) == 2 * length(isBadProbe)){ isBadProbe = as.vector(rbind(isBadProbe, TRUE)) } if (length(unit[[probe]]) != length(isBadProbe)){ message(str(unit)) stop("Bad lengths: ", length(unit[[probe]]), " != ", length(isBadProbe)) } unit[[probe]] = unit[[probe]][!isBadProbe] } else { probe = names(unit$groups); if (length(unit[["groups"]][[probe]][[1]]) == 2 * length(isBadProbe)){ isBadProbe = as.vector(rbind(isBadProbe, TRUE)) } if (length(unit[["groups"]][[probe]][[1]]) != length(isBadProbe)){ message(str(unit)) stop("Bad lengths: ", length(unit[["groups"]][[probe]][[1]]), " != ", length(isBadProbe)) } for (nm in names(unit[["groups"]][[probe]])){ unit[["groups"]][[probe]][[nm]] = unit[["groups"]][[probe]][[nm]][!isBadProbe]; } } return(list(unit)) } writeGoodProbesCdf = function(newCdfName, oldCdfName, badProbeList, stratifyBy="nothing"){ require(affxparser) cdfFile = paste("annotationData/chipTypes/", oldCdfName, "/", oldCdfName, ".CDF", sep="") newCdfDir = paste("annotationData/chipTypes/", newCdfName, sep="") mkdirs(newCdfDir) newCdfFile = paste(newCdfDir , "/", newCdfName, ".CDF", sep="") newMonocellCdfFile = paste(newCdfDir , "/", newCdfName, ",monocell.CDF", sep="") if (file.exists(newMonocellCdfFile)) file.remove(newMonocellCdfFile) cdf = readCdf(cdfFile, stratifyBy=stratifyBy) cdfMod = cdf for (i in 1:length(cdfMod)){ if (!(names(cdfMod)[i] %in% names(badProbeList))) next; isGood = !badProbeList[[ names(cdfMod)[i]]] ncells = sum(isGood) natoms = ncells cdfMod[[i]]$ncells = ncells cdfMod[[i]]$natoms = natoms cdfMod[[i]]$groups[[1]]$indexpos = 0:(natoms-1) cdfMod[[i]]$groups[[1]]$atom = cdfMod[[i]]$groups[[1]]$indexpos cdfMod[[i]]$groups[[1]]$pbase = cdfMod[[i]]$groups[[1]]$pbase[isGood] cdfMod[[i]]$groups[[1]]$tbase = cdfMod[[i]]$groups[[1]]$tbase[isGood] cdfMod[[i]]$groups[[1]]$x = cdfMod[[i]]$groups[[1]]$x[isGood] cdfMod[[i]]$groups[[1]]$y = cdfMod[[i]]$groups[[1]]$y[isGood] cdfMod[[i]]$groups[[1]]$natoms = natoms } cdfHeader = readCdfHeader(cdfFile) cdfqc = readCdfQc(cdfFile) writeCdf(newCdfFile, cdfHeader, cdfMod, cdfqc, overwrite=TRUE) } # Calculate RMA signals for any array or cdf # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # The aromaRMA function # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - aromaRMA <- function(experimentName, dataChipType, analysisChipType, force=FALSE) { print(paste(getBuiltinTime.GString(), "aromaRMA started")); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Setup data set # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if(force) try( clearCache(paste(getCachePath(), "aroma.affymetrix", analysisChipType, sep="/"), prompt=FALSE),silent=TRUE) cs <- AffymetrixCelSet$byName(experimentName, checkChipType=FALSE, chipType = dataChipType); print(cs); cdf <- AffymetrixCdfFile$byChipType(analysisChipType); cs$setCdf(cdf) ResultsPath <- paste("reports", experimentName, sep="\\"); ResultsPath <- paste(ResultsPath, dataChipType, sep="\\"); mkdirs(ResultsPath); # ---------------------------------- # RMA estimates by aroma.affymetrix # ---------------------------------- verbose && enter(verbose, "RMA by aroma.affymetrix"); # RMA background correction print(paste(getBuiltinTime.GString(), "RMA background correction started")); bc <- RmaBackgroundCorrection(cs); csBC <- process(bc, verbose=verbose); print(paste(getBuiltinTime.GString(), "RMA background correction completed")); # RMA quantile normalization print(paste(getBuiltinTime.GString(), "RMA quantile normalization started")); qn <- QuantileNormalization(csBC, typesToUpdate="pm"); csN <- process(qn, verbose=verbose); print(paste(getBuiltinTime.GString(), "RMA quantile normalization completed")); # RMA probe summarization print(paste(getBuiltinTime.GString(), "RMA probe summarization started")); plm <- RmaPlm(csN); fit(plm, verbose=verbose); print(paste(getBuiltinTime.GString(), "RMA probe summarization completed")); # ---------------------------------- # Extract chip effects on the log2 scale # ---------------------------------- print(paste(getBuiltinTime.GString(), "Extraction and saving of chip effects started")) ces <- getChipEffectSet(plm); theta <- extractMatrix(ces, returnUgcMap=TRUE); theta <- log2(theta); ugcMap <- attr(theta, "unitGroupCellMap"); rownames(theta) <- getUnitNames(getCdf(ces), ugcMap[,"unit"]); print(paste(getBuiltinTime.GString(), "Extraction and saving of chip effects completed")); save(theta, file = paste(ResultsPath,paste(analysisChipType,"RMA.Rdata", sep="_"),sep="\\")); print(paste(getBuiltinTime.GString(), "aromaRMA completed")); verbose && exit(verbose); return(TRUE); }