From 032efec6125b9318239e1a543830bb8c2401e7f7 Mon Sep 17 00:00:00 2001 From: Nathanael I Lichti Date: Wed, 28 Jan 2026 10:43:41 -0500 Subject: [PATCH] Initial R script upload --- Beckett_et_al_2026_fsr_functions.R | 3415 ++++++++++++++++++++++++++++ Beckett_et_al_2026_fsr_run.R | 41 + 2 files changed, 3456 insertions(+) create mode 100644 Beckett_et_al_2026_fsr_functions.R create mode 100644 Beckett_et_al_2026_fsr_run.R diff --git a/Beckett_et_al_2026_fsr_functions.R b/Beckett_et_al_2026_fsr_functions.R new file mode 100644 index 0000000..17955aa --- /dev/null +++ b/Beckett_et_al_2026_fsr_functions.R @@ -0,0 +1,3415 @@ +## Top-level function: maxQuantFsr --------------------------------------------- + +#' Proteome-wide estimation of fractional synthesis rates (FSR) +#' using deuterium labeling and MaxQuant output files +#' +#' @description +#' `MaxQuantFsr` provides a user front-end to perform inference on protein +#' fractional synthesis rates, including estimation, confidence interval +#' calculation, and hypothesis testing for specified contrasts. The function +#' is designed for an untargeted proteomics workflow based on initial +#' processing by MaxQuant, but it will work with any appropriately formatted +#' input files. +#' +#' @details +#' Data processing reads the allPeptides.txt and msms.txt files from a folder +#' containing maxQuant outputs, and joins them by the raw file name and MSMS +#' scan number. Sample data, including body water enrichment (BWE) and optional +#' experimental treatment variables should be provided in an additional csv, +#' txt, or xlsx file. +#' +#' After reading the data in, optional quality control filters are applied, +#' potentially limiting estimation to peptides that appear in a given proportion +#' of samples, peptides whose isotope patterns are reasonably correlated +#' between scans within samples, and proteins that are uniquely associated +#' with a given number of peptides. Additionally, peptides represented in only +#' one scan (thus preventing correlation assessment) and peptides associated +#' with maxQuant protein groups (as opposed to unique proteins), may also be +#' excluded. +#' +#' Estimation uses a mixture model. Briefly, the theoretical baseline ($f0$) +#' and asymptotic ($fa$) mass isotopomer distributions for each pepetide are +#' calculated in each sample using given stable isotope abundances, numbers of +#' exchangable hydrogen atoms, and per-sample enriched deuterium levels. +#' Protein fractional synthetic rates (FSRs) are assumed to follow a +#' single-compartment exponential model (a the standard assumption in MIDA +#' analyses), and peptides are assumed to represent noisy observations from +#' their parent proteins. For a given protein $j$, the rate parameter $k_j$ is +#' estimated through a log-link linear model, and the proportion of the peptide +#' pool for sample $i$ that originated after labeling is defined as +#' $w_{ij} = 1 - exp(-k_j * t_i)$, where $t_i$ is the observation time for +#' sample $i$. This implies that the observed mass isotopomer distribution is +#' given by $f_{ijt} = w_{ij} * fa_{ij} + (1 - w_{ij}) * f0_{ij}$. +#' +#' The model is fit by nonlinear least squares using the Levenberg–Marquardt +#' algorithm with standard errors calculated by the delta method. By default, +#' p-values are calculated for all pairwise comparisons among groups and then +#' adjusted over all proteins using the FDR adjustment (WARNING: this has not +#' been tested with more than two groups). +#' +#' @param mq_folder character value identifying the path for a folder where +#' the maxquant results files are located. If NULL, the user will be prompted +#' to select a folder. +#' @param sample_data a file path or a data frame containing sample times and +#' BWE estimates for each sample (i.e., each MaxQuant Raw file). If NULL, the +#' sample data are assumed to be in `file.path(mq_folder, "sampleData.csv")`. +#' @param formula an optional right-hand model formula relating fractional +#' synthesis rate to experimental variables in `sample_data`. Note that this +#' formula defines the study design, \strong{not} the functional form of the +#' FSR model to be fit. +#' @param model the FSR model to be fit (`"e1c"` = exponential one-compartment, +#' currently the only option). +#' @param contrasts a list providing a contrast specification (defaults to all +#' pairwise comparisons among unique experimental conditions defined by +#' `formula`). +#' @param time_unit the unit on which time is recorded. +#' @param protein_aliases an optional file or data frame containing two columns, +#' `Alias` and `Name`, which identify proteins that should be combined or +#' renamed. The value in the`Name` column will be retained. +#' @param msms_count a numeric value giving the minimum proportion of samples +#' with an isotope pattern for a given sequence (for quality control) +#' @param scan_corr a numeric value giving the minimum within-sample correlation +#' of isotope patterns for a given sequence when > 1 pattern is present (for +#' quality control) +#' @param corr_method passed to `.isotopePatternCorrelation` and then to +#' `stats::cor` for quality control checks. +#' @param use_single_scans logical; should peptides be used for estimation if +#' they only appear in one scan, so that correlations cannot be calculated? +#' @param minSeq minimum number of uniquely associated peptides required to do +#' FSR calculation on a protein +#' @param unique_only logical; should estimation be restricted to uniquely +#' peptides that are uniquely associated with a single protein? +#' @param mMax maximum number of isotopic peaks to consider +#' @param p_adjustment method of p-value adjustment; see "?p.adjust" +#' @param plot logical; should distribution plots be drawn? +#' @param save logical; should results be saved to disk +#' @param ncore number of of processing threads to use +#' @param seed an optional random number seed. By default, set to +#' `as.integer(Sys.time())` +#' @param .source the path to source file for the fsr analysis functions +#' (i.e., this file). This is used to read the files into child processes +#' during parallel processing if `ncore > 1`. Defaults to the name of the +#' published file in current working directory. +#' @param ... optional additional arguments passed to internal functions +#' +#' @result +#' A list of data frames containing rate estimates for each protein in each +#' group, tests comparing the rates in groups, and quality control information. +#' These can be automatically saved to an Excel workbook by setting +#' `save = "xlsx"`. +#' +#' @export +maxQuantFsr <<- function( + mq_folder = NULL, + sample_data = NULL, + formula = ~ 1, + model = "e1c", + contrasts = list(), + time_unit = "hour", + protein_aliases = NULL, + msms_count = 0.5, + scan_corr = 0.8, + corr_method = "spearman", + use_single_scans = FALSE, + minSeq = 1, + unique_only = TRUE, + mMax = 10, + p_adjustment = "fdr", + plot = FALSE, + save = FALSE, + ncore = 1, + seed = NULL, + .source = "./Beckett_et_al_2026_fsr_functions.R", + ... +){ + # Using the dplyr package + if(!require(dplyr, quietly = TRUE)){ + message("Installing required package: dplyr") + install.packages("dplyr") + } + library(dplyr) + + if(is.null(seed)) seed <- as.integer(Sys.time()) + + # Set up I/O files + timestamp <- Sys.Date() + if(is.null(mq_folder)){ + mq_folder <- choose.dir(caption = "Please select a MaxQuant results folder") + } + results_folder <- file.path(mq_folder, "fsr_estimation") + if(!file.exists(results_folder)){ + dir.create(results_folder) + } + # qcfile <- file.path( # now included with rest of output + # results_folder, + # paste0(paste("fsr_qc_report", timestamp, sep = "_"), ".txt") + # ) + if(!any(isFALSE(save), save == c("csv","txt","xlsx"))){ + stop("save must be FALSE or one of: \"csv\", \"txt\", or \"xlsx\"") + } + resfile <- file.path( + results_folder, + paste0(paste("fsr_results", timestamp, sep = "_"), ".", save) + ) + plotfile <- file.path( + results_folder, + paste0(paste("plots", timestamp, sep = "_")) + ) + if(is.null(sample_data)){ + sample_data <- file.path(mq_folder,"sampleData.csv") + } + + # Read data + # allPeptides.txt has isotope patterns, and msms.txt has sequences and + # protein associations. They are linked by the raw file (i.e., sample) + # name and MSMS scan number. + message(paste0(round(Sys.time()), ": Starting analysis")) + message(paste0( + round(Sys.time()), ": Reading data files", + if(file.size(file.path(mq_folder, "allPeptides.txt")) > 1E8){ + " (this may take a moment)" + } + )) + apep <- read.delim(file.path(mq_folder, "allPeptides.txt")) + apep <- select(apep, Raw.file, MSMS.Scan.Numbers, Isotope.pattern) %>% + rowwise() %>% + reframe( + Raw.file = Raw.file, + Scan.number = strsplit(MSMS.Scan.Numbers, ";")[[1]], + Isotope.pattern = Isotope.pattern + ) + msms <- read.delim(file.path(mq_folder, "msms.txt")) + msms <- select(msms, Raw.file, Scan.number, Sequence, Proteins, PEP) %>% + mutate(Scan.number = as.character(Scan.number)) + smpl <- readDataFile(sample_data) + for(v in all.vars(formula)){ + if(is.character(smpl[[v]])) smpl[[v]] <- as.factor(smpl[[v]]) + } + + # Join tables and clean up detritus + pep <- left_join(msms, apep, by = c("Raw.file", "Scan.number")) %>% + filter(!is.na(Isotope.pattern)) # remove IP NAs? + rm(apep, msms); gc() + + # Number of biological replicates and MSMS count cutoff + sample_sizes <- groupedSampleSize(smpl, formula) + # nsample <- sample_sizes[1] #length(unique(pep$Raw.file)) + msms_count <- if(length(sample_sizes) == 1){ + ceiling(msms_count * sample_sizes[1]) + } else { + ceiling(msms_count * max(sample_sizes[-1])) + } + + # Resolve protein annotations + protein_aliases <- readDataFile(protein_aliases) + pep <- dealiasProteins(pep, protein_aliases) + + # Run quality control checks and filter data + mq_proteins <- unique(pep$Proteins) + pep <- .checkMsmsDataQuality(pep, corr_method, scan_corr, + use_single_scans, msms_count, + smpl, formula, sample_sizes[1]) + qcreport <- attr(pep, "qc") + attr(pep, "qc") <- NULL + + # Select best isotope pattern (from scan with lowest PEP score / sequence) + # and join the sample data to the MSMS data + pep <- suppressMessages( + pep %>% group_by(Raw.file, Sequence) %>% summarize( + Proteins = Proteins[which.min(PEP)], + Isotope.pattern = Isotope.pattern[which.min(PEP)] + ) %>% ungroup() #%>% + ) + + # Split into groups of related proteins/peptides + groups <- with( + suppressMessages( + summarize(group_by(pep, Sequence), Proteins = unique(Proteins)) + ), + findProteinFamilies(Sequence, Proteins, ret_index = TRUE) + ) + pep <- full_join(pep, groups, by = c(Sequence = "peptide")) +browser() + pep <- split(pep, f = factor(pep$group)) + + message(paste0(round(Sys.time()), ": Estimating fractional synthesis rates")) + cl <- setupCluster(ncore, seed) # set up parallel processing & rn seed + pbo <- pbapply::pboptions(use_lb = TRUE) # use load balancing + on.exit(pbapply::pboptions(pbo), add = TRUE) # restore default options on exit + + if(!is.null(cl)){ + parallel::clusterCall(cl, source, file = .source) + parallel::clusterCall(cl, library, package = "dplyr", character.only = TRUE) + } + + results <- pbapply::pblapply(pep, .doMaxQuantFsr, formula, smpl, + minseq = minSeq, mMax = mMax, model= model, + contrasts = contrasts, plot = plot, + unique_only = unique_only, ..., + cl = cl) + plots <- .getFsrPlots(results) + + results <- .segregateProteinGroups( + aggregateDataFrameList(results, exclude = "plot") + ) + if(!is.null(results$tests)){ + results$tests$p.adj <- p.adjust(results$tests$p.value, p_adjustment) + } + + notrun <- list(`not_run` = setdiff(mq_proteins, results$rates$Protein)) + qcreport <- c(qcreport, .writeQcAddendum(results, notrun)) + + message(paste0(round(Sys.time()), ": Analysis complete")) + if(!isFALSE(save)){ + if(save == "xlsx"){ + names(qcreport)[1] <- "quality control" + names(notrun)[1] <- "not run" + } + writeAnalysisResults( + c( + .writeFsrReadme(time_unit, unique_only, p_adjustment, ...), + results, + qcreport[1], + notrun, + list(session = .buildSessionInfo(ncore, seed)) + ), + resfile + ) + ext <- paste0(".", save) + if(ext == ".xlsx"){ + writeAnalysisResults(qcreport[-1], resfile, append = TRUE, + sheet = "quality control") + } else { + resfile <- sub(ext, paste0("_quality_control", ext), resfile) + writeAnalysisResults(qcreport[-1], resfile, append = TRUE, + col.names = FALSE) + } + resfile <- dirname(resfile) + savePlotList(plots, file.path(resfile, "plots"), scale = 3) + message(paste0("Results are in ", resfile)) + } + if(length(plots) > 0) results$plots <- plots + return(results) +} + +## Assorted utility functions -------------------------------------------------- + +#' Read standard data file formats +#' +#' @description +#' A convenience function to read data files into R from a variety of +#' standard formats, including space, tab, and comma-delimited text, as +#' well as Excel. +#' +#' @details +#' If `x` is a character vector with only one element, it is interpreted as +#' a file path and the corresponding file is read into R based on its file +#' extension (\code{\link{readxl::read_xlsx}} is used for `".xlsx"`, +#' \code{\link{read.csv}} for `".csv"`, and \code{\link{read.delim}} for +#' `".txt"`). If `x` contains a second element, it will be interpreted as +#' a tab name or tab number if the first element points to an Excel workbook, +#' and ignored with a warning otherwise. If `x` is (strictly, inherits from) +#' a data frame, it will simply be returned as-is. +#' +#' @param x a character vector or a data frame; see Details. +#' +#' @result +#' A data frame. +#' +#' @seealso +#' \code{\link{readxl::read_xlsx}}, \code{\link{read.csv}}, and +#' \code{\link{read.delim}} +#' +readDataFile <- function(x){ + if(inherits(x, "data.frame") || is.null(x)) return(x) + if(!is.character(x)) + stop("Data file x must be a character vector or a data frame.") + f <- x[1] + if(grepl(".xlsx", f)){ + if(length(x) > 1) tb <- x[2] else(tb <- 1) + if(!is.na(suppressWarnings(as.numeric(tb)))) tb <- as.numeric(tb) + readxl::read_xlsx(f, tb) + } else { + if(length(x) > 1) + warning("Only the first file name in data file vector will be used.") + if(grepl(".csv", f)){ + read.csv(f) + } else { + if(grepl(".txt", f)){ + read.delim(x) + } else { + stop("Data file should be one of: data frame, .xlsx, .csv, .txt") + } + } + } +} + +#' Write the results of an analysis to a csv, txt, or xlsx file +#' @param results a matrix, data frame, or list (possibly with > 1 data frame) +#' @param file path to the output file, including a .csv, .txt, or .xlsx extension +#' @param append,row.names,col.names logical >> self explanatory +#' @param sheet passed to openxlsx::write.xlsx +writeAnalysisResults <<- function(results, file, append = FALSE, sheet = NULL, + row.names = FALSE, col.names = TRUE){ + if(inherits(results, c("matrix","data.frame")) || + (is.vector(results) & !is.list(results))){ + results <- list(result = results) + } + if(!is.list(results) || is.null(names(results))){ + stop("results should be a named list; maybe try list(result = results)?") + } + if(!is.character(file)){ + stop("file must be a character value") + } + file <- file[1] + fileext <- tools::file_ext(file) + if(!append){ + switch( + fileext, + csv = { + file <- gsub(".csv", "", file, fixed = TRUE) + lapply(names(results), function(i){ + write.table( + results[[i]], + file = paste0(file, "_", i, ".csv"), + sep = ",", row.names = row.names, col.names = col.names + ) + }) + }, + txt = { + file <- gsub(".txt", "", file, fixed = TRUE) + lapply(names(results), function(i){ + write.table( + results[[i]], + file = paste0(file, "_", i, ".txt"), + sep = "\t", row.names = row.names, col.names = col.names + ) + }) + }, + xlsx = openxlsx::write.xlsx(results, file = file), + stop(paste0( + fileext, " is not a recognized file extension.\n", + "Please use .txt (tab-delimited), .csv (comma-delimited), or ", + ".xlsx (Excel)." + )) + ) + } else { + switch( + fileext, + csv = suppressWarnings( + lapply(results, write.table, file = file, row.names = row.names, + col.names = col.names, append = TRUE, sep = ",") + ), + txt = suppressWarnings( + lapply(results, write.table, file = file, row.names = row.names, + col.names = col.names, append = TRUE, sep = "\t") + ), + xlsx = { + wb <- openxlsx::loadWorkbook(file) + if(!is.null(sheet)){ + if(!any(openxlsx::sheets(wb) == sheet)){ + openxlsx::addWorksheet(wb, sheet) + } + for(i in 1:length(results)){ + sr <- .get_xlsx_dims(wb, sheet)[1] + 1 + openxlsx::writeData(wb, sheet, results[[i]], startRow = sr) + } + } else { + for(i in names(results)){ + openxlsx::addWorksheet(wb, i) + openxlsx::writeData(wb, sheet = i, results[[i]]) + } + } + openxlsx::saveWorkbook(wb, file, overwrite = TRUE) + }, + stop(paste0( + fileext, " is not a recognized file extension.\n", + "Please use .txt (tab-delimited), .csv (comma-delimited), or ", + ".xlsx (Excel)." + )) + ) + } + invisible(TRUE) +} + +#' Calculate theoretical mass isotopomer distributions for peptides +#' based on their amino acid sequences +#' +#' @description +#' Generates a list of theoretical isotope distributions with a baseline +#' and asymptotic, enriched distribution for each input peptide sequence. +#' +#' @details +#' `peptidesToIsotopePatterns` is a flexible front-end for calculating +#' theoretical mass isotopomer distributions (MIDs, aka isotope patterns or +#' isotope distributions) for peptides. At minimum, users must supply a +#' character vector (`x`) containing the target peptide sequences. Optionally, +#' users can also set the maximum number of output peaks (`nMax`, defaults to +#' 10) and can supply enriched molar fractions for one or more heavy isotopes. +#' These are supplied as a named list to the `label` argument. For example, +#' `label = list(H2 = 0.02)` will generate a pattern with the assumption that +#' deuterium has been enriched to 2%. If a `label` is supplied, the function +#' will return a baseline (unlabeled) MID as well as a labeled MID for each +#' sequence. Otherwise, only the baseline MID is returned. +#' +#' MIDs are calculated based on the information contained in `aatable` and +#' `fractions`. The first of these data frames lists the amino acids. It must +#' include a column named `formula` that provides the molecular formula for +#' each amino acid, and must have `row.names` property that identifies the +#' amino acid by its standard letter symbol. It can also include columns named +#' for elements (i.e., "C", "H", etc.) that provide the number of atoms from +#' each element that are accessible to the label (if the value is `NA`, all +#' atoms are assumed to be accessible). The default table, +#' \code{\link{amino_acids}}, only provides numeric values for hydrogen. +#' +#' The `fractions` data frame lists heavy isotopes and (at a minimum) the +#' molar fractions to be used in calculating the baseline MID. It must include +#' a `row.names` attributute that identifies the isotopeby elemental symbol and +#' mass number (e.g., "C13"), and must include at least one column containing +#' molar fraction data for each isotope. Only heavy isotopes should be listed. +#' Monoisotopes are assumed to make up the remaining potion for each element. +#' The name of the variable containing the molar fractions should be supplied +#' to the argument `fraction_column`. By default, `fractions = stable_isotopes` +#' and `fraction_column = "nist"`. See \code{\link{stable_isotopes}} for +#' potential alternatives (currently, only `"iupac"`). If they have good data +#' available, users are encouraged to provide their own, study-specific molar +#' fractions, as this will generally lead to more accurate MIDs and improved +#' rate estimates. +#' +#' By default, all of the isotopes listed in `fractions` will be considered in +#' MID calculations. Specifically, the molar fractions listed in the `fractions` +#' table are used to generate the baseline distribution. Then if a `label` is +#' provided, it is used to replace the relevant isotopic fraction and the MID +#' is recalculated, with the number of label-accessible atoms determined by +#' `aatable`. If desired, the user can use the `isotopes` argument to limit the +#' set of isotopes included in the MID calculations (e.g., setting +#' `isotopes = c("C13","H2")` would only include carbon-13 and deuterium in the +#' calculations). This is not encouraged, but may be useful for comparisons +#' with other software. +#' +#' @param x character vector of amino acid sequences +#' @param nMax the maximum number of peaks to retain in the isotope pattern +#' (i.e., the maximum nominal mass difference of interest + 1 for the +#' monoisotopic peak) +#' @param label a named list giving enrichment values for isotopic labels +#' (e.g. `list(H2 = 0.02)` for a 2% deuterium label) +#' @param aatable a data frame of amino acid properties, passed to +#' \code{\link{.atomsToPattern}}. See \code{\link{amino_acids}}. +#' @param fractions data frame of isotope properties, passed to +#' \code{\link{.atomsToPattern}}. See \code{\link{stable_isotopes}}. +#' @param fraction_column name of the variable in `fractions` that contains the +#' molar fractions of different isotopes (defaults to `"nist"`). +#' @param isotopes character vector of stable, heavy isotopes to be considered +#' in the isotopomer pattern calculations. Defaults to all isotopes in +#' `fractions`. +#' +#' @result +#' A named list with one component for each input peptide. Each component is +#' also a named list with `$baseline` and (if a `label` is specified) +#' `$asymptotic` components, each of which are numeric vectors of length `nMax`, +#' normalized to sum to 1. +#' +peptidesToIsotopePatterns <- function(x, nMax = 10, label = list(), + aatable = amino_acids, + fractions = stable_isotopes, + fraction_column = "nist", + isotopes = rownames(fractions)){ + m <- 0:(nMax - 1) + x <- setNames(x,x) + x <- strsplit(x, "") + fractions[["molar_fraction"]] <- fractions[[fraction_column]] + lapply(x, .atomsToPattern, m, label, isotopes, aatable, fractions) +} + +#' Extract an isotope distribution from a list returned by +#' \code{peptidesToIsotopePatterns} +#' +#' @description +#' A utility to make it easier to access the isotope patterns for a +#' specific peptide sequence +#' +#' @details +#' This is a convenience function to access specific theoretical isotope +#' distributions after they have been generated. It is used internally (with +#' \code{\link{mapply}}) to package the theoretical distributions to be passed +#' to the estimation functions. +#' +#' @param x a named list containing isotope patterns for one or more peptide +#' sequences, as returned by \code{\link{peptidesToIsotopePatterns}} +#' @param sequence a character value specifying the sequence to be extracted +#' (should match the name of an element in `x`); scalar numeric indices are +#' also accepted. +#' @param sample the name or index of the sample whose pattern is to be +#' extracted +#' @param pattern character value identifying the type of pattern to be +#' extracted +#' +#' @result +#' a numeric vector containing the isotope distribution +#' +#' @seealso +#' \code{\link{peptidesToIsotopePatterns}} +#' +getPattern <- function(x, sequence, sample, pattern = c("baseline","asymptotic")){ + x[[sequence]][[sample]][[pattern]] +} + +#' Count the number of atoms in a compound, by element +#' +#' @description +#' Converts chemical formulas into a table of atomic counts +#' +#' @details +#' `x` is a character vector of chemical formulas of the form "C4H6N1O2" (note +#' that all elements include a count in the formula). `countAtoms` parses the +#' formulas and returns a matrix with one row per compound and one column for +#' each of the elements C, H, O, N, P, S, F, Cl, Br, and I (other elements may +#' be added in future versions). Values in the matrix are atomic counts. +#' If `x` is named, the compound names will be preserved as row names in +#' the output. +#' +#' @param x character vector of molecular formulas +#' +#' @result +#' matrix of atomic counts (see Details) +#' +countAtoms <- function(x){ + elements <- c("C","H","O","N","P","S","F","Cl","Br","I") + pos <- sapply(elements, stringr::str_locate_all, string = x) + if(is.vector(pos)) pos <- t(pos) + cnt <- outer(1:nrow(pos), 1:ncol(pos), .getElementCount, pos = pos, x = x) + dimnames(cnt) <- list(names(x), elements) + cnt +} + +#' Calculate the theoretical isotope pattern generated by +#' a given element in a compound +#' +#' @description +#' Calculates an isotope distribution for a given element based on the number +#' of atoms present in a compound (n), a given set of stable heavy isotope +#' fractions (p), and the nominal mass differences from the monoisotope for +#' those heavy isotopes (d). +#' +#' @details +#' The elemental isotope distribution is calculated as Multinom(n, p). Note +#' that this function uses the vectorized \code{\link{dmultinomial}} function, +#' not \code{\link{stats::dmultinom}}. +#' +#' @param n numeric vector giving the number of atoms for each element +#' @param p numeric vector giving isotope fractions for each element +#' @param d numeric vector giving nominal mass differences for each isotope +#' +#' @result +#' A numeric vector describing the isotope distribution. +#' +elementalIsotopomerPattern <- function(n, p, d){ + stopifnot(length(p) == length(d)) + if(length(p) == 1){ + m <- 0:n + setNames(dbinom(m, n, p), d * m) + } else { + p <- c(1 - sum(p), p) + d <- c(0, d) + M <- eval(parse( + text = paste0( + "expand.grid(", paste(rep("0:n", length(p)), collapse = ","), ")" + ) + )) + M <- as.matrix(M)[rowSums(M) == n,] + D <- colSums(t(M) * d) + P <- aggregate(dmultinomial(M, n, p), list(D), sum) + setNames(P[,2], P[,1]) + } +} + +#' Dealias proteins +#' +#' @description +#' Rename proteins in a dataset to resolve aliased annotations +#' +#' @details +#' To be filled in... +#' +#' @param x +#' @param alias_table +#' @param col +#' +dealiasProteins <<- function(x, alias_table = NULL, col = "Proteins"){ + if(is.null(alias_table)) return(x) + pp <- x[[col]] + pp <- strsplit(pp, ";") + truenames <- with(alias_table, setNames(Name, Alias)) + for(i in 1:nrow(alias_table)){ + pp <- lapply(pp, function(p){ + p[p %in% names(truenames)] <- truenames[p %in% names(truenames)] + paste(unique(p), collapse = ";") + }) + } + x[[col]] <- do.call(c, pp) + x +} + +#' Identify peptide-protein families +#' +#' @description +#' +#' @details +#' This function decomposes a vector of unique peptide sequences and a +#' corresponding vector of associated proteins into a list of subvectors, +#' where each list-element describes a family of related proteins and peptides. +#' This means that each protein in the group is linked to at least one other +#' protein by a shared peptide. Peptides that are unique to a protein are +#' connected to the group through the protein's other associated peptides. +#' Peptides or proteins that appear in different list elements have no links +#' and can be treated independently of one another. +findProteinFamilies <- function(peptides, proteins, sep = ";", + ret_index = FALSE){ + stopifnot(length(proteins) == length(peptides)) + stopifnot(is.character(peptides)) + stopifnot(is.character(proteins)) + if(any(table(peptides) > 1)){ + stop("peptides cannot include duplicate values") + } + proteins <- setNames(proteins, peptides) + if(ret_index) refpep <- peptides + out <- list() + i = 1 + while(length(peptides) > 0){ + pepi <- peptides[1] + proti <- strsplit(proteins[1], sep)[[1]] + j <- 1 + while(j <= length(proti)){ + add <- grep(proti[j], proteins) + pepi <- unique(c(pepi, peptides[add])) + proti <- unique(c(proti, unlist(sapply(proteins[add], strsplit, sep)))) + j = j + 1 + } + if(ret_index){ + out[[i]] <- cbind( + peptide = pepi, #match(pepi, refpep) + group = rep(i, length(pepi)) + ) + } else { + out[[i]] <- list(peptide = pepi, proteins = proteins[pepi]) + } + proteins <- proteins[!peptides %in% pepi] + peptides <- peptides[!peptides %in% pepi] + i <- i + 1 + } + if(ret_index){ + out <- as.data.frame(do.call(rbind, out)) + # out[order(out[,"row"]), "group"] + } #else { + out + # } +} + +#' Construct a peptide-protein association matrix +#' +#' @description +#' +#' @details +#' Given a vector of unique peptide sequences and a corresponding vector that +#' identifies the proteins associated with each peptide, this function builds +#' a binary matrix that indicates which peptides are associated with which +#' proteins. The peptide vector should not contain duplicate values. +#' +#' If a peptide is associated with multiple proteins, they should all be listed +#' in the corresponding element of `proteins`, separated by the `sep` character. +#' The defaults are set up to accept the `Sequence` and `Proteins` columns found +#' in MaxQuant output tables. +#' +#' \strong{Caution}: running `pepToProt` on a large dataset is very +#' memory-inefficient. Use \code{\link{findProteinFamilies}} first to identify +#' subsets of related peptides and proteins, then run `pepToProt` separately +#' on each family. +#' +#' TODO: Sparsify this thing! +#' +#' @param peptides a chracter vector of unique peptide sequences +#' @param proteins a character vector giving the proteins associated with each +#' peptide, seperated within each element by `sep` if >1 protein association +#' exists. +#' @param sep a character separating protein names within elements of `proteins` +#' +pepToProt <<- function(peptides, proteins, sep = ";"){ + stopifnot(length(proteins) == length(peptides)) + stopifnot(is.character(peptides)) + stopifnot(is.character(proteins)) + if(any(table(peptides) > 1)){ + stop("peptides cannot include duplicate values") + } + if(!isFALSE(sep)){ + proteins <- setNames(strsplit(proteins, sep), peptides) + } else { + proteins <- setNames(proteins, peptides) + } + uprot <- sort(unique(unlist(proteins))) + .P2PMAT <- matrix(0L, length(peptides), length(uprot), + dimnames = list(peptides, uprot)) + # .P2PMAT <- SparseM::as.matrix.csr(.P2PMAT) # this does not help much + sapply(peptides, function(i) { + .P2PMAT[i, proteins[[i]]] <<- 1L + invisible() + }) + # SparseM::as.matrix.csr(.P2PMAT) + .P2PMAT +} + +#' A vectorized version of the multinomial probability mass function +#' +#' @description +#' `dmultinomial` provides a vectorized version of `stats::dmulitnom` +#' +#' @details +#' `x` is vector-valued, so that n observations each with k possible outcomes +#' should be represented as an n x k matrix. `size` sets the number of draws +#' per row in `x` (recycled if necessary). `prob` is a vector of length k (or +#' a matrix of length-k row vectors) giving probabilities of different outcomes +#' (rows recycled if necessary). +#' +#' Trials with different n and k values can be run by ensuring that +#' `ncol(x) = max(k)` and that `sum(x[i,]) == n[i]` for all `i`, and setting +#' `prob[i, not_possible] = 0`. +#' +#' @param x an n x k numeric matrix giving the number of observed outcomes +#' from k classes in n multinomial trials. +#' @param size a numeric vector giving the number of draws per trial +#' @param prob a numeric vector of length k or a (.) x k matrix giving the +#' probability of each outcome (or the probabilities in different trials, if +#' `prob` is a matrix) +#' @param log logical. Should the result be reported on the natural log scale? +#' +#' @result +#' A vector of (log) probabilities for each trial. +#' +#' @seealso +#' \code{\link{dmultinom}} for a non-vectorized version. + +dmultinomial <- function(x, size, prob, log = FALSE){ + if(is.vector(x)) x <- t(x) + if(is.vector(prob)) prob <- t(prob) + lout <- max(c(length(size), nrow(x), nrow(prob))) + if(length(size) < lout){ + size <- rep(size, len = lout) + } + stopifnot(all(rowSums(x) == size)) + stopifnot(ncol(x) == ncol(prob)) + stopifnot(all(prob >= 0)) + if(any(rowSums(prob) != 1)){ + warning("Some sum(prob) != 1; they will be normalized.") + prob <- prob/rowSums(prob) + } + if(nrow(prob) < lout){ + prob <- prob[rep(1:nrow(prob), len = lout),] + } + if(nrow(x) < lout){ + x <- x[rep(1:nrow(x), len = lout),] + } + lprob <- lgamma(size+1) - rowSums(lgamma(x+1)) + rowSums(x * log(prob)) + if(log) lprob else (exp(lprob)) +} + +# Restructure a list of data frames +aggregateDataFrameList <<- function(x, label = NULL, add_row_names = FALSE, + exclude = NULL){ + if(!is.null(label)){ + arn <- add_row_names + add_row_names <- TRUE + } + full <- sapply(x, length) + nms <- setdiff(names(x[[which.max(full)]]), exclude) + out <- lapply(setNames(nms, nms), function(i){ + xi <- lapply(x, "[[", i) + do.call(rbind, append(xi, list(make.row.names = add_row_names))) + }) + if(!is.null(label)){ + lapply(out, function(i){ + i <- cbind(setNames(list(rownames(i)), label[1]), i) + if(!arn) rownames(i) <- NULL + i + }) + } else {out} +} + +# identify experimental groups for samples in data frame x, using variables +# in a formula, and get the sample sizes of the groups +getDesignGroups <<- function(x, formula, ...){ + vars <- all.vars(formula) + if(length(vars) == 0){ + return(c(Total = nrow(x))) + } + formula <- formula(paste0("~", paste(vars, collapse = "+"))) + mf <- model.frame(formula = formula, data = x, ...) + for(i in names(mf)){ mf[,i] <- paste0(i, ":", mf[,i])} + apply(mf, 1, paste, collapse = ";") +} + +groupedSampleSize <<- function(x, formula, ...){ + cases <- getDesignGroups(x, formula, ...) + c(Total = length(cases), table(cases)) +} + +# save a list of plots to a directory +savePlotList <<- function(plots, directory = tempdir(), format = "png", + verbose = FALSE, ...){ + nms <- names(plots) + if(length(nms) > 0 && !dir.exists(directory)) dir.create(directory) + if(length(nms) == 0 && length(plots) > 0){ + nms <- paste0("Rplot", seq_along(plots)) + names(plots) <- nms + } + expr <- " + lapply(nms, function(i){ + ggplot2::ggsave( + plots[[i]], + file = file.path(directory, paste0(i, '.', format)), + ... + ) + }) + " + if(isTRUE(verbose)){ + eval(parse(text = expr)) + } else { + suppressMessages(suppressWarnings(eval(parse(text = expr)))) + } + invisible(TRUE) +} + +# set up a parallel processing cluster +setupCluster <<- function(ncore = 1, seed = NULL, + exports = NULL, evals = NULL, calls = NULL, + cleanup = TRUE, add = FALSE, after = TRUE, + ...){ + if(is.null(seed)) seed <- as.integer(Sys.time()) + if(ncore > 1){ + cl <- parallel::makeCluster(ncore) + parallel::clusterSetRNGStream(cl, iseed = seed) + if(!is.null(exports)){ + parallel::clusterExport(cl, exports, envir = parent.frame()) + } + if(!is.null(evals)){ + for(i in evals) parallel::clusterEvalQ(cl, i) + } + if(!is.null(calls)){ + for(i in calls) parallel::clusterCall(cl, i, ...) + } + if(!identical(parent.frame(), globalenv()) && cleanup[1]){ + do.call( + on.exit, + list(expr = "parallel::stopCluster(cl)", add = add, after = after) + ) + } + } else { + cl <- NULL + if(!identical(parent.frame(), globalenv()) && cleanup[1]){ + assign("..rnseed", .Random.seed, envir = parent.frame()) + do.call( + on.exit, + list(expr = "set.seed(..rnseed)", add = add, after = after) + ) + } + set.seed(seed) + } + return(cl) +} + +## Internal functions called by maxQuantFsr ------------------------------------ + +#' Fit an FSR model and obtain results for each protein (or protein group) in +#' an untargeted proteomics dataset (internal function). +#' +#' @description +#' Perform inference on FSR for all of the proteins and study conditions +#' listed in a dataset. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' The function generates a correspondence table that links peptides (rows) +#' to proteins (or protein groups; columns), and then performs inference for +#' each protein that has at least `minseq` uniquely associated peptides (if +#' no proteins meet this requirement, a value of `NULL` is returned). +#' +#' Processing proceeds as follows: +#' \enumerate{ +#' \item{Theoretical baseline and asymptotic distributions are generated once +#' for each combination of peptide sequence and enrichment value and then +#' called as needed during model fitting.} +#' \item{For each protein (group), \code{\link{.packageFsrData}} is called to +#' synthesize an input list that contains the relevant data, theoretical +#' isotope distributions, and model specifications. +#' } +#' \item{The input package is passed to \code{\link{.randStart}}, which +#' fits the specified model.} +#' \item{The fitted model object, input package, and specifications for +#' `contrasts` are passed to \code{\link{.formatFsrResult}}, which returns +#' a list of inference results.} +#' \item{If requested, a plot is generated for the protein and appended to +#' the result list.} +#' \item{Results for all proteins are aggregated into tables organized by +#' inference type (see \strong{Results})}. +#' } +#' +#' @param x a data frame containing (at minimum) columns titled `Sequence`, +#' `Proteins`, `Isotope.pattern`, and `Raw.file` +#' @param formula a RHS model formula referencing variables in `sample_data` +#' (can be `~ 1` for intercept-only models) +#' @param sample_data a data frame with annotation information for each +#' specimen; minimally, it must include columns for `File`, `BWE`, and `Time` +#' @param minseq numeric value > 0; the minimum number of uniquely associated +#' peptides required to obtain results for a protein. +#' @param mMax numeric value >= 1; maximum number of isotopic peaks to include +#' in calculations. Non-integer values will be rounded up. +#' @param model character value identifying the FSR model to be used for +#' rate estimation. +#' @param contrasts a list specifying contrasts to be obtained (see +#' \code{\link{.buildContrastMatrix}} for specification details) +#' @param plot logical; should a plot be obtained for the result? +#' @param unique_only logical; if `TRUE`, rates will only be estimated for +#' proteins that are uniquely associated with at least `minseq` peptide +#' sequences. If `FALSE`, rates will also be estimated for protein groups. +#' THE RELIABILITY AND INTERPRETATION OF GROUP-LEVEL ESTIMATES ARE NOT CLEAR. +#' EXERCISE CAUTION! +#' @param ... additional arguments passed to other functions +#' +#' @result +#' A list with either 2 or three components: +#' \describe{ +#' \item{$rates} a data frame providing the estimated fractional synthesis +#' rates for each protein (group) in each treatment condition +#' \item{$contrasts} a data frame providing the estimated contrasts between +#' specified treatment conditions for each protein (group). This component +#' is missing if no contrasts were requested. +#' \item{$metadata} a data frame containing metadata about each protein +#' (group), including its name, the algorithm used to fit the model, whether +#' it converged, and the number of time points, specimens, peptide sequences, +#' isotope peaks, and OIDs used to fit the model. +#' } +#' +#' @seealso +#' \code{\link{peptidesToIsotopePatterns}} for calculation of theoretical +#' isotope patterns, \code{\link{.checkFsrInputs}} for data requirements, +#' \code{\link{.packageFsrData}}, \code{\link{.randStart}}, +#' \code{\link{.formatFsrResult}}, and \code{\link{aggregateDataFrameList}} +#' for details on specific processing steps. +#' +.doMaxQuantFsr <<- function(x, formula, sample_data, minseq, mMax, model, + contrasts, plot, unique_only = TRUE, unit = "hour", + ...){ + sep <- if(unique_only) ";" else(FALSE) + p2p <- with( + dplyr::summarize(dplyr::group_by(x, Sequence), Proteins = unique(Proteins)), + pepToProt(Sequence, Proteins, sep = sep) + ) + uniqueToProtein <- rowSums(p2p) == 1 + nUniquePep <- colSums(p2p[uniqueToProtein,, drop = FALSE]) + if(!any(nUniquePep >= minseq)) return(NULL) + + theory <- peptidesToIsotopePatterns( + unique(x$Sequence), + label = list(H2 = with(sample_data, setNames(BWE, File))), # fix indexing of BWE + nMax = ceiling(mMax) + ) + + if(minseq > 0){ + RESULT <- list() + p2p <- p2p[uniqueToProtein, nUniquePep >= minseq, drop = FALSE] + for(i in colnames(p2p)){ + peps <- rownames(p2p)[p2p[,i] == 1] + + X <- .checkFsrInputs( + dplyr::left_join( + dplyr::filter(x, x$Sequence %in% peps), sample_data, + by = c(Raw.file = "File") + ), + formula, + contrasts + ) + + DATA <- .packageFsrData( + x = X, + theory = theory[peps], + nMax = mMax, + formula = formula, + model = model, + ... + ) + + FIT <- .randStart(data = DATA, ...) + + RESULT[[i]] <- .formatFsrResult(object = FIT, data = DATA, + contrasts = contrasts, + ...) + + if(plot) RESULT[[i]]$plot <- .plotFsr2(DATA) + } + + RESULT <- aggregateDataFrameList(RESULT) + + } else { + stop("minseq must be >= 1.") + } + RESULT +} + +#' Retrieve functions for a specified model +#' +#' @description +#' `.setFsrModel` returns a list containing the `objective` function and +#' `parameter` function for a specified FSR model. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' `setFsrModel` can accept (1) a character value, (2) function, or (3) list +#' as input. If a character vector is given, only the first element will be +#' used, with a warning. If a character value is given, `.setFsrModel` calls +#' \code{\link{base::switch}} to run the predefined model specification +#' function requested. Unrecognized function names will return an error. +#' +#' If a function is provided, it will be called with default arguments, and the +#' result will be checked to ensure that it meets the basic requirements for a +#' model specification before it is returned. A similar check is performed on +#' user-supplied list objects. See \code{\link{buildFsrModel}} for information +#' on custom model specification. +#' +#' @param model one of: a character value specifying a predefined model for +#' FSR estimation, a function that generates such a model, or a list object +#' that provides functions to define the model. +#' +#' @return +#' A S3 list with two named elements, `objective` and `parameter`, each +#' containing a function. See \code{\link{buildFsrModel}} for details on the +#' contents and structure of these functions. +#' +#' @seealso +#' \code\link{.domaxQuantFsr}}, which calls this function and +#' \code{\link{.checkFsrModel}}, which enforces model structure. In addition, +#' \code{\link{buildFsrModel}} can be used to specify a custom model. +#' +.setFsrModel <<- function(model){ + if(is.character(model)){ + if(length(model) > 1){ + warning("length(model) is > 1: only the first value will be used.") + model <- model[1] + } + .func <- switch( + model, + e1c = .ue1cGivenp(), + e1cp = { + stop("Sorry. Model e1cp has not been written yet.") + }, + stop(paste0( + "model should be one of:\n", + " \"e1c\" for one-compartment exponential with given enrichment\n", + " \"e1cp\" for one-compartment exponential with estimated enrichment" + )) + ) + } + if(is.function(model)) model <- model() + if(is.list(model)){ + .func <- .checkFsrModel(model) + } + return(.func) +} + +#' Check the integrity of an FSR model list +#' +#' @description +#' `.checkFsrModel` performs basic sanity checks on the structure, arguments, +#' and results generated by an FSR model list. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' `.checkFsrModel` checks its input to confirm that it is a list with (at +#' least) four elements, named `description`, `objective`, `rate`, and +#' `difference`, that all elements have the correct form (a 1-element character +#' for `description` and functions for the other three), and that the functions +#' have the correct arguments. An error is returned if any of the checks fail. +#' Otherwise, the input is returned unchanged. +#' +#' @param x an R object +#' +#' @return +#' The input model list, `x`, or an error. +#' +#' @sealso +#' \code{\link{buildFsrModel}} or model specification guidelines. +#' +.checkFsrModel <- function(x){ + OK <- TRUE + for(i in 1:10){ + if(!OK){ + stop(paste0( + "\nThe specified model does not meet formatting requirements.\n", + "Please see help(\"buildFsrModel\") for assistance." + )) + } + OK <- switch( + i, + is.list(x), + all(c("description","objective","rate","difference") %in% names(x)), + is.character(x$description), + length(x$description) == 1, + is.function(x$objective), + is.function(x$rate), + is.function(x$difference), + all(names(formals(x$objective)) == c("pars","data","estimator","...")), + all(names(formals(x$rate)) == c("X","pars")), + all(names(formals(x$difference)) == c("X","pars","contrast")) + ) + } + return(x) +} + +#' Check the integrity of inputs supplied to \code{\link{.doMaxQuantFsr}} +#' +#' @description +#' `.checkFsrInputs` runs a series of checks on the data supplied to the +#' estimation functions and returns specific error messages if it detects +#' any problems. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' The current list of checks ensures that the inputs inherit from data.frame +#' and that all required variables are present and have the correct data type. +#' Variables referenced in the model formula are included in the check (and +#' the formula itself is checked to make sure it really is a model formula). +#' +#' @param x a data frame +#' @param formula a model formula +#' @param contrasts a list specifying model contrasts (not currently checked) +#' +#' @return +#' The input data, `x`, or an error. +#' +#' @seealso +#' +.checkFsrInputs <<- function(x, formula = ~ 1, contrasts = NULL){ + if(!inherits(x, "data.frame")) stop("x must be a data frame.") + if(!is(formula, "formula")) stop("formula must be a RHS model formula.") + vars <- all.vars(formula) + missingvars <- vars[!vars %in% names(x)] + if(length(missingvars) > 0){ + stop(paste0( + "\nThe following variables are missing from the sample data:\n", + paste(shQuote(missingvars), collapse = ", ") + )) + } + if(!is.character(x[["Raw.file"]])){ + stop(paste0( + "\nx must include a character vector named \"Raw.file\" that\n", + "identifies the mass spectrometry sample run (i.e., MS data file)\n)", + "associated with each isotope pattern." + )) + } + if(!is.character(x[["Isotope.pattern"]])){ + stop(paste0( + "\nx must include a character vector named \"Isotope.pattern\"\n", + "that provides the observed isotope distribution with intensities\n", + "separated by semicolons (;)." + )) + } + if(!is.character(x[["Proteins"]])){ + stop(paste0( + "\nx must include a character vector named \"Proteins\"\n", + "that provides the protein or protein group associated with\n", + "each isotope pattern." + )) + } + if(!is.character(x[["Sequence"]])){ + stop(paste0( + "\nx must include a character vector named \"Sequence\"\n", + "that provides the peptide sequence for each isotope pattern." + )) + } + if(!is.numeric(x[["BWE"]])){ + stop(paste0( + "\nx must include a numeric vector named \"BWE\"\n", + "that provides the body water enrichment for each isotope pattern." + )) + } + if(!is.numeric(x[["Time"]])){ + stop(paste0( + "\nx must include a numeric vector named \"Time\"\n", + "that provides the sample collection time (post label exposure)\n", + "for each isotope pattern." + )) + } + return(x) +} + +#' Package FSR inputs for analysis +#' +#' @description +#' `.packageFsrData` aggregates various inputs into a single, consistently +#' formatted data object +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' `.packageFsrData` compiles sample data, observed isotope patterns, and +#' theoretical isotope patterns into a single list object. In the process, it +#' parses the observed isotope patterns (passed as semi-colon separated text +#' values) and optionally calculates theoretical baseline and asymptotic +#' isotope patterns. +#' +#' @param x a data frame +#' @param theory a list of theoretical isotope patterns +#' @param nMax maximum number of isotopic peaks to consider in analysis +#' @param formula a RHS model formula +#' @param ... additional arguments (not currently used) +#' +#' @result +#' An S3 list with the following components: +#' \describe{ +#' \item{`i`}{Numeric sample (row) index. Technically, each row is a peptide +#' sequence within a physical sample.} +#' \item{`y`}{Observed isotope patterns as a numeric matrix with +#' sample-peptides as rows and peaks as columns. Rows are normalized to sum +#' to 1.} +#' \item{`mask`}{A logical matrix with dimensions matching `y`, indicating +#' whether each value (peak in a sample) was observed (y > 0) or not.} +#' `.$design %*% pars`.} +#' \item{`f0`}{A numeric matrix with dimensions matching `y`, giving the +#' theoretical baseline isotope distribution for each sample-peptide.} +#' \item{`fa`}{A numeric matrix with dimensions matching `y`, giving the +#' theoretical asymptotic isotope distribution for each sample-peptide.} +#' \item{`time`}{A numeric vector giving the time at which each sample was +#' collected, relative to the beginning of label exposure.} +#' \item{`sample`}{A character vector identifying the physical sample for +#' each observation.} +#' \item{`sequence`}{A character vector identifying the peptide sequence for +#' each observation.} +#' \item{`protein`}{A character vector identifying the protein (or protein +#' group) for each observation.} +#' \item{`model`}{A model frame returned by `model.frame(formula, data = x)` +#' and used to map sample-peptides to fitted parameters. +#' \item{`objective`}{An R function that defines the objective to be +#' optimized during parameter estimation (see \code{\link{buildFsrModel}}).} +#' \item{`rate`}{An R function that calculates rates based on a parameter +#' vector and the model frame in the `model` component.} +#' \item{`difference`}{An R function that calculates rate differences +#' using a parameter vector, the `model` component, and a contrast matrix +#' returned by \code{\link{.buildContrastMatrix}}.} +#' } +#' +#' @seealso +#' +.packageFsrData <<- function(x, theory = NULL, nMax = 10, formula = ~1, + model = "e1c", ...){ + isopattern <- lapply(strsplit(x$Isotope.pattern, ";"), as.numeric) + y <- matrix(0, length(isopattern), nMax) + for(i in 1:length(isopattern)){ + cols <- 1:min(nMax, length(isopattern[[i]])) + tryCatch( + y[i, cols] <- isopattern[[i]][cols], + error = function(e) browser() + ) + } + if(is.null(theory)){ + theory = peptidesToIsotopePatterns( + x$Sequence, + label = list(H2 = x$BWE), # fix indexing of BWE + nMax = nMax + ) + } + .data <- list( + i = 1:nrow(y), + y = y/rowSums(y, na.rm = TRUE), + mask = y > 0, + f0 = t(mapply(getPattern, sequence = x$Sequence, sample = x$Raw.file, #Sample, + MoreArgs = list(x=theory, pattern = "baseline"))[1:nMax,]), # sapply(theory, "[[", "baseline"), + fa = t(mapply(getPattern, sequence = x$Sequence, sample = x$Raw.file, #Sample, + MoreArgs = list(x=theory, pattern = "asymptotic"))[1:nMax,]), # sapply(theory, "[[", "asymptotic"), + time = x$Time, + sample = x$Raw.file, #, #Sample + sequence = x$Sequence, + protein = x$Proteins, + formula = formula, + # unit = unit, + model = model.frame(formula, data = x) #design = model.matrix(object = formula, data = x), + ) + .data <- append(.data, .setFsrModel(model)) + return(.data) +} + +#' Initialize parameters for optimization +#' +#' @description +#' `.initializeParameters` reads the parameters from a design matrix, `x`, and +#' returns a list with starting values, lower, and upper limits for +#' optimization. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' `.initializeParameters` uses the column names from a design matrix to +#' construct vectors of starting values and lower and upper optimization +#' bounds. By default, these are set to `0`, `-Inf`, and `Inf`, respectively. +#' However, any of these vectors can be reset to user-supplied values. +#' +#' @param x a model frame (returned by \code{model.frame}) +#' @param start `NULL`, or a numeric vector of initial parameter values +#' @param lower `NULL`, or a numeric vector of lower optimization bounds +#' @param upper `NULL`, or a numeric vector of upper optimization bounds +#' @param ... additional arguments (not currently used) +#' +#' @result +#' A list containing 3 elements, `start`, `lower`, and `upper`, each of which +#' is a numeric vector of length `ncol(x)`. +#' +.initializeParameters <<- function(x, start = NULL, lower = NULL, upper = NULL, + ...){ + x <- #tryCatch( + model.matrix(formula(x), x)#, + # error = function(e){ + # msg <- "contrasts can be applied only to factors with 2 or more levels" + # if(grepl(msg, e)){ + # vals <- sapply(x, function(i) unique(i)) + # bad <- sapply(vals, length) == 1 + # vals <- vals[bad] + # model.matrix(update(formula(x), ~ -substitute(names(vals))), x) + # } + # } + # ) + pars <- colnames(x) + if(is.null(lower)) lower <- setNames(rep(-Inf, ncol(x)), pars) + if(is.null(upper)) upper <- setNames(rep(Inf, ncol(x)), pars) + if(is.null(start)) start <- setNames(rep(0, length(pars)), pars) + if(any( + names(lower) != pars, length(lower) != length(pars), !is.numeric(lower), + names(upper) != pars, length(upper) != length(pars), !is.numeric(upper), + names(start) != pars, length(start) != length(pars), !is.numeric(start) + )){ + stop(paste0( + "\n\"upper\", \"lower\", and \"start\" must be numeric vectors with\n", + "names and lengths that match the parameter vector\n", + "generated by the model formula.\n\n", + "Parameter names are:\n", paste(shQuote(pars), sep = ", ") + )) + } + .pars <- list(start = start, lower = lower, upper = upper) + return(.pars) +} + +#' Format and output the results from fitting an FSR model +#' +#' @description +#' `.formatFsrResult` obtains, formats, and outputs statistics and metadata +#' for a fitted model +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' The fitted model object is used to obtain estimates and confidence intervals +#' for rates and optionally for contrasts. If requested, contrasts also include +#' raw (i.e., not corrected for multiple comparisons) t-test results for the +#' null hypothesis that rates are equal. The results are organized into a list, +#' along with a data frame providing metadata about the analysis. +#' +#' @param object a fitted model object returned by \code{\link{.randStart}} +#' @param data a data object returned by \code{\link{.packageFsrData}} +#' @param contrasts a list specifying contrasts +#' @param ... additional arguments passed to \code{\link{.doInference}} +#' +#' @seealso +#' \code{\link{.buildContrastMatrix}} for implementation of contrasts, +#' \code{\link{.doInference}} for inferential statistics +#' +.formatFsrResult <<- function(object, data, contrasts, ...){ + npeaks <- rowSums(data$mask) + mats <- .buildContrastMatrix(data, contrasts, ...) # this is modified + ci <- .doInference(object, data, mats, type = "estimate", ...) # this is modified + contrasts <- .doInference(object, data, mats, type = "contrast", ...) # this is modified + + .res <- list( + rates = cbind(Protein = unique(data$protein), ci), + metadata = data.frame( + Protein = unique(data$protein), + Model = data$description, + Class = class(object), + Convergence = .checkConvergence(object), + `Time points` = length(unique(data$time)), + Specimens = length(unique(data$sample)), + Sequences = length(unique(data$sequence)), + `Max peaks` = max(npeaks), + `Average peaks` = mean(npeaks), + Measurements = length(data$time), + check.names = FALSE + ) + ) + if(!is.null(contrasts)){ + .res <- append( + .res, + list(tests = cbind(Protein = unique(data$protein), contrasts)), + after = 1 + ) + } + return(.res) +} + +#' Sort results into uniquely identified proteins versus protein groups +#' +.segregateProteinGroups <<- function(x, sep = ";"){ + for(i in c("rates","metadata","tests")){ + if(is.null(x[[i]])) next + is_group <- grepl(sep, x[[i]]$Protein, fixed = TRUE) + ord <- 1:nrow(x[[i]]) + x[[i]] <- list2DF(append(x[[i]], list(Group = is_group), after = 1)) + x[[i]] <- x[[i]][order(is_group, ord), , drop = FALSE] + } + return(x) +} + +#' Write a README file to annotate a collection of FSR results +#' +.writeFsrReadme <<- function(unit, unique_only, p_adjustment, ...){ + inferargs <- sapply( + .getArgs(.doInference, ..., + args = c("imethod","estimator","nsim")), + function(i) eval(i)[1] + ) + list( + readme = c( + "PROTEOME-WIDE ESTIMATION OF FRACTIONAL SYNTHESIS RATES", + paste0("Analysis run on ", Sys.Date()), + paste0("Units: percent turnover / ", unit), + paste0("Interval estimation methodology: ", inferargs["imethod"]), + if(inferargs["imethod"] == "simulation"){ + paste0("Number of parametric bootstrap simulations:", inferargs["nsim"]) + }, + paste0("FSR estimation methodology: ", inferargs["estimator"]), + paste0("P-value adjustment: ", p_adjustment), "", + + if(!unique_only){ + c( + paste0("WARNING: Rate estimates for protein groups may represent ", + "averaged results over mixtures "), + "of proteins with unknown propotions.", + paste0("They are likely to be inaccurate. Use caution when ", + "interpreting them, especially if "), + "a group includes proteins that are not closely related.", + "", + paste0("Groups are identified by a \"TRUE\" value for \"Group\" ", + "in the rates, tests, and metadata tables."), + "" + ) + } else {NULL}, + + "Table of contents:", + paste0(" - rates: point estimates, standard errors, and confidence ", + "intervals for each protein at"), + paste0(" each experimental condition. If a condition is missing ", + "for a particular protein, no data"), + " were available for the protein under the missing condition.", + " or protein group, by experimental condition if appropriate.", + paste0(" - tests: point estimates, standard errors, confidence ", + "intervals, and "), + paste0(" p-values for requested contrasts. Missing contrasts could ", + "not be estimated due to missing data."), + paste0(" - metadata: supplementary information about each protein ", + "(or protein group)"), + " and the model fit, including:", + " > A verbal description of the model that was used to estimate FSR", + " > The class of the optimization algorithm used to fit the model", + " > Algorithm convergence status (see below)", + " > The number of unique time points for which data were available", + " > The number of unique specimens for which data were available", + " > The number of peptide sequences used for estimation", + " > The average number of isotopic peaks per sequence", + " > The maximum number of isotopic peaks per sequence", + " > The total number of observations used for estimation ", + " (max 1 observation per time point per specimen per sequence)", + paste0(" - quality control: information about the results of quality ", + "control filters"), + " imposed on the raw data:", + " > Filter settings", + paste0(" > Counts of observations, peptides, proteins, and ", + "protein groups that remained"), + " in the analysis after filters were imposed", + paste0(" > Counts of proteins flagged as contaminants or bad ", + "reads by MaxQuant"), + paste0(" > Information on isotope pattern peak counts and ", + "inter-scan correlations, by raw file"), + paste0(" > A frequency distribution for the proportion of ", + "peptides that appeared in"), + " only 1 sample, in 2 samples, etc.", + paste0(" > An alphabetized list of all proteins that are ", + "mentioned in the input files"), + " for which rates were not estimated", + paste0(" - not run: a list of proteins and protein groups that were ", + "present in"), + " the input files but did not meet QC standards", + paste0(" - session: technical information about the computing ", + "environment,"), + " needed to replicate the results", + "", + "Interpretation notes:","", + "Confidence intervals and p-values are not corrected for multiple testing.", + "Corrections are coming soon....", "", + "If Convergence is FALSE, the estimation algorithm failed to reach a solution ", + "and no confidence intervals or p-values will be provided. Nonconvergence ", + "can occur because (a) one or more peptide sequences was misidentified, ", + "(b) because the time interval used for data collection was not appropriate ", + "for the true FSR of the protein (i.e., tunover is very near 0% or 100%), or ", + "(c) because the estimation algorithm was started at bad initial values. By ", + "default, 100 random restarts are used to minimize occurrence of case (c). ", + "To change this, use max_restarts = ." + ) + ) +} + +#' Construct a contrast matrix +#' +#' @description +#' `.buildContrastMatrix` constructs matrices for the evaluation of contrasts +#' or trends. It also outputs a matrix for case-wise point estimates. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. However, user-level functions do pass contrast specifications +#' to this function. +#' +#' Contrasts are specified by the `contrasts` argument, which accepts a named +#' list with up to two components, `$groups` and `$at`. These specify the factor +#' levels and covariate values for cases of interest, respectively. Both +#' components can be specified as a character vector or a list, and the +#' function's behavior changes depending on these specifications: +#' +#' \describe{ +#' \item{If `$groups` is a character vector}{Values should be the names of +#' one or more categorical variables in the experimental design. Cases will +#' be defined using all possible combinations of unique values from these +#' variables, and all pairwise combinations of cases will be constructed.} +#' \item{If `$groups` is a list}{The list must be named, with list names +#' corresponding to variable names, and each component of the list should +#' be a character vector giving the factor levels to be included in the +#' contrast matrix.} +#' \item{If `$at` is a charcater vector and `$groups` is `NULL`}{`$at` +#' should specify the names of one or more numeric design variables. Cases +#' will be defined based on the quantiles specified in the `probs` +#' argument.} +#' \item{If `$at` is a charcater vector and `$groups` is not `NULL`}{`$at` +#' should specify the names of one or more numeric design variables. Cases +#' will be evaluated at the mean value for each `$at` variable.} +#' \item{If `$at` is a list}{The list must be named, with list names +#' corresponding to numeric variable names. Each component of the list should +#' be a numeric vector giving the values at which cases should be evaluated.} +#' } +#' +#' Currently, contrasts are defined over all pairwise comparisons of cases. +#' +#' NOTE: this function has had only limited testing. Unexpected behaviors are +#' possible. It may be replaced in the future by a function utilizing the +#' \code{\link{emmeans}} package. +#' +#' @param data a data object returned by \code{\link{.packageFsrData}} +#' @param contrasts a list specifying contrasts +#' @param probs quantile probabilities at which to evaluate numeric covariates +#' @param ... additional arguments (not currently used) +#' +#' @result +#' A list with two entries: +#' \describe{ +#' \item{`$pmat`}{a matrix with one row for each unique case class (i.e., +#' design matrix cell), used to generate estimates for rates in different +#' groups} +#' \item{`$cmat`}{a matrix with one row for each requested contrast} +#' } +#' +#' @seealso +#' \code{\link{.formatFsrResult}} and \code{\link{.doInference}} for usage +#' context. The \code{\link{emmeans}} package for a similar (but much more +#' mature) implementation. +#' +.buildContrastMatrix <<- function(data, contrasts, probs = c(0,0.25,0.5,0.75,1), + ...){ + formula <- formula(data$model) + vars <- all.vars(formula) + if(is.character(contrasts$groups)){ + names(contrasts$groups) <- contrasts$groups + if(any(!names(contrasts$groups) %in% vars)) + stop("\"groups\" must be variables in \"formula\"") + if(any(!names(contrasts$at) %in% vars)) + stop("\"at\" must be variables in \"formula\"") + } + vals <- lapply( + setNames(vars, vars), + function(i){ + v <- data$model[[i]] + if(is.numeric(v)){ + if(i %in% names(contrasts$groups)) + stop("use \"at = ...\" to specify numeric variables in \"contrasts\"") + if(i %in% names(contrasts$at)){ + contrasts$at[[i]] + } else { + if(is.null(contrasts$groups)){ + quantile(v, probs) + } else { + mean(v) + } + } + } else{ + if(is.character(contrasts$groups) | !i %in% names(contrasts$groups)){ + unique(v) + } else { + v[v %in% contrasts$groups[[i]]] + } + } + } + ) + + vals <- lapply(vals, sort) + + cases <- do.call(expand.grid, vals) + if(length(vars) > 0){ + X <- model.matrix(formula, cases) + } else { + X <- model.matrix(formula, data$model[1,]) + } + labels <- lapply(names(cases), function(i) paste0(i, ".", cases[[i]])) + if(length(labels) == 0) labels <- list("rate") + rownames(X) <- apply(as.data.frame(labels), 1, paste, collapse = ":") + if(all(length(vars) > 0, !isFALSE(contrasts), nrow(X) >= ncol(X))){ + pairs <- gtools::combinations(nrow(X), 2) + rownames(pairs) <- paste(rownames(X)[pairs[,2]],"-",rownames(X)[pairs[,1]]) + } else { + pairs <- NULL + } + list( + pmat = X, + cmat = pairs + ) +} + +#' Obtain inferential statistics for an FSR model +#' +#' @description +#' `.doInference` obtains confidence intervals and hypothesis testing results +#' for an FSR model. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' The fitted model object is used to obtain estimates and confidence intervals +#' for rates if `type = "estimate"`, or estimates, intervals, and t-test +#' results if `type = "contrast"`. Confidence intervals and p-values may be +#' calculated using the delta method (`imethod = "delta"`, the default) or by +#' parametric bootstrap `"simulation"` using `nsim` replicates. Estimation can +#' be performed using either a version of the usual MIDA estimator based on the +#' monoisotopic peak (`estimator = "mida"`), or a full mixture model +#' (`"mixture"`). Both estimators are corrected for biases that may occur due +#' to truncation of the observed isotope distribution (OID). They will give +#' identical point estimates, but may give different confidence intervals +#' (I NEED TO CHECK THIS). +#' +#' If `estimator = "mixture"`, the covariance matrix and residual degrees of +#' freedom are adjusted under the assumption that each OID provides only one +#' bit of information (i.e., the peaks within an OID are \strong{NOT} mutually +#' independent). This assumption may be conservative in some cases, but +#' simulations suggest that it yields intervals with empirical coverage that +#' approximates the nominal `level`. +#' +#' @param object a fitted model object returned by \code{\link{.randStart}} +#' @param data a data object returned by \code{\link{.packageFsrData}} +#' @param matrices a list returned by \code{\link{.buildContrastMatix}} +#' containing a case matrix (`$pmat`) and a contrast matrix (`$cmat`) +#' @param type the type of result desired (see Details) +#' @param estimator the estimator used in the optimization (see Details) +#' @param imethod methodology used to calculate confidence intervals and +#' p-values (see Details) +#' @param nsim number of simulations if `imethod = "simulation"` (ignored +#' otherwise) +#' @param level confidence level for interval calculations +#' @param ... additional arguments (currently not used) +#' +#' @result +#' A data frame with the following entries: +#' \describe{ +#' \item{`parameter`}{the name of the estimated quantity (e.g. a treatment or +#' contrast)} +#' \item{`estimate`}{point estimate} +#' \item{`se`}{standard error of the estimate} +#' \item{`lo%`}{lower confidence bound, corresponding to the labeled quantile +#' (e.g., `2.5%`)} +#' \item{`hi%`}{upper confidence bound, corresponding to the labeled quantile +#' (e.g. `97.5%`)} +#' \item{`df`}{residual degrees of freedom used or the t-test (contrasts +#' only)} +#' \item{`t.value`}{t-statistic (contrasts only)} +#' \item{`p.value`}{p-value for the 2-sided test of the null hypothesis that +#' t = 0 (contrasts only)} +#' } +#' If the fitted model `object` has not converged, all entries except the +#' `parameter` name and point `estimate` will be `NA`. +#' +#' @seealso +#' \code{\link{.buildContrastMatrix}} for implementation of contrasts, +#' \code{\link{.formatFsrResult}} for usage context, and +#' \code{\link{.checkConvergence}} for convergence checks. +#' +.doInference <<- function(object, data, matrices, + type = c("estimate","contrast"), + estimator = c("mixture","mida"), + imethod = c("delta","simulation"), + nsim = 10000, level = 0.95, ...){ + a <- c((1 - level)/2, (1 + level)/2) + cinames <- paste0(format(a*100, justify = "none", trim = TRUE), "%") + outnames <- c("parameter","estimate","se", cinames, "df","t.value","p.value") + p <- coef(object) + np <- length(p) + pmat <- matrices$pmat + + switch( + match.arg(type), + estimate = { + cmat <- NULL + pnames <- rownames(pmat) + outnames <- outnames[1:5] + ifun <- data$rate + }, + contrast = { + cmat <- matrices$cmat + if(is.null(cmat)) return(NULL) + pnames <- rownames(cmat) + ifun <- data$difference + } + ) + fit <- as.vector(ifun(pmat, p, cmat)) + + if(.checkConvergence(object)){ + + switch( + match.arg(estimator), + mixture = { + rdf <- df.residual(object) - nrow(data$y) + cvmat <- vcov(object) * df.residual(object)/rdf + }, + mida = , mida_dr = { + rdf <- df.residual(object) + cvmat <- vcov(object) + } + ) + + switch( + match.arg(imethod), + delta = { + dfit <- numericDeriv( + quote(ifun(X, pars, contr)), + theta=c("pars"), + rho = list2env(list(X = pmat, pars = p, contr = cmat)) + ) + grd <- attr(dfit, "gradient") + se <- sqrt(diag(grd %*% cvmat %*% t(grd))) + ci <- fit + se %o% qt(a, rdf) + tv <- fit/se + pv <- 2 * (1 - pt(abs(tv), rdf)) + }, + simulation = { + b <- mvtnorm::rmvt(nsim, cvmat, delta = p, df = rdf) + theta <- ifun(X = pmat, pars = t(b), contr = cmat) + se <- apply(theta, 1, sd) + ci <- t(apply(theta, 1, quantile, a)) + tv <- fit/se + pv <- 2 * min((pv <- sum(y >= 0)/nsim), 1 - pv) + } + ) + + ci <- setNames(as.data.frame(ci), cinames) + + data.frame( + parameter = pnames, + estimate = fit, + se = se, + ci, + df = rdf, + t.value = tv, + p.value = pv, + check.names = FALSE + )[,outnames] + + } else { + + ci <- setNames(as.data.frame(t(rep(NA, length(a)))), cinames) + + data.frame( + parameter = pnames, + estimate = fit, + se = NA, + ci, + df = NA, + t.value = NA, + p.value = NA, + check.names = FALSE + )[,outnames] + + } +} + +#' Check the convergence of a fitted FSR model +#' +#' @describtion +#' Checks to make sure that the fitted model object has actually converged on +#' a useable result. Allows for checking of different model classes in the +#' future. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' `.checkConvergence` is a simple wrapper that evaluates the class of its +#' input `object` and then runs a specified check (or series of checks) on the +#' `object` to make sure it meets minimal convergence criteria. An error is +#' returned if the `object` does not inherit from a recognized class. +#' +#' Currently, the only class recognized by `.checkConvergence` is `"nls.lm"`. +#' Models of this class are required to have an exit code < 4 (see `??nls.lm`) +#' and a non-singular, positive definite (i.e., solvable) Hessian. +#' +#' @param object a fitted model object +#' +#' @result +#' A logical value indicating whether the convergence check was passed, +#' or an error. +#' +.checkConvergence <<- function(object){ + switch( + class(object), + "nls.lm" = all( + object$info < 4, + !isFALSE( + tryCatch( + solve(object$hessian), + error = function(e) FALSE + ) + ) + ), + "\"object\" is not from a recognized class." + ) +} + +#' Parameter optimization with random restarts +#' +#' @describtion +#' `.randStart` repeatedly calls \code{\link{minpack.lm::nls.lm}} with +#' random starting values to build confidence in the stability of the +#' resulting solution. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' \code{\link{minpack.lm::nls.lm}} will occasionally return a local rather +#' than global minimum. To minimize this risk, the optimizer is called up to +#' `max_restarts` times with different, random starting values, and the result +#' with the smallest objective function is returned. To reduce needless +#' computation, the function may quit searching after `nquit` tries fail to +#' improve model deviance. Optionally, it may also quit with an error if more +#' than `nerr` sequential tries fail to produce a model with a sensible Hessian. +#' +#' @param pars a numeric parameter vector (actual values are ignored) +#' @param data a data list returned by \code{\link{.packageFsrData}} +#' @param fn the objective function to be optimized (NOW IN DATA) +#' @param lower a vector of lower optimization bounds with the same length +#' and names as `par` +#' @param upper a vector of upper optimization bounds with the same length +#' and names as `par` +#' @param nquit search will end after `nquit` successive tries without +#' improvement +#' @param max_restarts maximum number of restarts +#' @param tol NO LONGER USED +#' @param nerr search will quit with an error if `nerr` successive tries +#' fail to converge on a solution +#' @param silent logical; messages are suppressed if `TRUE` +#' @param ... additional arguments passed to \code{\link{minpack.lm::nls.lm}} +#' +#' @result +#' A fitted \code{\link{nls.lm}} object, or an error. +#' +#' @seealso +#' \code{\link{minpack.lm::nls.lm}} +#' +.randStart <<- function(data, nquit = 3, max_restarts = 100, tol = 1e-8, + nerr = max_restarts, silent = TRUE, ...){ + FN <- data$objective + PARS <- .initializeParameters(data$model, ...) + tmax <- max(data$time) + # X <- with(data, model.matrix(formula, model)) + # u <- apply(X, 1, paste, collapse="") + # X <- X[match(unique(u), u),] + if("(Intercept)" %in% names(PARS$start)){ + lwr <- c(-2.5, rep(0, length(PARS$start)-1)) + upr <- -1 * lwr + } else { + lwr <- -2.5 + upr <- 2.5 + } + pars <- PARS$start + starts <- replicate(max_restarts, runif(length(pars), lwr, upr)) + if(is.vector(starts)) starts <- t(starts) + dv <- Inf + errors <- 0 + for(i in 1:max_restarts){ + pars[] <- starts[,i] + mod <- try( + minpack.lm::nls.lm(pars, lower = PARS$lower, upper = PARS$upper, + fn = FN, data = data, ...), + silent = TRUE + ) + check <- try(solve(mod$hessian), silent = TRUE) + if(inherits(check, "try-error")){ + errors <- errors + 1 + if(errors > nerr & !is.finite(dv)){ + stop(paste0(nerr, " errors without progress:\n", mod)) + } + } else { + errors <- 0 + } + + if(is.na(dv > deviance(mod))) browser() + + if(dv > deviance(mod)){ + dv <- deviance(mod) + winner <- mod + nbest <- 0 + } else { + nbest <- nbest + 1 + if(nbest == nquit) break + } + } + if(!silent) message(paste0( + "Winner after ", i, " restarts (", nbest, " wih no improvement)." + )) + return(winner) +} + +#' Normalization of observed and theoretical isotope distributions +#' +#' @describtion +#' Provides several normalization methods, including baseline normalization, +#' internal proportional normalization, and internal maximal normalization. +#' WARNING: internal proportional methods (i.e., to self) are not valid for +#' comparisons of distributions truncated at different quantiles. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' Given a data object with observed (OID) and theoretical distributions, this +#' function will normalize the OID for each sample (`data$y`) and the +#' theoretical asymptotic distribution (`data$fa`) to the theoretical baseline +#' distribution (`data$f0`). The default `baseline` method corrects for the +#' potential truncation of the OIDs and allows MIDA to yield unbiased estimates +#' for FSR. +#' +#' Alternative methods are provided for reference only and \strong{will} +#' produce biased results if OIDs are significantly truncated. The `proportion` +#' method normalizes each sample OID, `$fa`, and `$f0` so that the peaks +#' within the pattern sum to 1. The `maximum` method normalizes each pattern +#' relative to its largest peak, so that the maximum is equal to 1. +#' +#' @param data a data list returned by \code{\link{.packageFsrData}} +#' @param prop_new a numeric vector giving the proportion of newly synthesized +#' peptide in the sample(s) (i.e., the mixing fraction for `fa` and `f0`) +#' @param normalization the normalization method to be used. +#' @param ... additional arguments (currently unused) +#' +#' @result +#' A new copy of `data` in which `$y`, `$f0`, and `$fa` are normalized . +#' +#' @seealso +#' \code{\link{.ueicGivenp}} for an example of use context +#' +.normalizeIsotopePatterns <- function(data, prop_new = NULL, + normalization = "baseline", + ...){ + switch( + normalization, + baseline = { + stopifnot(all(prop_new >= 0, prop_new <= 1)) + ftmsk <- with(data, ((1 - prop_new) * f0 + prop_new * fa) * mask) + denom <- with(data, rowSums(f0 * mask)) + data$f0 <- with(data, (f0 * mask)/denom) + data$fa <- with(data, (fa * mask)/denom) + data$y <- with(data, (y * mask)/rowSums(y * mask) * rowSums(ftmsk)/denom) + }, + proportion = { + data$f0 <- with(data, (f0 * mask)/rowSums(f0 * mask)) + data$fa <- with(data, (fa * mask)/rowSums(fa * mask)) + data$y <- with(data, (y * mask)/rowSums(y * mask)) + }, + maximum = { + rowMax <- function(x, na.rm = FALSE) apply(x, 1, max, na.rm = na.rm) + data$f0 <- with(data, (f0 * mask)/rowMax(f0 * mask)) + data$fa <- with(data, (fa * mask)/rowMax(fa * mask)) + data$y <- with(data, (y * mask)/rowMax(y * mask)) + }, + stop(paste0( + shQuote(normalization), " is not a recognized normalization.\n", + " Please use \"baseline\", \"proportion\", or \"maximum\"." + )) + ) + data +} + +#' One-compartment exponential FSR model with given enrichment +#' +#' @describtion +#' Generates objective, rate, and difference functions for fitting and +#' interpreting the one-compartment exponential model. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' DESCRIBE THE MODEL HERE. +#' +#' @result +#' A named list with components `$objective`, `$rate`, and `$difference`, +#' each containing an R function, and `model_name`, containing a single +#' string-value description. `$objective` provides for calculation of the +#' residuals for a given dataset and hyperparameter vector. `$rate` converts +#' hyperparameters into parameters (i.e., rates) for cases based on a given +#' design matrix. `$difference` performs the same conversion, but returns +#' the difference (i.e., contrast) between two specified cases. +#' +#' @seealso +#' \code{\link{buildFsrModel}} or model specification guidelines. +#' +.ue1cGivenp <<- function(){ + list( + description = "exponential, 1 compartment, enrichment given", + objective = { # objective function to be optimized + function(pars, data, estimator = c("mixture","mida"), ...){ + estimator <- match.arg(estimator) + mmat <- with(data, model.matrix(formula, model)) + khat <- with(data, as.vector(rate(mmat, pars))) # linpred k here. + frac_new <- with(data, 1 - exp(-khat * time)) # note the complement! so w is P(new) instead of P(old) + data <- .normalizeIsotopePatterns(data, frac_new, ...) + resid <- with(data, { + switch( + estimator, + mixture = { + yhat <- ((1 - frac_new) * f0 + frac_new * fa) * mask + ((y - yhat)/sqrt(yhat))[mask] # Pearson residual with n = 1 + }, + mida = { + yhat <- fa[,1] + (f0[,1] - fa[,1]) * exp(-khat * time) + (y[,1] - yhat) + }, + mida_dr = { + i <- apply(abs(fa - f0), 1, which.max) + frac_new <- sapply(seq_along(i), function(ii) + (-log(1-(y[,ii] - f0[,ii])/(fa[,ii] - f0[,ii]))) + ) + k <- frac_new/time + (k - khat) + } + ) + }) + return(resid) + } + }, + rate = { # function to obtain target parameters from hyperparameters + function(X, pars, contr = NULL) + exp(X %*% pars) # rates + }, + difference = { # function to obtain contrasts from hyperparameters + function(X, pars, contr = NULL) + exp(X[contr[,2],] %*% pars) - exp(X[contr[,1],] %*% pars) # contrasts + } + ) +} + +#' Use a residual function to build an objective function +#' +#' @description +#' Given an R function that returns a vector of residuals for a model, +#' return a new function with the same formal arguments that returns a +#' scalar objective value. +#' +#' @details +#' `.makeObjectiveFunction` provides a utility to simplify switching between +#' optimizers that expect a residual vector (e.g., +#' \code{\link{minpack.lm::nls.lm}}) and optimizers that expect a scalar +#' objective function (e.g., \code{\link{optim}}). Currently, it only +#' returns the sum-of-squared residuals, but other objective functions +#' can easily be added using a \code{\link{switch}} with an argument. +#' +#' @param resid_fun an R function. Technically, any function will work, but +#' only functions that return numeric vectors will produce usable output. +#' +#' @eamples +#' .makeObjectiveFunction(rnorm) +#' +.makeObjectiveFunction <- function(resid_fun){ + args <- formals(resid_fun) + argstring <- paste( + paste0(names(args), "=", names(args)), + collapse = ", " + ) + resid_fun <- deparse(substitute(resid_fun)) + cmd <- paste0( + "function(){ + err <- ", resid_fun, "(", argstring, ") + sum(err^2) + }" + ) + func <- eval(parse(text = cmd)) + formals(func) <- args + return(func) +} + +#' Plot the normalized isotope patterns for a protein +#' +#' @description +#' Uses \code{\link{ggplot2}} to generate plots showing the normalized +#' observed isotope distributions overlaid against the envelope created by +#' the baseline and asymptotic distributions. The plot is faceted by +#' peptide sequence. Line colors indicate different experimental conditions. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' The envelope between the baseline and asymptotic functions is shown in +#' grey, and the boundaries are labeled f_0 for the baseline and f_a for +#' the asymptotic distribution. Experimental results show one line for +#' each sample, color-coded by experimental condition. Experimental data +#' should fall within the envelope. If they do not, one or more of the +#' following problems may be responsible: +#' \enumerate{ +#' \item{The peptide sequence may have been misidentified.} +#' \item{The molar fractions used to calculate the baseline isotope +#' distribution may not be representative for the problem sample(s).} +#' \item{The body water enrichment used to calculate the asymptotic +#' distribution may not be accurate.} +#' } +#' Note that issue 1 should occur idiosyncratically, while issues 2 and 3 +#' will affect all of the data for one or more samples. +#' +#' Aesthetic note: Although isotope distributions are discrete, `.plotFsr2` +#' represents them with continuous lines. This is a deliberate choice intended +#' to make it easier to distinguish the overall patterns in the data. +#' +#' @param data an input object generated by \code{\link{.packageFsrData}} +#' +#' @result +#' A list containing the gg-object with instructions to draw the plot. +#' +.plotFsr2 <<- function(data){ + data$y[data$y == 0] <- NA + df <- data.frame( + Sample = data$sample, + Sequence = data$sequence, + Condition = if(ncol(data$model) > 0){ + interaction(data$model) + } else { + data$sample + }, + f0 = c(data$f0), + fa = c(data$fa), + y = c(data$y), + m = c(col(data$y)) - 1 + ) + lbls <- dplyr::reframe( + dplyr::group_by(df, Sequence), + x = -0.3, + y = c(max(f0), max(fa)), + label = c("italic(f)[0]", "italic(f)[a]"), + col = c("navyblue", "goldenrod") + ) + plt <- ggplot2::ggplot(df, ggplot2::aes(x=m)) + + ggplot2::geom_ribbon(ggplot2::aes(ymin = f0, ymax = fa), alpha = 0.1) + + ggplot2::geom_line(ggplot2::aes(y=f0), color = "navyblue", alpha=0.4) + + ggplot2::geom_line(ggplot2::aes(y=fa), color = "goldenrod", alpha=0.4) + + ggplot2::geom_line(ggplot2::aes(y=y, color = Condition, group = Sample)) + + ggplot2::labs(x = "Peak index", y = "Normalized intensity") + + ggplot2::scale_x_continuous(breaks = unique(df$m)) + + ggplot2::facet_wrap(.~Sequence) + + ggplot2::geom_text(ggplot2::aes(x=x,y=y,label=label), data=lbls, + parse = TRUE, size = 6, family = "serif", + color = lbls$col) + + ggplot2::theme_bw(base_size = 14) + list(plt) +} +.getFsrPlots <<- function(x){ + pp <- lapply(lapply(x, "[[", "plot"), "[", 1, 1) + nms <- sapply(pp, names) + has_plot <- !sapply(nms, is.null) + setNames( + sapply(pp, function(i) i[[1]])[has_plot], + nms[has_plot] + ) +} + + +#' Perform quality control checks and report results +#' +#' @description +#' Performs quality control checks to make sure that data meet +#' specified standards. +#' +#' @details +#' This is an internal function. It is not intended to be called directly +#' by users. +#' +#' `.checkMsmsDataQuality` checks for (and removes) peptides flagged by +#' MaxQuant as reverse reads or common contaminants. Next, it counts the +#' number of isotope patterns reported for each peptide, and removes peptides +#' with no patterns or peptides whose patterns are insufficiently consistent +#' within a sample (if > 1 scan-level pattern is available). The required +#' level of consistency is set by `mincorr` and evaluated by a correlation +#' `method`. At the sequence level, peptides are required to have pattern data +#' in at least `mincount` samples. Results of the quality control checks +#' are output to a summary `file`. +#' +#' @param pep data frame of peptide data +#' @param method correlation method; see \code{\link{stats::cor}} +#' @param mincorr numeric value giving the minimum required interscan +#' correlation +#' @param mincount numeric value giving the required number of samples with +#' isotope pattern data. +#' @param N total number of samples +#' @param file character value specifying a file path for the quality control +#' summary +#' +#' @result +#' A new version of the `pep` data frame that only contains peptides that +#' passed the quality control checks. As a side effect, a summary of the +#' quality control checks is saved to disk. +#' +.checkMsmsDataQuality <<- function(pep, method, mincorr, use_single_scans, + mincount, sample_data, formula, N){ + message(paste0(round(Sys.time()), ": Running quality control filters")) + msg <- c( + " - Removing MaxQuant-flagged contaminants and reverse reads", + paste0( + " - Minimum required correlation among isotope patterns within ", + "a sample: ", mincorr), + paste0( + " - Minimum required number of samples/design cell with ", + "isotope pattern data for peptide: ", mincount) + ) + message(paste(msg, collapse = "\n")) + + # Basic counts + totals <- .countSampPepProt(pep, "Raw") + + # Reverse read and contaminant removal + bad_qc <- with(pep, c( + contaminants = sum(grepl("CON__", Proteins, fixed = TRUE)), + reverse_reads = sum(Proteins == "") + )) + pep <- dplyr::filter(pep, !grepl("CON__", Proteins, fixed = TRUE), Proteins != "") + totals <- cbind(totals, .countSampPepProt(pep, "- Bad reads")) + + # Sample-sequence quality control checks + # (require at least one pattern, and consistency among scans within a sample) + suppressMessages({ + sampseq_qc <- pep %>% dplyr::group_by(Raw.file, Sequence) %>% + dplyr::summarize( + m = sum(!is.na(Isotope.pattern)), + r = .isotopePatternCorrelation(Isotope.pattern, method) + ) + if(use_single_scans){ + pep <- dplyr::full_join(pep, sampseq_qc, by = c("Raw.file","Sequence")) %>% + dplyr::filter(m == 1 | r >= mincorr) + } else { + pep <- dplyr::full_join(pep, sampseq_qc, by = c("Raw.file","Sequence")) %>% + dplyr::filter(m >= 1, r >= mincorr) + } + totals <- cbind(totals, .countSampPepProt(pep, " - Inconsistent scans")) + }) + # Sequence-level quality control checks + # (require patterns for a minimum percentage of samples) + suppressMessages({ + seq_qc <- pep %>% + dplyr::mutate(..case = .getDesignCases(sample_data, formula, Raw.file)) %>% + dplyr::group_by(Sequence) %>% + dplyr::summarize(n = max(table(..case[match(unique(Raw.file), Raw.file)]))) #length(unique(Raw.file))) + pep <- dplyr::full_join(pep, seq_qc, by = "Sequence") %>% + dplyr::filter(n >= mincount) + totals <- cbind(totals, .countSampPepProt(pep, "- Low pattern count")) + }) + # Write quality control summary and return data + qcrep <- .writeQcSummary(msg, totals, bad_qc, sampseq_qc, seq_qc, N) + attr(pep, "qc") <- qcrep + pep +} + +#' Internal functions called by `.checkMsmsDataQuality` +#' +#' @description +#' These are internal functions that handle repetitive tasks related to +#' quality control reporting. They are not intended to be called by users. +#' +#' @details +#' `.countSampPepProt` counts the number of samples, peptides, proteins, and +#' protein groups in a data frame. `.isotopePatternCorrelation` calculates +#' the correlations between different scans for a given isotope pattern. +#' `.writeQcSummary` formats the quality control summary and writes it to disk. +#' +#' @param x data frame of peptide data (`.countSampPepProt`) or a character +#' vector of delimited isotope patterns (`.isotopePatternCorrelation`) +#' @param name a variable name for the summary line output by +#' `.countSampPepProt` +#' @param method a correlation method passed to \code{\link{stats::cor}} +#' @param sep character separating peak intensities within an isotope pattern +#' @param ret_min logical; if `TRUE` and >2 scans are present, the minimum +#' pairwise correlation is returned. Otherwise the entire matrix is retuned +#' @param file file path for the QC summary +#' @param pars a message describing the quality control parameters used +#' @param totals data frame giving counts at each stage of the procedure +#' @param bad data frame giving counts of reverse reads and contaminants +#' @param sampseq data frame with counts from the sample-sequence level checks +#' @param seq data frame with counts from the sequence level checks +#' @param N total number of samples +#' @param file character value specifying a file path for the quality control +#' summary +#' +#' @seealso +#' \code{\link{.checkMsmsDataQuality}} +#' +.countSampPepProt <- function(x, name){ + x <- with(x, data.frame(c( + samples = length(Raw.file), + peptides = length(unique(Sequence)), + proteins = length(unique(unlist(sapply(Proteins, strsplit, ";")))), + protein_groups = length(unique(Proteins)) + ))) + names(x) = name + x +} +.isotopePatternCorrelation <- function(x, method = "spearman", + sep = ";", ret_min = TRUE){ + if(length(x) == 1) return(NA) + y <- strsplit(na.omit(x), sep) + if(length(y) <= 1) return(NA) + L <- sapply(y, length) + for(i in 1:length(y)){ + y[[i]] <- as.numeric(y[[i]]) + if(L[i] < max(L)) y[[i]][(L[i]+1):max(L)] <- 0 + } + X <- do.call(cbind, y) + R <- cor(X, method = method) + # if(any(is.na(R))) browser() + if(ret_min){ + R[which.min(abs(R))] + } else { + R + } +} +.getDesignCases <<- function(samples, formula, ids){ + getDesignGroups(samples, formula)[match(ids, samples$File)] +} +.getGSS <- function(ids, samples, formula){ + n <- groupedSampleSize(dplyr::filter(samples, File %in% ids), formula) + if(length(n) == 1) n else(max(n[-1])) +} +.writeQcSummary <<- function(pars, totals, bad, sampseq, seq, N){ + totals <- cbind(Level = rownames(totals), totals) + sampseq <- dplyr::summarize( + sampseq, + `m = 0` = sum(m == 0), + `m = 1` = sum(m == 1), + `r < 0` = sum(r < 0, na.rm = TRUE), + `r in (0,0.6]` = sum(r >= 0 & r < 0.6, na.rm = TRUE), + `r in (0.6,0.8]` = sum(r >= 0.6 & r < 0.8, na.rm = TRUE), + `r in (0.8,0.9]` = sum(r >= 0.8 & r < 0.9, na.rm = TRUE), + `r in (0.9,0.99]` = sum(r >= 0.9 & r < 0.99, na.rm = TRUE), + `r >= 0.99` = sum(r >= 0.99, na.rm = TRUE) + ) %>% + dplyr::bind_rows(., dplyr::bind_cols( + data.frame(Raw.file = c("Total (n)", "Total (%)", "Total (% | m > 1)")), + dplyr::bind_rows( + colSums(.[,-1]), + round(100 * colSums(.[,-1])/sum(.[,-1]), 2), + round(100 * colSums(.[,-(1:3)])/sum(.[,-(1:3)]), 2) + ) + )) + seq <- seq %>% count(n, name = "Count") %>% + dplyr::mutate(`%` = round(100 * Count/sum(Count), 2)) + + report <- list( + `quality_control` = c( + paste(rep("-", 80), collapse = ""), "QUALITY CONTROL SUMMARY:", + paste(rep("-", 80), collapse = ""), "", pars, + "","Sample, Peptide, and Protein counts after checks:", + paste(rep("-", 49), collapse = "") + ), + totals = totals, + header2 = c( + "","Counts of bad reads flagged by MaxQuant:", + paste(rep("-", 40), collapse = "") + ), + bad = t(bad), + header3 = c( + "","Sample-sequence level (scans/sequence/sample):", + "[m = number of isotope patterns, r = correlation among patterns if m > 1]", + paste(rep("-", 73), collapse = "") + ), + sampseq = sampseq, + header4 = c( + "", paste0("Sequence level (samples/sequence, of possible ", N, "):"), + "[number of peptides that have patterns in n samples]", + paste(rep("-", 52), collapse = "") + ), + seq + ) + return(report) +} + +.writeQcAddendum <<- function(results, notrun){ + ingroups <- with( + results$metadata, + unique(sapply(strsplit(Protein[Group], ";"), "[[", 1)) + ) + c( + "", + "Final protein counts:", + paste0("FSR estimation was attempted for ", + sum(!results$metadata$Group), " uniquely identified proteins"), + paste0("and ", sum(results$metadata$Group), + " protein groups that include an additional ", length(ingroups), + " possible proteins"), + paste0("(any given protein in a group may or may ", + "not be present in the sample)."), + paste0("Estimation succeeded for ", + with(results$metadata, sum(Convergence & !Group)), + " uniquely identified proteins and ", + with(results$metadata, sum(Convergence & Group)), + " protein groups."), + paste0("Convergence failed in ", sum(!results$metadata$Convergence), + " attempts."), + paste0("FSR estimation was not attempted for ", length(notrun[[1]]), + " proteins or protein groups that appear in the MaxQuant"), + "results but failed to meet quality control standards.", "", + "A list of unestimated proteins is available under \"not_run\"." + ) +} + +#' Internal functions for FSR simulation +#' +#' @description +#' These functions are used to generate simulated datasets (`.genFsrSim`), +#' to analyze them (`.fitFsrSim`), and to evaluate error statistics for the +#' simulation (`.appendErrorStats`). +#' +#' @details +#' These are internal functions. They are not intended to be called directly +#' by users. +#' +#' `.genFsrSim` expects a data frame `X` that includes (at a minimum) columns +#' for `Subject`, `Protein`, `Sequence`, `Rate`, `Delta`, `Time`, `Count`, +#' `Trap_factor`, and `Noise_threshold`, as produced internally by +#' \code{\link{simFsr}}. These columns define the parameters that +#' \code{\link{simulateIsotopePattern}} needs to generate an observed isotope +#' distribution (OID) for each row in `X`. `.genFsrSim` also expects a +#' named parameter list, `pars`, that must include entries for `bwe_mean`, +#' `bwe_sd`, `maxpeaks`, `baserate` and `delta`. The last two entries should +#' correspond to the `Rate` and `Detla` values in `X`. +#' +#' Simulation proceds as follows: +#' \enumerate{ +#' \item{A body water enrichment (BWE) value is drawn for each unique +#' `Subject` (i.e., biological replicate) in `X`, where +#' $BWE \sim N(bwe_mean, bwe_sd)$.} +#' \item{Theoretical baseline and asymptotic distributions are generated for +#' each row in `X` based on the `Sequence` and `BWE` (to reduce computation, +#' theoretical distributions are first generated for each unique combination +#' `Sequence` and `BWE`, and then looked up for each row). Theoretical +#' distributions are stored in `X` as a list column named `Theory`.} +#' \item{An OID is simulated for each row in `X` based on its `Sequence`, +#' theortical distributions, and `BWE`, `Time`, true fractional synthesis rate +#' (` = Rate + Delta`), and observation model parameters (`maxpeaks`, `Count`, +#' `Trap_factor`, and `Noise_threshold`). The simulated OID is appended to +#' `X` as a column named `Isotope.pattern` and stored as a character vector +#' containing semi-colon (;) separated peak values for each row (this mimics +#' the format used by MaxQuant).} +#' \item{A modified version of `X` with simulated OID and theoretical +#' distributions is returned.} +#' } +#' +#' `.fitFsrSim` expects the data frame generated by `.genFsrSim` and parameter +#' list `pars`. It determines whether any `Delta != 0` exist, and generates a +#' model formula as appropriate (if nonzero `Delta`s are present, there must +#' also be a column in `X` called `Treatment` to distinguish "treatment" and +#' "control" groups (TODO: Generate this automatically inside `.fitFsrSim`)). +#' `X` is then segmented into peptide and sample tables, mimicking input from +#' MaxQuant, and then passed to \code{\link{.doMaxQuantFsr}} for model fitting. +#' Finally, `.appendErrorStats` is called to append the true rates (or rate +#' contrasts), and estimation error, along with an indicator reporting whether +#' the true value falls inside the estimated confidence interval. +#' +#' @param X a data frame (see Details) +#' @param pars a named list of simulation parameters (see Details) +#' @param ... additional arguments passed to \code{\link{.doMaxQuantFsr}} +#' @param x a list of estimation results returned by +#' \code{\link{.doMaxQuantFsr}} +#' +#' @result +#' +#' @seealso +#' +.genFsrSim <<- function(X, pars, ...){ + `%>%` <- magrittr::`%>%` + enrichment <- function(id, mean, sd){ + uid <- unique(id) + setNames(rnorm(length(uid), mean, sd), uid)[id] + } + X <- dplyr::mutate(X, BWE = enrichment(Subject, pars$bwe_mean, pars$bwe_sd)) + useq <- unique(X$Sequence) + ubwe <- unique(X$BWE) + THEORY <- peptidesToIsotopePatterns(useq, label = list(H2 = ubwe), + nMax = pars$maxpeaks) + getTheory <- function(sequence, bwe) THEORY[[sequence]][[bwe]] + X <- dplyr::rowwise(X) %>% + dplyr::mutate( + Theory = list(getTheory(Sequence, match(BWE, ubwe))), + Isotope.pattern = simulateIsotopePattern( + theory = Theory, + model = pars$model, + rate = Rate + Delta, + time = Time, + count = Count, + trap_factor = Trap_factor, + noise_threshold = Noise_threshold, + collapse = ";" + ) + ) %>% + dplyr::ungroup() + X +} +.fitFsrSim <<- function(X, pars, ...){ + `%>%` <- magrittr::`%>%` + if(any(X$Delta != 0)){ + formula <- ~Treatment + } else { + formula <- ~1 + } + pep <- dplyr::transmute(X, Raw.file = File, Proteins = Proteins, + Sequence = Sequence, Isotope.pattern = Isotope.pattern) + smpl <- dplyr::select(X, File, Subject, BWE, Time, Treatment) %>% + dplyr::slice(match(unique(File), File)) + res <- .doMaxQuantFsr(x = pep, formula = formula, sample_data = smpl, + minseq = 1, mMax = pars$maxpeaks, model = pars$model, + contrasts = list(), level = 0.95, plot = FALSE, ...) + res <- .appendErrorStats(res, pars) + res +} +.appendErrorStats <<- function(x, pars){ + tables <- c("rates","tests") + for(what in tables){ + if(!what %in% names(x)) next + truth <- switch( + what, + rates = with(pars, baserate + delta), + tests = with(pars, diff(baserate + delta)) + ) + x[[what]]$truth <- truth + x[[what]]$error <- x[[what]]$estimate - truth + x[[what]]$in_ci <- x[[what]]$`2.5%` < truth & truth <= x[[what]]$`97.5%` + } + return(x) +} + +#' Internal functions for isotope pattern calculations +#' +#' @description +#' These functions are used to generate theoretical isotope patterns from +#' peptide sequences and data on isotopic molar fractions. +#' +#' @details +#' These are internal functions. They are not intended to be called directly +#' by users. +#' +#' \describe{ +#' \item{`.atomsToPattern`}{organizes the calculation of baseline and labeled +#' mass isotope patterns for a given peptide sequence.} +#' \item{`.labeledEIP`}{implements the actual calculation of baseline and +#' enriched elemental isotope distributions, and then combines them to return +#' molecular isotope distributions.} +#' \item{`.truncateElementalIsotopePattern`}{calculates the multinomial isotope +#' distribution for a given element based on a given molar fraction and number +#' of atoms, and truncates its tail. The theoretical patterns include n + 1 +#' peaks (n = the number of atoms), but most higher-order peaks are too small +#' to be detected. Given a maximum of `nMax` peaks of interest, the function +#' truncates the higher-order peaks (index > `nMax`). This greatly reduces the +#' sample space dimension for the resulting joint distribution in downstream +#' calculations, while preserving the relative probabilities of the remaining +#' peaks. This is important because `.calcPattern` has complexity on the order +#' of O(n^N), where n is the average number of peaks/element and N is the +#' number of elements.} +#' \item{`.calcPattern`}{calculates the mass isotopomer distribution for a +#' compound with (possibly) labeled and unlabeled components. Given a set of +#' N isotope pattern vectors for N elements in a compound, where vector i +#' describes the probability for each of K nominal mass differences from the +#' monoisotope due to element i, the sample space for potential total mass +#' differences over the compound is described by a pair of tensors. The first +#' (`.MASSPATTERN`) describes the probabilities of disjoint outcomes and +#' results from a cumulative outer product of the pattern vectors. The second +#' (`.MASSDELTA`) describes the corresponding numbers of extra neutrons in +#' the compound, and results from a cumulative outer sum of the pattern +#' labels. Summation of the probabilities over each delta value yields the +#' compound-level mass isotopomer pattern.} +#' \item{`.getElementCount`}{counts the number of atoms of each element in a +#' molecular formula} +#' } +#' +#' +#' @param x a character vector providing an amino acid sequence (one amino acid +#' per element) (`.atomsToPattern`) or a vector of molecular formulas +#' (`.getElementCount`) +#' @param m a numeric vector of peak indices starting at 0 +#' @param label a named list of isotopic labels with the form +#' `list(isotope = enriched_fraction_value)` +#' @param isotopes a character vector identifying isotopes to be used in +#' calculations. Values must occur in `rownames(fractions)`. +#' @param aatable data frame of amino acid properties; see +#' \code{\link{amino_acids}}. Row names must identify the amino acid by +#' symbol ("A" for alanine, "R" for arginine, etc.), and variables must include +#' a `"formula"` providing the molecular composition. +#' @param fractions data frame of stable heavy isotopes; see +#' \code{\link{stable_isotopes}}. Row names must identify the isotopes (e.g., +#' "C13") and variables must include `element` and `molar_fraction`. In +#' standard usage, the second variable is added by the calling wrapper, +#' `peptidesToIsotopePatterns`. +#' @param lbl an element from `label` +#' @param aatable a dta frame of amino acid properties +#' @param elem a character value identifying a target element (e.g., "C") +#' @param n a named numeric vector giving the elemental composition of the +#' target peptide (as illustration, methane would be `c(C = 1, H = 4)`). +#' @param nMax the number of isotopic peaks to include in the calculation +#' (= max(m) + 1) +#' @param unlabeled a named list of numeric vectors providing the element-wise +#' mass isotope distributions for an unlabeled peptide +#' @param labeled a named list of numeric vectors providing the element-wise +#' mass isotope distributions for an isotopically labeled peptide +#' @param pos a matrix-list specifying the text position of a given elemental +#' symbol in a molecular formula +#' @param i row indices for `pos` +#' @param j column indices for `pos` +#' +.atomsToPattern <- function(x, m, label, isotopes, aatable, fractions){ + if(all(length(label)>0, any(!names(label) %in% isotopes))){ + isotopes <- c(label, isotopes) + } + if(any((badiso <- !isotopes %in% rownames(fractions)))){ + stop(paste0( + "The following isotopes are missing from enrichment data: \n", + paste(isotopes[badiso], collapse = ", ") + )) + } + nMax <- max(m)+1 + elements <- unique(gsub("\\d", "", isotopes)) + elements <- setNames(elements, elements) + n0 <- colSums(countAtoms(aatable[x,"formula"])[, elements, drop = FALSE]) + n0 <- n0[n0 > 0] + base_pm0 <- lapply(elements[names(n0)], .truncateElementalIsotopePattern, + n = n0, fractions = fractions, nMax = nMax) + + if(length(label) > 0){ + environment(.labeledEIP) <- environment() + apply(as.data.frame(label), 1, .labeledEIP, aatable=aatable[x,,drop = FALSE]) + } else { + list( + baseline = .calcPattern(m, base_pm0), + asymptotic = NULL + ) + } +} +.labeledEIP <- function(lbl, aatable){ + elems <- unique(gsub("\\d", "", names(lbl))) + elems <- setNames(elems, elems) + nL <- colSums(aatable[x, elems, drop = FALSE]) + n0[elems] <- n0[elems] - nL + pm0 <- lapply(elements[names(n0)], .truncateElementalIsotopePattern, + n = n0, fractions = fractions, nMax = nMax) + isos <- names(lbl) + fractions[isos, "molar_fraction"] <- lbl[] + pmL <- lapply(elems, .truncateElementalIsotopePattern, n = nL, + fractions = fractions, nMax = nMax) + list( + baseline = .calcPattern(m, base_pm0), + asymptotic = .calcPattern(m, pm0, pmL) + ) +} +.truncateElementalIsotopePattern <- function(elem, n, fractions, nMax, + warn = 0.05){ + with(subset(fractions, element == elem), { + pe <- elementalIsotopomerPattern(n[elem], molar_fraction, delta) + # pe[nMax + 1] <- sum(pe[-(1:nMax)]) + if((sumpe <- sum(pe[-(1:nMax)])) > warn){ + + # browser() + + warning(paste0("Total truncated probability for ", elem, " is ", + round(sumpe, 3), ".\n Consider a larger nMax.")) + } + pe[1:nMax] + }) +} +.calcPattern <- function(m, unlabeled = list(), labeled = list()){ + .MASSPATTERN <- NULL + if(length(unlabeled) > 0){ + .MASSPATTERN <- unlabeled[[1]] + .MASSDELTA <- as.numeric(names(unlabeled[[1]])) + if(length(unlabeled) > 1){ + for(i in 2:length(unlabeled)){ + .MASSPATTERN <- outer(.MASSPATTERN, unlabeled[[i]], "*") # use sum(log())? + .MASSDELTA <- outer(.MASSDELTA, as.numeric(names(unlabeled[[i]])), "+") + } + } + } + if(length(labeled) > 0){ + if(length(.MASSPATTERN) == 0){ + .MASSPATTERN <- labeled[[1]] + .MASSDELTA <- as.numeric(names(labeled[[1]])) + } + for(i in 1:length(labeled)){ + .MASSPATTERN <- outer(.MASSPATTERN, labeled[[i]], "*") # use sum(log())? + .MASSDELTA <- outer(.MASSDELTA, as.numeric(names(labeled[[i]])), "+") + } + } + P <- aggregate(c(.MASSPATTERN), list(c(.MASSDELTA)), sum) + Pm <- setNames(P[,2], P[,1])[as.character(m)] + Pm/sum(Pm) +} +.getElementCount <- Vectorize(function(i,j, pos, x){ + ans <- 0 + for(v in pos[i,j][[1]][,2] + 1){ + if(!grepl("\\D", substr(x[i],v, v))){ + for(vv in v+0:4){ + if(grepl("\\D", substr(x[i],v, vv))){ + break + } + } + ans <- as.numeric(substr(x[i], v, vv - 1)) + } else { + next + } + } + ans +}, c("i","j")) + +## Simulation Functions -------------------------------------------------------- +#' Simulate an experiment measuring fractional synthesis rate (FSR) +#' +#' @description +#' Runs `n` replicate simulations of an FSR experiment using a specified +#' experimental design, set of proteins/peptides, and noise parameters. +#' +#' @details +#' Given a user-specified set of protein-peptide pairs, study design, true +#' rate parameters, and noise parameters, `simFsr` generates `n` replicate +#' simulations for the specified experiment and then summarizes the resulting +#' error statistics. Optionally, it can also return the simulated datasets, +#' either with the analysis results or by themselves. +#' +#' \strong{Arguments} +#' +#' Inputs for the simulation are as follows: +#' \describe{ +#' \item{Proteins and peptides}{are defined by `sequences`, which expects a +#' named list of character vectors. Each component is a name-value pair in +#' which the name identifies a protein or protein group, and the value is +#' a vector of peptide sequences associated with that protein.} +#' \item{The experimental design}{is determined by the arguments `nreps`, +#' `ntimes`, `delta`, and `repeated`. The number of time points is set by +#' `ntimes`, with values `1:ntimes`. If `repeated = FALSE`, the simulation +#' uses a simple randomized design with `nreps` cases for each treatment at +#' each time point. Alternatively, if `repeated = TRUE`, measurements are +#' repeated for each subject at each time point. Setting `delta = 0` yields +#' a single-sample design in which FSR is estimated for each protein in a +#' single population. Otherwise, an independent two-sample design is used, +#' and the analysis includes a test for the null hypothesis of no difference +#' between the populations.} +#' \item{True, average rate parameters}{are determined by `baserate` and +#' `delta`, where `baserate` defines the FSR for the control condition and +#' `delta` defines the true effect of the treatment. In other words, the FSR +#' for the treatment condition is defined as `baserate + delta`.} +#' \item{Average body water enrichment (BWE)}{in the study population is +#' controlled by `bwe_mean`} +#' \item{Noise components}{include `bwe_sd`, `rate_sd`, `count`, `trap_factor`, +#' and `noise threshold`. Each unique subject (i.e., biological replicate) has +#' a BWE value drawn from $N(`bwe_mean`, `bwe_sd`)$, and an individual +#' fractional synthesis rate drawn independently for each protein from +#' $N(`mu`, `rate_sd`)$, where `mu = baserate` or `baserate + delta`, +#' depending on treatment. Th arguments `count`, `trap_factor`, and +#' `noise_threshold` parameterize a censored multinomial-(quasi-)Poisson +#' observation model for the observed isotope patterns in the simulated +#' data. See \code{\link{simulateIsotopePattern}} for details.} +#' } +#' In addition to the arguments listed above, `simFsr` accepts several control +#' arguments that modify outputs or influence processing: +#' \describe{ +#' \item{`model`}{determines the rate model used to generate the data. See +#' \code{\link{simulateIsotopePattern}} for additional details.} +#' \item{`maxpeaks`}{determines the number of peaks in the theoretical +#' baseline and asymptotic isotope distributions. It should be large +#' enough that the baseline distribution approaches 100% coverage.} +#' \item{`ncore`}{sets the number of processing cores to be used. If +#' `ncore > 1`, simulation and analysis are parallelized using the +#' \code{\link{parallel}} and \code{\link{pbapply}} packages.} +#' \item{`seed`}{sets an overall random number seed for the simulation.} +#' \item{`analyze`}{determines whether fitting will occur. If analyze is +#' `FALSE`, only the simulated data will be returned (assuming +#' `data = TRUE`).} +#' \item{`data`}{controls whether or not the simulated data themselves are +#' returned.} +#' \item{`...`}{additional arguments may be passed to `.genFsrSim` or +#' `.fitFsrSim`, and from them on to downstream function calls.} +#' } +#' +#' \strong{Processing} +#' +#' Processing proceeds as follows: +#' \enumerate{ +#' \item{`sequences` is parsed to generate vector of protein names} +#' \item{A data frame, `X` is generated that includes one row for each +#' sample-sequence combination, where a "sample" is a physical specimen +#' (i.e., one biological replicate at one time point). The data frame +#' includes all of the information needed to determine treatment values and +#' synthesis rates for each row.} +#' \item{Proteins, treatments ("control" vs "treatment" as determined by +#' `X$Delta` == 0 or `delta`), subject id's, and file names (i.e., imaginary +#' MS data files) are appended to `X`.} +#' \item{Arguments are collected into a list object named `pars` to be passed +#' to `.genFsrSim` and `.fitFsrSim`.} +#' \item{The random number generator and parallel processing culster are +#' initialized, if needed. If `ncore > 1` and `seed = NULL`, `seed` is first +#' set by a call to \code{\link{Sys.time()}}. Then RNG streams on individual +#' processing cores are initialized by a call to +#' \code{\link{parallel::clusterSetRNGStream}}. When `simFsr` exits (i.e., +#' \code{\link{on.exit}}), the parallel processing cluseter is automatically +#' stopped and (if `ncore == 1`) the random number seed is restored to its +#' original value.} +#' \item{Depending on whether `data == TRUE`, `.genFsrSim` and `.fitFsrSim` +#' are called sequentially (via \code{\link{pbapply::pbreplicate}} and +#' \code{\link{pbapply::pbsapply}}) or through a single call to +#' \code{\link{pbapply::pbreplicate}}. The later option discards datasets +#' as they are used, so it minimizes memory requirements and communications +#' between processes.} +#' \item{The results are modified to include a simulation ID number, summary +#' tables, and metadata about the simulation settings, and then returned.} +#' } +#' +#' @param .source the path to source file for the fsr analysis functions +#' (i.e., this file). This is used to read the files into child processes +#' during parallel processing if `ncore > 1`. Defaults to the name of the +#' published file in current working directory. + +#' +#' @result +#' A named list with up to 6 components: +#' \describe{ +#' \item{`$rates`}{rate estimates for each protein in each replicate simulation} +#' \item{`$tests`}{contrast estimates for each protein in each simulation (not +#' included if `delta = 0`)} +#' \item{`$summary`}{summary statistics for each protein, across the set of +#' simulations} +#' \item{`$overall`}{summary statistics aggregated across proteins and +#' simulations} +#' \item{`$metadata`}{information about the simulation parameters, including +#' argument values and the random number seed} +#' \item{`$data`}{if requested, a list of the datasets from all 'n' replicate +#' simulations (excluded if `data = FALSE`)} +#' } +#' +#' @seealso +#' \code{\link{.genFsrSim}} for data simulation details and +#' \code{\link{.fitFsrSim}} for model fitting. +#' +#' @examples +#' \dontrun{ +#' # Run single simulation of one protein with 2 peptides, measured +#' # at a single time point with 3 reps and 2 treatments, assuming +#' # no inter-individual variation in rate or BWE. +#' fsr <- simFsr( +#' n = 1, +#' sequences = list(P07724 = c("CCSGSLVER", "CCTLPEDQR")), +#' nreps = 3, ntimes = 1, baserate = 0.5, delta = 0.2, +#' bwe_mean = 0.02, bwe_sd = 0, rate_sd = 0, model = "e1c", +#' count = 1e6, trap_factor = 1, noise_threshold = 100, +#' repeated = FALSE, maxpeaks = 10, ncore = 1, +#' seed=12345 +#' ) +#' } +#' #' \dontrun{ +#' # Returning only the simulated datasets. +#' fsr <- simFsr( +#' n = 1, +#' sequences = list(P07724 = c("CCSGSLVER", "CCTLPEDQR")), +#' nreps = 3, ntimes = 1, baserate = 0.5, delta = 0.2, +#' bwe_mean = 0.02, bwe_sd = 0, rate_sd = 0, model = "e1c", +#' count = 1e6, trap_factor = 1, noise_threshold = 100, +#' repeated = FALSE, maxpeaks = 10, ncore = 1, +#' analyze = FALSE, data = TRUE, +#' seed=12345 +#' ) +#' } +#' +simFsr <<- function(n, sequences, nreps, ntimes, baserate, delta, bwe_mean, + bwe_sd, rate_sd, count, trap_factor, noise_threshold, + repeated = FALSE, unbalanced = 0, model = "e1c", maxpeaks, + ncore = 1, seed = NULL, analyze = TRUE, data = FALSE, + .source = "./Beckett_et_al_2026_fsr_functions.R", ...){ + `%>%` <- magrittr::`%>%` + proteins <- do.call(c, lapply(names(sequences), function(i){ + setNames(rep(i, length(sequences[[i]])), sequences[[i]]) + })) + delta <- unique(c(0, delta)) + X <- expand.grid( + Sequence = unlist(sequences), + Rep = 1:nreps, + Rate = baserate, + Delta = delta, + Time = 1:ntimes, + Count = count, + Trap_factor = trap_factor, + Noise_threshold = noise_threshold, + stringsAsFactors = FALSE + ) + if(unbalanced > 1 | unbalanced < 0 | !is.numeric(unbalanced)){ + stop("unbalanced should be a proportion of sequence reads to drop.") + } + if(unbalanced > 0){ + X <- X[-sample(nrow(X))[1:(nrow(X)*unbalanced)],] # goes to floor + } + X <- dplyr::mutate( + X, + Proteins = proteins[unlist(Sequence)], + Treatment = c("Control","Treatment")[(Delta != 0) + 1], + Subject = if(repeated){ + paste(Treatment, Rep, sep = "_") + } else { + paste(Treatment, Rep, Time, sep = "_") + }, + File = paste0(Subject, "_", Time) + ) + pars <- list(model = model, baserate = baserate, delta = delta, + bwe_mean = bwe_mean, bwe_sd = bwe_sd, rate_sd, count = count, + trap_factor = trap_factor, noise_threshold = noise_threshold, + maxpeaks = maxpeaks, nreps = nreps, ntimes = ntimes, + repeated = repeated) + if(is.null(ncore)|ncore == 1){ + cl <- NULL + if(!is.null(seed)){ + rs <- .Random.seed + set.seed(seed) + on.exit(.Random.seed <<- rs) + } + } else { + cl <- parallel::makeCluster(ncore) + on.exit(parallel::stopCluster(cl)) + seed <- ifelse(is.null(seed), as.numeric(Sys.time()), seed) + parallel::clusterSetRNGStream(cl, seed) + parallel::clusterCall(cl, source, file = .source) + parallel::clusterCall(cl, library, package = "dplyr", character.only = TRUE) + } + if(data){ + message("Simulating datasets:") + DATA <- pbapply::pbreplicate(n, .genFsrSim(X, pars, ...), + simplify = FALSE, cl = cl) + if(analyze){ + message("Running analyses:") + RES <- pbapply::pbsapply(DATA, .fitFsrSim, pars = pars, ..., cl = cl) + } else { + return(DATA) + } + } else { + if(!analyze){ + warning("analyze = FALSE and DATA = FALSE --> Nothing to do!") + return(invisible()) + } + message("Running simulations:") + DATA = NULL + RES <- pbapply::pbreplicate(n, { + X <- .genFsrSim(X, pars, ...) + .fitFsrSim(X, pars, ...) + }, cl = cl) + } + suppressMessages({ + RES <- apply(RES, 1, do.call, what = rbind) + RES$rates <- dplyr::mutate( + RES$rates, + simId = rep(1:n, each = length(delta) * length(sequences)) + ) %>% + dplyr::select(c(10,1:9)) + S1 <- dplyr::summarize( + dplyr::group_by(RES$rates, Protein, parameter), + type = "estimate", + coverage = sum(in_ci, na.rm = TRUE)/sum(!is.na(in_ci)), + mean = mean(error, na.rm = TRUE), + sd = sd(error, na.rm = TRUE), + as.data.frame(t(quantile(error, c(0,0.1,0.25,0.5,0.75,0.9,1), + na.rm = TRUE))) + ) %>% dplyr::ungroup() + if(!is.null(RES$tests)){ + RES$tests <- dplyr::mutate( + RES$tests, + simId = rep(1:n, each = length(sequences)) + ) %>% + dplyr::select(c(13,1:12)) + S2 <- dplyr::summarize( + dplyr::group_by(RES$tests, Protein, parameter), + type = "contrast", + coverage = sum(in_ci, na.rm = TRUE)/sum(!is.na(in_ci)), + mean = mean(error, na.rm = TRUE), + sd = sd(error, na.rm = TRUE), + as.data.frame(t(quantile(error, c(0,0.1,0.25,0.5,0.75,0.9,1), + na.rm = TRUE))), + reject = sum(p.value <= 0.05, na.rm = TRUE)/sum(!is.na(p.value)) + ) %>% + dplyr::ungroup() + } else { + S2 <- NULL + S1$reject <- NA + } + RES$summary <- dplyr::bind_rows(S1, S2) + RES$overall <- dplyr::summarize( + dplyr::group_by(RES$summary, type), + mean_coverage = mean(coverage), + sd_coverage = sd(coverage), + setNames(as.data.frame(t(range(coverage))), + c("min_coverage","max_coverage")), + mean_reject = mean(reject), + sd_reject = sd(reject), + setNames(as.data.frame(t(range(reject))), + c("min_reject","max_reject")) + ) %>% dplyr::ungroup() + RES$metadata <- list(ncore = ncore, random.seed = seed, + data = data, pars = pars) + RES$datasets <- DATA + }) + return(RES) +} + +# A cluster version of simFsr (WARNING: This has not been extensively tested) +simFsrCluster <- function(run_id, rep_num, N, seed, sequences, nreps, ntimes, + baserate, delta, bwe_mean, bwe_sd, noise, maxpeaks, + repeated = FALSE, model = "e1c"){ + `%>%` <- magrittr::`%>%` + proteins <- do.call(c, lapply(names(sequences), function(i){ + setNames(rep(i, length(sequences[[i]])), sequences[[i]]) + })) + delta <- unique(c(0, delta)) + X <- expand.grid( + Sequence = unlist(sequences), + Rep = 1:nreps, + Rate = baserate, + Delta = delta, + Time = 1:ntimes, + Noise = noise, + stringsAsFactors = FALSE + ) + X <- dplyr::mutate( + X, + Proteins = proteins[unlist(Sequence)], + Treatment = c("Control","Treatment")[(Delta != 0) + 1], + Subject = if(repeated){ + paste(Treatment, Rep, sep = "_") + } else { + paste(Treatment, Rep, Time, sep = "_") + }, + File = paste0(Subject, "_", Time) + ) + pars <- list(run_id = run_id, rep_num = rep_num, seed = seed, model = model, + baserate = baserate, delta = delta, bwe_mean = bwe_mean, + bwe_sd = bwe_sd, noise = noise, maxpeaks = maxpeaks, + nreps = nreps, ntimes = ntimes, repeated = repeated) + set.seed(seed) + set.seed(round(runif(N, -1e9, 1e9))[j]) + X <- .genFsrSim(X, pars) + RES <- .fitFsrSim(X, pars) + OUT <- list(pars = pars, res = RES) + save(OUT, file = paste0("fsr_sim_result_", run_id, "_", rep_num)) + return(0) +} + +#' Simulate observed isotope distributions (OIDs) for a given peptide sequence +#' +#' @description +#' Simulates one collection of OIDs for a set of rates and time points. +#' +#' @details +#' OID peak intensities are simulated by a censored multinomial-quasi-Poisson +#' error process, with the target distribution defined as a mixture of +#' theoretical baseline (without heavy isotope enrichment) and asymptotically +#' enriched isotope patterns. Mixture weights (labeled `frac_new` in the source +#' code) are determined by the synthesis `model`, `rate` parameter, and +#' sampling `time`. +#' +#' @param theory a named list with components `$baseline` and `$asymptotic` +#' each containing an n sample x m peaks matrix of thoretical isotope patterns +#' @param model character value defining the synthesis model to be used (only +#' `"e1c"` is currently recognized, for "exponential-one-compartment") +#' @param rate a numeric vector of fractional synthesis rates (length n) +#' @param time a numeric vector of times (in arbitrary units) +#' @param count peptide abundance in the sample +#' @param trap_factor ion trap effect on Poisson noise in peak height; the +#' standard deviation of the noise is reduced by a factor `1/trap_factor` +#' @param noise_threshold minimum intensity value at which a peak is recognized +#' as a signal; peaks `< noise_threshold` are reset to 0. +#' @param collapse logical; if `TRUE` the result is returned as a character +#' vector with peak intensities separated by semicolons (;). Otherwise, a +#' numeric matrix is returned. +#' +#' @result +#' A vector or matrix of simulated isotope patterns, depending on the value of +#' `collapse`. +#' +#' @seealso +#' \code{\link{simFsr}} for usage context. +#' \code{\link{peptidesToIsotopePatterns}} for the generation of theoretical +#' distributions. +#' +simulateIsotopePattern <<- function(theory, model = "e1c", rate = 0.5, time = 1, + count = 1e6, trap_factor = 1, + noise_threshold = 100, collapse = FALSE){ + frac_new <- switch( + model, + e1c = 1 - exp(-rate %o% time), + paste0(quote(model), " is not a recognized model.") + ) + obsv <- real <- with(theory, (1-frac_new) %x% baseline + frac_new %x% asymptotic) + lambda <- rmultinom(1, count, real) + obsv[] <- (rpois(length(lambda), lambda) - lambda)/trap_factor + lambda + obsv[obsv < noise_threshold] <- 0 + y <- t(obsv)/colSums(obsv) + if(!isFALSE(collapse)){ + apply(y, 1, function(yy) paste(yy[yy > 0], collapse = collapse)) + } else { + y + } +} + +#' Generate an FSR simulation experiment using a Latin hypercube design +#' +#' @description +#' Uses a Latin hypercube design to parameterize a set of simulations that can +#' examine the effects of sample size (`nreps`), the number of time points +#' (`ntimes`), base fractional synthesis rate (`baserate`), treatment effects +#' (`delta`), BWE, protein abundance (`count`), and mass spectrometer +#' performance () on FSR estimation. +#' +#' @details +#' A unit Latin hypercube is generated for `n` simulations using an optimal +#' design (see \code{\link{lhs::optimum}} for details), and then rescaled as +#' determined by the function arguments. For convenience, a random number seed +#' is also generated for each simulation run, so that results can be replicated +#' if desired. +#' +#' @param n number of simulation runs desired +#' @param nreps length 2 numeric vector defining bounds for the number of +#' sample replicates per simulated experiment +#' @param ntimes length 2 numeric vector defining bounds for the number of time +#' points per simulated experiment +#' @param baserate length 2 numeric vector defining bounds for the baseline +#' (control) fractional synthesis rate (FSR) +#' @param delta length 2 numeric vector defining bounds for the treatment +#' effect on FSR +#' @param bwe_mean length 2 numeric vector defining bounds for the mean body +#' water enrichment (BWE) per simulation +#' @param bwe_sd length 2 numeric vector defining bounds for the standard +#' deviation of BWE in each simulated experiment +#' @param count length 2 numeric vector defining bounds for protein abundance +#' @param trap_factor length 2 numeric vector defining bounds for the ion trap +#' effect on peak noise +#' @param seed optional numeric value used to seed the generation of random +#' number seeds. By default, seeds are drawn from a uniform distribution that +#' is seeded based on a call to \code{\link{Sys.time()}}. +#' +#' @result +#' A data frame providing inputs for a simulation experiment. +#' +#' @seealso +#' \code{\link{lhs::optimum}} for more on Latin hypercubes, \code{\link{simFsr}} +#' for details on simulation procedures. +#' +buildHyperCube <- function(n, nreps = c(3, 10), ntimes = c(1, 5), + baserate = c(0.1, 1.9), delta = c(0.5, 2), + bwe_mean = c(0.02, 0.02), bwe_sd = c(0, 0.005), + count = c(10e4, 10e7), trap_factor = c(1,1), + seed = NULL){ + H <- lhs::optimumLHS(n, 8) + colnames(H) <- c("nreps", "ntimes", "baserate", "delta", + "bwe_mean", "bwe_sd", "count", "trap_factor") + H <- as.data.frame(H) + H$nreps <- round(nreps[1] - 0.5 + H$nreps * (diff(nreps) + 1)) + H$ntimes <- round(ntimes[1] - 0.5 + H$ntimes * (diff(ntimes) + 1)) + H$baserate <- baserate[1] + H$baserate * diff(baserate) + H$delta <- delta[1] + H$delta * diff(delta) + H$bwe_mean <- bwe_mean[1] + H$bwe_mean * diff(bwe_mean) + H$bwe_sd <- bwe_sd[1] + H$bwe_sd * diff(bwe_sd) + H$count <- count[1] + H$count * diff(count) + H$trap_factor <- trap_factor[1] + H$trap_factor * diff(trap_factor) + rs <- .Random.seed + on.exit(.Random.seed <<- rs) + seed <- ifelse(is.null(seed), as.numeric(Sys.time()), seed) + set.seed(seed) + H$seed <- round(runif(nrow(H), -1e9, 1e9)) + H +} + +## Data sets ------------------------------------------------------------------- +#' Amino acids +#' +#' A table of the 20 amino acids with their symbols, names, chemical formulae, +#' and number of substitutable hydrogen atoms. Columns are included for other +#' elements as well, but no values have been supplied (this may be changed in +#' future updates). +#' +#' @format A data frame with 20 rows and 13 variables: +#' \describe{ +#' \item{symbol}{symbol for the amino acid} +#' \item{name}{name of the amino acid} +#' \item{formula}{chemical formula} +#' \item{C}{number of substitutable carbon atoms} +#' \item{H}{number of substitutable hydrogen atoms} +#' \item{N}{number of substitutable nitrogen atoms} +#' \item{O}{number of substitutable oxygen atoms} +#' \item{P}{number of substitutable phosphorous atoms} +#' \item{S}{number of substitutable sulfur atoms} +#' \item{F}{number of substitutable florine atoms} +#' \item{Cl}{number of substitutable chlorine atoms} +#' \item{Br}{number of substitutable bromine atoms} +#' \item{I}{number of substitutable iodine atoms} +#' } +#' +amino_acids <- data.frame( + stringsAsFactors = FALSE, + symbol = c("A","R","N","D","C","E","Q","G","H","I", + "L","K","M","F","P","S","T","Y","W","V"), + name = c("Alanine","Arginine","Asparagine","Aspartate", + "Cysteine","Glutamate","Glutamine","Glycine", + "Histidine","Isoleucine","Leucine","Lysine", + "Methionine","Phenylalanine","Proline","Serine", + "Threonine","Tyrosine","Tryptophan","Valine"), + formula = c("C3H7N1O2","C6H14N4O2","C4H8N2O3","C4H7N1O4", + "C3H7N1O2S1","C5H9N1O4","C5H10N2O3","C2H5N1O2", + "C6H9N3O2","C6H13N1O2","C6H13N1O2","C6H14N2O2", + "C5H11N1O2S1","C9H11N1O2","C5H9N1O2","C3H7N1O3", + "C4H9N1O3","C9H11N1O3","C11H12N2O2","C5H11N1O2"), + C = NA, + H = c(4,3,2,2,2,4,4,2,3,1,1,1,1,0,3,3,0,0,0,1), + N = NA, + O = NA, + P = NA, + S = NA, + F = NA, + Cl = NA, + Br = NA, + I = NA, + row.names = c("A","R","N","D","C","E","Q","G","H","I", + "L","K","M","F","P","S","T","Y","W","V") +) + +#' Stable heavy isotopes +#' +#' A table of stable isotopes for common elements in biological compounds, +#' including molar fractions, nominal mass differences (= extra neutron +#' counts), and exact masses. Monoisotopes are \strong{not} included in this +#' table but may be added in the future. +#' +#' Exact mass values follow NIST. Because references differ on the exact values +#' for molar fractions of some isotopes, this table includes values from both +#' NIST and IUPAC. Users should be aware that these numbers represent global +#' averages and may not be representative for the background abundances in a +#' given sample due to geographic variation, changes in isotope abundance over +#' time, and fractionation by biological or industrial processes. If it is +#' feasible to do so, users should apply values specific to their study system +#' instead of values from this table. +#' +#' @format A data frame with 10 rows and 4 variables: +#' \describe{ +#' \item{element}{atomic symbol} +#' \item{number}{mass number} +#' \item{delta}{number of extra neutrons, relative to the monoisotope} +#' \item{mass}{relative atomic mass, where C12 = 12} +#' \item{nist}{average proportional molar abundance reported by NIST} +#' \item{iupac}{average proportional molar abundance reported by IUPAC; +#' midpoints are reported for isotopes where the database contains a range +#' of values} +#' } +#' +#' @references +#' Coursey, J.S., Schwab, D.J., Tsai, J.J., and Dragoset, R.A. +#' \emph{\link[ +#' https://www.nist.gov/pml/atomic-weights-and-isotopic-compositions-relative-atomic-masses +#' ]{Atomic weights and isotopic compositions with relative atomic masses.} +#' U.S. National Institutes of Standards and Technology. (accessed 10 August +#' 2023) +#' +#' CIAAW. \emph{\link[https://www.ciaaw.org/isotopic-abundances.htm]{Isotopic +#' compositions of the elements 2021}. International Union of Pure and Applied +#' Chemistry. (accessed 10 August 2023) +#' +stable_isotopes <- data.frame( + element = c("C","H","N","O","O","S","S","S","Cl","Br"), + number = c(13,2,15,17,18,33,34,36,37,81), + delta = c(1,1,1,1,2,1,2,3,2,2), # nominal mass difference from monoisotope + mass = c( + C = 13.00335483507, # Carbon 13 + H = 2.01410177812, # Hydrogen 2 (Deuterium) + N = 15.00010889888, # Nitrogen 15 + O17 = 16.99913175650, # Oxygen 17 + O18 = 17.99915961286, # Oxygen 18 + # P = 0, # Phosphorous has no stable heavy isotopes + S33 = 32.9714589098, # Sulfur 33 + S34 = 33.967867004, # Sulfur 34 + S36 = 35.96708071, # Sulfur 36 + # F = 0, # Florine has no stable heavy isotopes + Cl = 36.965902602, # Chlorine 37 + Br = 80.9162897 # Bromine 81 + # I = 0 # Iodine has no stable heavy isotopes + ), + nist = c( + C = 0.01070, # 0.011078, # Carbon 13 + H = 0.000115, # 0.00015574, # Hydrogen 2 (Deuterium) + N = 0.00364, # 0.003663, # Nitrogen 15 + O17 = 0.00038, # 0.0003790, # Oxygen 17 + O18 = 0.00205, # 0.0020004, # Oxygen 18 + # P = 0, # Phosphorous has no stable heavy isotopes + S33 = 0.0075, # 0.0419599, # Sulfur 33 + S34 = 0.0425, # 0.0074869, # Sulfur 34 + S36 = 0.0001, # 0.0001458, # Sulfur 36 + # F = 0, # Florine has no stable heavy isotopes + Cl = 0.2424, # Chlorine 37 + Br = 0.4931 # Bromine 81 + # I = 0 # Iodine has no stable heavy isotopes + ), + iupac = c( + C = 0.01060, # 0.011078, # Carbon 13 + H = 0.000145, # 0.00015574, # Hydrogen 2 (Deuterium) + N = 0.003795, # 0.003663, # Nitrogen 15 + O17 = 0.0003835, # 0.0003790, # Oxygen 17 + O18 = 0.002045, # 0.0020004, # Oxygen 18 + # P = 0, # Phosphorous has no stable heavy isotopes + S33 = 0.00763, # 0.0419599, # Sulfur 33 + S34 = 0.04365, # 0.0074869, # Sulfur 34 + S36 = 0.000158, # 0.0001458, # Sulfur 36 + # F = 0, # Florine has no stable heavy isotopes + Cl = 0.242, # Chlorine 37 + Br = 0.4935 # Bromine 81 + # I = 0 # Iodine has no stable heavy isotopes + ), + row.names = paste0( + c("C","H","N","O","O","S","S","S","Cl","Br"), + c(13,2,15,17,18,33,34,36,37,81) + ) +) \ No newline at end of file diff --git a/Beckett_et_al_2026_fsr_run.R b/Beckett_et_al_2026_fsr_run.R new file mode 100644 index 0000000..dbc9838 --- /dev/null +++ b/Beckett_et_al_2026_fsr_run.R @@ -0,0 +1,41 @@ +# Settings used to estimate fractional synthesis rates in Beckett et al. +# (2026) The impact of high fat diet on global protein abundance and +# fractional synthetic rate in liver and mammary gland of peak lactation +# ICR mice. + +source("fsr_functions.R") + +liverfsr <- maxQuantFsr( + mq_folder = "./liver", + sample_data = NULL, + formula = ~ Diet, + msms_count = 0.5, + scan_corr = 0.8, + use_single_scans = TRUE, + corr_method = "spearman", + minSeq = 1, + model = "e1c", + level = 0.95, + plot = FALSE, + save = "xlsx", + ncore = 5, + unique_only = FALSE +) + +mammfsr <- maxQuantFsr( + mq_folder = "./mammary_gland", + sample_data = NULL, + formula = ~ Diet, + msms_count = 0.5, + scan_corr = 0.8, + use_single_scans = TRUE, + corr_method = "spearman", + minSeq = 1, + model = "e1c", + level = 0.95, + plot = FALSE, + save = "xlsx", + ncore = 10, + unique_only = FALSE +) +