From 6d9084fe92969c99a59e9a4d33316b1795bc3578 Mon Sep 17 00:00:00 2001
From: ndwarshuis <ndwar@yavin4.ch>
Date: Thu, 29 Jul 2021 12:48:13 -0400
Subject: [PATCH] ADD methods to aim 2a

---
 tex/thesis.tex | 256 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 239 insertions(+), 17 deletions(-)

diff --git a/tex/thesis.tex b/tex/thesis.tex
index db987ad..7062876 100644
--- a/tex/thesis.tex
+++ b/tex/thesis.tex
@@ -722,7 +722,7 @@ better retention of memory phenotype compared to current bead-based methods.
 
 \section{methods}
 
-\subsection{dms functionalization}
+\subsection{dms functionalization}\label{sec:dms_fab}
 
 \begin{figure*}[ht!]
   \begingroup
@@ -778,14 +778,6 @@ was then manually counted to obtain a concentration. Surface area for
 \si{\ab\per\um\squared} was calculated using the properties for \gls{cus} and
 \gls{cug} as given by the manufacturer {Table X}.
 
-%TODO this bit belongs in the next aim
-% In the case of the \gls{doe} experiment where
-% variable mAb surface density was utilized, the anti-CD3/anti-CD28 mAb mixture
-% was further combined with a biotinylated isotype control to reduce the overall
-% fraction of targeted mAbs (for example the 60\% mAb surface density corresponded
-% to 3 mass parts anti-CD3, 3 mass parts anti-CD8, and 4 mass parts isotype
-% control).
-
 \subsection{dms quality control assays}
 
 Biotin was quantified using the \product{\gls{haba} assay}{Sigma}{H2153-1VL}. In
@@ -848,11 +840,6 @@ depending on media color or a \SI{300}{\mg\per\deci\liter} minimum glucose
 threshold. Media glucose was measured using a \product{GlucCell glucose
   meter}{Chemglass}{CLS-1322-02}.
 
-% TODO this belongs in aim 2
-% In order to remove \glspl{dms} from
-% culture, collagenase D (Sigma Aldrich) was sterile filtered in culture media and
-% added to a final concentration of \SI{50}{\ug\per\ml} during media addition.
-
 Cells on the \glspl{dms} were visualized by adding \SI{0.5}{\ul}
 \product{\gls{stppe}}{\bl}{405204} and \SI{2}{ul}
 \product{\acd{45}-\gls{af647}}{\bl}{368538}, incubating for \SI{1}{\hour}, and
@@ -1047,7 +1034,7 @@ These equations were then used analogously to describe the reaction profile of
 
 % METHOD add the equation governing the washing steps
 
-\subsection{Luminex Analysis}
+\subsection{Luminex Analysis}\label{sec:luminex_analysis}
 
 Luminex was performed using a \product{ProcartaPlex kit}{\thermo}{custom} for
 the markers outlined in \cref{tab:luminex_panel} with modifications (note that
@@ -1055,14 +1042,21 @@ some markers were run in separate panels to allow for proper dilutions).
 Briefly, media supernatents from cells were sampled as desired and immediately
 placed in a \SI{-80}{\degreeCelsius} freezer until use. Before use, samples were
 thawed at \gls{rt} and vortexed to ensure homogeneity. To run the plate,
-\SI{25}{\ul} of magnetic beads were added to the plate and washed 3x using
+\SI{25}{\ul} of magnetic beads were added to the plate and washed 3X using
 \SI{300}{\ul} of wash buffer. \SI{25}{\ul} of samples or standard were added to
 the plate and incubated for \SI{120}{\minute} at \SI{850}{\rpm} at \gls{rt}
 before washing analogously 3X with wash. \SI{12.5}{\ul} detection \glspl{mab}
 and \SI{25}{\ul} \gls{stppe} were sequentially added, incubated for
 \SI{30}{\minute} and vortexed, and washed analogously to the sample step.
 Finally, samples were resuspended in \SI{120}{\ul} reading buffer and analyzed
-via a Biorad Bioplex 200 plate reader.
+via a BioRad Bioplex 200 plate reader. An 8 point log2 standard curve was used,
+and all samples were run with single replicates.
+
+Luminex data was preprocessed using R for inclusion in downstream analysis as
+follows. Any cytokine level that was over-range (`OOR >' in output spreadsheet)
+was set to the maximum value of the standard curve for that cytokine. Any value
+that was under-range (`OOR <l in output spreadsheet) was set to zero. All values
+that were extrapolated from the standard curve were left unchanged.
 
 \begin{table}[!h] \centering
   \caption{Luminex Panel}
@@ -1119,6 +1113,11 @@ lack-of-fit tests where replicates were present (to assess model fit in the
 context of pure error). Statistical significance was evaluated at $\upalpha$ =
 0.05.
 
+\subsection{flow cytometry}\label{sec:flow_cytometry}
+
+% METHOD add flow cytometry
+% FIGURE add gating strategy
+
 \section{results}
 
 \subsection{DMSs can be fabricated in a controlled manner}
@@ -1944,6 +1943,229 @@ provide these benefits.
 
 \section{introduction}
 \section{methods}
+
+\subsection{study design}
+
+The first DOE resulted in a randomized 18-run I-optimal custom design where each
+DMS parameter was evaluated at three levels: IL2 concentration (10, 20, and 30
+U/μL), DMS concentration (500, 1500, 2500 carrier/μL), and functionalized
+antibody percent (60\%, 80\%, 100\%). These 18 runs consisted of 14 unique
+parameter combinations where 4 of them were replicated twice to assess
+prediction error. Process parameters for the ADOE were evaluated at multiple
+levels: IL2 concentration (30, 35, and 40 U/μL), DMS concentration (500, 1000,
+1500, 2000, 2500, 3000, 3500 carrier/μL), and functionalized antibody percent
+(100\%) as depicted in Fig.1b. To further optimize the initial region explored
+(DOE) in terms of total live CD4+ TN+TCM cells, a sequential adaptive
+design-of-experiment (ADOE) was designed with 10 unique parameter combinations,
+two of these replicated twice for a total of 12 additional samples (Fig.1b). The
+fusion of cytokine and NMR profiles from media to model these responses included
+30 cytokines from a custom Thermo Fisher ProcartaPlex Luminex kit and 20 NMR
+features. These 20 spectral features from NMR media analysis were selected out
+of approximately 250 peaks through the implementation of a variance-based
+feature selection approach and some manual inspection steps.
+
+\subsection{DMS fabrication}
+
+\glspl{dms} were fabricated as described in \cref{sec:dms_fab} with the
+following modifications in order to obtain a variable functional \gls{mab}
+surface density. During the \gls{mab} coating step, the anti-CD3/anti-CD28 mAb
+mixture was further combined with a biotinylated isotype control to reduce the
+overall fraction of targeted \glspl{mab} (for example the \SI{60}{\percent}
+\gls{mab} surface density corresponded to 3 mass parts \acd{3}, 3 mass parts
+\acd{28}, and 4 mass parts isotype control).
+
+\subsection{T cell culture}
+
+T cell culture was performed as described in \cref{sec:tcellculture} with the
+following modifications. At days 4, 6, 8, and 11, \SI{100}{\ul} media were
+collected for the Luminex assay and \gls{nmr} analysis. The volume of removed
+media was equivalently replaced during the media feeding step, which took place
+immediately after sample collection. Additionally, the same media feeding
+schedule was followed for the DOE and ADOE to improve consistency, and the same
+donor lot was used for both experiments. All cell counts were performed using
+\gls{aopi}.
+
+\subsection{flow cytometry}
+
+Flow cytometry was performed analogously to \cref{sec:flow_cytometry}.
+
+\subsection{Cytokine quantification}
+
+Cytokines were quantified via Luminex as described in
+\cref{sec:luminex_analysis}.
+
+% TODO paraphrase this entire section since I didn't do it
+\subsection{NMR metabolomics}
+
+Prior to analysis, samples were centrifuged at \SI{2990}{\gforce} for
+\SI{20}{\minute} at \SI{4}{\degreeCelsius} to clear any debris. 5 μL of 100/3 mM
+DSS-D6 in deuterium oxide (Cambridge Isotope Laboratories) were added to 1.7 mm
+NMR tubes (Bruker BioSpin), followed by 45 μL of media from each sample that was
+added and mixed, for a final volume of 50 μL in each tube. Samples were prepared
+on ice and in predetermined, randomized order. The remaining volume from each
+sample in the rack (∼4 μL) was combined to create an internal pool. This
+material was used for internal controls within each rack as well as metabolite
+annotation.
+
+NMR spectra were collected on a Bruker Avance III HD spectrometer at 600 MHz
+using a 5-mm TXI cryogenic probe and TopSpin software (Bruker BioSpin).
+One-dimensional spectra were collected on all samples using the noesypr1d pulse
+sequence under automation using ICON NMR software. Two-dimensional HSQC and
+TOCSY spectra were collected on internal pooled control samples for metabolite
+annotation.
+
+One-dimensional spectra were manually phased and baseline corrected in TopSpin.
+Two-dimensional spectra were processed in NMRpipe37. One dimensional spectra
+were referenced, water/end regions removed, and normalized with the PQN
+algorithm38 using an in-house MATLAB (The MathWorks, Inc.) toolbox
+(https://github.com/artedison/Edison_Lab_Shared_Metabolomics_UGA).
+
+To reduce the total number of spectral features from approximately 250 peaks and
+enrich for those that would be most useful for statistical modeling, a
+variance-based feature selection was performed within MATLAB. For each digitized
+point on the spectrum, the variance was calculated across all experimental
+samples and plotted. Clearly-resolved features corresponding to peaks in the
+variance spectrum were manually binned and integrated to obtain quantitative
+feature intensities across all samples (Supp.Fig.S24). In addition to highly
+variable features, several other clearly resolved and easily identifiable
+features were selected (glucose, BCAA region, etc). Some features were later
+discovered to belong to the same metabolite but were included in further
+analysis.
+
+Two-dimensional spectra collected on pooled samples were uploaded to COLMARm web
+server10, where HSQC peaks were automatically matched to database peaks. HSQC
+matches were manually reviewed with additional 2D and proton spectra to confirm
+the match. Annotations were assigned a confidence score based upon the levels of
+spectral data supporting the match as previously described11. Annotated
+metabolites were matched to previously selected features used for statistical
+analysis.
+
+Using the list of annotated metabolites obtained above, an approximation of a
+representative experimental spectrum was generated using the GISSMO mixture
+simulation tool.39,40 With the simulated mixture of compounds, generated at 600
+MHz to match the experimental data, a new simulation was generated at 80 MHz to
+match the field strength of commercially available benchtop NMR spectrometers.
+The GISSMO tool allows visualization of signals contributed from each individual
+compound as well as the mixture, which allows annotation of features in the
+mixture belonging to specific compounds.
+
+Several low abundance features selected for analysis did not have database
+matches and were not annotated. Statistical total correlation spectroscopy41
+suggested that some of these unknown features belonged to the same molecules
+(not shown). Additional multidimensional NMR experiments will be required to
+determine their identity.
+
+% TODO paraphrase most of this since I didn't do much of the analysis myself
+\subsection{machine learning and statistical analysis}
+
+Seven machine learning (ML) techniques were implemented to predict three
+responses related to the memory phenotype of the cultured T cells under
+different process parameters conditions (i.e. Total Live CD4+ TN and TCM, Total
+Live CD8+ TN+TCM, and Ratio CD4+/CD8+ TN+TCM). The ML methods executed were
+Random Forest (RF), Gradient Boosted Machine (GBM), Conditional Inference Forest
+(CIF), Least Absolute Shrinkage and Selection Operator (LASSO), Partial
+Least-Squares Regression (PLSR), Support Vector Machine (SVM), and DataModeler’s
+Symbolic Regression (SR). Primarily, SR models were used to optimize process
+parameter values based on TN+TCM phenotype and to extract early predictive
+variable combinations from the multi-omics experiments. Furthermore, all
+regression methods were executed, and the high-performing models were used to
+perform a consensus analysis of the important variables to extract potential
+critical quality attributes and critical process parameters predictive of T-cell
+potency, safety, and consistency at the early stages of the manufacturing
+process.
+
+Symbolic regression (SR) was done using Evolved Analytics’ DataModeler software
+(Evolved Analytics LLC, Midland, MI). DataModeler utilizes genetic programming
+to evolve symbolic regression models (both linear and non-linear) rewarding
+simplicity and accuracy. Using the selection criteria of highest accuracy
+(R2>90\% or noise-power) and lowest complexity, the top-performing models were
+identified. Driving variables, variable combinations, and model dimensionality
+tables were generated. The top-performing variable combinations were used to
+generate model ensembles. In this analysis, DataModeler’s SymbolicRegression
+function was used to develop explicit algebraic (linear and nonlinear) models.
+The fittest models were analyzed to identify the dominant variables using the
+VariablePresence function, the dominant variable combinations using the
+VariableCombinations function, and the model dimensionality (number of unique
+variables) using the ModelDimensionality function. CreateModelEnsemble was used
+to define trustable model ensembles using selected variable combinations and
+these were summarized (model expressions, model phenotype, model tree plot,
+ensemble quality, model quality, variable presence map, ANOVA tables, model
+prediction plot, exportable model forms) using the ModelSummaryTable function.
+Ensemble prediction and residual performance were respectively assessed via the
+EnsemblePredictionPlot and EnsembleResidualPlot subroutines. Model maxima
+(ModelMaximum function) and model minima (ModelMinimum function) were calculated
+and displayed using the ResponsePlotExplorer function. Trade-off performance of
+multiple responses was explored using the MultiTargetResponseExplorer and
+ResponseComparisonExplorer with additional insights derived from the
+ResponseContourPlotExplorer. Graphics and tables were generated by DataModeler.
+These model ensembles were used to identify predicted response values, potential
+optima in the responses, and regions of parameter values where the predictions
+diverge the most.
+
+Non-parametric tree-based ensembles were done through the randomForest, gbm, and
+cforest regression functions in R, for random forest, gradient boosted trees,
+and conditional inference forest models, respectively. Both random forest and
+conditional inference forest construct multiple decision trees in parallel, by
+randomly choosing a subset of features at each decision tree split, in the
+training stage. Random forest individual decision trees are split using the Gini
+Index, while conditional inference forest uses a statistical significance test
+procedure to select the variables at each split, reducing correlation bias. In
+contrast, gradient boosted trees construct regression trees in series through an
+iterative procedure that adapts over the training set. This model learns from
+the mistakes of previous regression trees in an iterative fashion to correct
+errors from its precursors’ trees (i.e. minimize mean squared errors).
+Prediction performance was evaluated using leave-one-out cross-validation
+(LOO)-R2 and permutation-based variable importance scores assessing \% increase
+of mean squared errors (MSE), relative influence based on the increase of
+prediction error, coefficient values for RF, GBM, and CID, respectively. Partial
+least squares regression was executed using the plsr function from the pls
+package in R while LASSO regression was performed using the cv.glmnet R package,
+both using leave-one-out cross-validation. Finally, the kernlab R package was
+used to construct the Support Vector Machine regression models.
+
+Parameter tuning was done for all models in a grid search manner using the train
+function from the caret R package using LOO-R2 as the optimization criteria.
+Specifically, the number of features randomly sampled as candidates at each
+split (mtry) and the number of trees to grow (ntree) were tuned parameters for
+random forest and conditional inference forest. In particular, minimum sum of
+weights in a node to be considered for splitting and the minimum sum of weights
+in a terminal node were manually tuned for building the CIF models. Moreover,
+GBM parameters such as the number of trees to grow, maximum depth of each tree,
+learning rate, and the minimal number of observations at the terminal node, were
+tuned for optimum LOO-R2 performance as well. For PLSR, the optimal number of
+components to be used in the model was assessed based on the standard error of
+the cross-validation residuals using the function selectNcomp from the pls
+package. Moreover, LASSO regression was performed using the cv.glmnet package
+with alpha = 1. The best lambda for each response was chosen using the minimum
+error criteria. Lastly, a fixed linear kernel (i.e. svmLinear) was used to build
+the SVM regression models evaluating the cost parameter value with best LOO-R2.
+Prediction performance was measured for all models using the final model with
+LOO-R2 tuned parameters. Table M2 shows the parameter values evaluated per model
+at the final stages of results reporting.
+
+\subsection{consensus analysis}
+
+Consensus analysis of the relevant variables extracted from each machine
+learning model was done to identify consistent predictive features of quality at
+the early stages of manufacturing. First importance scores for all features were
+measured across all ML models using varImp with caret R package except for
+scores for SVM which rminer R package was used. These importance scores were
+percent increase in mean squared error (MSE), relative importance through
+average increase in prediction error when a given predictor is permuted,
+permuted coefficients values, absolute coefficient values, weighted sum of
+absolute coefficients values, and relative importance from sensitivity analysis
+determined for RF, GBM, CIF, LASSO, PLSR, and SVM, respectively. Using these
+scores, key predictive variables were selected if their importance scores were
+within the 80th percentile ranking for the following ML methods: RF, GBM, CIF,
+LASSO, PLSR, SVM while for SR variables present in >30\% of the top-performing
+SR models from DataModeler (R2≥ 90\%, Complexity ≥ 100) were chosen to
+investigate consensus except for NMR media models at day 4 which considered a
+combination of the top-performing results of models excluding lactate ppms, and
+included those variables which were in > 40\% of the best performing models.
+Only variables with those high percentile scoring values were evaluated in terms
+of their logical relation (intersection across ML models) and depicted using a
+Venn diagram from the venn R package.
+
 \section{results}
 
 \subsection{DOE shows optimal conditions for expanded potent T cells}