From 4789723e721552c544554228e37931187d7e63ed Mon Sep 17 00:00:00 2001
From: ndwarshuis <ndwar@yavin4.ch>
Date: Wed, 4 Aug 2021 18:24:06 -0400
Subject: [PATCH] ENH proofread aim2a

---
 tables/doe_cd4.tex       |   4 +-
 tables/doe_mem1.tex      |   4 +-
 tables/doe_mem2.tex      |   4 +-
 tables/doe_mem4.tex      |   4 +-
 tables/doe_ratio.tex     |   4 +-
 tables/doe_runs.tex      |   2 +-
 tables/model_results.tex |   2 +-
 tex/references.bib       |  56 ++++
 tex/thesis.tex           | 560 +++++++++++++++++++++------------------
 9 files changed, 372 insertions(+), 268 deletions(-)

diff --git a/tables/doe_cd4.tex b/tables/doe_cd4.tex
index a7035b6..868d9fd 100644
--- a/tables/doe_cd4.tex
+++ b/tables/doe_cd4.tex
@@ -16,8 +16,8 @@
 Observations & 30 \\ 
 R$^{2}$ & 0.888 \\ 
 Adjusted R$^{2}$ & 0.870 \\ 
-Residual Std. Error & 727,042.800 (df = 25) \\ 
-F Statistic & 49.454$^{***}$ (df = 4; 25) \\ 
+% Residual Std. Error & 727,042.800 (df = 25) \\ 
+% F Statistic & 49.454$^{***}$ (df = 4; 25) \\ 
 \hline 
 \hline \\[-1.8ex] 
 \textit{Note:}  & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
diff --git a/tables/doe_mem1.tex b/tables/doe_mem1.tex
index 921168c..5e07427 100644
--- a/tables/doe_mem1.tex
+++ b/tables/doe_mem1.tex
@@ -16,8 +16,8 @@
 Observations & 30 \\ 
 R$^{2}$ & 0.331 \\ 
 Adjusted R$^{2}$ & 0.224 \\ 
-Residual Std. Error & 3,659,501.000 (df = 25) \\ 
-F Statistic & 3.096$^{**}$ (df = 4; 25) \\ 
+% Residual Std. Error & 3,659,501.000 (df = 25) \\ 
+% F Statistic & 3.096$^{**}$ (df = 4; 25) \\ 
 \hline 
 \hline \\[-1.8ex] 
 \textit{Note:}  & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
diff --git a/tables/doe_mem2.tex b/tables/doe_mem2.tex
index 1359154..9421e08 100644
--- a/tables/doe_mem2.tex
+++ b/tables/doe_mem2.tex
@@ -23,8 +23,8 @@
 Observations & 30 \\ 
 R$^{2}$ & 0.741 \\ 
 Adjusted R$^{2}$ & 0.583 \\ 
-Residual Std. Error & 0.228 (df = 18) \\ 
-F Statistic & 4.693$^{***}$ (df = 11; 18) \\ 
+% Residual Std. Error & 0.228 (df = 18) \\ 
+% F Statistic & 4.693$^{***}$ (df = 11; 18) \\ 
 \hline 
 \hline \\[-1.8ex] 
 \textit{Note:}  & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
diff --git a/tables/doe_mem4.tex b/tables/doe_mem4.tex
index e585f81..bb50a28 100644
--- a/tables/doe_mem4.tex
+++ b/tables/doe_mem4.tex
@@ -14,8 +14,8 @@
 Observations & 30 \\ 
 R$^{2}$ & 0.835 \\ 
 Adjusted R$^{2}$ & 0.808 \\ 
-Residual Std. Error & 493,168.700 (df = 25) \\ 
-F Statistic & 31.571$^{***}$ (df = 4; 25) \\ 
+% Residual Std. Error & 493,168.700 (df = 25) \\ 
+% F Statistic & 31.571$^{***}$ (df = 4; 25) \\ 
 \hline 
 \hline \\[-1.8ex] 
 \textit{Note:}  & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
diff --git a/tables/doe_ratio.tex b/tables/doe_ratio.tex
index 10b62ba..8c2e49a 100644
--- a/tables/doe_ratio.tex
+++ b/tables/doe_ratio.tex
@@ -15,8 +15,8 @@
 Observations & 30 \\ 
 R$^{2}$ & 0.879 \\ 
 Adjusted R$^{2}$ & 0.860 \\ 
-Residual Std. Error & 0.039 (df = 25) \\ 
-F Statistic & 45.554$^{***}$ (df = 4; 25) \\ 
+% Residual Std. Error & 0.039 (df = 25) \\ 
+% F Statistic & 45.554$^{***}$ (df = 4; 25) \\ 
 \hline 
 \hline \\[-1.8ex] 
 \textit{Note:}  & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
diff --git a/tables/doe_runs.tex b/tables/doe_runs.tex
index 630bd12..3363c6e 100644
--- a/tables/doe_runs.tex
+++ b/tables/doe_runs.tex
@@ -1,7 +1,7 @@
 \begin{tabular}{ccccc}
 \\[-1.8ex]\hline 
 \hline \\[-1.8ex] 
-\\[-1.8ex] Dataset & Run & IL2 Conc (\si{\IU\per\ml}) & DMS Conc. (\si{\IU\per\ml})
+\\[-1.8ex] Dataset & Run & IL2 Conc (\si{\IU\per\ml}) & DMS Conc. (\si{\dms\per\ml})
   & Functional \gls{mab} (\si{\percent}) \\
 \hline \\[-1.8ex]
 DOE & 1 & 30 & 500 & 100\\
diff --git a/tables/model_results.tex b/tables/model_results.tex
index 4ab1c19..57ad9de 100644
--- a/tables/model_results.tex
+++ b/tables/model_results.tex
@@ -17,7 +17,7 @@ PP+S6 & \SI{98}{\percent} & \SI{71.4}{\percent} & \SI{99.9}{\percent} & \SI{75.0
 PP+S6+N6 & \SI{98}{\percent} & \SI{68.2}{\percent} & \SI{95.6}{\percent} & \SI{74.4}{\percent} & \SI{72.5}{\percent} & \SI{81.7}{\percent} & \SI{77.0}{\percent}\\
 
 \hline \\
-\multicolumn{8}{l}{\ptmemh{} cells} \\ 
+\multicolumn{8}{l}{\ptmemk{} cells} \\ 
 PP+N4 & \SI{93}{\percent} & \SI{4.7}{\percent} & \SI{44.4}{\percent} & \SI{9.2}{\percent} & \SI{1.2}{\percent} & \SI{65.1}{\percent} & \SI{9.1}{\percent}\\
 PP+N6 & \SI{86}{\percent} & \SI{2.0}{\percent} & \SI{29.9}{\percent} & \SI{15.8}{\percent} & \SI{28.5}{\percent} & \SI{63.3}{\percent} & \SI{30.6}{\percent}\\
 PP+S6 & \SI{93}{\percent} & \SI{7.8}{\percent} & \SI{28.0}{\percent} & \SI{15.1}{\percent} & \SI{76.2}{\percent} & \SI{98.4}{\percent} & \SI{49.8}{\percent}\\
diff --git a/tex/references.bib b/tex/references.bib
index 1f9fb3c..5ad2ff7 100644
--- a/tex/references.bib
+++ b/tex/references.bib
@@ -2673,6 +2673,62 @@ CONCLUSIONS: We developed a simplified, semi-closed system for the initial selec
   publisher = {The American Association of Immunologists},
 }
 
+@Article{Delaglio1995,
+  author    = {Frank Delaglio and Stephan Grzesiek and GeertenW. Vuister and Guang Zhu and John Pfeifer and Ad Bax},
+  journal   = {Journal of Biomolecular {NMR}},
+  title     = {{NMRPipe}: A multidimensional spectral processing system based on {UNIX} pipes},
+  year      = {1995},
+  month     = {nov},
+  number    = {3},
+  volume    = {6},
+  doi       = {10.1007/bf00197809},
+  publisher = {Springer Science and Business Media {LLC}},
+}
+
+@Article{Dieterle2006,
+  author    = {Frank Dieterle and Alfred Ross and Götz Schlotterbeck and Hans Senn},
+  journal   = {Analytical Chemistry},
+  title     = {Probabilistic Quotient Normalization as Robust Method to Account for Dilution of Complex Biological Mixtures. Application in1H {NMR} Metabonomics},
+  year      = {2006},
+  month     = {jul},
+  number    = {13},
+  pages     = {4281--4290},
+  volume    = {78},
+  doi       = {10.1021/ac051632c},
+  publisher = {American Chemical Society ({ACS})},
+}
+
+@Article{Dashti2017,
+  author    = {Hesam Dashti and William M. Westler and Marco Tonelli and Jonathan R. Wedell and John L. Markley and Hamid R. Eghbalnia},
+  journal   = {Analytical Chemistry},
+  title     = {Spin System Modeling of Nuclear Magnetic Resonance Spectra for Applications in Metabolomics and Small Molecule Screening},
+  year      = {2017},
+  month     = {nov},
+  number    = {22},
+  pages     = {12201--12208},
+  volume    = {89},
+  doi       = {10.1021/acs.analchem.7b02884},
+  publisher = {American Chemical Society ({ACS})},
+}
+
+@InProceedings{Kordona,
+  author    = {A.K. Kordon and Ching-Tai Lue},
+  booktitle = {Proceedings of the 2004 Congress on Evolutionary Computation ({IEEE} Cat. No.04TH8753)},
+  title     = {Symbolic regression modeling of blown film process effects},
+  year      = {2004},
+  publisher = {{IEEE}},
+  doi       = {10.1109/cec.2004.1330907},
+}
+
+@Book{Koza1992,
+  author    = {Koza, John},
+  publisher = {MIT Press},
+  title     = {Genetic programming : on the programming of computers by means of natural selection},
+  year      = {1992},
+  address   = {Cambridge, Mass},
+  isbn      = {0262111705},
+}
+
 @Comment{jabref-meta: databaseType:bibtex;}
 
 @Comment{jabref-meta: grouping:
diff --git a/tex/thesis.tex b/tex/thesis.tex
index 661df46..7b9de20 100644
--- a/tex/thesis.tex
+++ b/tex/thesis.tex
@@ -103,8 +103,9 @@
 \tcellacronym{tc}{c}{cytotoxic}{}
 \tcellacronym{th1}{h}{type 1 helper}{1}
 \tcellacronym{th2}{h}{type 2 helper}{2}
-% \tcellacronym{th17}{h}{\il{17} helper}{1}
+\tcellacronym{th17}{h}{IL-17 helper}{17}
 
+\newacronym{bcaa}{BCAA}{branched-chain amino acid}
 \newacronym{til}{TIL}{tumor infiltrating lymphocyte}
 \newacronym{tcr}{TCR}{T cell receptor}
 \newacronym{act}{ACT}{adoptive cell therapies}
@@ -207,6 +208,7 @@
 \newacronym{scfv}{scFv}{single-chain fragment variable}
 \newacronym{hepes}{HEPES}{4-(2-hydroxyethyl)-1-piperazineethanesulfonic acid}
 \newacronym{nhs}{NHS}{N-hydroxysulfosuccinimide}
+\newacronym{tocsy}{TOCSY}{total correlation spectroscopy}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % SI units for uber nerds
@@ -282,10 +284,13 @@
 % so I don't need to worry about abbreviating all the different interleukins
 \newcommand{\il}[1]{\gls{il}-#1}
 
-% DOE responses I don't feel like typing ad-nauseam
+% DOE stuff I don't feel like typing ad-nauseam
 \newcommand{\pilII}{\gls{il2} concentration}
 \newcommand{\pdms}{\gls{dms} concentration}
 \newcommand{\pmab}{functional \gls{mab} surface density}
+\newcommand{\rmemh}{total \ptmemh{} cells}
+\newcommand{\rmemk}{total \ptmemk{} cells}
+\newcommand{\rratio}{CD4/CD8 ratio}
 
 % vendor and product stuff I don't feel like typing
 \newcommand{\catnum}[2]{(#1, #2)}
@@ -2683,12 +2688,12 @@ necessary to have a fully-automated manufacturing system.
 
 The \gls{dms} system could be used as a drop in replacement for beads in many of
 current allogeneic therapies. Indeed, given its higher potential for expansion
-(\cref{fig:dms_exp,tab:ci_controlled}, it may work in cases where the beads fail
-(although this would need to be tested by gathering data with many unhealthy
-donors). However, in the autologous setting patients only need a fixed dose, and
-thus any expansion beyond the indicated dose would be wasted. Given this, it
-will be interesting to apply this technology in an allogeneic paradigm where
-this increased expansion potential would be well utilized.
+(\cref{fig:dms_exp,tab:ci_controlled}), it may work in cases where the beads
+fail (although this would need to be tested by gathering data with many
+unhealthy donors). However, in the autologous setting patients only need a fixed
+dose, and thus any expansion beyond the indicated dose would be wasted. Given
+this, it will be interesting to apply this technology in an allogeneic paradigm
+where this increased expansion potential would be well utilized.
 
 Finally, we should note that while we demonstrated a method providing superior
 performance compared to bead-based expansion, the cell manufacturing field would
@@ -2702,20 +2707,20 @@ manufacturing companies.
 
 \section{introduction}
 
-The purpose of this sub-aim was to develop computational methods to identify novel
-\glspl{cqa} and \glspl{cpp} that could be used for release criteria, process
-control, and process optimization for the \gls{dms} platform. We hypothesized
-that T cells grown using the \gls{dms} system would produce detectable
-biological signatures in the media supernatent which corresponded to clinically
-relevent responses such as the fold expansion of the T cells or the resulting
-phenotype. We tested this hypothesis by activating T cells under a variety of
-conditions using a \gls{doe}, sampling the media at intermediate timepoints, and
-creating models to predict the outcome of the cultures. We should stress that
-the specific \glspl{cpp} and \glspl{cqa} determined by this aim are not
-necessarily universal, as this was not performed with equipment that would
-normally be used at scale. However, the process outlined here is one that can
-easily be adaptable to any system, and the specific findings themselves offer
-interesting insights that warrant further study.
+The purpose of this sub-aim was to develop computational methods to identify
+novel \glspl{cqa} and \glspl{cpp} that could be used for release criteria,
+process control, and process optimization for the \gls{dms} platform. We
+hypothesized that T cells grown using the \gls{dms} system would produce
+detectable biological signatures in the media supernatent which corresponded to
+clinically relevent responses such as fold expansion or phenotype. We tested
+this hypothesis by activating T cells under a variety of conditions using a
+\gls{doe}, sampling the media at intermediate timepoints, and creating models to
+predict the outcome of the cultures. We should stress that the specific
+\glspl{cpp} and \glspl{cqa} determined by this aim are not necessarily
+universal, as this was not performed with equipment that would normally be used
+at scale. However, the process outlined here is one that can easily be adaptable
+to any system, and the specific findings themselves offer interesting insights
+that warrant further study.
 
 \section{methods}
 
@@ -2744,33 +2749,38 @@ interesting insights that warrant further study.
   \label{fig:mod_overview}
 \end{figure*}
 
-The first DOE resulted in a randomized 18-run I-optimal custom design where each
-DMS parameter was evaluated at three levels: IL2 concentration (10, 20, and 30
-U/uL), DMS concentration (500, 1500, 2500 carrier/uL), and functionalized
-antibody percent (60\%, 80\%, 100\%). These 18 runs consisted of 14 unique
-parameter combinations where 4 of them were replicated twice to assess
-prediction error. Process parameters for the ADOE were evaluated at multiple
-levels: IL2 concentration (30, 35, and 40 U/uL), DMS concentration (500, 1000,
-1500, 2000, 2500, 3000, 3500 carrier/uL), and functionalized antibody percent
-(100\%) as depicted in Fig.1b. To further optimize the initial region explored
-(DOE) in terms of total live CD4+ TN+TCM cells, a sequential adaptive
-design-of-experiment (ADOE) was designed with 10 unique parameter combinations,
-two of these replicated twice for a total of 12 additional samples (Fig.1b). The
-fusion of cytokine and NMR profiles from media to model these responses included
-30 cytokines from a custom Thermo Fisher ProcartaPlex Luminex kit and 20 NMR
-features. These 20 spectral features from NMR media analysis were selected out
-of approximately 250 peaks through the implementation of a variance-based
-feature selection approach and some manual inspection steps.
+The overall workflow of this aim is shown in \cref{fig:mod_overview_flow}.
+Experimental conditions within the design space were explored using a \gls{doe},
+and longitudinal samples were collected for each condition as the cultures
+progressed. Data from inputs and/or longitudinal samples were used to predict
+the endpoint response. The fusion of cytokine and \gls{nmr} profiles from media
+to model these responses included 30 cytokines from a custom Thermo Fisher
+ProcartaPlex Luminex kit and 20 \gls{nmr} features. These 20 spectral features
+from \gls{nmr} media analysis were selected out of approximately 250 peaks
+through the implementation of a variance-based feature selection approach and
+some manual inspection steps.
+
+The first \gls{doe} resulted in a randomized 18-run I-optimal custom design
+where each \gls{dms} parameter was evaluated at three levels: \pilII{} (10, 20,
+and 30 U/uL), \pdms{} (500, 1500, 2500 \si{\dms\per\ul}), and \pmab{} (60, 80,
+100 \si{\percent}). These 18 runs consisted of 14 unique parameter combinations
+where 4 of them were replicated twice to assess prediction error. To further
+optimize the initial region explored, an \gls{adoe} was designed with 10 unique
+parameter combinations, two of these replicated twice for a total of 12
+additional samples (\cref{fig:mod_overview_doe}). Process parameters for the
+\gls{adoe} were evaluated at multiple levels: \pilII{} (30, 35, and 40
+\si{\IU\per\ml}), \pdms{} (500, 1000, 1500, 2000, 2500, 3000, 3500
+\si{\dms\per\ml}), and \pmab{} (\SI{100}{\percent}) (\cref{fig:mod_overview}).
 
 \subsection{DMS fabrication}
 
 \glspl{dms} were fabricated as described in \cref{sec:dms_fab} with the
 following modifications in order to obtain a variable functional \gls{mab}
-surface density. During the \gls{mab} coating step, the anti-CD3/anti-CD28 mAb
-mixture was further combined with a biotinylated isotype control to reduce the
-overall fraction of targeted \glspl{mab} (for example the \SI{60}{\percent}
-\gls{mab} surface density corresponded to 3 mass parts \acd{3}, 3 mass parts
-\acd{28}, and 4 mass parts isotype control).
+surface density. During the \gls{mab} coating step, the \acd{3}/\acd{28}
+\gls{mab} mixture was further combined with a biotinylated isotype control to
+reduce the overall fraction of targeted \glspl{mab} (for example the
+\SI{60}{\percent} \gls{mab} surface density corresponded to 3 mass parts
+\acd{3}, 3 mass parts \acd{28}, and 4 mass parts isotype control).
 
 \subsection{T cell culture}
 
@@ -2779,9 +2789,9 @@ following modifications. At days 4, 6, 8, and 11, \SI{100}{\ul} media were
 collected for the Luminex assay and \gls{nmr} analysis. The volume of removed
 media was equivalently replaced during the media feeding step, which took place
 immediately after sample collection. Additionally, the same media feeding
-schedule was followed for the DOE and ADOE to improve consistency, and the same
-donor lot was used for both experiments. All cell counts were performed using
-\gls{aopi}.
+schedule was followed for the \gls{doe} and \gls{adoe} to improve consistency,
+and the same donor lot was used for both experiments. All cell counts were
+performed using \gls{aopi}.
 
 \subsection{flow cytometry}
 
@@ -2796,26 +2806,29 @@ Cytokines were quantified via Luminex as described in
 
 Prior to analysis, samples were centrifuged at \SI{2990}{\gforce} for
 \SI{20}{\minute} at \SI{4}{\degreeCelsius} to clear any debris\footnote{all
-  \gls{nmr} analysis was done by our collaborators at the University of
-  Georgia}. 5 uL of 100/3 mM DSS-D6 in deuterium oxide (Cambridge Isotope
-Laboratories) were added to 1.7 mm NMR tubes (Bruker BioSpin), followed by 45 uL
-of media from each sample that was added and mixed, for a final volume of 50 uL
-in each tube. Samples were prepared on ice and in predetermined, randomized
-order. The remaining volume from each sample in the rack (approx. 4 uL) was
-combined to create an internal pool. This material was used for internal
-controls within each rack as well as metabolite annotation.
+  \gls{nmr} analysis was done by our collaborators Max Colonna and Art Edison at
+  the University of Georgia; methods included here for reference}. \SI{5}{\ul} of
+100/3 \si{\mM} DSS-D6 in deuterium oxide (Cambridge Isotope Laboratories) were
+added to \SI{1.7}{\mm} \gls{nmr} tubes (Bruker BioSpin), followed by
+\SI{45}{\ul} of media from each sample that was added and mixed, for a final
+volume of \SI{50}{\ul} in each tube. Samples were prepared on ice and in
+predetermined, randomized order. The remaining volume from each sample in the
+rack (approx. \SI{4}{\ul}) was combined to create an internal pool. This
+material was used for internal controls within each rack as well as metabolite
+annotation.
 
-\gls{nmr} spectra were collected on a Bruker Avance III HD spectrometer at 600
-MHz using a 5-mm TXI cryogenic probe and TopSpin software (Bruker BioSpin).
-One-dimensional spectra were collected on all samples using the noesypr1d pulse
-sequence under automation using ICON NMR software. Two-dimensional \gls{hsqc}
-and TOCSY spectra were collected on internal pooled control samples for
-metabolite annotation.
+\gls{nmr} spectra were collected on a Bruker Avance III HD spectrometer at
+\SI{600}{\MHz} using a \SI{5}{\mm} TXI cryogenic probe and TopSpin software
+(Bruker BioSpin). One-dimensional spectra were collected on all samples using
+the noesypr1d pulse sequence under automation using ICON NMR software.
+Two-dimensional \gls{hsqc} and \gls{tocsy} spectra were collected on internal
+pooled control samples for metabolite annotation.
 
 One-dimensional spectra were manually phased and baseline corrected in TopSpin.
-Two-dimensional spectra were processed in NMRpipe37. One dimensional spectra
-were referenced, water/end regions removed, and normalized with the PQN
-algorithm38 using an in-house MATLAB (The MathWorks, Inc.) toolbox.
+Two-dimensional spectra were processed in NMRpipe\cite{Delaglio1995}. One
+dimensional spectra were referenced, water/end regions removed, and normalized
+with the PQN algorithm\cite{Dieterle2006} using an in-house MATLAB (The
+MathWorks, Inc.) toolbox.
 
 % TODO add the supplemental figure alluded to here?
 To reduce the total number of spectral features from approximately 250 peaks and
@@ -2824,19 +2837,22 @@ variance-based feature selection was performed within MATLAB. For each digitized
 point on the spectrum, the variance was calculated across all experimental
 samples and plotted. Clearly-resolved features corresponding to peaks in the
 variance spectrum were manually binned and integrated to obtain quantitative
-feature intensities across all samples (Supp.Fig.S24). In addition to highly
-variable features, several other clearly resolved and easily identifiable
-features were selected (glucose, BCAA region, etc). Some features were later
-discovered to belong to the same metabolite but were included in further
-analysis.
+feature intensities across all samples.
+% (Supp.Fig.S24).
+In addition to highly variable features, several other clearly resolved and
+easily identifiable features were selected (glucose, \gls{bcaa} region, etc).
+Some features were later discovered to belong to the same metabolite but were
+included in further analysis.
 
+% I think this is the right source? it seems wrong in the manuscript but this
+% source at least talks about an optimization score
 Two-dimensional spectra collected on pooled samples were uploaded to COLMARm web
-server10, where \gls{hsqc} peaks were automatically matched to database peaks.
+server, where \gls{hsqc} peaks were automatically matched to database peaks.
 \gls{hsqc} matches were manually reviewed with additional 2D and proton spectra
 to confirm the match. Annotations were assigned a confidence score based upon
-the levels of spectral data supporting the match as previously described11.
-Annotated metabolites were matched to previously selected features used for
-statistical analysis.
+the levels of spectral data supporting the match as previously
+described\cite{Dashti2017}. Annotated metabolites were matched to previously
+selected features used for statistical analysis.
 
 % I'm pretty sure this isn't relevant
 % Using the list of annotated metabolites obtained above, an approximation of a
@@ -2851,8 +2867,8 @@ statistical analysis.
 Several low abundance features selected for analysis did not have database
 matches and were not annotated. Statistical total correlation spectroscopy41
 suggested that some of these unknown features belonged to the same molecules
-(not shown). Additional multidimensional NMR experiments will be required to
-determine their identity.
+(not shown). Additional multidimensional \gls{nmr} experiments will be required
+to determine their identity.
 
 \subsection{machine learning and statistical analysis}
 
@@ -2861,66 +2877,70 @@ Linear regression analysis of the \glspl{doe} was performed as described in
 
 Seven \gls{ml} techniques were implemented to predict three responses related to
 the memory phenotype of the cultured T cells under different process parameters
-conditions (i.e. Total Live CD4+ TN and TCM, Total Live CD8+ TN+TCM, and Ratio
-CD4+/CD8+ TN+TCM). The \gls{ml} methods executed were \gls{rf}, \gls{gbm},
-\gls{cif}, \gls{lasso}, \gls{plsr}, \gls{svm}, and DataModeler’s
-\gls{sr}\footnote{of these seven methods, all except \gls{lasso} were performed
-  by collaborators}. Primarily, \gls{sr} models were used to optimize process
-parameter values based on TN+TCM phenotype and to extract early predictive
+conditions (i.e. \rmemh{}, \rmemk{}, and \rratio{}). The \gls{ml} methods
+executed were \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr},
+\gls{svm}, and DataModeler’s \gls{sr}\footnote{\gls{sr} was performed by Theresa
+  Kotanchek at Evolved Analytics, \gls{rf}, \gls{gbm}, \gls{cif}, \gls{plsr},
+  \gls{svm} were performed by Valerie Odeh-Couvertier at UPRM. Methods included
+  here for reference}. Primarily, \gls{sr} models were used to optimize process
+parameter values based on \ptmem{} phenotype and to extract early predictive
 variable combinations from the multi-omics experiments. Furthermore, all
 regression methods were executed, and the high-performing models were used to
 perform a consensus analysis of the important variables to extract potential
-critical quality attributes and critical process parameters predictive of T-cell
+critical quality attributes and critical process parameters predictive of T cell
 potency, safety, and consistency at the early stages of the manufacturing
 process.
 
 \gls{sr} was done using Evolved Analytics’ DataModeler software (Evolved
 Analytics LLC, Midland, MI). DataModeler utilizes genetic programming to evolve
 symbolic regression models (both linear and non-linear) rewarding simplicity and
-accuracy. Using the selection criteria of highest accuracy (R2>90\% or
-noise-power) and lowest complexity, the top-performing models were identified.
-Driving variables, variable combinations, and model dimensionality tables were
-generated. The top-performing variable combinations were used to generate model
-ensembles. In this analysis, DataModeler’s SymbolicRegression function was used
-to develop explicit algebraic (linear and nonlinear) models. The fittest models
-were analyzed to identify the dominant variables using the VariablePresence
-function, the dominant variable combinations using the VariableCombinations
+accuracy. Using the selection criteria of highest accuracy
+($R^2$>\SI{90}{\percent}) and lowest complexity, the top-performing models were
+identified. Driving variables, variable combinations, and model dimensionality
+tables were generated. The top-performing variable combinations were used to
+generate model ensembles. In this analysis, DataModeler’s
+\inlinecode{SymbolicRegression} function was used to develop explicit algebraic
+(linear and nonlinear) models. The fittest models were analyzed to identify the
+dominant variables using the \inlinecode{VariablePresence} function, the
+dominant variable combinations using the \inlinecode{VariableCombinations}
 function, and the model dimensionality (number of unique variables) using the
-ModelDimensionality function. CreateModelEnsemble was used to define trustable
-model ensembles using selected variable combinations and these were summarized
-(model expressions, model phenotype, model tree plot, ensemble quality, model
-quality, variable presence map, \gls{anova} tables, model prediction plot, exportable
-model forms) using the ModelSummaryTable function. Ensemble prediction and
-residual performance were respectively assessed via the EnsemblePredictionPlot
-and EnsembleResidualPlot subroutines. Model maxima (ModelMaximum function) and
-model minima (ModelMinimum function) were calculated and displayed using the
-ResponsePlotExplorer function. Trade-off performance of multiple responses was
-explored using the MultiTargetResponseExplorer and ResponseComparisonExplorer
-with additional insights derived from the ResponseContourPlotExplorer. Graphics
-and tables were generated by DataModeler. These model ensembles were used to
-identify predicted response values, potential optima in the responses, and
-regions of parameter values where the predictions diverge the most.
+\inlinecode{ModelDimensionality} function. \inlinecode{CreateModelEnsemble} was
+used to define trustable model ensembles using selected variable combinations
+and these were summarized (model expressions, model phenotype, model tree plot,
+ensemble quality, model quality, variable presence map, \gls{anova} tables,
+model prediction plot, exportable model forms) using the
+\inlinecode{ModelSummaryTable} function. Ensemble prediction and residual
+performance were assessed via the \inlinecode{EnsemblePredictionPlot} and
+\inlinecode{EnsembleResidualPlot} subroutines respectively. Model maxima
+(\inlinecode{ModelMaximum} function) and model minima (\inlinecode{ModelMinimum}
+function) were calculated and displayed using the
+\inlinecode{ResponsePlotExplorer} function. Trade-off between multiple
+responses was explored using \inlinecode{MultiTargetResponseExplorer} and
+\inlinecode{ResponseComparisonExplorer} with additional insights derived from
+\inlinecode{ResponseContourPlotExplorer}. Graphics and tables were generated by
+DataModeler. These model ensembles were used to identify predicted response
+values, potential optima in the responses, and regions of parameter values where
+the predictions diverge the most.
 
 Non-parametric tree-based ensembles were done through the
 \inlinecode{randomForest}, inlinecode{gbm}, and \inlinecode{cforest} regression
 functions in R, for \gls{rf}, \gls{gbm}, and \gls{cif} models, respectively.
 Both \gls{rf} and \gls{cif} construct multiple decision trees in parallel, by
 randomly choosing a subset of features at each decision tree split, in the
-training stage. Random forest individual decision trees are split using the Gini
+training stage. \gls{rf} individual decision trees are split using the Gini
 Index, while conditional inference forest uses a statistical significance test
 procedure to select the variables at each split, reducing correlation bias. In
 contrast, \gls{gbm} construct regression trees in series through an iterative
 procedure that adapts over the training set. This model learns from the mistakes
-of previous regression trees in an iterative fashion to correct errors from its
-precursors’ trees (i.e. minimize \gls{mse}). Prediction performance was
-evaluated using \gls{loocv} and permutation-based
-variable importance scores assessing \% increase of \gls{mse}, relative
-influence based on the increase of prediction error, coefficient values for
-\gls{rf}, \gls{gbm}, and \gls{cif}, respectively. \gls{plsr} was executed using
-the \inlinecode{plsr} function from the \inlinecode{pls} package in R while
-\gls{lasso} regression was performed using the \inlinecode{cv.glmnet} R package,
-both using leave-one-out cross-validation. Finally, the \inlinecode{kernlab} R
-package was used to construct the \gls{svm} models.
+of previous regression trees in an iterative fashion to correct errors
+(\gls{mse}) from its precursors’ trees. Prediction performance was evaluated
+using \gls{loocv} and permutation-based variable importance scores assessing
+percent increase of \gls{mse}, relative influence based on the increase of
+prediction error, coefficient values for \gls{rf}, \gls{gbm}, and \gls{cif},
+respectively. \gls{plsr} was executed using the \inlinecode{plsr} function from
+the \inlinecode{pls} package in R while \gls{lasso} regression was performed
+using the \inlinecode{cv.glmnet} R package, both using \gls{loocv}. Finally, the
+\inlinecode{kernlab} R package was used to construct the \gls{svm} models.
 
 Parameter tuning was done for all models in a grid search manner using the train
 function from the \inlinecode{caret} R package using \gls{loocv} as the
@@ -2936,12 +2956,12 @@ optimum \gls{loocv} performance as well. For \gls{plsr}, the optimal number of
 components to be used in the model was assessed based on the standard error of
 the cross-validation residuals using the function \inlinecode{selectNcomp} from
 the \inlinecode{pls} package. Moreover, \gls{lasso} regression was performed
-using the \inlinecode{cv.glmnet} package with alpha = 1. The best lambda for
-each response was chosen using the minimum error criteria. Lastly, a fixed
-linear kernel (i.e. \inlinecode{svmLinear}) was used to build the \gls{svm}
-regression models evaluating the cost parameter value with best \gls{loocv}.
-Prediction performance was measured for all models using the final model with
-\gls{loocv} tuned parameters.
+using the \inlinecode{cv.glmnet} package with $\upalpha$ = 1. The best
+$\uplambda$ for each response was chosen using the minimum error criteria.
+Lastly, a fixed linear kernel (\inlinecode{svmLinear}) was used to build
+the \gls{svm} regression models evaluating the cost parameter value with best
+\gls{loocv}. Prediction performance was measured for all models using the final
+model with \gls{loocv} tuned parameters.
 
 % TODO do I need this?
 % Table M2 shows the parameter values evaluated per model
@@ -2961,16 +2981,17 @@ values, absolute coefficient values, weighted sum of absolute coefficients
 values, and relative importance from sensitivity analysis determined for
 \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr}, and \gls{svm},
 respectively. Using these scores, key predictive variables were selected if
-their importance scores were within the 80th percentile ranking for the
+their importance scores were within the \nth{80} percentile ranking for the
 following \gls{ml} methods: \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso},
-\gls{plsr}, \gls{svm} while for \gls{sr} variables present in >30\% of the
-top-performing \gls{sr} models from DataModeler (R2>= 90\%, Complexity >= 100)
-were chosen to investigate consensus except for \gls{nmr} media models at day 4
-which considered a combination of the top-performing results of models excluding
-lactate ppms, and included those variables which were in > 40\% of the best
-performing models. Only variables with those high percentile scoring values were
-evaluated in terms of their logical relation (intersection across \gls{ml}
-models) and depicted using a Venn diagram from the \inlinecode{venn} R package.
+\gls{plsr}, \gls{svm} while for \gls{sr} variables present in >\SI{30}{\percent}
+of the top-performing \gls{sr} models from DataModeler ($R^2\ge$
+\SI{90}{\percent}, Complexity $\ge$ 100) were chosen to investigate consensus
+except for \gls{nmr} media models at day 4 which considered a combination of the
+top-performing results of models excluding lactate ppms, and included those
+variables which were in >\SI{40}{\percent} of the best performing models. Only
+variables with those high percentile scoring values were evaluated in terms of
+their logical relation (intersection across \gls{ml} models) and depicted using
+a Venn diagram from the \inlinecode{venn} R package.
 
 \section{results}
 
@@ -3033,8 +3054,6 @@ advantage at lower \gls{il2} concentrations compared to beads. For this reason,
 we decided to investigate the lower range of \gls{il2} concentrations starting
 at \SI{10}{\IU\per\ml} throughout the remainder of this aim.
 
-% RESULT this is not consistent with the next section since the responses are
-% different
 \subsection{DOE shows optimal conditions for expanded potent T cells}
 
 % TABLE not all of these were actually used, explain why by either adding columns
@@ -3045,7 +3064,6 @@ at \SI{10}{\IU\per\ml} throughout the remainder of this aim.
   \input{../tables/doe_runs.tex}
 \end{table}
 
-% RESULT integrate this figure into the results paragraph
 \begin{figure*}[ht!]
   \begingroup
 
@@ -3183,7 +3201,7 @@ qualitatively observed in the response plot (\cref{fig:doe_responses_mem}).
 Furthermore, the dataset parameter was weakly significant, indicating a possible
 batch effect between the \glspl{doe}. We should also note that despite many
 parameters being significant, this model was still only mediocre in describing
-this response; the $R^2$ was 0.741 but the adjusted $R^2$ was 0.583, indicating
+this response; the $R^2$ was 0.741 but the $R_{adj}^2$ was 0.583, indicating
 that our data might be underpowered for a model this complex. Further
 experiments beyond what was performed here may be needed to fully describe this
 response.
@@ -3193,7 +3211,7 @@ We performed linear regression on the other three responses, all of which
 performed much better than the \ptmem{} response as expected given the much
 lower apparent complexity in the response plots
 (\cref{fig:doe_responses_cd4,fig:doe_responses_mem4,fig:doe_responses_ratio}).
-All these models appeared to fit will, with $R^2$ and adjusted $R^2$ upward of
+All these models appeared to fit will, with $R^2$ and $R_{adj}^2$ upward of
 0.8. In all but the CD4:CD8 \ptmem{} ratio, the dataset parameter emerged as
 significant, indicating a batch effect between the \glspl{doe}. All other
 parameters except \pilII{} in the case of CD4:CD8 \ptmem{} ratio were
@@ -3216,16 +3234,24 @@ significant predictors.
   \label{fig:doe_sr_contour}
 \end{figure*}
 
-We then visualized the total \ptmemh{} cells and CD4:CD8 \ptmem{} ratio using
-the response explorer in DataModeler to create contour plots around the maximum
-responses. For both, it appeared that maximizing all three input parameters
-resulted in the maximum value for either response (\cref{fig:doe_responses}).
-While not all combinations at and around this optimum were tested, the model
-nonetheless showed that there were no other optimal values or regions elsewhere
-in the model.
+We then visualized the total \ptmemh{} cells and \rratio{} using the response
+explorer in DataModeler to create contour plots around the maximum responses.
+For both, it appeared that maximizing all three input parameters resulted in the
+maximum value for either response (\cref{fig:doe_sr_contour}). While not all
+combinations at and around this optimum were tested, the model nonetheless
+showed that there were no other optimal values or regions elsewhere in the
+model.
 
-% TODO this section header sucks
-\subsection{AI modeling reveals highly predictive species}
+\subsection{Modeling with artificial intelligence methods reveals potential
+  CQAs}
+
+Due to the heterogeneity of the multivariate data collected and knowing that no
+single model is perfect for all applications, we implemented an agnostic
+modeling approach to better understand these \ptmem{} responses. To achieve
+this, a consensus analysis using seven \gls{ml} techniques, \gls{rf}, \gls{gbm},
+\gls{cif}, \gls{lasso}, \gls{plsr}, \gls{svm}, and DataModeler’s \gls{sr}, was
+implemented to molecularly characterize \ptmem{} cells and to extract predictive
+features of quality early in their expansion process.
 
 \begin{figure*}[ht!]
   \begingroup
@@ -3238,34 +3264,38 @@ in the model.
   \label{fig:doe_luminex}
 \end{figure*}
 
-Due to the heterogeneity of the multivariate data collected and knowing that no
-single model structure is perfect for all applications, we implemented an
-agnostic modeling approach to better understand these TN+TCM responses. To
-achieve this, a consensus analysis using seven machine learning (ML) techniques,
-Random Forest (RF), Gradient Boosted Machine (GBM), Conditional Inference Forest
-(CIF), Least Absolute Shrinkage and Selection Operator (LASSO), Partial
-Least-Squares Regression (PLSR), Support Vector Machine (SVM), and DataModeler’s
-Symbolic Regression (SR), was implemented to molecularly characterize TN+TCM
-cells and to extract predictive features of quality early on their expansion
-process (Fig.1d-e).
+We collected secretome data via luminex for days 4, 6, 8, 11, and 14.
+Plotting the concentrations of these cytokines showed a large variation over all
+runs and between different timepoints, demonstrated that these could potentially
+be used to differentiate between different process conditions qualitatively
+simply based on variance (\cref{fig:doe_luminex}). These were also much higher
+in most cases that a set of bead based runs which were run in parallel, in
+agreement with the luminex data obtained previously in the Grex system (these
+data were collected in plates) (\cref{fig:grex_luminex}).
 
 % TABLE this table looks like crap, break it up into smaller tables
 \begin{table}[!h] \centering
-  \caption{Results for data-driven modeling}
+  \caption{Results for data-driven modeling using process parameters (PP) with
+    only \gls{nmr} on day 4 (N4), only \gls{nmr} on day 6 (N6), only secretome
+    on day 6 (S6), or various combindation of each for all seven \gls{ml}
+    techniques}
   \label{tab:mod_results}
   \input{../tables/model_results.tex}
 \end{table}
 
-SR models achieved the highest predictive performance (R2>93\%) when using
-multi-omics predictors for all endpoint responses (\cref{tab:mod_results}). SR
-achieved R2>98\% while GBM tree-based ensembles showed leave-one-out
-cross-validated R2 (LOO-R2) >95\% for CD4+ and CD4+/CD8+ TN+TCM responses.
-Similarly, LASSO, PLSR, and SVM methods showed consistent high LOO-R2, 92.9\%,
-99.7\%, and 90.5\%, respectively, to predict the CD4+/CD8+ TN+TCM. Yet, about
-10\% reduction in LOO-R2, 72.5\%-81.7\%, was observed for CD4+ TN+TCM with these
-three methods. Lastly, SR and PLSR achieved R2>90\% while other ML methods
-exhibited exceedingly variable LOO-R2 (0.3\%,RF-51.5\%,LASSO) for CD8+ TN+TCM
-cells.
+\gls{sr} models achieved the highest predictive performance
+($R^2$>\SI{93}{\percent}) when using multi-omics predictors for all endpoint
+responses (\cref{tab:mod_results}). \gls{sr} achieved $R^2$>\SI{98}{\percent}
+while \gls{gbm} tree-based ensembles showed \gls{loocv} $R^2$ >
+\SI{95}{\percent} for \rmemh{} and \rmemk{} responses. Similarly, \gls{lasso},
+\gls{plsr}, and \gls{svm} methods showed consistently high \gls{loocv},
+(\SI{92.9}{\percent}, \SI{99.7}{\percent}, and \SI{90.5}{\percent}
+respectively), to predict the \rratio{}. Yet, about \SI{10}{\percent} reduction
+in \gls{loocv}, \SIrange{72.5}{81.7}{\percent}, was observed for \rmemh{} with
+these three methods. Lastly, \gls{sr} and \gls{plsr} achieved
+$R^2$>\SI{90}{\percent} while other \gls{ml} methods exhibited exceedingly
+variable \gls{loocv} (\SI{0.3}{\percent} for \gls{rf} to \SI{51.5}{\percent} for
+\gls{lasso}) for \rmemk{}.
 
 \begin{figure*}[ht!]
   \begingroup
@@ -3279,13 +3309,12 @@ cells.
   \label{fig:sr_omics}
 \end{figure*}
 
-The top-performing technique, SR, showed that the median aggregated predictions
-for CD4+ and CD8+ TN+TCM cells increases when IL2 concentration, IL15, and IL2R
-increase while IL17a decreases in conjunction with other features. These
-patterns combined with low values of DMS concentration and GM-CSF uniquely
-characterized maximum CD8+ TN+TCM. Meanwhile, higher glycine but lower IL13 in
-combination with others showed maximum CD4+ TN+TCM predictions
-(\cref{fig:sr_omics}).
+The top-performing technique, \gls{sr}, showed that the median aggregated
+predictions for \rmemh{} \rmemk{} increases when IL2 concentration, IL15, and
+IL2R increase while IL17a decreases in conjunction with other features. These
+patterns combined with low values of \pdms{} and GM-CSF uniquely characterized
+maximum \rmemk{}. Meanwhile, higher glycine but lower IL13 in combination with
+others showed maximum \rmemh{} predictions (\cref{fig:sr_omics}).
 
 \begin{figure*}[ht!]
   \begingroup
@@ -3306,13 +3335,25 @@ combination with others showed maximum CD4+ TN+TCM predictions
   \label{fig:mod_flower}
 \end{figure*}
 
-Selecting CPPs and CQAs candidates consistently for T cell memory is desired.
-Here, \gls{tnfa} was found in consensus across all seven ML methods for predicting
-CD4+/CD8+ TN+TCM when considering features with the highest importance scores
-across models (Fig.3a;Methods). Other features, IL2R, IL4, IL17a, and DMS
-concentration, were commonly selected in >=5 ML methods (Fig.3a,c). Moreover,
-IL13 and IL15 were found predictive in combination with these using SR
-(Supp.Table.S4).
+Selecting \gls{cpp} and \glspl{cqa} candidates consistently for T cell memory is
+desired. Here, \gls{tnfa} was found in consensus across all seven \gls{ml}
+methods for predicting \rratio{} when considering features with the highest
+importance scores across models (\cref{fig:mod_flower_48r}). Other features,
+IL2R, IL4, IL17a, and \pdms{}, were commonly selected in $\ge$ 5 \gls{ml}
+methods (\cref{fig:mod_flower_48r}). When restricting the models only to include
+metabolome, formate emerged as the dominant predictor shared across all seven
+models.
+
+% Moreover, IL13 and IL15 were found predictive in combination
+% with these using \gls{sr} (Supp.Table.S4).
+
+When performing similar analysis on \rmemh{}, we observe that no species for
+either the secretome or metabolome was agreed upon by all seven models
+(\cref{fig:mod_flower_cd4}). We also observed that these models did not fit as
+well as they did for \rratio{} (\cref{tab:mod_results}). For the secretome, the
+species that were agreed upon by $\ge$ 5 models were IL4, IL17a, and IL2R. For
+the metabolome, formate once again was agreed upon by $\ge$ 5 models as well as
+lactate.
 
 \begin{figure*}[ht!]
   \begingroup
@@ -3338,55 +3379,61 @@ IL13 and IL15 were found predictive in combination with these using SR
   \label{fig:nmr_cors}
 \end{figure*}
 
+We also investigated the \gls{nmr} features extracted from day of expansion to
+assess if there was any predictive power for \ptmemh{}; in general these models
+had almost as good of fit despite being 2 days earlier in the process
+(\cref{fig:nmr_cors}). Lactate and formate were observed to correlate with each
+other, and both correlated with \rmemh{}. Furthermore, lactate was observed to
+positively correlate with \pdms{} and negatively correlate with glucose
+(\cref{fig:nmr_cors_lactate}). Formate also had the same correlation patterns
+(\cref{fig:nmr_cors_formate}). Glucose was only negatively correlated with
+formate and lactate (\cref{fig:nmr_cors_glucose}). Together, these data suggest
+that lactate, formate, \pdms{}, and \rmemh{} are fundamentally linked.
+
 \section{discussion}
 
-% optimization of process features
-% TODO this sounds like total fluff
-
-% DISCUSSION integrate figures
-
-CPPs modeling and understanding are critical to new product development and in
-cell therapy development, it can have life-saving implications. The challenges
-for effective modeling grow with the increasing complexity of processes due to
-high dimensionality, and the potential for process interactions and nonlinear
-relationships. Another critical challenge is the limited amount of available
-data, mostly small DOE datasets. SR has the necessary capabilities to resolve
-the issues of process effects modeling and has been applied across multiple
-industries12. SR discovers mathematical expressions that fit a given sample and
-differs from conventional regression techniques in that a model structure is not
-defined a priori13. Hence, a key advantage of this methodology is that
+\gls{cpp} modeling and understanding are critical to new product development and
+in cell therapy development, it can have life-saving implications. The
+challenges for effective modeling grow with the increasing complexity of
+processes due to high dimensionality, and the potential for process interactions
+and nonlinear relationships. Another critical challenge is the limited amount of
+available data, mostly small \gls{doe} datasets. \gls{sr} has the necessary
+capabilities to resolve the issues of process effects modeling and has been
+applied across multiple industries\cite{Kordona}. \gls{sr} discovers
+mathematical expressions that fit a given sample and differs from conventional
+regression techniques in that a model structure is not defined a
+priori\cite{Koza1992}. Hence, a key advantage of this methodology is that
 transparent, human-interpretable models can be generated from small and large
 datasets with no prior assumptions\cite{Kotancheka}.
 
 Since the model search process lets the data determine the model, diverse and
-competitive (e.g., accuracy, complexity) model structures are typically
-discovered. An ensemble of diverse models can be formed where its constituent
-models will tend to agree when constrained by observed data yet diverge in new
-regions. Collecting data in these regions helps to ensure that the target system
-is accurately modeled, and its optimum is accurately located\cite{Kotancheka}.
-Exploiting these features allows adaptive data collection and interactive
-modeling. Consequently, this adaptive-DOE approach is useful in a variety of
-scenarios, including maximizing model validity for model-based decision making,
-optimizing processing parameters to maximize target yields, and developing
-emulators for online optimization and human understanding\cite{Kotancheka}.
+competitive model structures are typically discovered. An ensemble of diverse
+models can be formed where its constituent models will tend to agree when
+constrained by observed data yet diverge in new regions. Collecting data in
+these regions helps to ensure that the target system is accurately modeled, and
+its optimum is accurately located\cite{Kotancheka}. Exploiting these features
+allows adaptive data collection and interactive modeling. Consequently, this
+\gls{adoe} approach is useful in a variety of scenarios, including maximizing
+model validity for model-based decision making, optimizing processing parameters
+to maximize target yields, and developing emulators for online optimization and
+human understanding\cite{Kotancheka}.
 
-% predictive features
-
-An in-depth characterization of potential DMS-based T-cell CQAs includes a list
-of cytokine and NMR features from media samples that are crucial in many aspects
-of T cell fate decisions and effector functions of immune cells. Cytokine
-features were observed to slightly improve prediction and dominated the ranking
-of important features and variable combinations when modeling together with NMR
-media analysis and process parameters (Fig.3b,d).
+An in-depth characterization of potential \gls{dms} based T cell \glspl{cqa}
+includes a list of cytokine and \gls{nmr} features from media samples that are
+crucial in many aspects of T cell fate decisions and effector functions of
+immune cells. Cytokine features were observed to slightly improve prediction and
+dominated the ranking of important features and variable combinations when
+modeling together with \gls{nmr} media analysis and process parameters
+(\cref{fig:mod_flower}).
 
 Predictive cytokine features such as \gls{tnfa}, IL2R, IL4, IL17a, IL13, and
 IL15 were biologically assessed in terms of their known functions and activities
 associated with T cells. T helper cells secrete more cytokines than T cytotoxic
 cells, as per their main functions, and activated T cells secrete more cytokines
 than resting T cells. It is possible that some cytokines simply reflect the
-CD4+/CD8+ ratio and the activation degree by proxy proliferation. However, the
-exact ratio of expected cytokine abundance is less clear and depends on the
-subtypes present, and thus examination of each relevant cytokine is needed.
+\rratio{} and the activation degree by proxy proliferation. However, the exact
+ratio of expected cytokine abundance is less clear and depends on the subtypes
+present, and thus examination of each relevant cytokine is needed.
 
 IL2R is secreted by activated T cells and binds to IL2, acting as a sink to
 dampen its effect on T cells\cite{Witkowska2005}. Since IL2R was much greater
@@ -3399,22 +3446,23 @@ form, this may either increase or decrease CD4+ ratio and/or memory T cells
 depending on the ratio of the membrane to soluble TNF\cite{Mehta2018}. Since
 only soluble TNF was measured, membrane TNF is needed to understand its impact
 on both CD4+ ratio and memory T cells. Furthermore, IL13 is known to be critical
-for Th2 response and therefore could be secreted if there are significant Th2 T
-cells already present in the starting population\cite{Wong2011}. This cytokine
-has limited signaling in T cells and is thought to be more of an effector than a
-differentiation cytokine\cite{Junttila2018}. It might be emerging as relevant
-due to an initially large number of Th2 cells or because Th2 cells were
-preferentially expanded; indeed, IL4, also found important, is the conical
-cytokine that induces Th2 cell differentiation (Fig.3). The role of these
-cytokines could be investigated by quantifying the Th1/2/17 subsets both in the
-starting population and longitudinally. Similar to IL13, IL17 is an effector
-cytokine produced by Th17 cells\cite{Amatya2017} thus may reflect the number of
-Th17 subset of T cells. GM-CSF has been linked with activated T cells,
-specifically Th17 cells, but it is not clear if this cytokine is inducing
+for \gls{th2} response and therefore could be secreted if there are significant
+\glspl{th2} already present in the starting population\cite{Wong2011}. This
+cytokine has limited signaling in T cells and is thought to be more of an
+effector than a differentiation cytokine\cite{Junttila2018}. It might be
+emerging as relevant due to an initially large number of \glspl{th2} or because
+\glspl{th2} were preferentially expanded; indeed, IL4, also found important, is
+the conical cytokine that induces \gls{th2} differentiation
+(\cref{fig:mod_flower}). The role of these cytokines could be investigated by
+quantifying \glspl{th1}, \glspl{th2}, or \glspl{th17} both in the starting
+population and longitudinally. Similar to IL13, IL17 is an effector cytokine
+produced by \glspl{th17}\cite{Amatya2017} thus may reflect the number of
+\glspl{th17} in the population. GM-CSF has been linked with activated T cells,
+specifically \glspl{th17}, but it is not clear if this cytokine is inducing
 differential expansion of CD8+ T cells or if it is simply a covariate with
 another cytokine inducing this expansion\cite{Becher2016}. Finally, IL15 has
-been shown to be essential for memory signaling and effective in skewing CAR-T
-cells toward the Tscm phenotype when using membrane-bound IL15Ra and
+been shown to be essential for memory signaling and effective in skewing
+\gls{car} T cells toward \glspl{tscm} when using membrane-bound IL15Ra and
 IL15R\cite{Hurton2016}. Its high predictive behavior goes with its ability to
 induce large numbers of memory T cells by functioning in an autocrine/paracrine
 manner and could be explored by blocking either the cytokine or its receptor.
@@ -3424,7 +3472,7 @@ activity associated with T cell activation and differentiation, yet it is not
 clear how the various combinations of metabolites relate with each other in a
 heterogeneous cell population. Formate and lactate were found to be highly
 predictive and observed to positively correlate with higher values of total live
-CD4+ TN+TCM cells (Fig.5a-b;Supp.Fig.28-S30,S38). Formate is a byproduct of the
+\rmemh{} cells (~\cref{fig:nmr_cors}). Formate is a byproduct of the
 one-carbon cycle implicated in promoting T cell activation\cite{RonHarel2016}.
 Importantly, this cycle occurs between the cytosol and mitochondria of cells and
 formate excreted\cite{Pietzke2020}. Mitochondrial biogenesis and function are
@@ -3432,18 +3480,18 @@ shown necessary for memory cell persistence\cite{van_der_Windt_2012,
   Vardhana2020}. Therefore, increased formate in media could be an indicator of
 one-carbon metabolism and mitochondrial activity in the culture.
 
-In addition to formate, lactate was found as a putative CQA of TN+TCM. Lactate
-is the end-product of aerobic glycolysis, characteristic of highly proliferating
-cells and activated T cells\cite{Lunt2011, Chang2013}. Glucose import and
-glycolytic genes are immediately upregulated in response to T cell stimulation,
-and thus generation of lactate. At earlier time-points, this abundance suggests
-a more robust induction of glycolysis and higher overall T cell proliferation.
-Interestingly, our models indicate that higher lactate predicts higher CD4+,
-both in total and in proportion to CD8+, seemingly contrary to previous studies
-showing that CD8+ T cells rely more on glycolysis for proliferation following
-activation\cite{Cao2014}. It may be that glycolytic cells dominate in the
-culture at the early time points used for prediction, and higher lactate
-reflects more cells.
+In addition to formate, lactate was found as a putative \gls{cqa} of \ptmem{}
+cells. Lactate is the end-product of aerobic glycolysis, characteristic of
+highly proliferating cells and activated T cells\cite{Lunt2011, Chang2013}.
+Glucose import and glycolytic genes are immediately upregulated in response to T
+cell stimulation, and thus generation of lactate. At earlier time-points, this
+abundance suggests a more robust induction of glycolysis and higher overall T
+cell proliferation. Interestingly, our models indicate that higher lactate
+predicts higher CD4+, both in total and in proportion to CD8+, seemingly
+contrary to previous studies showing that CD8+ T cells rely more on glycolysis
+for proliferation following activation\cite{Cao2014}. It may be that glycolytic
+cells dominate in the culture at the early time points used for prediction, and
+higher lactate reflects more cells.
 
 % TODO not sure how much I should include here since I didn't do this analysis
 % AT ALL
@@ -3461,20 +3509,20 @@ reflects more cells.
 % this looks fine since it is just parroting sources, just need to paraphrase a
 % little
 Metabolites that consistently decreased over time are consistent with the
-primary carbon source (glucose) and essential amino acids (BCAA, histidine) that
-must be continually consumed by proliferating cells. Moreover, the inclusion of
-glutamine in our predictive models also suggests the importance of other carbon
-sources for certain T cell subpopulations. Glutamine can be used for oxidative
-energy metabolism in T cells without the need for glycolysis\cite{Cao2014}.
-Overall, these results are consistent with existing literature that show
-different T cell subtypes require different relative levels of glycolytic and
-oxidative energy metabolism to sustain the biosynthetic and signaling needs of
-their respective phenotypes\cite{Almeida2016,Wang_2012}. It is worth noting that
-the trends of metabolite abundance here are potentially confounded by the
-partial replacement of media that occurred periodically during expansion
-(Methods), thus likely diluting some metabolic byproducts (i.e. formate,
-lactate) and elevating depleted precursors (i.e. glucose, amino acids). More
-definitive conclusions of metabolic activity across the expanding cell
+primary carbon source (glucose) and essential amino acids (\gls{bcaa},
+histidine) that must be continually consumed by proliferating cells. Moreover,
+the inclusion of glutamine in our predictive models also suggests the importance
+of other carbon sources for certain T cell subpopulations. Glutamine can be used
+for oxidative energy metabolism in T cells without the need for
+glycolysis\cite{Cao2014}. Overall, these results are consistent with existing
+literature that show different T cell subtypes require different relative levels
+of glycolytic and oxidative energy metabolism to sustain the biosynthetic and
+signaling needs of their respective phenotypes\cite{Almeida2016,Wang_2012}. It
+is worth noting that the trends of metabolite abundance here are potentially
+confounded by the partial replacement of media that occurred periodically during
+expansion, thus likely diluting some metabolic byproducts (such as formate,
+lactate) and elevating depleted precursors (such as glucose and amino acids).
+More definitive conclusions of metabolic activity across the expanding cell
 population can be addressed by a closed system, ideally with on-line process
 sensors and controls for formate, lactate, along with ethanol and glucose.