ENH proofread aim2a

This commit is contained in:
Nathan Dwarshuis 2021-08-04 18:24:06 -04:00
parent fa716f9d78
commit 4789723e72
9 changed files with 372 additions and 268 deletions

View File

@ -16,8 +16,8 @@
Observations & 30 \\ Observations & 30 \\
R$^{2}$ & 0.888 \\ R$^{2}$ & 0.888 \\
Adjusted R$^{2}$ & 0.870 \\ Adjusted R$^{2}$ & 0.870 \\
Residual Std. Error & 727,042.800 (df = 25) \\ % Residual Std. Error & 727,042.800 (df = 25) \\
F Statistic & 49.454$^{***}$ (df = 4; 25) \\ % F Statistic & 49.454$^{***}$ (df = 4; 25) \\
\hline \hline
\hline \\[-1.8ex] \hline \\[-1.8ex]
\textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ \textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\

View File

@ -16,8 +16,8 @@
Observations & 30 \\ Observations & 30 \\
R$^{2}$ & 0.331 \\ R$^{2}$ & 0.331 \\
Adjusted R$^{2}$ & 0.224 \\ Adjusted R$^{2}$ & 0.224 \\
Residual Std. Error & 3,659,501.000 (df = 25) \\ % Residual Std. Error & 3,659,501.000 (df = 25) \\
F Statistic & 3.096$^{**}$ (df = 4; 25) \\ % F Statistic & 3.096$^{**}$ (df = 4; 25) \\
\hline \hline
\hline \\[-1.8ex] \hline \\[-1.8ex]
\textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ \textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\

View File

@ -23,8 +23,8 @@
Observations & 30 \\ Observations & 30 \\
R$^{2}$ & 0.741 \\ R$^{2}$ & 0.741 \\
Adjusted R$^{2}$ & 0.583 \\ Adjusted R$^{2}$ & 0.583 \\
Residual Std. Error & 0.228 (df = 18) \\ % Residual Std. Error & 0.228 (df = 18) \\
F Statistic & 4.693$^{***}$ (df = 11; 18) \\ % F Statistic & 4.693$^{***}$ (df = 11; 18) \\
\hline \hline
\hline \\[-1.8ex] \hline \\[-1.8ex]
\textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ \textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\

View File

@ -14,8 +14,8 @@
Observations & 30 \\ Observations & 30 \\
R$^{2}$ & 0.835 \\ R$^{2}$ & 0.835 \\
Adjusted R$^{2}$ & 0.808 \\ Adjusted R$^{2}$ & 0.808 \\
Residual Std. Error & 493,168.700 (df = 25) \\ % Residual Std. Error & 493,168.700 (df = 25) \\
F Statistic & 31.571$^{***}$ (df = 4; 25) \\ % F Statistic & 31.571$^{***}$ (df = 4; 25) \\
\hline \hline
\hline \\[-1.8ex] \hline \\[-1.8ex]
\textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ \textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\

View File

@ -15,8 +15,8 @@
Observations & 30 \\ Observations & 30 \\
R$^{2}$ & 0.879 \\ R$^{2}$ & 0.879 \\
Adjusted R$^{2}$ & 0.860 \\ Adjusted R$^{2}$ & 0.860 \\
Residual Std. Error & 0.039 (df = 25) \\ % Residual Std. Error & 0.039 (df = 25) \\
F Statistic & 45.554$^{***}$ (df = 4; 25) \\ % F Statistic & 45.554$^{***}$ (df = 4; 25) \\
\hline \hline
\hline \\[-1.8ex] \hline \\[-1.8ex]
\textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ \textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\

View File

@ -1,7 +1,7 @@
\begin{tabular}{ccccc} \begin{tabular}{ccccc}
\\[-1.8ex]\hline \\[-1.8ex]\hline
\hline \\[-1.8ex] \hline \\[-1.8ex]
\\[-1.8ex] Dataset & Run & IL2 Conc (\si{\IU\per\ml}) & DMS Conc. (\si{\IU\per\ml}) \\[-1.8ex] Dataset & Run & IL2 Conc (\si{\IU\per\ml}) & DMS Conc. (\si{\dms\per\ml})
& Functional \gls{mab} (\si{\percent}) \\ & Functional \gls{mab} (\si{\percent}) \\
\hline \\[-1.8ex] \hline \\[-1.8ex]
DOE & 1 & 30 & 500 & 100\\ DOE & 1 & 30 & 500 & 100\\

View File

@ -17,7 +17,7 @@ PP+S6 & \SI{98}{\percent} & \SI{71.4}{\percent} & \SI{99.9}{\percent} & \SI{75.0
PP+S6+N6 & \SI{98}{\percent} & \SI{68.2}{\percent} & \SI{95.6}{\percent} & \SI{74.4}{\percent} & \SI{72.5}{\percent} & \SI{81.7}{\percent} & \SI{77.0}{\percent}\\ PP+S6+N6 & \SI{98}{\percent} & \SI{68.2}{\percent} & \SI{95.6}{\percent} & \SI{74.4}{\percent} & \SI{72.5}{\percent} & \SI{81.7}{\percent} & \SI{77.0}{\percent}\\
\hline \\ \hline \\
\multicolumn{8}{l}{\ptmemh{} cells} \\ \multicolumn{8}{l}{\ptmemk{} cells} \\
PP+N4 & \SI{93}{\percent} & \SI{4.7}{\percent} & \SI{44.4}{\percent} & \SI{9.2}{\percent} & \SI{1.2}{\percent} & \SI{65.1}{\percent} & \SI{9.1}{\percent}\\ PP+N4 & \SI{93}{\percent} & \SI{4.7}{\percent} & \SI{44.4}{\percent} & \SI{9.2}{\percent} & \SI{1.2}{\percent} & \SI{65.1}{\percent} & \SI{9.1}{\percent}\\
PP+N6 & \SI{86}{\percent} & \SI{2.0}{\percent} & \SI{29.9}{\percent} & \SI{15.8}{\percent} & \SI{28.5}{\percent} & \SI{63.3}{\percent} & \SI{30.6}{\percent}\\ PP+N6 & \SI{86}{\percent} & \SI{2.0}{\percent} & \SI{29.9}{\percent} & \SI{15.8}{\percent} & \SI{28.5}{\percent} & \SI{63.3}{\percent} & \SI{30.6}{\percent}\\
PP+S6 & \SI{93}{\percent} & \SI{7.8}{\percent} & \SI{28.0}{\percent} & \SI{15.1}{\percent} & \SI{76.2}{\percent} & \SI{98.4}{\percent} & \SI{49.8}{\percent}\\ PP+S6 & \SI{93}{\percent} & \SI{7.8}{\percent} & \SI{28.0}{\percent} & \SI{15.1}{\percent} & \SI{76.2}{\percent} & \SI{98.4}{\percent} & \SI{49.8}{\percent}\\

View File

@ -2673,6 +2673,62 @@ CONCLUSIONS: We developed a simplified, semi-closed system for the initial selec
publisher = {The American Association of Immunologists}, publisher = {The American Association of Immunologists},
} }
@Article{Delaglio1995,
author = {Frank Delaglio and Stephan Grzesiek and GeertenW. Vuister and Guang Zhu and John Pfeifer and Ad Bax},
journal = {Journal of Biomolecular {NMR}},
title = {{NMRPipe}: A multidimensional spectral processing system based on {UNIX} pipes},
year = {1995},
month = {nov},
number = {3},
volume = {6},
doi = {10.1007/bf00197809},
publisher = {Springer Science and Business Media {LLC}},
}
@Article{Dieterle2006,
author = {Frank Dieterle and Alfred Ross and Götz Schlotterbeck and Hans Senn},
journal = {Analytical Chemistry},
title = {Probabilistic Quotient Normalization as Robust Method to Account for Dilution of Complex Biological Mixtures. Application in1H {NMR} Metabonomics},
year = {2006},
month = {jul},
number = {13},
pages = {4281--4290},
volume = {78},
doi = {10.1021/ac051632c},
publisher = {American Chemical Society ({ACS})},
}
@Article{Dashti2017,
author = {Hesam Dashti and William M. Westler and Marco Tonelli and Jonathan R. Wedell and John L. Markley and Hamid R. Eghbalnia},
journal = {Analytical Chemistry},
title = {Spin System Modeling of Nuclear Magnetic Resonance Spectra for Applications in Metabolomics and Small Molecule Screening},
year = {2017},
month = {nov},
number = {22},
pages = {12201--12208},
volume = {89},
doi = {10.1021/acs.analchem.7b02884},
publisher = {American Chemical Society ({ACS})},
}
@InProceedings{Kordona,
author = {A.K. Kordon and Ching-Tai Lue},
booktitle = {Proceedings of the 2004 Congress on Evolutionary Computation ({IEEE} Cat. No.04TH8753)},
title = {Symbolic regression modeling of blown film process effects},
year = {2004},
publisher = {{IEEE}},
doi = {10.1109/cec.2004.1330907},
}
@Book{Koza1992,
author = {Koza, John},
publisher = {MIT Press},
title = {Genetic programming : on the programming of computers by means of natural selection},
year = {1992},
address = {Cambridge, Mass},
isbn = {0262111705},
}
@Comment{jabref-meta: databaseType:bibtex;} @Comment{jabref-meta: databaseType:bibtex;}
@Comment{jabref-meta: grouping: @Comment{jabref-meta: grouping:

View File

@ -103,8 +103,9 @@
\tcellacronym{tc}{c}{cytotoxic}{} \tcellacronym{tc}{c}{cytotoxic}{}
\tcellacronym{th1}{h}{type 1 helper}{1} \tcellacronym{th1}{h}{type 1 helper}{1}
\tcellacronym{th2}{h}{type 2 helper}{2} \tcellacronym{th2}{h}{type 2 helper}{2}
% \tcellacronym{th17}{h}{\il{17} helper}{1} \tcellacronym{th17}{h}{IL-17 helper}{17}
\newacronym{bcaa}{BCAA}{branched-chain amino acid}
\newacronym{til}{TIL}{tumor infiltrating lymphocyte} \newacronym{til}{TIL}{tumor infiltrating lymphocyte}
\newacronym{tcr}{TCR}{T cell receptor} \newacronym{tcr}{TCR}{T cell receptor}
\newacronym{act}{ACT}{adoptive cell therapies} \newacronym{act}{ACT}{adoptive cell therapies}
@ -207,6 +208,7 @@
\newacronym{scfv}{scFv}{single-chain fragment variable} \newacronym{scfv}{scFv}{single-chain fragment variable}
\newacronym{hepes}{HEPES}{4-(2-hydroxyethyl)-1-piperazineethanesulfonic acid} \newacronym{hepes}{HEPES}{4-(2-hydroxyethyl)-1-piperazineethanesulfonic acid}
\newacronym{nhs}{NHS}{N-hydroxysulfosuccinimide} \newacronym{nhs}{NHS}{N-hydroxysulfosuccinimide}
\newacronym{tocsy}{TOCSY}{total correlation spectroscopy}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% SI units for uber nerds % SI units for uber nerds
@ -282,10 +284,13 @@
% so I don't need to worry about abbreviating all the different interleukins % so I don't need to worry about abbreviating all the different interleukins
\newcommand{\il}[1]{\gls{il}-#1} \newcommand{\il}[1]{\gls{il}-#1}
% DOE responses I don't feel like typing ad-nauseam % DOE stuff I don't feel like typing ad-nauseam
\newcommand{\pilII}{\gls{il2} concentration} \newcommand{\pilII}{\gls{il2} concentration}
\newcommand{\pdms}{\gls{dms} concentration} \newcommand{\pdms}{\gls{dms} concentration}
\newcommand{\pmab}{functional \gls{mab} surface density} \newcommand{\pmab}{functional \gls{mab} surface density}
\newcommand{\rmemh}{total \ptmemh{} cells}
\newcommand{\rmemk}{total \ptmemk{} cells}
\newcommand{\rratio}{CD4/CD8 ratio}
% vendor and product stuff I don't feel like typing % vendor and product stuff I don't feel like typing
\newcommand{\catnum}[2]{(#1, #2)} \newcommand{\catnum}[2]{(#1, #2)}
@ -2683,12 +2688,12 @@ necessary to have a fully-automated manufacturing system.
The \gls{dms} system could be used as a drop in replacement for beads in many of The \gls{dms} system could be used as a drop in replacement for beads in many of
current allogeneic therapies. Indeed, given its higher potential for expansion current allogeneic therapies. Indeed, given its higher potential for expansion
(\cref{fig:dms_exp,tab:ci_controlled}, it may work in cases where the beads fail (\cref{fig:dms_exp,tab:ci_controlled}), it may work in cases where the beads
(although this would need to be tested by gathering data with many unhealthy fail (although this would need to be tested by gathering data with many
donors). However, in the autologous setting patients only need a fixed dose, and unhealthy donors). However, in the autologous setting patients only need a fixed
thus any expansion beyond the indicated dose would be wasted. Given this, it dose, and thus any expansion beyond the indicated dose would be wasted. Given
will be interesting to apply this technology in an allogeneic paradigm where this, it will be interesting to apply this technology in an allogeneic paradigm
this increased expansion potential would be well utilized. where this increased expansion potential would be well utilized.
Finally, we should note that while we demonstrated a method providing superior Finally, we should note that while we demonstrated a method providing superior
performance compared to bead-based expansion, the cell manufacturing field would performance compared to bead-based expansion, the cell manufacturing field would
@ -2702,20 +2707,20 @@ manufacturing companies.
\section{introduction} \section{introduction}
The purpose of this sub-aim was to develop computational methods to identify novel The purpose of this sub-aim was to develop computational methods to identify
\glspl{cqa} and \glspl{cpp} that could be used for release criteria, process novel \glspl{cqa} and \glspl{cpp} that could be used for release criteria,
control, and process optimization for the \gls{dms} platform. We hypothesized process control, and process optimization for the \gls{dms} platform. We
that T cells grown using the \gls{dms} system would produce detectable hypothesized that T cells grown using the \gls{dms} system would produce
biological signatures in the media supernatent which corresponded to clinically detectable biological signatures in the media supernatent which corresponded to
relevent responses such as the fold expansion of the T cells or the resulting clinically relevent responses such as fold expansion or phenotype. We tested
phenotype. We tested this hypothesis by activating T cells under a variety of this hypothesis by activating T cells under a variety of conditions using a
conditions using a \gls{doe}, sampling the media at intermediate timepoints, and \gls{doe}, sampling the media at intermediate timepoints, and creating models to
creating models to predict the outcome of the cultures. We should stress that predict the outcome of the cultures. We should stress that the specific
the specific \glspl{cpp} and \glspl{cqa} determined by this aim are not \glspl{cpp} and \glspl{cqa} determined by this aim are not necessarily
necessarily universal, as this was not performed with equipment that would universal, as this was not performed with equipment that would normally be used
normally be used at scale. However, the process outlined here is one that can at scale. However, the process outlined here is one that can easily be adaptable
easily be adaptable to any system, and the specific findings themselves offer to any system, and the specific findings themselves offer interesting insights
interesting insights that warrant further study. that warrant further study.
\section{methods} \section{methods}
@ -2744,33 +2749,38 @@ interesting insights that warrant further study.
\label{fig:mod_overview} \label{fig:mod_overview}
\end{figure*} \end{figure*}
The first DOE resulted in a randomized 18-run I-optimal custom design where each The overall workflow of this aim is shown in \cref{fig:mod_overview_flow}.
DMS parameter was evaluated at three levels: IL2 concentration (10, 20, and 30 Experimental conditions within the design space were explored using a \gls{doe},
U/uL), DMS concentration (500, 1500, 2500 carrier/uL), and functionalized and longitudinal samples were collected for each condition as the cultures
antibody percent (60\%, 80\%, 100\%). These 18 runs consisted of 14 unique progressed. Data from inputs and/or longitudinal samples were used to predict
parameter combinations where 4 of them were replicated twice to assess the endpoint response. The fusion of cytokine and \gls{nmr} profiles from media
prediction error. Process parameters for the ADOE were evaluated at multiple to model these responses included 30 cytokines from a custom Thermo Fisher
levels: IL2 concentration (30, 35, and 40 U/uL), DMS concentration (500, 1000, ProcartaPlex Luminex kit and 20 \gls{nmr} features. These 20 spectral features
1500, 2000, 2500, 3000, 3500 carrier/uL), and functionalized antibody percent from \gls{nmr} media analysis were selected out of approximately 250 peaks
(100\%) as depicted in Fig.1b. To further optimize the initial region explored through the implementation of a variance-based feature selection approach and
(DOE) in terms of total live CD4+ TN+TCM cells, a sequential adaptive some manual inspection steps.
design-of-experiment (ADOE) was designed with 10 unique parameter combinations,
two of these replicated twice for a total of 12 additional samples (Fig.1b). The The first \gls{doe} resulted in a randomized 18-run I-optimal custom design
fusion of cytokine and NMR profiles from media to model these responses included where each \gls{dms} parameter was evaluated at three levels: \pilII{} (10, 20,
30 cytokines from a custom Thermo Fisher ProcartaPlex Luminex kit and 20 NMR and 30 U/uL), \pdms{} (500, 1500, 2500 \si{\dms\per\ul}), and \pmab{} (60, 80,
features. These 20 spectral features from NMR media analysis were selected out 100 \si{\percent}). These 18 runs consisted of 14 unique parameter combinations
of approximately 250 peaks through the implementation of a variance-based where 4 of them were replicated twice to assess prediction error. To further
feature selection approach and some manual inspection steps. optimize the initial region explored, an \gls{adoe} was designed with 10 unique
parameter combinations, two of these replicated twice for a total of 12
additional samples (\cref{fig:mod_overview_doe}). Process parameters for the
\gls{adoe} were evaluated at multiple levels: \pilII{} (30, 35, and 40
\si{\IU\per\ml}), \pdms{} (500, 1000, 1500, 2000, 2500, 3000, 3500
\si{\dms\per\ml}), and \pmab{} (\SI{100}{\percent}) (\cref{fig:mod_overview}).
\subsection{DMS fabrication} \subsection{DMS fabrication}
\glspl{dms} were fabricated as described in \cref{sec:dms_fab} with the \glspl{dms} were fabricated as described in \cref{sec:dms_fab} with the
following modifications in order to obtain a variable functional \gls{mab} following modifications in order to obtain a variable functional \gls{mab}
surface density. During the \gls{mab} coating step, the anti-CD3/anti-CD28 mAb surface density. During the \gls{mab} coating step, the \acd{3}/\acd{28}
mixture was further combined with a biotinylated isotype control to reduce the \gls{mab} mixture was further combined with a biotinylated isotype control to
overall fraction of targeted \glspl{mab} (for example the \SI{60}{\percent} reduce the overall fraction of targeted \glspl{mab} (for example the
\gls{mab} surface density corresponded to 3 mass parts \acd{3}, 3 mass parts \SI{60}{\percent} \gls{mab} surface density corresponded to 3 mass parts
\acd{28}, and 4 mass parts isotype control). \acd{3}, 3 mass parts \acd{28}, and 4 mass parts isotype control).
\subsection{T cell culture} \subsection{T cell culture}
@ -2779,9 +2789,9 @@ following modifications. At days 4, 6, 8, and 11, \SI{100}{\ul} media were
collected for the Luminex assay and \gls{nmr} analysis. The volume of removed collected for the Luminex assay and \gls{nmr} analysis. The volume of removed
media was equivalently replaced during the media feeding step, which took place media was equivalently replaced during the media feeding step, which took place
immediately after sample collection. Additionally, the same media feeding immediately after sample collection. Additionally, the same media feeding
schedule was followed for the DOE and ADOE to improve consistency, and the same schedule was followed for the \gls{doe} and \gls{adoe} to improve consistency,
donor lot was used for both experiments. All cell counts were performed using and the same donor lot was used for both experiments. All cell counts were
\gls{aopi}. performed using \gls{aopi}.
\subsection{flow cytometry} \subsection{flow cytometry}
@ -2796,26 +2806,29 @@ Cytokines were quantified via Luminex as described in
Prior to analysis, samples were centrifuged at \SI{2990}{\gforce} for Prior to analysis, samples were centrifuged at \SI{2990}{\gforce} for
\SI{20}{\minute} at \SI{4}{\degreeCelsius} to clear any debris\footnote{all \SI{20}{\minute} at \SI{4}{\degreeCelsius} to clear any debris\footnote{all
\gls{nmr} analysis was done by our collaborators at the University of \gls{nmr} analysis was done by our collaborators Max Colonna and Art Edison at
Georgia}. 5 uL of 100/3 mM DSS-D6 in deuterium oxide (Cambridge Isotope the University of Georgia; methods included here for reference}. \SI{5}{\ul} of
Laboratories) were added to 1.7 mm NMR tubes (Bruker BioSpin), followed by 45 uL 100/3 \si{\mM} DSS-D6 in deuterium oxide (Cambridge Isotope Laboratories) were
of media from each sample that was added and mixed, for a final volume of 50 uL added to \SI{1.7}{\mm} \gls{nmr} tubes (Bruker BioSpin), followed by
in each tube. Samples were prepared on ice and in predetermined, randomized \SI{45}{\ul} of media from each sample that was added and mixed, for a final
order. The remaining volume from each sample in the rack (approx. 4 uL) was volume of \SI{50}{\ul} in each tube. Samples were prepared on ice and in
combined to create an internal pool. This material was used for internal predetermined, randomized order. The remaining volume from each sample in the
controls within each rack as well as metabolite annotation. rack (approx. \SI{4}{\ul}) was combined to create an internal pool. This
material was used for internal controls within each rack as well as metabolite
annotation.
\gls{nmr} spectra were collected on a Bruker Avance III HD spectrometer at 600 \gls{nmr} spectra were collected on a Bruker Avance III HD spectrometer at
MHz using a 5-mm TXI cryogenic probe and TopSpin software (Bruker BioSpin). \SI{600}{\MHz} using a \SI{5}{\mm} TXI cryogenic probe and TopSpin software
One-dimensional spectra were collected on all samples using the noesypr1d pulse (Bruker BioSpin). One-dimensional spectra were collected on all samples using
sequence under automation using ICON NMR software. Two-dimensional \gls{hsqc} the noesypr1d pulse sequence under automation using ICON NMR software.
and TOCSY spectra were collected on internal pooled control samples for Two-dimensional \gls{hsqc} and \gls{tocsy} spectra were collected on internal
metabolite annotation. pooled control samples for metabolite annotation.
One-dimensional spectra were manually phased and baseline corrected in TopSpin. One-dimensional spectra were manually phased and baseline corrected in TopSpin.
Two-dimensional spectra were processed in NMRpipe37. One dimensional spectra Two-dimensional spectra were processed in NMRpipe\cite{Delaglio1995}. One
were referenced, water/end regions removed, and normalized with the PQN dimensional spectra were referenced, water/end regions removed, and normalized
algorithm38 using an in-house MATLAB (The MathWorks, Inc.) toolbox. with the PQN algorithm\cite{Dieterle2006} using an in-house MATLAB (The
MathWorks, Inc.) toolbox.
% TODO add the supplemental figure alluded to here? % TODO add the supplemental figure alluded to here?
To reduce the total number of spectral features from approximately 250 peaks and To reduce the total number of spectral features from approximately 250 peaks and
@ -2824,19 +2837,22 @@ variance-based feature selection was performed within MATLAB. For each digitized
point on the spectrum, the variance was calculated across all experimental point on the spectrum, the variance was calculated across all experimental
samples and plotted. Clearly-resolved features corresponding to peaks in the samples and plotted. Clearly-resolved features corresponding to peaks in the
variance spectrum were manually binned and integrated to obtain quantitative variance spectrum were manually binned and integrated to obtain quantitative
feature intensities across all samples (Supp.Fig.S24). In addition to highly feature intensities across all samples.
variable features, several other clearly resolved and easily identifiable % (Supp.Fig.S24).
features were selected (glucose, BCAA region, etc). Some features were later In addition to highly variable features, several other clearly resolved and
discovered to belong to the same metabolite but were included in further easily identifiable features were selected (glucose, \gls{bcaa} region, etc).
analysis. Some features were later discovered to belong to the same metabolite but were
included in further analysis.
% I think this is the right source? it seems wrong in the manuscript but this
% source at least talks about an optimization score
Two-dimensional spectra collected on pooled samples were uploaded to COLMARm web Two-dimensional spectra collected on pooled samples were uploaded to COLMARm web
server10, where \gls{hsqc} peaks were automatically matched to database peaks. server, where \gls{hsqc} peaks were automatically matched to database peaks.
\gls{hsqc} matches were manually reviewed with additional 2D and proton spectra \gls{hsqc} matches were manually reviewed with additional 2D and proton spectra
to confirm the match. Annotations were assigned a confidence score based upon to confirm the match. Annotations were assigned a confidence score based upon
the levels of spectral data supporting the match as previously described11. the levels of spectral data supporting the match as previously
Annotated metabolites were matched to previously selected features used for described\cite{Dashti2017}. Annotated metabolites were matched to previously
statistical analysis. selected features used for statistical analysis.
% I'm pretty sure this isn't relevant % I'm pretty sure this isn't relevant
% Using the list of annotated metabolites obtained above, an approximation of a % Using the list of annotated metabolites obtained above, an approximation of a
@ -2851,8 +2867,8 @@ statistical analysis.
Several low abundance features selected for analysis did not have database Several low abundance features selected for analysis did not have database
matches and were not annotated. Statistical total correlation spectroscopy41 matches and were not annotated. Statistical total correlation spectroscopy41
suggested that some of these unknown features belonged to the same molecules suggested that some of these unknown features belonged to the same molecules
(not shown). Additional multidimensional NMR experiments will be required to (not shown). Additional multidimensional \gls{nmr} experiments will be required
determine their identity. to determine their identity.
\subsection{machine learning and statistical analysis} \subsection{machine learning and statistical analysis}
@ -2861,66 +2877,70 @@ Linear regression analysis of the \glspl{doe} was performed as described in
Seven \gls{ml} techniques were implemented to predict three responses related to Seven \gls{ml} techniques were implemented to predict three responses related to
the memory phenotype of the cultured T cells under different process parameters the memory phenotype of the cultured T cells under different process parameters
conditions (i.e. Total Live CD4+ TN and TCM, Total Live CD8+ TN+TCM, and Ratio conditions (i.e. \rmemh{}, \rmemk{}, and \rratio{}). The \gls{ml} methods
CD4+/CD8+ TN+TCM). The \gls{ml} methods executed were \gls{rf}, \gls{gbm}, executed were \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr},
\gls{cif}, \gls{lasso}, \gls{plsr}, \gls{svm}, and DataModelers \gls{svm}, and DataModelers \gls{sr}\footnote{\gls{sr} was performed by Theresa
\gls{sr}\footnote{of these seven methods, all except \gls{lasso} were performed Kotanchek at Evolved Analytics, \gls{rf}, \gls{gbm}, \gls{cif}, \gls{plsr},
by collaborators}. Primarily, \gls{sr} models were used to optimize process \gls{svm} were performed by Valerie Odeh-Couvertier at UPRM. Methods included
parameter values based on TN+TCM phenotype and to extract early predictive here for reference}. Primarily, \gls{sr} models were used to optimize process
parameter values based on \ptmem{} phenotype and to extract early predictive
variable combinations from the multi-omics experiments. Furthermore, all variable combinations from the multi-omics experiments. Furthermore, all
regression methods were executed, and the high-performing models were used to regression methods were executed, and the high-performing models were used to
perform a consensus analysis of the important variables to extract potential perform a consensus analysis of the important variables to extract potential
critical quality attributes and critical process parameters predictive of T-cell critical quality attributes and critical process parameters predictive of T cell
potency, safety, and consistency at the early stages of the manufacturing potency, safety, and consistency at the early stages of the manufacturing
process. process.
\gls{sr} was done using Evolved Analytics DataModeler software (Evolved \gls{sr} was done using Evolved Analytics DataModeler software (Evolved
Analytics LLC, Midland, MI). DataModeler utilizes genetic programming to evolve Analytics LLC, Midland, MI). DataModeler utilizes genetic programming to evolve
symbolic regression models (both linear and non-linear) rewarding simplicity and symbolic regression models (both linear and non-linear) rewarding simplicity and
accuracy. Using the selection criteria of highest accuracy (R2>90\% or accuracy. Using the selection criteria of highest accuracy
noise-power) and lowest complexity, the top-performing models were identified. ($R^2$>\SI{90}{\percent}) and lowest complexity, the top-performing models were
Driving variables, variable combinations, and model dimensionality tables were identified. Driving variables, variable combinations, and model dimensionality
generated. The top-performing variable combinations were used to generate model tables were generated. The top-performing variable combinations were used to
ensembles. In this analysis, DataModelers SymbolicRegression function was used generate model ensembles. In this analysis, DataModelers
to develop explicit algebraic (linear and nonlinear) models. The fittest models \inlinecode{SymbolicRegression} function was used to develop explicit algebraic
were analyzed to identify the dominant variables using the VariablePresence (linear and nonlinear) models. The fittest models were analyzed to identify the
function, the dominant variable combinations using the VariableCombinations dominant variables using the \inlinecode{VariablePresence} function, the
dominant variable combinations using the \inlinecode{VariableCombinations}
function, and the model dimensionality (number of unique variables) using the function, and the model dimensionality (number of unique variables) using the
ModelDimensionality function. CreateModelEnsemble was used to define trustable \inlinecode{ModelDimensionality} function. \inlinecode{CreateModelEnsemble} was
model ensembles using selected variable combinations and these were summarized used to define trustable model ensembles using selected variable combinations
(model expressions, model phenotype, model tree plot, ensemble quality, model and these were summarized (model expressions, model phenotype, model tree plot,
quality, variable presence map, \gls{anova} tables, model prediction plot, exportable ensemble quality, model quality, variable presence map, \gls{anova} tables,
model forms) using the ModelSummaryTable function. Ensemble prediction and model prediction plot, exportable model forms) using the
residual performance were respectively assessed via the EnsemblePredictionPlot \inlinecode{ModelSummaryTable} function. Ensemble prediction and residual
and EnsembleResidualPlot subroutines. Model maxima (ModelMaximum function) and performance were assessed via the \inlinecode{EnsemblePredictionPlot} and
model minima (ModelMinimum function) were calculated and displayed using the \inlinecode{EnsembleResidualPlot} subroutines respectively. Model maxima
ResponsePlotExplorer function. Trade-off performance of multiple responses was (\inlinecode{ModelMaximum} function) and model minima (\inlinecode{ModelMinimum}
explored using the MultiTargetResponseExplorer and ResponseComparisonExplorer function) were calculated and displayed using the
with additional insights derived from the ResponseContourPlotExplorer. Graphics \inlinecode{ResponsePlotExplorer} function. Trade-off between multiple
and tables were generated by DataModeler. These model ensembles were used to responses was explored using \inlinecode{MultiTargetResponseExplorer} and
identify predicted response values, potential optima in the responses, and \inlinecode{ResponseComparisonExplorer} with additional insights derived from
regions of parameter values where the predictions diverge the most. \inlinecode{ResponseContourPlotExplorer}. Graphics and tables were generated by
DataModeler. These model ensembles were used to identify predicted response
values, potential optima in the responses, and regions of parameter values where
the predictions diverge the most.
Non-parametric tree-based ensembles were done through the Non-parametric tree-based ensembles were done through the
\inlinecode{randomForest}, inlinecode{gbm}, and \inlinecode{cforest} regression \inlinecode{randomForest}, inlinecode{gbm}, and \inlinecode{cforest} regression
functions in R, for \gls{rf}, \gls{gbm}, and \gls{cif} models, respectively. functions in R, for \gls{rf}, \gls{gbm}, and \gls{cif} models, respectively.
Both \gls{rf} and \gls{cif} construct multiple decision trees in parallel, by Both \gls{rf} and \gls{cif} construct multiple decision trees in parallel, by
randomly choosing a subset of features at each decision tree split, in the randomly choosing a subset of features at each decision tree split, in the
training stage. Random forest individual decision trees are split using the Gini training stage. \gls{rf} individual decision trees are split using the Gini
Index, while conditional inference forest uses a statistical significance test Index, while conditional inference forest uses a statistical significance test
procedure to select the variables at each split, reducing correlation bias. In procedure to select the variables at each split, reducing correlation bias. In
contrast, \gls{gbm} construct regression trees in series through an iterative contrast, \gls{gbm} construct regression trees in series through an iterative
procedure that adapts over the training set. This model learns from the mistakes procedure that adapts over the training set. This model learns from the mistakes
of previous regression trees in an iterative fashion to correct errors from its of previous regression trees in an iterative fashion to correct errors
precursors trees (i.e. minimize \gls{mse}). Prediction performance was (\gls{mse}) from its precursors trees. Prediction performance was evaluated
evaluated using \gls{loocv} and permutation-based using \gls{loocv} and permutation-based variable importance scores assessing
variable importance scores assessing \% increase of \gls{mse}, relative percent increase of \gls{mse}, relative influence based on the increase of
influence based on the increase of prediction error, coefficient values for prediction error, coefficient values for \gls{rf}, \gls{gbm}, and \gls{cif},
\gls{rf}, \gls{gbm}, and \gls{cif}, respectively. \gls{plsr} was executed using respectively. \gls{plsr} was executed using the \inlinecode{plsr} function from
the \inlinecode{plsr} function from the \inlinecode{pls} package in R while the \inlinecode{pls} package in R while \gls{lasso} regression was performed
\gls{lasso} regression was performed using the \inlinecode{cv.glmnet} R package, using the \inlinecode{cv.glmnet} R package, both using \gls{loocv}. Finally, the
both using leave-one-out cross-validation. Finally, the \inlinecode{kernlab} R \inlinecode{kernlab} R package was used to construct the \gls{svm} models.
package was used to construct the \gls{svm} models.
Parameter tuning was done for all models in a grid search manner using the train Parameter tuning was done for all models in a grid search manner using the train
function from the \inlinecode{caret} R package using \gls{loocv} as the function from the \inlinecode{caret} R package using \gls{loocv} as the
@ -2936,12 +2956,12 @@ optimum \gls{loocv} performance as well. For \gls{plsr}, the optimal number of
components to be used in the model was assessed based on the standard error of components to be used in the model was assessed based on the standard error of
the cross-validation residuals using the function \inlinecode{selectNcomp} from the cross-validation residuals using the function \inlinecode{selectNcomp} from
the \inlinecode{pls} package. Moreover, \gls{lasso} regression was performed the \inlinecode{pls} package. Moreover, \gls{lasso} regression was performed
using the \inlinecode{cv.glmnet} package with alpha = 1. The best lambda for using the \inlinecode{cv.glmnet} package with $\upalpha$ = 1. The best
each response was chosen using the minimum error criteria. Lastly, a fixed $\uplambda$ for each response was chosen using the minimum error criteria.
linear kernel (i.e. \inlinecode{svmLinear}) was used to build the \gls{svm} Lastly, a fixed linear kernel (\inlinecode{svmLinear}) was used to build
regression models evaluating the cost parameter value with best \gls{loocv}. the \gls{svm} regression models evaluating the cost parameter value with best
Prediction performance was measured for all models using the final model with \gls{loocv}. Prediction performance was measured for all models using the final
\gls{loocv} tuned parameters. model with \gls{loocv} tuned parameters.
% TODO do I need this? % TODO do I need this?
% Table M2 shows the parameter values evaluated per model % Table M2 shows the parameter values evaluated per model
@ -2961,16 +2981,17 @@ values, absolute coefficient values, weighted sum of absolute coefficients
values, and relative importance from sensitivity analysis determined for values, and relative importance from sensitivity analysis determined for
\gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr}, and \gls{svm}, \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr}, and \gls{svm},
respectively. Using these scores, key predictive variables were selected if respectively. Using these scores, key predictive variables were selected if
their importance scores were within the 80th percentile ranking for the their importance scores were within the \nth{80} percentile ranking for the
following \gls{ml} methods: \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, following \gls{ml} methods: \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso},
\gls{plsr}, \gls{svm} while for \gls{sr} variables present in >30\% of the \gls{plsr}, \gls{svm} while for \gls{sr} variables present in >\SI{30}{\percent}
top-performing \gls{sr} models from DataModeler (R2>= 90\%, Complexity >= 100) of the top-performing \gls{sr} models from DataModeler ($R^2\ge$
were chosen to investigate consensus except for \gls{nmr} media models at day 4 \SI{90}{\percent}, Complexity $\ge$ 100) were chosen to investigate consensus
which considered a combination of the top-performing results of models excluding except for \gls{nmr} media models at day 4 which considered a combination of the
lactate ppms, and included those variables which were in > 40\% of the best top-performing results of models excluding lactate ppms, and included those
performing models. Only variables with those high percentile scoring values were variables which were in >\SI{40}{\percent} of the best performing models. Only
evaluated in terms of their logical relation (intersection across \gls{ml} variables with those high percentile scoring values were evaluated in terms of
models) and depicted using a Venn diagram from the \inlinecode{venn} R package. their logical relation (intersection across \gls{ml} models) and depicted using
a Venn diagram from the \inlinecode{venn} R package.
\section{results} \section{results}
@ -3033,8 +3054,6 @@ advantage at lower \gls{il2} concentrations compared to beads. For this reason,
we decided to investigate the lower range of \gls{il2} concentrations starting we decided to investigate the lower range of \gls{il2} concentrations starting
at \SI{10}{\IU\per\ml} throughout the remainder of this aim. at \SI{10}{\IU\per\ml} throughout the remainder of this aim.
% RESULT this is not consistent with the next section since the responses are
% different
\subsection{DOE shows optimal conditions for expanded potent T cells} \subsection{DOE shows optimal conditions for expanded potent T cells}
% TABLE not all of these were actually used, explain why by either adding columns % TABLE not all of these were actually used, explain why by either adding columns
@ -3045,7 +3064,6 @@ at \SI{10}{\IU\per\ml} throughout the remainder of this aim.
\input{../tables/doe_runs.tex} \input{../tables/doe_runs.tex}
\end{table} \end{table}
% RESULT integrate this figure into the results paragraph
\begin{figure*}[ht!] \begin{figure*}[ht!]
\begingroup \begingroup
@ -3183,7 +3201,7 @@ qualitatively observed in the response plot (\cref{fig:doe_responses_mem}).
Furthermore, the dataset parameter was weakly significant, indicating a possible Furthermore, the dataset parameter was weakly significant, indicating a possible
batch effect between the \glspl{doe}. We should also note that despite many batch effect between the \glspl{doe}. We should also note that despite many
parameters being significant, this model was still only mediocre in describing parameters being significant, this model was still only mediocre in describing
this response; the $R^2$ was 0.741 but the adjusted $R^2$ was 0.583, indicating this response; the $R^2$ was 0.741 but the $R_{adj}^2$ was 0.583, indicating
that our data might be underpowered for a model this complex. Further that our data might be underpowered for a model this complex. Further
experiments beyond what was performed here may be needed to fully describe this experiments beyond what was performed here may be needed to fully describe this
response. response.
@ -3193,7 +3211,7 @@ We performed linear regression on the other three responses, all of which
performed much better than the \ptmem{} response as expected given the much performed much better than the \ptmem{} response as expected given the much
lower apparent complexity in the response plots lower apparent complexity in the response plots
(\cref{fig:doe_responses_cd4,fig:doe_responses_mem4,fig:doe_responses_ratio}). (\cref{fig:doe_responses_cd4,fig:doe_responses_mem4,fig:doe_responses_ratio}).
All these models appeared to fit will, with $R^2$ and adjusted $R^2$ upward of All these models appeared to fit will, with $R^2$ and $R_{adj}^2$ upward of
0.8. In all but the CD4:CD8 \ptmem{} ratio, the dataset parameter emerged as 0.8. In all but the CD4:CD8 \ptmem{} ratio, the dataset parameter emerged as
significant, indicating a batch effect between the \glspl{doe}. All other significant, indicating a batch effect between the \glspl{doe}. All other
parameters except \pilII{} in the case of CD4:CD8 \ptmem{} ratio were parameters except \pilII{} in the case of CD4:CD8 \ptmem{} ratio were
@ -3216,16 +3234,24 @@ significant predictors.
\label{fig:doe_sr_contour} \label{fig:doe_sr_contour}
\end{figure*} \end{figure*}
We then visualized the total \ptmemh{} cells and CD4:CD8 \ptmem{} ratio using We then visualized the total \ptmemh{} cells and \rratio{} using the response
the response explorer in DataModeler to create contour plots around the maximum explorer in DataModeler to create contour plots around the maximum responses.
responses. For both, it appeared that maximizing all three input parameters For both, it appeared that maximizing all three input parameters resulted in the
resulted in the maximum value for either response (\cref{fig:doe_responses}). maximum value for either response (\cref{fig:doe_sr_contour}). While not all
While not all combinations at and around this optimum were tested, the model combinations at and around this optimum were tested, the model nonetheless
nonetheless showed that there were no other optimal values or regions elsewhere showed that there were no other optimal values or regions elsewhere in the
in the model. model.
% TODO this section header sucks \subsection{Modeling with artificial intelligence methods reveals potential
\subsection{AI modeling reveals highly predictive species} CQAs}
Due to the heterogeneity of the multivariate data collected and knowing that no
single model is perfect for all applications, we implemented an agnostic
modeling approach to better understand these \ptmem{} responses. To achieve
this, a consensus analysis using seven \gls{ml} techniques, \gls{rf}, \gls{gbm},
\gls{cif}, \gls{lasso}, \gls{plsr}, \gls{svm}, and DataModelers \gls{sr}, was
implemented to molecularly characterize \ptmem{} cells and to extract predictive
features of quality early in their expansion process.
\begin{figure*}[ht!] \begin{figure*}[ht!]
\begingroup \begingroup
@ -3238,34 +3264,38 @@ in the model.
\label{fig:doe_luminex} \label{fig:doe_luminex}
\end{figure*} \end{figure*}
Due to the heterogeneity of the multivariate data collected and knowing that no We collected secretome data via luminex for days 4, 6, 8, 11, and 14.
single model structure is perfect for all applications, we implemented an Plotting the concentrations of these cytokines showed a large variation over all
agnostic modeling approach to better understand these TN+TCM responses. To runs and between different timepoints, demonstrated that these could potentially
achieve this, a consensus analysis using seven machine learning (ML) techniques, be used to differentiate between different process conditions qualitatively
Random Forest (RF), Gradient Boosted Machine (GBM), Conditional Inference Forest simply based on variance (\cref{fig:doe_luminex}). These were also much higher
(CIF), Least Absolute Shrinkage and Selection Operator (LASSO), Partial in most cases that a set of bead based runs which were run in parallel, in
Least-Squares Regression (PLSR), Support Vector Machine (SVM), and DataModelers agreement with the luminex data obtained previously in the Grex system (these
Symbolic Regression (SR), was implemented to molecularly characterize TN+TCM data were collected in plates) (\cref{fig:grex_luminex}).
cells and to extract predictive features of quality early on their expansion
process (Fig.1d-e).
% TABLE this table looks like crap, break it up into smaller tables % TABLE this table looks like crap, break it up into smaller tables
\begin{table}[!h] \centering \begin{table}[!h] \centering
\caption{Results for data-driven modeling} \caption{Results for data-driven modeling using process parameters (PP) with
only \gls{nmr} on day 4 (N4), only \gls{nmr} on day 6 (N6), only secretome
on day 6 (S6), or various combindation of each for all seven \gls{ml}
techniques}
\label{tab:mod_results} \label{tab:mod_results}
\input{../tables/model_results.tex} \input{../tables/model_results.tex}
\end{table} \end{table}
SR models achieved the highest predictive performance (R2>93\%) when using \gls{sr} models achieved the highest predictive performance
multi-omics predictors for all endpoint responses (\cref{tab:mod_results}). SR ($R^2$>\SI{93}{\percent}) when using multi-omics predictors for all endpoint
achieved R2>98\% while GBM tree-based ensembles showed leave-one-out responses (\cref{tab:mod_results}). \gls{sr} achieved $R^2$>\SI{98}{\percent}
cross-validated R2 (LOO-R2) >95\% for CD4+ and CD4+/CD8+ TN+TCM responses. while \gls{gbm} tree-based ensembles showed \gls{loocv} $R^2$ >
Similarly, LASSO, PLSR, and SVM methods showed consistent high LOO-R2, 92.9\%, \SI{95}{\percent} for \rmemh{} and \rmemk{} responses. Similarly, \gls{lasso},
99.7\%, and 90.5\%, respectively, to predict the CD4+/CD8+ TN+TCM. Yet, about \gls{plsr}, and \gls{svm} methods showed consistently high \gls{loocv},
10\% reduction in LOO-R2, 72.5\%-81.7\%, was observed for CD4+ TN+TCM with these (\SI{92.9}{\percent}, \SI{99.7}{\percent}, and \SI{90.5}{\percent}
three methods. Lastly, SR and PLSR achieved R2>90\% while other ML methods respectively), to predict the \rratio{}. Yet, about \SI{10}{\percent} reduction
exhibited exceedingly variable LOO-R2 (0.3\%,RF-51.5\%,LASSO) for CD8+ TN+TCM in \gls{loocv}, \SIrange{72.5}{81.7}{\percent}, was observed for \rmemh{} with
cells. these three methods. Lastly, \gls{sr} and \gls{plsr} achieved
$R^2$>\SI{90}{\percent} while other \gls{ml} methods exhibited exceedingly
variable \gls{loocv} (\SI{0.3}{\percent} for \gls{rf} to \SI{51.5}{\percent} for
\gls{lasso}) for \rmemk{}.
\begin{figure*}[ht!] \begin{figure*}[ht!]
\begingroup \begingroup
@ -3279,13 +3309,12 @@ cells.
\label{fig:sr_omics} \label{fig:sr_omics}
\end{figure*} \end{figure*}
The top-performing technique, SR, showed that the median aggregated predictions The top-performing technique, \gls{sr}, showed that the median aggregated
for CD4+ and CD8+ TN+TCM cells increases when IL2 concentration, IL15, and IL2R predictions for \rmemh{} \rmemk{} increases when IL2 concentration, IL15, and
increase while IL17a decreases in conjunction with other features. These IL2R increase while IL17a decreases in conjunction with other features. These
patterns combined with low values of DMS concentration and GM-CSF uniquely patterns combined with low values of \pdms{} and GM-CSF uniquely characterized
characterized maximum CD8+ TN+TCM. Meanwhile, higher glycine but lower IL13 in maximum \rmemk{}. Meanwhile, higher glycine but lower IL13 in combination with
combination with others showed maximum CD4+ TN+TCM predictions others showed maximum \rmemh{} predictions (\cref{fig:sr_omics}).
(\cref{fig:sr_omics}).
\begin{figure*}[ht!] \begin{figure*}[ht!]
\begingroup \begingroup
@ -3306,13 +3335,25 @@ combination with others showed maximum CD4+ TN+TCM predictions
\label{fig:mod_flower} \label{fig:mod_flower}
\end{figure*} \end{figure*}
Selecting CPPs and CQAs candidates consistently for T cell memory is desired. Selecting \gls{cpp} and \glspl{cqa} candidates consistently for T cell memory is
Here, \gls{tnfa} was found in consensus across all seven ML methods for predicting desired. Here, \gls{tnfa} was found in consensus across all seven \gls{ml}
CD4+/CD8+ TN+TCM when considering features with the highest importance scores methods for predicting \rratio{} when considering features with the highest
across models (Fig.3a;Methods). Other features, IL2R, IL4, IL17a, and DMS importance scores across models (\cref{fig:mod_flower_48r}). Other features,
concentration, were commonly selected in >=5 ML methods (Fig.3a,c). Moreover, IL2R, IL4, IL17a, and \pdms{}, were commonly selected in $\ge$ 5 \gls{ml}
IL13 and IL15 were found predictive in combination with these using SR methods (\cref{fig:mod_flower_48r}). When restricting the models only to include
(Supp.Table.S4). metabolome, formate emerged as the dominant predictor shared across all seven
models.
% Moreover, IL13 and IL15 were found predictive in combination
% with these using \gls{sr} (Supp.Table.S4).
When performing similar analysis on \rmemh{}, we observe that no species for
either the secretome or metabolome was agreed upon by all seven models
(\cref{fig:mod_flower_cd4}). We also observed that these models did not fit as
well as they did for \rratio{} (\cref{tab:mod_results}). For the secretome, the
species that were agreed upon by $\ge$ 5 models were IL4, IL17a, and IL2R. For
the metabolome, formate once again was agreed upon by $\ge$ 5 models as well as
lactate.
\begin{figure*}[ht!] \begin{figure*}[ht!]
\begingroup \begingroup
@ -3338,55 +3379,61 @@ IL13 and IL15 were found predictive in combination with these using SR
\label{fig:nmr_cors} \label{fig:nmr_cors}
\end{figure*} \end{figure*}
We also investigated the \gls{nmr} features extracted from day of expansion to
assess if there was any predictive power for \ptmemh{}; in general these models
had almost as good of fit despite being 2 days earlier in the process
(\cref{fig:nmr_cors}). Lactate and formate were observed to correlate with each
other, and both correlated with \rmemh{}. Furthermore, lactate was observed to
positively correlate with \pdms{} and negatively correlate with glucose
(\cref{fig:nmr_cors_lactate}). Formate also had the same correlation patterns
(\cref{fig:nmr_cors_formate}). Glucose was only negatively correlated with
formate and lactate (\cref{fig:nmr_cors_glucose}). Together, these data suggest
that lactate, formate, \pdms{}, and \rmemh{} are fundamentally linked.
\section{discussion} \section{discussion}
% optimization of process features \gls{cpp} modeling and understanding are critical to new product development and
% TODO this sounds like total fluff in cell therapy development, it can have life-saving implications. The
challenges for effective modeling grow with the increasing complexity of
% DISCUSSION integrate figures processes due to high dimensionality, and the potential for process interactions
and nonlinear relationships. Another critical challenge is the limited amount of
CPPs modeling and understanding are critical to new product development and in available data, mostly small \gls{doe} datasets. \gls{sr} has the necessary
cell therapy development, it can have life-saving implications. The challenges capabilities to resolve the issues of process effects modeling and has been
for effective modeling grow with the increasing complexity of processes due to applied across multiple industries\cite{Kordona}. \gls{sr} discovers
high dimensionality, and the potential for process interactions and nonlinear mathematical expressions that fit a given sample and differs from conventional
relationships. Another critical challenge is the limited amount of available regression techniques in that a model structure is not defined a
data, mostly small DOE datasets. SR has the necessary capabilities to resolve priori\cite{Koza1992}. Hence, a key advantage of this methodology is that
the issues of process effects modeling and has been applied across multiple
industries12. SR discovers mathematical expressions that fit a given sample and
differs from conventional regression techniques in that a model structure is not
defined a priori13. Hence, a key advantage of this methodology is that
transparent, human-interpretable models can be generated from small and large transparent, human-interpretable models can be generated from small and large
datasets with no prior assumptions\cite{Kotancheka}. datasets with no prior assumptions\cite{Kotancheka}.
Since the model search process lets the data determine the model, diverse and Since the model search process lets the data determine the model, diverse and
competitive (e.g., accuracy, complexity) model structures are typically competitive model structures are typically discovered. An ensemble of diverse
discovered. An ensemble of diverse models can be formed where its constituent models can be formed where its constituent models will tend to agree when
models will tend to agree when constrained by observed data yet diverge in new constrained by observed data yet diverge in new regions. Collecting data in
regions. Collecting data in these regions helps to ensure that the target system these regions helps to ensure that the target system is accurately modeled, and
is accurately modeled, and its optimum is accurately located\cite{Kotancheka}. its optimum is accurately located\cite{Kotancheka}. Exploiting these features
Exploiting these features allows adaptive data collection and interactive allows adaptive data collection and interactive modeling. Consequently, this
modeling. Consequently, this adaptive-DOE approach is useful in a variety of \gls{adoe} approach is useful in a variety of scenarios, including maximizing
scenarios, including maximizing model validity for model-based decision making, model validity for model-based decision making, optimizing processing parameters
optimizing processing parameters to maximize target yields, and developing to maximize target yields, and developing emulators for online optimization and
emulators for online optimization and human understanding\cite{Kotancheka}. human understanding\cite{Kotancheka}.
% predictive features An in-depth characterization of potential \gls{dms} based T cell \glspl{cqa}
includes a list of cytokine and \gls{nmr} features from media samples that are
An in-depth characterization of potential DMS-based T-cell CQAs includes a list crucial in many aspects of T cell fate decisions and effector functions of
of cytokine and NMR features from media samples that are crucial in many aspects immune cells. Cytokine features were observed to slightly improve prediction and
of T cell fate decisions and effector functions of immune cells. Cytokine dominated the ranking of important features and variable combinations when
features were observed to slightly improve prediction and dominated the ranking modeling together with \gls{nmr} media analysis and process parameters
of important features and variable combinations when modeling together with NMR (\cref{fig:mod_flower}).
media analysis and process parameters (Fig.3b,d).
Predictive cytokine features such as \gls{tnfa}, IL2R, IL4, IL17a, IL13, and Predictive cytokine features such as \gls{tnfa}, IL2R, IL4, IL17a, IL13, and
IL15 were biologically assessed in terms of their known functions and activities IL15 were biologically assessed in terms of their known functions and activities
associated with T cells. T helper cells secrete more cytokines than T cytotoxic associated with T cells. T helper cells secrete more cytokines than T cytotoxic
cells, as per their main functions, and activated T cells secrete more cytokines cells, as per their main functions, and activated T cells secrete more cytokines
than resting T cells. It is possible that some cytokines simply reflect the than resting T cells. It is possible that some cytokines simply reflect the
CD4+/CD8+ ratio and the activation degree by proxy proliferation. However, the \rratio{} and the activation degree by proxy proliferation. However, the exact
exact ratio of expected cytokine abundance is less clear and depends on the ratio of expected cytokine abundance is less clear and depends on the subtypes
subtypes present, and thus examination of each relevant cytokine is needed. present, and thus examination of each relevant cytokine is needed.
IL2R is secreted by activated T cells and binds to IL2, acting as a sink to IL2R is secreted by activated T cells and binds to IL2, acting as a sink to
dampen its effect on T cells\cite{Witkowska2005}. Since IL2R was much greater dampen its effect on T cells\cite{Witkowska2005}. Since IL2R was much greater
@ -3399,22 +3446,23 @@ form, this may either increase or decrease CD4+ ratio and/or memory T cells
depending on the ratio of the membrane to soluble TNF\cite{Mehta2018}. Since depending on the ratio of the membrane to soluble TNF\cite{Mehta2018}. Since
only soluble TNF was measured, membrane TNF is needed to understand its impact only soluble TNF was measured, membrane TNF is needed to understand its impact
on both CD4+ ratio and memory T cells. Furthermore, IL13 is known to be critical on both CD4+ ratio and memory T cells. Furthermore, IL13 is known to be critical
for Th2 response and therefore could be secreted if there are significant Th2 T for \gls{th2} response and therefore could be secreted if there are significant
cells already present in the starting population\cite{Wong2011}. This cytokine \glspl{th2} already present in the starting population\cite{Wong2011}. This
has limited signaling in T cells and is thought to be more of an effector than a cytokine has limited signaling in T cells and is thought to be more of an
differentiation cytokine\cite{Junttila2018}. It might be emerging as relevant effector than a differentiation cytokine\cite{Junttila2018}. It might be
due to an initially large number of Th2 cells or because Th2 cells were emerging as relevant due to an initially large number of \glspl{th2} or because
preferentially expanded; indeed, IL4, also found important, is the conical \glspl{th2} were preferentially expanded; indeed, IL4, also found important, is
cytokine that induces Th2 cell differentiation (Fig.3). The role of these the conical cytokine that induces \gls{th2} differentiation
cytokines could be investigated by quantifying the Th1/2/17 subsets both in the (\cref{fig:mod_flower}). The role of these cytokines could be investigated by
starting population and longitudinally. Similar to IL13, IL17 is an effector quantifying \glspl{th1}, \glspl{th2}, or \glspl{th17} both in the starting
cytokine produced by Th17 cells\cite{Amatya2017} thus may reflect the number of population and longitudinally. Similar to IL13, IL17 is an effector cytokine
Th17 subset of T cells. GM-CSF has been linked with activated T cells, produced by \glspl{th17}\cite{Amatya2017} thus may reflect the number of
specifically Th17 cells, but it is not clear if this cytokine is inducing \glspl{th17} in the population. GM-CSF has been linked with activated T cells,
specifically \glspl{th17}, but it is not clear if this cytokine is inducing
differential expansion of CD8+ T cells or if it is simply a covariate with differential expansion of CD8+ T cells or if it is simply a covariate with
another cytokine inducing this expansion\cite{Becher2016}. Finally, IL15 has another cytokine inducing this expansion\cite{Becher2016}. Finally, IL15 has
been shown to be essential for memory signaling and effective in skewing CAR-T been shown to be essential for memory signaling and effective in skewing
cells toward the Tscm phenotype when using membrane-bound IL15Ra and \gls{car} T cells toward \glspl{tscm} when using membrane-bound IL15Ra and
IL15R\cite{Hurton2016}. Its high predictive behavior goes with its ability to IL15R\cite{Hurton2016}. Its high predictive behavior goes with its ability to
induce large numbers of memory T cells by functioning in an autocrine/paracrine induce large numbers of memory T cells by functioning in an autocrine/paracrine
manner and could be explored by blocking either the cytokine or its receptor. manner and could be explored by blocking either the cytokine or its receptor.
@ -3424,7 +3472,7 @@ activity associated with T cell activation and differentiation, yet it is not
clear how the various combinations of metabolites relate with each other in a clear how the various combinations of metabolites relate with each other in a
heterogeneous cell population. Formate and lactate were found to be highly heterogeneous cell population. Formate and lactate were found to be highly
predictive and observed to positively correlate with higher values of total live predictive and observed to positively correlate with higher values of total live
CD4+ TN+TCM cells (Fig.5a-b;Supp.Fig.28-S30,S38). Formate is a byproduct of the \rmemh{} cells (~\cref{fig:nmr_cors}). Formate is a byproduct of the
one-carbon cycle implicated in promoting T cell activation\cite{RonHarel2016}. one-carbon cycle implicated in promoting T cell activation\cite{RonHarel2016}.
Importantly, this cycle occurs between the cytosol and mitochondria of cells and Importantly, this cycle occurs between the cytosol and mitochondria of cells and
formate excreted\cite{Pietzke2020}. Mitochondrial biogenesis and function are formate excreted\cite{Pietzke2020}. Mitochondrial biogenesis and function are
@ -3432,18 +3480,18 @@ shown necessary for memory cell persistence\cite{van_der_Windt_2012,
Vardhana2020}. Therefore, increased formate in media could be an indicator of Vardhana2020}. Therefore, increased formate in media could be an indicator of
one-carbon metabolism and mitochondrial activity in the culture. one-carbon metabolism and mitochondrial activity in the culture.
In addition to formate, lactate was found as a putative CQA of TN+TCM. Lactate In addition to formate, lactate was found as a putative \gls{cqa} of \ptmem{}
is the end-product of aerobic glycolysis, characteristic of highly proliferating cells. Lactate is the end-product of aerobic glycolysis, characteristic of
cells and activated T cells\cite{Lunt2011, Chang2013}. Glucose import and highly proliferating cells and activated T cells\cite{Lunt2011, Chang2013}.
glycolytic genes are immediately upregulated in response to T cell stimulation, Glucose import and glycolytic genes are immediately upregulated in response to T
and thus generation of lactate. At earlier time-points, this abundance suggests cell stimulation, and thus generation of lactate. At earlier time-points, this
a more robust induction of glycolysis and higher overall T cell proliferation. abundance suggests a more robust induction of glycolysis and higher overall T
Interestingly, our models indicate that higher lactate predicts higher CD4+, cell proliferation. Interestingly, our models indicate that higher lactate
both in total and in proportion to CD8+, seemingly contrary to previous studies predicts higher CD4+, both in total and in proportion to CD8+, seemingly
showing that CD8+ T cells rely more on glycolysis for proliferation following contrary to previous studies showing that CD8+ T cells rely more on glycolysis
activation\cite{Cao2014}. It may be that glycolytic cells dominate in the for proliferation following activation\cite{Cao2014}. It may be that glycolytic
culture at the early time points used for prediction, and higher lactate cells dominate in the culture at the early time points used for prediction, and
reflects more cells. higher lactate reflects more cells.
% TODO not sure how much I should include here since I didn't do this analysis % TODO not sure how much I should include here since I didn't do this analysis
% AT ALL % AT ALL
@ -3461,20 +3509,20 @@ reflects more cells.
% this looks fine since it is just parroting sources, just need to paraphrase a % this looks fine since it is just parroting sources, just need to paraphrase a
% little % little
Metabolites that consistently decreased over time are consistent with the Metabolites that consistently decreased over time are consistent with the
primary carbon source (glucose) and essential amino acids (BCAA, histidine) that primary carbon source (glucose) and essential amino acids (\gls{bcaa},
must be continually consumed by proliferating cells. Moreover, the inclusion of histidine) that must be continually consumed by proliferating cells. Moreover,
glutamine in our predictive models also suggests the importance of other carbon the inclusion of glutamine in our predictive models also suggests the importance
sources for certain T cell subpopulations. Glutamine can be used for oxidative of other carbon sources for certain T cell subpopulations. Glutamine can be used
energy metabolism in T cells without the need for glycolysis\cite{Cao2014}. for oxidative energy metabolism in T cells without the need for
Overall, these results are consistent with existing literature that show glycolysis\cite{Cao2014}. Overall, these results are consistent with existing
different T cell subtypes require different relative levels of glycolytic and literature that show different T cell subtypes require different relative levels
oxidative energy metabolism to sustain the biosynthetic and signaling needs of of glycolytic and oxidative energy metabolism to sustain the biosynthetic and
their respective phenotypes\cite{Almeida2016,Wang_2012}. It is worth noting that signaling needs of their respective phenotypes\cite{Almeida2016,Wang_2012}. It
the trends of metabolite abundance here are potentially confounded by the is worth noting that the trends of metabolite abundance here are potentially
partial replacement of media that occurred periodically during expansion confounded by the partial replacement of media that occurred periodically during
(Methods), thus likely diluting some metabolic byproducts (i.e. formate, expansion, thus likely diluting some metabolic byproducts (such as formate,
lactate) and elevating depleted precursors (i.e. glucose, amino acids). More lactate) and elevating depleted precursors (such as glucose and amino acids).
definitive conclusions of metabolic activity across the expanding cell More definitive conclusions of metabolic activity across the expanding cell
population can be addressed by a closed system, ideally with on-line process population can be addressed by a closed system, ideally with on-line process
sensors and controls for formate, lactate, along with ethanol and glucose. sensors and controls for formate, lactate, along with ethanol and glucose.