ENH proof aim2a

This commit is contained in:
Nathan Dwarshuis 2021-09-08 19:14:31 -04:00
parent 597c316a5c
commit eb19087bd2
9 changed files with 213 additions and 220 deletions

View File

@ -3,8 +3,8 @@
\begin{tabular}{@{\extracolsep{5pt}}lc}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{1}{c}{\textit{Dependent variable:}} \\
\cline{2-2}
% & \multicolumn{1}{c}{\textit{Dependent variable:}} \\
% \cline{2-2}
\\[-1.8ex] & CD4+ Cells \\
\hline \\[-1.8ex]
Dataset [2] & 1,271,171.000$^{**}$ \\
@ -13,7 +13,7 @@
DMS Conc. (1/ml) & 1,742.752$^{***}$ \\
Intercept & $-$5,344,494.000$^{***}$ \\
\hline \\[-1.8ex]
Observations & 30 \\
% Observations & 30 \\
R$^{2}$ & 0.888 \\
Adjusted R$^{2}$ & 0.870 \\
% Residual Std. Error & 727,042.800 (df = 25) \\

View File

@ -3,8 +3,8 @@
\begin{tabular}{@{\extracolsep{5pt}}lc}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{1}{c}{\textit{Dependent variable:}} \\
\cline{2-2}
% & \multicolumn{1}{c}{\textit{Dependent variable:}} \\
% \cline{2-2}
\\[-1.8ex] & CD62L+CCR7+ Cells \\
\hline \\[-1.8ex]
Dataset [2] & 4,661,754.000$^{*}$ \\
@ -13,7 +13,7 @@
DMS Conc. (1/ml) & 240.038 \\
Intercept & $-$3,478,851.000 \\
\hline \\[-1.8ex]
Observations & 30 \\
% Observations & 30 \\
R$^{2}$ & 0.331 \\
Adjusted R$^{2}$ & 0.224 \\
% Residual Std. Error & 3,659,501.000 (df = 25) \\

View File

@ -3,8 +3,8 @@
\begin{tabular}{@{\extracolsep{5pt}}lc}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{1}{c}{\textit{Dependent variable:}} \\
\cline{2-2}
% & \multicolumn{1}{c}{\textit{Dependent variable:}} \\
% \cline{2-2}
\\[-1.8ex] & log(CD62L+CCR7+ Cells) \\
\hline \\[-1.8ex]
Dataset [2] & 0.269 \\
@ -20,7 +20,7 @@
(Functional mAb \%)*(IL2 Conc. (IU/ml)*(DMS Conc. (1/ml)) & 0.00000$^{*}$ \\
Intercept & 20.899$^{***}$ \\
\hline \\[-1.8ex]
Observations & 30 \\
% Observations & 30 \\
R$^{2}$ & 0.741 \\
Adjusted R$^{2}$ & 0.583 \\
% Residual Std. Error & 0.228 (df = 18) \\

View File

@ -11,7 +11,7 @@
DMS Conc. (1/ml) & 926.925$^{***}$ \\
Intercept & $-$3,368,762.000$^{***}$ \\
\hline \\[-1.8ex]
Observations & 30 \\
% Observations & 30 \\
R$^{2}$ & 0.835 \\
Adjusted R$^{2}$ & 0.808 \\
% Residual Std. Error & 493,168.700 (df = 25) \\

View File

@ -12,7 +12,7 @@
DMS Conc. (1/ml) & 0.0001$^{***}$ \\
Intercept & $-$0.144$^{*}$ \\
\hline \\[-1.8ex]
Observations & 30 \\
% Observations & 30 \\
R$^{2}$ & 0.879 \\
Adjusted R$^{2}$ & 0.860 \\
% Residual Std. Error & 0.039 (df = 25) \\

View File

@ -23,7 +23,7 @@ DOE & 16 & 10 & 500 & 100\\
DOE & 17 & 20 & 1500 & 60\\
DOE & 18 & 30 & 2500 & 60\\
ADOE & 1 & 40 & 500 & 100\\
ADOE & 2 & 35 & 2000 & 100\\
ADOE & 2\tnote{a} & 35 & 2000 & 100\\
ADOE & 3 & 30 & 1500 & 100\\
ADOE & 4 & 30 & 2500 & 100\\
ADOE & 5 & 40 & 2500 & 100\\
@ -31,14 +31,14 @@ ADOE & 6 & 40 & 1500 & 100\\
ADOE & 7 & 30 & 500 & 100\\
ADOE & 8 & 35 & 2000 & 100\\
ADOE & 9 & 35 & 1000 & 100\\
ADOE & 10 & 30 & 1500 & 100\\
ADOE & 10\tnote{a} & 30 & 1500 & 100\\
ADOE & 11 & 35 & 3000 & 100\\
ADOE & 12 & 30 & 2500 & 100\\
ADOE & 13 & 40 & 1500 & 100\\
ADOE & 14 & 40 & 500 & 100\\
ADOE & 15 & 30 & 500 & 100\\
ADOE & 16 & 35 & 1000 & 100\\
ADOE & 17 & 35 & 3000 & 100\\
ADOE & 13\tnote{a} & 40 & 1500 & 100\\
ADOE & 14\tnote{a} & 40 & 500 & 100\\
ADOE & 15\tnote{a} & 30 & 500 & 100\\
ADOE & 16\tnote{a} & 35 & 1000 & 100\\
ADOE & 17\tnote{a} & 35 & 3000 & 100\\
ADOE & 18 & 40 & 3500 & 100\\
ADOE & 19 & 40 & 2500 & 100\\
ADOE & 20 & 40 & 3500 & 100\\

View File

@ -4,20 +4,23 @@
\\[-1.8ex] Response/Predictors & SR & RF & GBM & CIF & LASSO & PLSR & SVM \\
\hline \\[-1.8ex]
\multicolumn{8}{l}{CD4:CD8 Ratio} \\
\\[-1.8ex]
PP+N4 & \SI{99}{\percent} & \SI{86.8}{\percent} & \SI{96.3}{\percent} & \SI{84.5}{\percent} & \SI{88.6}{\percent} & \SI{92.5}{\percent} & \SI{88.5}{\percent}\\
PP+N6 & \SI{99}{\percent} & \SI{73.6}{\percent} & \SI{95.9}{\percent} & \SI{70.1}{\percent} & \SI{81.0}{\percent} & \SI{95.8}{\percent} & \SI{79.7}{\percent}\\
PP+S6 & \SI{99}{\percent} & \SI{87.1}{\percent} & \SI{99.9}{\percent} & \SI{83.4}{\percent} & \SI{87.2}{\percent} & \SI{97.9}{\percent} & \SI{86.8}{\percent}\\
PP+S6+N6 & \SI{99}{\percent} & \SI{85.5}{\percent} & \SI{95.3}{\percent} & \SI{83.4}{\percent} & \SI{92.9}{\percent} & \SI{99.7}{\percent} & \SI{90.5}{\percent}\\
\hline \\
\hline \\[-1.8ex]
\multicolumn{8}{l}{\ptmemh{} cells} \\
\\[-1.8ex]
PP+N4 & \SI{97}{\percent} & \SI{67.0}{\percent} & \SI{93.6}{\percent} & \SI{69.3}{\percent} & \SI{34.3}{\percent} & \SI{90.1}{\percent} & \SI{75.5}{\percent}\\
PP+N6 & \SI{96}{\percent} & \SI{45.9}{\percent} & \SI{92.6}{\percent} & \SI{51.2}{\percent} & \SI{42.8}{\percent} & \SI{92.1}{\percent} & \SI{79.4}{\percent}\\
PP+S6 & \SI{98}{\percent} & \SI{71.4}{\percent} & \SI{99.9}{\percent} & \SI{75.0}{\percent} & \SI{74.9}{\percent} & \SI{80.0}{\percent} & \SI{75.5}{\percent}\\
PP+S6+N6 & \SI{98}{\percent} & \SI{68.2}{\percent} & \SI{95.6}{\percent} & \SI{74.4}{\percent} & \SI{72.5}{\percent} & \SI{81.7}{\percent} & \SI{77.0}{\percent}\\
\hline \\
\hline \\[-1.8ex]
\multicolumn{8}{l}{\ptmemk{} cells} \\
\\[-1.8ex]
PP+N4 & \SI{93}{\percent} & \SI{4.7}{\percent} & \SI{44.4}{\percent} & \SI{9.2}{\percent} & \SI{1.2}{\percent} & \SI{65.1}{\percent} & \SI{9.1}{\percent}\\
PP+N6 & \SI{86}{\percent} & \SI{2.0}{\percent} & \SI{29.9}{\percent} & \SI{15.8}{\percent} & \SI{28.5}{\percent} & \SI{63.3}{\percent} & \SI{30.6}{\percent}\\
PP+S6 & \SI{93}{\percent} & \SI{7.8}{\percent} & \SI{28.0}{\percent} & \SI{15.1}{\percent} & \SI{76.2}{\percent} & \SI{98.4}{\percent} & \SI{49.8}{\percent}\\

View File

@ -2803,6 +2803,19 @@ CONCLUSIONS: We developed a simplified, semi-closed system for the initial selec
isbn = {046509760X},
}
@Article{Holmes2006,
author = {E. Holmes and O. Cloarec and J. K. Nicholson},
journal = {Journal of Proteome Research},
title = {Probing Latent Biomarker Signatures and in Vivo Pathway Activity in Experimental Disease States via Statistical Total Correlation Spectroscopy ({STOCSY}) of Biofluids:~ Application to {HgCl}2Toxicity},
year = {2006},
month = {jun},
number = {6},
pages = {1313--1320},
volume = {5},
doi = {10.1021/pr050399w},
publisher = {American Chemical Society ({ACS})},
}
@Comment{jabref-meta: databaseType:bibtex;}
@Comment{jabref-meta: grouping:

View File

@ -21,6 +21,7 @@
\usepackage{listings}
\usepackage{tocloft}
\usepackage{epigraph}
\usepackage{threeparttable}
\hypersetup{
colorlinks=true,
@ -2876,8 +2877,8 @@ The purpose of this sub-aim was to develop computational methods to identify
novel \glspl{cqa} and \glspl{cpp} that could be used for release criteria,
process control, and process optimization for the \gls{dms} platform. We
hypothesized that T cells grown using the \gls{dms} system would produce
detectable biological signatures in the media supernatent which corresponded to
clinically relevent responses such as fold expansion or phenotype. We tested
detectable biological signatures in the media supernatent which would correspond
to clinically relevent responses such as fold expansion or phenotype. We tested
this hypothesis by activating T cells under a variety of conditions using a
\gls{doe}, sampling the media at intermediate timepoints, and creating models to
predict the outcome of the cultures. We should stress that the specific
@ -2921,13 +2922,12 @@ progressed. Data from inputs and/or longitudinal samples were used to predict
the endpoint response. The fusion of cytokine and \gls{nmr} profiles from media
to model these responses included 30 cytokines from a custom Thermo Fisher
ProcartaPlex Luminex kit and 20 \gls{nmr} features. These 20 spectral features
from \gls{nmr} media analysis were selected out of approximately 250 peaks
through the implementation of a variance-based feature selection approach and
some manual inspection steps.
from \gls{nmr} media analysis were selected out of approximately 250 peaks using
a variance-based feature selection approach and some manual inspection steps.
The first \gls{doe} resulted in a randomized 18-run I-optimal custom design
where each \gls{dms} parameter was evaluated at three levels: \pilII{} (10, 20,
and 30 U/uL), \pdms{} (500, 1500, 2500 \si{\dms\per\ul}), and \pmab{} (60, 80,
and 30 U/uL), \pdms{} (500, 1500, 2500 \si{\dms\per\ml}), and \pmab{} (60, 80,
100 \si{\percent}). These 18 runs consisted of 14 unique parameter combinations
where 4 of them were replicated twice to assess prediction error. To further
optimize the initial region explored, an \gls{adoe} was designed with 10 unique
@ -2972,11 +2972,11 @@ Cytokines were quantified via Luminex as described in
Prior to analysis, samples were centrifuged at \SI{2990}{\gforce} for
\SI{20}{\minute} at \SI{4}{\degreeCelsius} to clear any debris\footnote{all
\gls{nmr} analysis was done by our collaborators Max Colonna and Art Edison at
the University of Georgia; methods included here for reference}. \SI{5}{\ul} of
100/3 \si{\mM} DSS-D6 in deuterium oxide (Cambridge Isotope Laboratories) were
added to \SI{1.7}{\mm} \gls{nmr} tubes (Bruker BioSpin), followed by
the University of Georgia; methods included here for reference}. \SI{5}{\ul}
of 100/3 \si{\mM} DSS-D6 in deuterium oxide (Cambridge Isotope Laboratories)
were added to \SI{1.7}{\mm} \gls{nmr} tubes (Bruker BioSpin), followed by
\SI{45}{\ul} of media from each sample that was added and mixed, for a final
volume of \SI{50}{\ul} in each tube. Samples were prepared on ice and in
volume of \SI{50}{\ul} in each tube. Samples were prepared on ice in
predetermined, randomized order. The remaining volume from each sample in the
rack (approx. \SI{4}{\ul}) was combined to create an internal pool. This
material was used for internal controls within each rack as well as metabolite
@ -3010,15 +3010,15 @@ Two-dimensional spectra collected on pooled samples were uploaded to COLMARm web
server, where \gls{hsqc} peaks were automatically matched to database peaks.
\gls{hsqc} matches were manually reviewed with additional 2D and proton spectra
to confirm the match. Annotations were assigned a confidence score based upon
the levels of spectral data supporting the match as previously
spectral data levels supporting the match as previously
described\cite{Dashti2017}. Annotated metabolites were matched to previously
selected features used for statistical analysis.
Several low abundance features selected for analysis did not have database
matches and were not annotated. Statistical total correlation spectroscopy41
suggested that some of these unknown features belonged to the same molecules
(not shown). Additional multidimensional \gls{nmr} experiments will be required
to determine their identity.
matches and were not annotated. Statistical total correlation
spectroscopy\cite{Holmes2006} suggested that some of these unknown features
belonged to the same molecules (not shown). Additional multidimensional
\gls{nmr} experiments will be required to determine their identity.
\subsection{Machine Learning and Statistical Analysis}
@ -3026,26 +3026,24 @@ Linear regression analysis of the \glspl{doe} was performed as described in
\cref{sec:statistics}.
Seven \gls{ml} techniques were implemented to predict three responses related to
the memory phenotype of the cultured T cells under different process
conditions (\rmemh{}, \rmemk{}, and \rratio{}). The \gls{ml} methods
executed were \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr},
\gls{svm}, and DataModelers \gls{sr}\footnote{\gls{sr} was performed by Theresa
Kotanchek at Evolved Analytics, \gls{rf}, \gls{gbm}, \gls{cif}, \gls{plsr},
\gls{svm} were performed by Valerie Odeh-Couvertier at UPRM. Methods included
here for reference}. Primarily, \gls{sr} models were used to optimize process
parameter values based on \ptmem{} phenotype and to extract early predictive
variable combinations from the multi-omics experiments. Furthermore, all
regression methods were executed, and the high-performing models were used to
perform a consensus analysis of the important variables to extract potential
critical quality attributes and critical process parameters predictive of T cell
potency, safety, and consistency at the early stages of the manufacturing
process.
the memory phenotype of the cultured T cells under different process conditions
(\rmemh{}, \rmemk{}, and \rratio{}). The \gls{ml} methods executed were
\gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr}, \gls{svm}, and
DataModelers \gls{sr}\footnote{\gls{sr} was performed by Theresa Kotanchek at
Evolved Analytics, \gls{rf}, \gls{gbm}, \gls{cif}, \gls{plsr}, \gls{svm} were
performed by Valerie Odeh-Couvertier at UPRM. Methods included here for
reference}. Primarily, \gls{sr} models were used to optimize process parameter
values based on \ptmem{} phenotype and to extract early predictive variable
combinations from the multi-omics experiments. Furthermore, high-performing
models from each method were used in consensus analysis to extract potential
\glspl{cqa} and \glspl{cpp} predictive of T cell potency, safety, and
consistency at the early stages of the manufacturing process.
\gls{sr} was done using Evolved Analytics DataModeler software (Evolved
Analytics LLC, Midland, MI). DataModeler utilizes genetic programming to evolve
symbolic regression models (both linear and non-linear) rewarding simplicity and
accuracy. Using the selection criteria of highest accuracy
($R^2$>\SI{90}{\percent}) and lowest complexity, the top-performing models were
($R^2>\SI{90}{\percent}$) and lowest complexity, the top-performing models were
identified. Driving variables, variable combinations, and model dimensionality
tables were generated. The top-performing variable combinations were used to
generate model ensembles. In this analysis, DataModelers
@ -3073,7 +3071,7 @@ values, potential optima in the responses, and regions of parameter values where
the predictions diverge the most.
Non-parametric tree-based ensembles were done through the
\inlinecode{randomForest}, inlinecode{gbm}, and \inlinecode{cforest} regression
\inlinecode{randomForest}, \inlinecode{gbm}, and \inlinecode{cforest} regression
functions in R, for \gls{rf}, \gls{gbm}, and \gls{cif} models, respectively.
Both \gls{rf} and \gls{cif} construct multiple decision trees in parallel, by
randomly choosing a subset of features at each decision tree split, in the
@ -3117,8 +3115,8 @@ model with \gls{loocv} tuned parameters.
Consensus analysis of the relevant variables extracted from each machine
learning model was done to identify consistent predictive features of quality at
the early stages of manufacturing. First importance scores for all features were
measured across all \gls{ml} models using \inlinecode{varImp} with
the early stages of manufacturing. First, importance scores for all features
were measured across all \gls{ml} models using \inlinecode{varImp} with
\inlinecode{caret} R package except for scores for \gls{svm} which
\inlinecode{rminer} R package was used. These importance scores were percent
increase in \gls{mse}, relative importance through average increase in
@ -3130,26 +3128,25 @@ respectively. Using these scores, key predictive variables were selected if
their importance scores were within the \nth{80} percentile ranking for the
following \gls{ml} methods: \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso},
\gls{plsr}, \gls{svm} while for \gls{sr} variables present in >\SI{30}{\percent}
of the top-performing \gls{sr} models from DataModeler ($R^2\ge$
\SI{90}{\percent}, Complexity $\ge$ 100) were chosen to investigate consensus
except for \gls{nmr} media models at day 4 which considered a combination of the
top-performing results of models excluding lactate ppms, and included those
variables which were in >\SI{40}{\percent} of the best performing models. Only
variables with those high percentile scoring values were evaluated in terms of
their logical relation (intersection across \gls{ml} models) and depicted using
a Venn diagram from the \inlinecode{venn} R package.
of the top-performing \gls{sr} models from DataModeler
($R^2\ge \SI{90}{\percent}$, Complexity $\ge 100$) were chosen to investigate
consensus except for \gls{nmr} media models at day 4 which considered a
combination of the top-performing results of models excluding lactate ppms, and
included those variables which were in >\SI{40}{\percent} of the best performing
models. Only variables with high percentile scoring values were evaluated in
terms of their logical relation (intersection across \gls{ml} models) and
depicted using a Venn diagram from the \inlinecode{venn} R package.
\section{Results}
\subsection{DMSs Grow T Cells With Lower IL2 Concentrations}
Prior to the main experiments in this aim, we performed a preliminary experiment
to assess the effect of lowering the \gls{il2} concentration on the T cells
grown with either bead or \gls{dms}. One of the hypotheses for the \gls{dms}
system was that the higher cell density would enable more efficient cross-talk
between T cells. Since \gls{il2} is secreted by activated T cells themselves,
T cells in the \gls{dms} system may need less or no \gls{il2} if this hypothesis
were true.
Prior to the main experiments in this aim, we assessed the effect of lowering
the \gls{il2} concentration on the T cells grown with either bead or \gls{dms}.
One of our hypotheses for the \gls{dms} system was that higher cell density
would enhance cross-talk between T cells. Since \gls{il2} is secreted by
activated T cells themselves, T cells in the \gls{dms} system may need less or
no \gls{il2} if this is true.
\begin{figure*}[ht!]
\begingroup
@ -3164,7 +3161,7 @@ were true.
\caption[T Cells Grown at Varying IL2 Concentrations]
{\glspl{dms} grow T cells effectively at lower IL2 concentrations.
\subcap{fig:il2_mod_timecourse}{Longitudinal cell counts of T cells grown
with either bead or \glspl{dms} using varying IL2 concentrations}
with either bead or \glspl{dms} using varying IL2 concentrations.}
Day 14 counts of either \subcap{fig:il2_mod_total}{total cells} or
\subcap{fig:il2_mod_mem}{\ptmem{} cells} plotted against \gls{il2}
concentration.
@ -3179,14 +3176,9 @@ expanded T cells as described in \cref{sec:tcellculture}. T cells grown with
either method expanded robustly as \gls{il2} concentration was increased
(\cref{fig:il2_mod_timecourse}). Surprisingly, neither the bead or the \gls{dms}
group expanded at all with \SI{0}{\IU\per\ml} \gls{il2}. When examining the
endpoint fold change after \SI{14}{\day}, we observe that the difference between
the bead and \gls{dms} appears to be greater at lower \gls{il2} concentrations
(\cref{fig:il2_mod_total}).
% This is further supported by fitting a non-linear
% least squares equation to the data following a hyperbolic curve (which should be
% a plausible model given that this curve describes receptor-ligand kinetics,
% which we can assume \gls{il2} to follow).
Furthermore, the same trend can be
endpoint fold change after \SI{14}{\day}, we observed that the difference
between the bead and \gls{dms} appears to be greater at lower \gls{il2}
concentrations (\cref{fig:il2_mod_total}). Furthermore, the same trend can be
seen when only examining the \ptmem{} cell expansion at day 14
(\cref{fig:il2_mod_mem}). In this case, the \ptmemp{} of the T cells seemed to
be relatively close at higher \gls{il2} concentrations, but separated further at
@ -3196,16 +3188,24 @@ Taken together, these data do not support the hypothesis that the \gls{dms}
system does not need \gls{il2} at all; however, it appears to have a modest
advantage at lower \gls{il2} concentrations compared to beads. For this reason,
we decided to investigate the lower range of \gls{il2} concentrations starting
at \SI{10}{\IU\per\ml} throughout the remainder of this aim.
at \SI{10}{\IU\per\ml} in the remainder of this aim.
\subsection{DOE Shows Optimal Conditions for Potent T Cells}
% TABLE not all of these were actually used, explain why by either adding columns
% or marking with an asterisk
\begin{table}[!h] \centering
\begin{table}[!h]
\centering
\begin{threeparttable}
\caption{DOE Runs}
\label{tab:doe_runs}
\input{../tables/doe_runs.tex}
\begin{tablenotes}
\item[a] It was determined later that the total \glspl{mab} surface density
may not be consistent across each batch of \gls{dms} used. Thus, these
runs were taken out as they were created at different scale and with a
different operator compared to the rest. Leaving them in may produce
unobserved confounding factors
\end{tablenotes}
\end{threeparttable}
\end{table}
\begin{figure*}[ht!]
@ -3224,38 +3224,29 @@ at \SI{10}{\IU\per\ml} throughout the remainder of this aim.
\label{fig:doe_response_first}
\end{figure*}
% RESULT maybe add regression tables to this, although it doesn't really matter
% since we end up doing regression on the full thing later anyways.
We conducted two consecutive \glspl{doe} to optimize the \pth{} and \ptmem{}
responses for the \gls{dms} system. In the first \gls{doe} we, tested \pilII{} in
the range of \SIrange{10}{30}{\IU\per\ml}, \pdms{} in the range of
responses for the \gls{dms} system. In the first, we tested \pilII{} in the
range of \SIrange{10}{30}{\IU\per\ml}, \pdms{} in the range of
\SIrange{500}{2500}{\dms\per\ml}, and \pmab{} in the range of
\SIrange{60}{100}{\percent}. When looking at the total \ptmemp{} output, we
observed that \pilII{} showed a positive linear trend with the \pdms{} and
\pmab{} showing possible second-order effects with maximums and minimums at the
intermediate level (\cref{fig:doe_response_first_mem}). In the case of \pth{},
we observed that all parameters seemed to have a positive linear response, with
\pilII{} and \pdms{} showing slight second order effects that suggest a maximum
might exist at a higher value for each.
\SIrange{60}{100}{\percent}. When looking at total \ptmemp{} cells, \pilII{}
showed a positive linear trend and \pdms{} and \pmab{} showed possible
second-order effects with intermediate maximums and minimums respectively
(\cref{fig:doe_response_first_mem}). In the case of \pth{}, all parameters
showed a positive, suggesting a maximum might exist at a higher value for each.
After performing the first \gls{doe} we augmented the original design matrix
After performing the first \gls{doe}, we augmented the original design matrix
with an \gls{adoe} which was built with three goals in mind. Firstly we wished
to validate the first \gls{doe} by assessing the strength and responses of each
effect. Secondly, we wished to improve our confidence in regions that showed
high complexity, such as the peak in the \gls{dms} concentration for the total
\ptmem{} cell response. Thirdly, we wished to explore additional ranges of each
response. Since \pilII{} and \pdms{} appeared to continue positively influence
multiple responses beyond our tested range, we were curious if there was an
optimum at some higher setting of either of these values. For this reason, we
increased the \pilII{} to include \SI{40}{\IU\per\ml} and the \pdms{} to
response. Notably, \pilII{} appeared to increase beyond our tested range, thus
we were curious if there was an optimum at some higher setting. For this reason,
we increased the \pilII{} to include \SI{40}{\IU\per\ml} and the \pdms{} to
\SI{3500}{\dms\per\ml}. Note that it was impossible to go beyond
\SI{100}{\percent} for the \pmab{}, so runs were positioned for this parameter
with validation and confidence improvements in mind. The runs for each \gls{doe}
were shown in \cref{tab:doe_runs}\footnote{Not all runs in this table were used.
It was determined later that the total \glspl{mab} surface density may not be
consistent across each batch of \gls{dms} used, primarily due to the fact that a
subset were created at different scale and with a different operator. To remove
this bias in our data, these runs were not used.}.
were shown in \cref{tab:doe_runs}.
\begin{figure*}[ht!]
\begingroup
@ -3329,10 +3320,10 @@ responses showed mostly linear relationships in all parameter cases
% anything to be significant
We performed linear regression on the three input parameters as well as a binary
parameter representing if a given run came from the first or second \gls{doe}
(called `dataset'). Starting with the total \ptmem{} cells response, we fit a
(called ``dataset''). Starting with the total \ptmem{} cells response, we fit a
first order regression model using these four parameters
(\cref{tab:doe_mem1.tex}). While \pilII{} was found to be a significant
predictor, the model fit was extremely poor ($R^2$ of 0.331). This was not
predictor, the model fit was extremely poor ($R^2 = 0.331$). This was not
surprising given the apparent complexity of this response
(\cref{fig:doe_responses_mem}). To obtain a better fit, we added second and
third degree terms (\cref{tab:doe_mem2.tex}). Note that the dataset parameter
@ -3350,9 +3341,8 @@ that our data might be underpowered for a model this complex. Further
experiments beyond what was performed here may be needed to fully describe this
response.
% TABLE combine these tables into one
We performed linear regression on the other three responses, all of which
performed much better than the \ptmem{} response as expected given the much
performed much better than the \ptmem{} response as expected given the
lower apparent complexity in the response plots
(\cref{fig:doe_responses_cd4,fig:doe_responses_mem4,fig:doe_responses_ratio}).
All these models appeared to fit will, with $R^2$ and $R_{adj}^2$ upward of
@ -3380,11 +3370,10 @@ significant predictors.
We then visualized the total \ptmemh{} cells and \rratio{} using the response
explorer in DataModeler to create contour plots around the maximum responses.
For both, it appeared that maximizing all three input parameters resulted in the
maximum value for either response (\cref{fig:doe_sr_contour}). While not all
combinations at and around this optimum were tested, the model nonetheless
showed that there were no other optimal values or regions elsewhere in the
model.
For both, maximizing all input parameters maximized both responses
(\cref{fig:doe_sr_contour}). While not all combinations at and around this
optimum were tested, these plots suggest that there were no other optimal values
elsewhere.
\subsection{Modeling with Machine Learning Reveals Putative CQAs}
@ -3407,16 +3396,15 @@ features of quality early in their expansion process.
\label{fig:doe_luminex}
\end{figure*}
We collected secretome data via luminex for days 4, 6, 8, 11, and 14.
Plotting the concentrations of these cytokines showed a large variation over all
runs and between different timepoints, demonstrated that these could potentially
be used to differentiate between different process conditions qualitatively
simply based on variance (\cref{fig:doe_luminex}). These were also much higher
in most cases that a set of bead based runs which were run in parallel, in
agreement with the luminex data obtained previously in the Grex system (these
data were collected in plates) (\cref{fig:grex_luminex}).
We collected secretome data via luminex for days 4, 6, 8, 11, and 14. Plotting
the concentrations of these cytokines showed a large variation over all runs and
between different timepoints, demonstrating that these could be used to
differentiate between different process conditions qualitatively simply based on
variance (\cref{fig:doe_luminex}). These were also much higher in most cases
that a set of bead based runs which were run in parallel, in agreement with the
luminex data obtained previously in the Grex system (these data were collected
in plates) (\cref{fig:grex_luminex}).
% TABLE this table looks like crap, break it up into smaller tables
\begin{table}[!h] \centering
\caption[Machine Learning Model Results]
{Results for \gls{ml} modeling using process parameters (PP) with
@ -3428,15 +3416,15 @@ data were collected in plates) (\cref{fig:grex_luminex}).
\end{table}
\gls{sr} models achieved the highest predictive performance
($R^2$>\SI{93}{\percent}) when using multi-omics predictors for all endpoint
responses (\cref{tab:mod_results}). \gls{sr} achieved $R^2$>\SI{98}{\percent}
while \gls{gbm} ensembles showed \gls{loocv} $R^2$ > \SI{95}{\percent} for
($R^2>\SI{93}{\percent}$) when using multi-omics predictors for all endpoint
responses (\cref{tab:mod_results}). \gls{sr} achieved $R^2>\SI{98}{\percent}$
while \gls{gbm} ensembles showed \gls{loocv} $R^2>\SI{95}{\percent}$ for
\rmemh{} and \rmemk{} responses. Similarly, \gls{lasso}, \gls{plsr}, and
\gls{svm} methods showed consistently high \gls{loocv}, (\SI{92.9}{\percent},
\SI{99.7}{\percent}, and \SI{90.5}{\percent} respectively), to predict the
\rratio{}. Yet, about \SI{10}{\percent} reduction in \gls{loocv},
\SIrange{72.5}{81.7}{\percent}, was observed for \rmemh{} with these three
methods. Lastly, \gls{sr} and \gls{plsr} achieved $R^2$>\SI{90}{\percent} while
methods. Lastly, \gls{sr} and \gls{plsr} achieved $R^2>\SI{90}{\percent}$ while
other \gls{ml} methods exhibited exceedingly variable \gls{loocv}
(\SI{0.3}{\percent} for \gls{rf} to \SI{51.5}{\percent} for \gls{lasso}) for
\rmemk{}.
@ -3485,18 +3473,13 @@ methods for predicting \rratio{} when considering features with the highest
importance scores across models (\cref{fig:mod_flower_48r}). Other features,
IL2R, IL4, IL17a, and \pdms{}, were commonly selected in $\ge$ 5 \gls{ml}
methods (\cref{fig:mod_flower_48r}). When restricting the models only to include
metabolome, formate emerged as the dominant predictor shared across all seven
models.
metabolome, formate was the sole predictor shared by all.
% Moreover, IL13 and IL15 were found predictive in combination
% with these using \gls{sr} (Supp.Table.S4).
When performing similar analysis on \rmemh{}, we observe that no species for
either the secretome or metabolome was agreed upon by all seven models
(\cref{fig:mod_flower_cd4}). We also observed that these models did not fit as
well as they did for \rratio{} (\cref{tab:mod_results}). For the secretome, the
species that were agreed upon by $\ge$ 5 models were IL4, IL17a, and IL2R. For
the metabolome, formate once again was agreed upon by $\ge$ 5 models as well as
When performing similar analysis on \rmemh{}, no species for either secretome or
metabolome was shared by all models (\cref{fig:mod_flower_cd4}). These models
also had worse fits compared to those for \rratio{} (\cref{tab:mod_results}).
For the secretome, IL4, IL17a, and IL2R were agreed upon by $\ge$ 5 models. For
the metabolome, formate once again was shared by $\ge$ 5 models as well as
lactate.
\begin{figure*}[ht!]
@ -3523,12 +3506,12 @@ lactate.
\label{fig:nmr_cors}
\end{figure*}
We also investigated the \gls{nmr} features extracted from day of expansion to
assess if there was any predictive power for \ptmemh{}; in general these models
had almost as good of fit despite being 2 days earlier in the process
(\cref{fig:nmr_cors}). Lactate and formate were observed to correlate with each
other, and both correlated with \rmemh{}. Furthermore, lactate was observed to
positively correlate with \pdms{} and negatively correlate with glucose
We also asked if day 4 \gls{nmr} features could predict \ptmemh{}; these models
generally fit well despite being 2 days earlier in the process
(\cref{fig:nmr_cors})\footnote{for anyone wondering why we don't have the
matching secretome data for day 4, blame UPS for losing our samples}. Lactate
and formate correlated with each other and with \rmemh{}. Furthermore, lactate
positively correlated with \pdms{} and negatively correlated with glucose
(\cref{fig:nmr_cors_lactate}). Formate also had the same correlation patterns
(\cref{fig:nmr_cors_formate}). Glucose was only negatively correlated with
formate and lactate (\cref{fig:nmr_cors_glucose}). Together, these data suggest
@ -3537,38 +3520,34 @@ that lactate, formate, \pdms{}, and \rmemh{} are fundamentally linked.
\section{Discussion}
\gls{cpp} modeling and understanding are critical to new product development and
in cell therapy development, it can have life-saving implications. The
challenges for effective modeling grow with the increasing complexity of
processes due to high dimensionality, and the potential for process interactions
and nonlinear relationships. Another critical challenge is the limited amount of
available data, mostly small \gls{doe} datasets. \gls{sr} has the necessary
have life-saving implications in the context of cell therapy. The challenges for
effective modeling grow with the increasing process complexity due to high
dimensionality, interactions between parameters, nonlinearity. Another critical
challenge is the limited amount of available data. \gls{sr} has the necessary
capabilities to resolve the issues of process effects modeling and has been
applied across multiple industries\cite{Kordona}. \gls{sr} discovers
mathematical expressions that fit a given sample and differs from conventional
regression techniques in that a model structure is not defined \textit{a
priori}\cite{Koza1992}. Hence, a key advantage of this methodology is that
transparent, human-interpretable models can be generated from small and large
datasets with no prior assumptions\cite{Kotancheka}.
datasets with few prior assumptions\cite{Kotancheka}.
Since the model search process lets the data determine the model, diverse and
competitive model structures are typically discovered. An ensemble of diverse
models can be formed where its constituent models will tend to agree when
constrained by observed data yet diverge in new regions. Collecting data in
these regions helps to ensure that the target system is accurately modeled, and
its optimum is accurately located\cite{Kotancheka}. Exploiting these features
allows adaptive data collection and interactive modeling. Consequently, this
\gls{adoe} approach is useful in a variety of scenarios, including maximizing
model validity for model-based decision making, optimizing processing parameters
to maximize target yields, and developing emulators for online optimization and
human understanding\cite{Kotancheka}.
competitive model structures are typically discovered. An diverse ensemble will
contain models that agree in regions constrained by observable data and diverge
in regions without data. Collecting data in divergent regions ensures the system
is accurately modeled and its optimum accurately located\cite{Kotancheka}.
Consequently, this \gls{adoe} approach is useful in a many scenarios, including
maximizing model validity for model-based decision making, optimizing processing
parameters to maximize yield, and developing emulators for online optimization
and human understanding\cite{Kotancheka}.
An in-depth characterization of potential \gls{dms} based T cell \glspl{cqa}
includes a list of cytokine and \gls{nmr} features from media samples that are
crucial in many aspects of T cell fate decisions and effector functions of
immune cells. Cytokine features were observed to slightly improve prediction and
dominated the ranking of important features and variable combinations when
modeling together with \gls{nmr} media analysis and process parameters
(\cref{fig:mod_flower}).
immune cells. Cytokine features slightly improved prediction and dominated the
ranking of important features and variable combinations when modeling together
with \gls{nmr} media analysis and process parameters (\cref{fig:mod_flower}).
Predictive cytokine features such as \gls{tnfa}, IL2R, IL4, IL17a, IL13, and
IL15 were biologically assessed in terms of their known functions and activities
@ -3577,35 +3556,35 @@ cells, as per their main functions, and activated T cells secrete more cytokines
than resting T cells. It is possible that some cytokines simply reflect the
\rratio{} and the activation degree by proxy proliferation. However, the exact
ratio of expected cytokine abundance is less clear and depends on the subtypes
present, and thus examination of each relevant cytokine is needed.
present, thus examination of each relevant cytokine is needed.
IL2R is secreted by activated T cells and binds to IL2, acting as a sink to
dampen its effect on T cells\cite{Witkowska2005}. Since IL2R was much greater
dampen its effect on T cells\cite{Witkowska2005}. Since IL2R was more abundant
than IL2 in solution, this might reduce the overall effect of IL2, which could
be further investigated by blocking IL2R with an antibody. In T cells, TNF can
increase IL2R, proliferation, and cytokine production\cite{Mehta2018}. It may
also induce apoptosis depending on concentration and alter the CD4+ to CD8+
also induce apoptosis depending on concentration and alter the CD4:CD8
ratio\cite{Vudattu2005}. Given that TNF has both a soluble and membrane-bound
form, this may either increase or decrease CD4+ ratio and/or memory T cells
form, this may either increase or decrease CD4:CD8 ratio and/or memory T cells
depending on the ratio of the membrane to soluble TNF\cite{Mehta2018}. Since
only soluble TNF was measured, membrane TNF is needed to understand its impact
on both CD4+ ratio and memory T cells. Furthermore, IL13 is known to be critical
for \gls{th2} response and therefore could be secreted if there are significant
\glspl{th2} already present in the starting population\cite{Wong2011}. This
cytokine has limited signaling in T cells and is thought to be more of an
effector than a differentiation cytokine\cite{Junttila2018}. It might be
emerging as relevant due to an initially large number of \glspl{th2} or because
\glspl{th2} were preferentially expanded; indeed, IL4, also found important, is
the conical cytokine that induces \gls{th2} differentiation
(\cref{fig:mod_flower}). The role of these cytokines could be investigated by
quantifying \glspl{th1}, \glspl{th2}, or \glspl{th17} both in the starting
population and longitudinally. Similar to IL13, IL17 is an effector cytokine
produced by \glspl{th17}\cite{Amatya2017} thus may reflect the number of
\glspl{th17} in the population. GM-CSF has been linked with activated T cells,
specifically \glspl{th17}, but it is not clear if this cytokine is inducing
differential expansion of CD8+ T cells or if it is simply a covariate with
another cytokine inducing this expansion\cite{Becher2016}. Finally, IL15 has
been shown to be essential for memory signaling and effective in skewing
on both CD4:CD8 ratio and memory T cells. Furthermore, IL13 is known to be
critical for \gls{th2} response and therefore could be secreted if there are
significant \glspl{th2} already present in the starting
population\cite{Wong2011}. This cytokine has limited signaling in T cells and is
thought to be more of an effector than a differentiation
cytokine\cite{Junttila2018}. It might be emerging here due to an initially large
number of \glspl{th2} or because \glspl{th2} were preferentially expanded;
indeed, IL4, also found important, is the canonical cytokine that induces
\gls{th2} differentiation (\cref{fig:mod_flower}). The role of these cytokines
could be investigated by quantifying \glspl{th1}, \glspl{th2}, or \glspl{th17}
both in the starting population and longitudinally. Similar to IL13, IL17 is an
effector cytokine produced by \glspl{th17}\cite{Amatya2017} thus may reflect the
number of \glspl{th17} in the population. GM-CSF has been linked with activated
T cells, specifically \glspl{th17}, but it is not clear if this cytokine is
inducing differential expansion of CD8+ T cells or if it is simply a covariate
with another cytokine inducing this expansion\cite{Becher2016}. Finally, IL15
has been shown to be essential for memory signaling and effective in skewing
\gls{car} T cells toward \glspl{tscm} when using membrane-bound IL15Ra and
IL15R\cite{Hurton2016}. Its high predictive behavior goes with its ability to
induce large numbers of memory T cells by functioning in an autocrine/paracrine
@ -3616,24 +3595,24 @@ activity associated with T cell activation and differentiation, yet it is not
clear how the various combinations of metabolites relate with each other in a
heterogeneous cell population. Formate and lactate were found to be highly
predictive and observed to positively correlate with higher values of total live
\rmemh{} cells (~\cref{fig:nmr_cors}). Formate is a byproduct of the
one-carbon cycle implicated in promoting T cell activation\cite{RonHarel2016}.
Importantly, this cycle occurs between the cytosol and mitochondria of cells and
formate excreted\cite{Pietzke2020}. Mitochondrial biogenesis and function are
shown necessary for memory cell persistence\cite{van_der_Windt_2012,
Vardhana2020}. Therefore, increased formate in media could be an indicator of
one-carbon metabolism and mitochondrial activity in the culture.
\rmemh{} cells (\cref{fig:nmr_cors}). Formate is a byproduct of the one-carbon
cycle implicated in promoting T cell activation\cite{RonHarel2016}. Importantly,
this cycle occurs between the cytosol and mitochondria, from which formate is
excreted\cite{Pietzke2020}. Mitochondrial biogenesis and function are shown to
be necessary for memory cell persistence\cite{van_der_Windt_2012, Vardhana2020}.
Therefore, increased formate in media could be an indicator of one-carbon
metabolism and mitochondrial activity in the culture.
In addition to formate, lactate was found as a putative \gls{cqa} of \ptmem{}
cells. Lactate is the end-product of aerobic glycolysis, characteristic of
highly proliferating cells and activated T cells\cite{Lunt2011, Chang2013}.
Glucose import and glycolytic genes are immediately upregulated in response to T
cell stimulation, and thus generation of lactate. At earlier time-points, this
abundance suggests a more robust induction of glycolysis and higher overall T
cell proliferation. Interestingly, our models indicate that higher lactate
predicts higher CD4+, both in total and in proportion to CD8+, seemingly
contrary to previous studies showing that CD8+ T cells rely more on glycolysis
for proliferation following activation\cite{Cao2014}. It may be that glycolytic
Glucose import and glycolytic genes are upregulated in response to T cell
stimulation, thus leading to lactate. At earlier time-points, this abundance
suggests a more robust induction of glycolysis and higher overall T cell
proliferation. Interestingly, our models indicate that higher lactate predicts
higher CD4+, both in total and in proportion to CD8+, seemingly contrary to
previous studies showing that CD8+ T cells rely more on glycolysis for
proliferation following activation\cite{Cao2014}. It may be that glycolytic
cells dominate in the culture at the early time points used for prediction, and
higher lactate reflects more cells.
@ -3652,28 +3631,26 @@ confounded by the partial replacement of media that occurred periodically during
expansion, thus likely diluting some metabolic byproducts (such as formate,
lactate) and elevating depleted precursors (such as glucose and amino acids).
More definitive conclusions of metabolic activity across the expanding cell
population can be addressed by a closed system, ideally with on-line process
sensors and controls for formate, lactate, along with ethanol and glucose.
population can be addressed by a closed system, ideally with on-line sensors and
controls for formate, lactate, ethanol, and glucose.
Practically, knowledge of how cytokines and/or metabolites are related to
outcome can be utilized for process control, which involves measuring the
current state of the culture, comparing it to a desired state, and intervening
if it is outside an acceptable range. In the case of lactate and formate, a
benchtop \gls{nmr} can be utilized to sample the media in real time during
culture. This \gls{nmr} can be tuned to automatically quantify the presence of
lactate and formate. Formate is part of the one-carbon pathway, and thus culture
fate may be controlled by altering the inputs to this pathway (glycine, serine,
choline) and/or adding folic acid inhibitors\cite{Ducker2017}. Since lactate is
a direct byproduct of glycolysis, this may be controlled by altering the
concentration of glucose in solution. Each of these control schemes would need
further study to assess if they have enough precision and temporal resolution to
reasonably ensure product quality. In the case of cytokines, there is currently
no analogue to a benchtop \gls{nmr}; however, research is underway to develop
protein-specific sensors using aptamers\cite{Parolo2020}. Even without these
developments, one could still use \gls{elisa} or Luminex to assess protein
levels in a semi-automated manner, but the disadvantage is that these assays are
temporally discrete and impose a significant time lag before the intervention
can be performed.
benchtop \gls{nmr} can be tuned to quantify lactate and formate to sample the
media in real time during culture. Formate is part of the one-carbon pathway,
and thus culture fate may be controlled by altering the inputs to this pathway
(glycine, serine, choline) and/or adding folic acid inhibitors\cite{Ducker2017}.
Since lactate is a direct byproduct of glycolysis, this may be controlled by
altering the concentration of glucose in solution. Each of these control schemes
would need further study to assess if they have enough precision and temporal
resolution to reasonably ensure product quality. For cytokines, there is
currently no analogue to a benchtop \gls{nmr}; however, research is underway to
develop protein-specific sensors using aptamers\cite{Parolo2020}. Even without
these developments, \gls{elisa} or Luminex can still quantify cytokines in a
semi-automated manner. However, these are temporally discrete and impose a
non-trivial delay before the intervention can be performed.
\chapter{AIM 2B}\label{aim2b}