From eb19087bd2fe7322326c5d3d1fb4e3c39e3a8d3e Mon Sep 17 00:00:00 2001 From: ndwarshuis Date: Wed, 8 Sep 2021 19:14:31 -0400 Subject: [PATCH] ENH proof aim2a --- tables/doe_cd4.tex | 6 +- tables/doe_mem1.tex | 6 +- tables/doe_mem2.tex | 6 +- tables/doe_mem4.tex | 2 +- tables/doe_ratio.tex | 2 +- tables/doe_runs.tex | 14 +- tables/model_results.tex | 13 +- tex/references.bib | 13 ++ tex/thesis.tex | 371 ++++++++++++++++++--------------------- 9 files changed, 213 insertions(+), 220 deletions(-) diff --git a/tables/doe_cd4.tex b/tables/doe_cd4.tex index 868d9fd..432e650 100644 --- a/tables/doe_cd4.tex +++ b/tables/doe_cd4.tex @@ -3,8 +3,8 @@ \begin{tabular}{@{\extracolsep{5pt}}lc} \\[-1.8ex]\hline \hline \\[-1.8ex] - & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ -\cline{2-2} +% & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ +% \cline{2-2} \\[-1.8ex] & CD4+ Cells \\ \hline \\[-1.8ex] Dataset [2] & 1,271,171.000$^{**}$ \\ @@ -13,7 +13,7 @@ DMS Conc. (1/ml) & 1,742.752$^{***}$ \\ Intercept & $-$5,344,494.000$^{***}$ \\ \hline \\[-1.8ex] -Observations & 30 \\ +% Observations & 30 \\ R$^{2}$ & 0.888 \\ Adjusted R$^{2}$ & 0.870 \\ % Residual Std. Error & 727,042.800 (df = 25) \\ diff --git a/tables/doe_mem1.tex b/tables/doe_mem1.tex index 5e07427..1e5b4e1 100644 --- a/tables/doe_mem1.tex +++ b/tables/doe_mem1.tex @@ -3,8 +3,8 @@ \begin{tabular}{@{\extracolsep{5pt}}lc} \\[-1.8ex]\hline \hline \\[-1.8ex] - & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ -\cline{2-2} +% & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ +% \cline{2-2} \\[-1.8ex] & CD62L+CCR7+ Cells \\ \hline \\[-1.8ex] Dataset [2] & 4,661,754.000$^{*}$ \\ @@ -13,7 +13,7 @@ DMS Conc. (1/ml) & 240.038 \\ Intercept & $-$3,478,851.000 \\ \hline \\[-1.8ex] -Observations & 30 \\ +% Observations & 30 \\ R$^{2}$ & 0.331 \\ Adjusted R$^{2}$ & 0.224 \\ % Residual Std. Error & 3,659,501.000 (df = 25) \\ diff --git a/tables/doe_mem2.tex b/tables/doe_mem2.tex index 9421e08..8c031da 100644 --- a/tables/doe_mem2.tex +++ b/tables/doe_mem2.tex @@ -3,8 +3,8 @@ \begin{tabular}{@{\extracolsep{5pt}}lc} \\[-1.8ex]\hline \hline \\[-1.8ex] - & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ -\cline{2-2} +% & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ +% \cline{2-2} \\[-1.8ex] & log(CD62L+CCR7+ Cells) \\ \hline \\[-1.8ex] Dataset [2] & 0.269 \\ @@ -20,7 +20,7 @@ (Functional mAb \%)*(IL2 Conc. (IU/ml)*(DMS Conc. (1/ml)) & 0.00000$^{*}$ \\ Intercept & 20.899$^{***}$ \\ \hline \\[-1.8ex] -Observations & 30 \\ +% Observations & 30 \\ R$^{2}$ & 0.741 \\ Adjusted R$^{2}$ & 0.583 \\ % Residual Std. Error & 0.228 (df = 18) \\ diff --git a/tables/doe_mem4.tex b/tables/doe_mem4.tex index bb50a28..28d3fd8 100644 --- a/tables/doe_mem4.tex +++ b/tables/doe_mem4.tex @@ -11,7 +11,7 @@ DMS Conc. (1/ml) & 926.925$^{***}$ \\ Intercept & $-$3,368,762.000$^{***}$ \\ \hline \\[-1.8ex] -Observations & 30 \\ +% Observations & 30 \\ R$^{2}$ & 0.835 \\ Adjusted R$^{2}$ & 0.808 \\ % Residual Std. Error & 493,168.700 (df = 25) \\ diff --git a/tables/doe_ratio.tex b/tables/doe_ratio.tex index 8c2e49a..9b60bca 100644 --- a/tables/doe_ratio.tex +++ b/tables/doe_ratio.tex @@ -12,7 +12,7 @@ DMS Conc. (1/ml) & 0.0001$^{***}$ \\ Intercept & $-$0.144$^{*}$ \\ \hline \\[-1.8ex] -Observations & 30 \\ +% Observations & 30 \\ R$^{2}$ & 0.879 \\ Adjusted R$^{2}$ & 0.860 \\ % Residual Std. Error & 0.039 (df = 25) \\ diff --git a/tables/doe_runs.tex b/tables/doe_runs.tex index 3363c6e..3ab5311 100644 --- a/tables/doe_runs.tex +++ b/tables/doe_runs.tex @@ -23,7 +23,7 @@ DOE & 16 & 10 & 500 & 100\\ DOE & 17 & 20 & 1500 & 60\\ DOE & 18 & 30 & 2500 & 60\\ ADOE & 1 & 40 & 500 & 100\\ -ADOE & 2 & 35 & 2000 & 100\\ +ADOE & 2\tnote{a} & 35 & 2000 & 100\\ ADOE & 3 & 30 & 1500 & 100\\ ADOE & 4 & 30 & 2500 & 100\\ ADOE & 5 & 40 & 2500 & 100\\ @@ -31,14 +31,14 @@ ADOE & 6 & 40 & 1500 & 100\\ ADOE & 7 & 30 & 500 & 100\\ ADOE & 8 & 35 & 2000 & 100\\ ADOE & 9 & 35 & 1000 & 100\\ -ADOE & 10 & 30 & 1500 & 100\\ +ADOE & 10\tnote{a} & 30 & 1500 & 100\\ ADOE & 11 & 35 & 3000 & 100\\ ADOE & 12 & 30 & 2500 & 100\\ -ADOE & 13 & 40 & 1500 & 100\\ -ADOE & 14 & 40 & 500 & 100\\ -ADOE & 15 & 30 & 500 & 100\\ -ADOE & 16 & 35 & 1000 & 100\\ -ADOE & 17 & 35 & 3000 & 100\\ +ADOE & 13\tnote{a} & 40 & 1500 & 100\\ +ADOE & 14\tnote{a} & 40 & 500 & 100\\ +ADOE & 15\tnote{a} & 30 & 500 & 100\\ +ADOE & 16\tnote{a} & 35 & 1000 & 100\\ +ADOE & 17\tnote{a} & 35 & 3000 & 100\\ ADOE & 18 & 40 & 3500 & 100\\ ADOE & 19 & 40 & 2500 & 100\\ ADOE & 20 & 40 & 3500 & 100\\ diff --git a/tables/model_results.tex b/tables/model_results.tex index 57ad9de..47736c5 100644 --- a/tables/model_results.tex +++ b/tables/model_results.tex @@ -3,21 +3,24 @@ \hline \\[-1.8ex] \\[-1.8ex] Response/Predictors & SR & RF & GBM & CIF & LASSO & PLSR & SVM \\ \hline \\[-1.8ex] -\multicolumn{8}{l}{CD4:CD8 Ratio} \\ +\multicolumn{8}{l}{CD4:CD8 Ratio} \\ +\\[-1.8ex] PP+N4 & \SI{99}{\percent} & \SI{86.8}{\percent} & \SI{96.3}{\percent} & \SI{84.5}{\percent} & \SI{88.6}{\percent} & \SI{92.5}{\percent} & \SI{88.5}{\percent}\\ PP+N6 & \SI{99}{\percent} & \SI{73.6}{\percent} & \SI{95.9}{\percent} & \SI{70.1}{\percent} & \SI{81.0}{\percent} & \SI{95.8}{\percent} & \SI{79.7}{\percent}\\ PP+S6 & \SI{99}{\percent} & \SI{87.1}{\percent} & \SI{99.9}{\percent} & \SI{83.4}{\percent} & \SI{87.2}{\percent} & \SI{97.9}{\percent} & \SI{86.8}{\percent}\\ PP+S6+N6 & \SI{99}{\percent} & \SI{85.5}{\percent} & \SI{95.3}{\percent} & \SI{83.4}{\percent} & \SI{92.9}{\percent} & \SI{99.7}{\percent} & \SI{90.5}{\percent}\\ -\hline \\ -\multicolumn{8}{l}{\ptmemh{} cells} \\ +\hline \\[-1.8ex] +\multicolumn{8}{l}{\ptmemh{} cells} \\ +\\[-1.8ex] PP+N4 & \SI{97}{\percent} & \SI{67.0}{\percent} & \SI{93.6}{\percent} & \SI{69.3}{\percent} & \SI{34.3}{\percent} & \SI{90.1}{\percent} & \SI{75.5}{\percent}\\ PP+N6 & \SI{96}{\percent} & \SI{45.9}{\percent} & \SI{92.6}{\percent} & \SI{51.2}{\percent} & \SI{42.8}{\percent} & \SI{92.1}{\percent} & \SI{79.4}{\percent}\\ PP+S6 & \SI{98}{\percent} & \SI{71.4}{\percent} & \SI{99.9}{\percent} & \SI{75.0}{\percent} & \SI{74.9}{\percent} & \SI{80.0}{\percent} & \SI{75.5}{\percent}\\ PP+S6+N6 & \SI{98}{\percent} & \SI{68.2}{\percent} & \SI{95.6}{\percent} & \SI{74.4}{\percent} & \SI{72.5}{\percent} & \SI{81.7}{\percent} & \SI{77.0}{\percent}\\ -\hline \\ -\multicolumn{8}{l}{\ptmemk{} cells} \\ +\hline \\[-1.8ex] +\multicolumn{8}{l}{\ptmemk{} cells} \\ +\\[-1.8ex] PP+N4 & \SI{93}{\percent} & \SI{4.7}{\percent} & \SI{44.4}{\percent} & \SI{9.2}{\percent} & \SI{1.2}{\percent} & \SI{65.1}{\percent} & \SI{9.1}{\percent}\\ PP+N6 & \SI{86}{\percent} & \SI{2.0}{\percent} & \SI{29.9}{\percent} & \SI{15.8}{\percent} & \SI{28.5}{\percent} & \SI{63.3}{\percent} & \SI{30.6}{\percent}\\ PP+S6 & \SI{93}{\percent} & \SI{7.8}{\percent} & \SI{28.0}{\percent} & \SI{15.1}{\percent} & \SI{76.2}{\percent} & \SI{98.4}{\percent} & \SI{49.8}{\percent}\\ diff --git a/tex/references.bib b/tex/references.bib index 5ceb261..e076f21 100644 --- a/tex/references.bib +++ b/tex/references.bib @@ -2803,6 +2803,19 @@ CONCLUSIONS: We developed a simplified, semi-closed system for the initial selec isbn = {046509760X}, } +@Article{Holmes2006, + author = {E. Holmes and O. Cloarec and J. K. Nicholson}, + journal = {Journal of Proteome Research}, + title = {Probing Latent Biomarker Signatures and in Vivo Pathway Activity in Experimental Disease States via Statistical Total Correlation Spectroscopy ({STOCSY}) of Biofluids:~ Application to {HgCl}2Toxicity}, + year = {2006}, + month = {jun}, + number = {6}, + pages = {1313--1320}, + volume = {5}, + doi = {10.1021/pr050399w}, + publisher = {American Chemical Society ({ACS})}, +} + @Comment{jabref-meta: databaseType:bibtex;} @Comment{jabref-meta: grouping: diff --git a/tex/thesis.tex b/tex/thesis.tex index ff55f45..d957571 100644 --- a/tex/thesis.tex +++ b/tex/thesis.tex @@ -21,6 +21,7 @@ \usepackage{listings} \usepackage{tocloft} \usepackage{epigraph} +\usepackage{threeparttable} \hypersetup{ colorlinks=true, @@ -2876,8 +2877,8 @@ The purpose of this sub-aim was to develop computational methods to identify novel \glspl{cqa} and \glspl{cpp} that could be used for release criteria, process control, and process optimization for the \gls{dms} platform. We hypothesized that T cells grown using the \gls{dms} system would produce -detectable biological signatures in the media supernatent which corresponded to -clinically relevent responses such as fold expansion or phenotype. We tested +detectable biological signatures in the media supernatent which would correspond +to clinically relevent responses such as fold expansion or phenotype. We tested this hypothesis by activating T cells under a variety of conditions using a \gls{doe}, sampling the media at intermediate timepoints, and creating models to predict the outcome of the cultures. We should stress that the specific @@ -2921,13 +2922,12 @@ progressed. Data from inputs and/or longitudinal samples were used to predict the endpoint response. The fusion of cytokine and \gls{nmr} profiles from media to model these responses included 30 cytokines from a custom Thermo Fisher ProcartaPlex Luminex kit and 20 \gls{nmr} features. These 20 spectral features -from \gls{nmr} media analysis were selected out of approximately 250 peaks -through the implementation of a variance-based feature selection approach and -some manual inspection steps. +from \gls{nmr} media analysis were selected out of approximately 250 peaks using +a variance-based feature selection approach and some manual inspection steps. The first \gls{doe} resulted in a randomized 18-run I-optimal custom design where each \gls{dms} parameter was evaluated at three levels: \pilII{} (10, 20, -and 30 U/uL), \pdms{} (500, 1500, 2500 \si{\dms\per\ul}), and \pmab{} (60, 80, +and 30 U/uL), \pdms{} (500, 1500, 2500 \si{\dms\per\ml}), and \pmab{} (60, 80, 100 \si{\percent}). These 18 runs consisted of 14 unique parameter combinations where 4 of them were replicated twice to assess prediction error. To further optimize the initial region explored, an \gls{adoe} was designed with 10 unique @@ -2972,11 +2972,11 @@ Cytokines were quantified via Luminex as described in Prior to analysis, samples were centrifuged at \SI{2990}{\gforce} for \SI{20}{\minute} at \SI{4}{\degreeCelsius} to clear any debris\footnote{all \gls{nmr} analysis was done by our collaborators Max Colonna and Art Edison at - the University of Georgia; methods included here for reference}. \SI{5}{\ul} of -100/3 \si{\mM} DSS-D6 in deuterium oxide (Cambridge Isotope Laboratories) were -added to \SI{1.7}{\mm} \gls{nmr} tubes (Bruker BioSpin), followed by + the University of Georgia; methods included here for reference}. \SI{5}{\ul} +of 100/3 \si{\mM} DSS-D6 in deuterium oxide (Cambridge Isotope Laboratories) +were added to \SI{1.7}{\mm} \gls{nmr} tubes (Bruker BioSpin), followed by \SI{45}{\ul} of media from each sample that was added and mixed, for a final -volume of \SI{50}{\ul} in each tube. Samples were prepared on ice and in +volume of \SI{50}{\ul} in each tube. Samples were prepared on ice in predetermined, randomized order. The remaining volume from each sample in the rack (approx. \SI{4}{\ul}) was combined to create an internal pool. This material was used for internal controls within each rack as well as metabolite @@ -3010,15 +3010,15 @@ Two-dimensional spectra collected on pooled samples were uploaded to COLMARm web server, where \gls{hsqc} peaks were automatically matched to database peaks. \gls{hsqc} matches were manually reviewed with additional 2D and proton spectra to confirm the match. Annotations were assigned a confidence score based upon -the levels of spectral data supporting the match as previously +spectral data levels supporting the match as previously described\cite{Dashti2017}. Annotated metabolites were matched to previously selected features used for statistical analysis. Several low abundance features selected for analysis did not have database -matches and were not annotated. Statistical total correlation spectroscopy41 -suggested that some of these unknown features belonged to the same molecules -(not shown). Additional multidimensional \gls{nmr} experiments will be required -to determine their identity. +matches and were not annotated. Statistical total correlation +spectroscopy\cite{Holmes2006} suggested that some of these unknown features +belonged to the same molecules (not shown). Additional multidimensional +\gls{nmr} experiments will be required to determine their identity. \subsection{Machine Learning and Statistical Analysis} @@ -3026,26 +3026,24 @@ Linear regression analysis of the \glspl{doe} was performed as described in \cref{sec:statistics}. Seven \gls{ml} techniques were implemented to predict three responses related to -the memory phenotype of the cultured T cells under different process -conditions (\rmemh{}, \rmemk{}, and \rratio{}). The \gls{ml} methods -executed were \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr}, -\gls{svm}, and DataModeler’s \gls{sr}\footnote{\gls{sr} was performed by Theresa - Kotanchek at Evolved Analytics, \gls{rf}, \gls{gbm}, \gls{cif}, \gls{plsr}, - \gls{svm} were performed by Valerie Odeh-Couvertier at UPRM. Methods included - here for reference}. Primarily, \gls{sr} models were used to optimize process -parameter values based on \ptmem{} phenotype and to extract early predictive -variable combinations from the multi-omics experiments. Furthermore, all -regression methods were executed, and the high-performing models were used to -perform a consensus analysis of the important variables to extract potential -critical quality attributes and critical process parameters predictive of T cell -potency, safety, and consistency at the early stages of the manufacturing -process. +the memory phenotype of the cultured T cells under different process conditions +(\rmemh{}, \rmemk{}, and \rratio{}). The \gls{ml} methods executed were +\gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr}, \gls{svm}, and +DataModeler’s \gls{sr}\footnote{\gls{sr} was performed by Theresa Kotanchek at + Evolved Analytics, \gls{rf}, \gls{gbm}, \gls{cif}, \gls{plsr}, \gls{svm} were + performed by Valerie Odeh-Couvertier at UPRM. Methods included here for + reference}. Primarily, \gls{sr} models were used to optimize process parameter +values based on \ptmem{} phenotype and to extract early predictive variable +combinations from the multi-omics experiments. Furthermore, high-performing +models from each method were used in consensus analysis to extract potential +\glspl{cqa} and \glspl{cpp} predictive of T cell potency, safety, and +consistency at the early stages of the manufacturing process. \gls{sr} was done using Evolved Analytics’ DataModeler software (Evolved Analytics LLC, Midland, MI). DataModeler utilizes genetic programming to evolve symbolic regression models (both linear and non-linear) rewarding simplicity and accuracy. Using the selection criteria of highest accuracy -($R^2$>\SI{90}{\percent}) and lowest complexity, the top-performing models were +($R^2>\SI{90}{\percent}$) and lowest complexity, the top-performing models were identified. Driving variables, variable combinations, and model dimensionality tables were generated. The top-performing variable combinations were used to generate model ensembles. In this analysis, DataModeler’s @@ -3073,7 +3071,7 @@ values, potential optima in the responses, and regions of parameter values where the predictions diverge the most. Non-parametric tree-based ensembles were done through the -\inlinecode{randomForest}, inlinecode{gbm}, and \inlinecode{cforest} regression +\inlinecode{randomForest}, \inlinecode{gbm}, and \inlinecode{cforest} regression functions in R, for \gls{rf}, \gls{gbm}, and \gls{cif} models, respectively. Both \gls{rf} and \gls{cif} construct multiple decision trees in parallel, by randomly choosing a subset of features at each decision tree split, in the @@ -3117,8 +3115,8 @@ model with \gls{loocv} tuned parameters. Consensus analysis of the relevant variables extracted from each machine learning model was done to identify consistent predictive features of quality at -the early stages of manufacturing. First importance scores for all features were -measured across all \gls{ml} models using \inlinecode{varImp} with +the early stages of manufacturing. First, importance scores for all features +were measured across all \gls{ml} models using \inlinecode{varImp} with \inlinecode{caret} R package except for scores for \gls{svm} which \inlinecode{rminer} R package was used. These importance scores were percent increase in \gls{mse}, relative importance through average increase in @@ -3130,26 +3128,25 @@ respectively. Using these scores, key predictive variables were selected if their importance scores were within the \nth{80} percentile ranking for the following \gls{ml} methods: \gls{rf}, \gls{gbm}, \gls{cif}, \gls{lasso}, \gls{plsr}, \gls{svm} while for \gls{sr} variables present in >\SI{30}{\percent} -of the top-performing \gls{sr} models from DataModeler ($R^2\ge$ -\SI{90}{\percent}, Complexity $\ge$ 100) were chosen to investigate consensus -except for \gls{nmr} media models at day 4 which considered a combination of the -top-performing results of models excluding lactate ppms, and included those -variables which were in >\SI{40}{\percent} of the best performing models. Only -variables with those high percentile scoring values were evaluated in terms of -their logical relation (intersection across \gls{ml} models) and depicted using -a Venn diagram from the \inlinecode{venn} R package. +of the top-performing \gls{sr} models from DataModeler +($R^2\ge \SI{90}{\percent}$, Complexity $\ge 100$) were chosen to investigate +consensus except for \gls{nmr} media models at day 4 which considered a +combination of the top-performing results of models excluding lactate ppms, and +included those variables which were in >\SI{40}{\percent} of the best performing +models. Only variables with high percentile scoring values were evaluated in +terms of their logical relation (intersection across \gls{ml} models) and +depicted using a Venn diagram from the \inlinecode{venn} R package. \section{Results} \subsection{DMSs Grow T Cells With Lower IL2 Concentrations} -Prior to the main experiments in this aim, we performed a preliminary experiment -to assess the effect of lowering the \gls{il2} concentration on the T cells -grown with either bead or \gls{dms}. One of the hypotheses for the \gls{dms} -system was that the higher cell density would enable more efficient cross-talk -between T cells. Since \gls{il2} is secreted by activated T cells themselves, -T cells in the \gls{dms} system may need less or no \gls{il2} if this hypothesis -were true. +Prior to the main experiments in this aim, we assessed the effect of lowering +the \gls{il2} concentration on the T cells grown with either bead or \gls{dms}. +One of our hypotheses for the \gls{dms} system was that higher cell density +would enhance cross-talk between T cells. Since \gls{il2} is secreted by +activated T cells themselves, T cells in the \gls{dms} system may need less or +no \gls{il2} if this is true. \begin{figure*}[ht!] \begingroup @@ -3164,7 +3161,7 @@ were true. \caption[T Cells Grown at Varying IL2 Concentrations] {\glspl{dms} grow T cells effectively at lower IL2 concentrations. \subcap{fig:il2_mod_timecourse}{Longitudinal cell counts of T cells grown - with either bead or \glspl{dms} using varying IL2 concentrations} + with either bead or \glspl{dms} using varying IL2 concentrations.} Day 14 counts of either \subcap{fig:il2_mod_total}{total cells} or \subcap{fig:il2_mod_mem}{\ptmem{} cells} plotted against \gls{il2} concentration. @@ -3179,14 +3176,9 @@ expanded T cells as described in \cref{sec:tcellculture}. T cells grown with either method expanded robustly as \gls{il2} concentration was increased (\cref{fig:il2_mod_timecourse}). Surprisingly, neither the bead or the \gls{dms} group expanded at all with \SI{0}{\IU\per\ml} \gls{il2}. When examining the -endpoint fold change after \SI{14}{\day}, we observe that the difference between -the bead and \gls{dms} appears to be greater at lower \gls{il2} concentrations -(\cref{fig:il2_mod_total}). -% This is further supported by fitting a non-linear -% least squares equation to the data following a hyperbolic curve (which should be -% a plausible model given that this curve describes receptor-ligand kinetics, -% which we can assume \gls{il2} to follow). -Furthermore, the same trend can be +endpoint fold change after \SI{14}{\day}, we observed that the difference +between the bead and \gls{dms} appears to be greater at lower \gls{il2} +concentrations (\cref{fig:il2_mod_total}). Furthermore, the same trend can be seen when only examining the \ptmem{} cell expansion at day 14 (\cref{fig:il2_mod_mem}). In this case, the \ptmemp{} of the T cells seemed to be relatively close at higher \gls{il2} concentrations, but separated further at @@ -3196,16 +3188,24 @@ Taken together, these data do not support the hypothesis that the \gls{dms} system does not need \gls{il2} at all; however, it appears to have a modest advantage at lower \gls{il2} concentrations compared to beads. For this reason, we decided to investigate the lower range of \gls{il2} concentrations starting -at \SI{10}{\IU\per\ml} throughout the remainder of this aim. +at \SI{10}{\IU\per\ml} in the remainder of this aim. \subsection{DOE Shows Optimal Conditions for Potent T Cells} -% TABLE not all of these were actually used, explain why by either adding columns -% or marking with an asterisk -\begin{table}[!h] \centering - \caption{DOE Runs} - \label{tab:doe_runs} - \input{../tables/doe_runs.tex} +\begin{table}[!h] + \centering + \begin{threeparttable} + \caption{DOE Runs} + \label{tab:doe_runs} + \input{../tables/doe_runs.tex} + \begin{tablenotes} + \item[a] It was determined later that the total \glspl{mab} surface density + may not be consistent across each batch of \gls{dms} used. Thus, these + runs were taken out as they were created at different scale and with a + different operator compared to the rest. Leaving them in may produce + unobserved confounding factors + \end{tablenotes} + \end{threeparttable} \end{table} \begin{figure*}[ht!] @@ -3224,38 +3224,29 @@ at \SI{10}{\IU\per\ml} throughout the remainder of this aim. \label{fig:doe_response_first} \end{figure*} -% RESULT maybe add regression tables to this, although it doesn't really matter -% since we end up doing regression on the full thing later anyways. We conducted two consecutive \glspl{doe} to optimize the \pth{} and \ptmem{} -responses for the \gls{dms} system. In the first \gls{doe} we, tested \pilII{} in -the range of \SIrange{10}{30}{\IU\per\ml}, \pdms{} in the range of +responses for the \gls{dms} system. In the first, we tested \pilII{} in the +range of \SIrange{10}{30}{\IU\per\ml}, \pdms{} in the range of \SIrange{500}{2500}{\dms\per\ml}, and \pmab{} in the range of -\SIrange{60}{100}{\percent}. When looking at the total \ptmemp{} output, we -observed that \pilII{} showed a positive linear trend with the \pdms{} and -\pmab{} showing possible second-order effects with maximums and minimums at the -intermediate level (\cref{fig:doe_response_first_mem}). In the case of \pth{}, -we observed that all parameters seemed to have a positive linear response, with -\pilII{} and \pdms{} showing slight second order effects that suggest a maximum -might exist at a higher value for each. +\SIrange{60}{100}{\percent}. When looking at total \ptmemp{} cells, \pilII{} +showed a positive linear trend and \pdms{} and \pmab{} showed possible +second-order effects with intermediate maximums and minimums respectively +(\cref{fig:doe_response_first_mem}). In the case of \pth{}, all parameters +showed a positive, suggesting a maximum might exist at a higher value for each. -After performing the first \gls{doe} we augmented the original design matrix +After performing the first \gls{doe}, we augmented the original design matrix with an \gls{adoe} which was built with three goals in mind. Firstly we wished to validate the first \gls{doe} by assessing the strength and responses of each effect. Secondly, we wished to improve our confidence in regions that showed high complexity, such as the peak in the \gls{dms} concentration for the total \ptmem{} cell response. Thirdly, we wished to explore additional ranges of each -response. Since \pilII{} and \pdms{} appeared to continue positively influence -multiple responses beyond our tested range, we were curious if there was an -optimum at some higher setting of either of these values. For this reason, we -increased the \pilII{} to include \SI{40}{\IU\per\ml} and the \pdms{} to +response. Notably, \pilII{} appeared to increase beyond our tested range, thus +we were curious if there was an optimum at some higher setting. For this reason, +we increased the \pilII{} to include \SI{40}{\IU\per\ml} and the \pdms{} to \SI{3500}{\dms\per\ml}. Note that it was impossible to go beyond \SI{100}{\percent} for the \pmab{}, so runs were positioned for this parameter with validation and confidence improvements in mind. The runs for each \gls{doe} -were shown in \cref{tab:doe_runs}\footnote{Not all runs in this table were used. -It was determined later that the total \glspl{mab} surface density may not be -consistent across each batch of \gls{dms} used, primarily due to the fact that a -subset were created at different scale and with a different operator. To remove -this bias in our data, these runs were not used.}. +were shown in \cref{tab:doe_runs}. \begin{figure*}[ht!] \begingroup @@ -3329,10 +3320,10 @@ responses showed mostly linear relationships in all parameter cases % anything to be significant We performed linear regression on the three input parameters as well as a binary parameter representing if a given run came from the first or second \gls{doe} -(called `dataset'). Starting with the total \ptmem{} cells response, we fit a +(called ``dataset''). Starting with the total \ptmem{} cells response, we fit a first order regression model using these four parameters (\cref{tab:doe_mem1.tex}). While \pilII{} was found to be a significant -predictor, the model fit was extremely poor ($R^2$ of 0.331). This was not +predictor, the model fit was extremely poor ($R^2 = 0.331$). This was not surprising given the apparent complexity of this response (\cref{fig:doe_responses_mem}). To obtain a better fit, we added second and third degree terms (\cref{tab:doe_mem2.tex}). Note that the dataset parameter @@ -3350,9 +3341,8 @@ that our data might be underpowered for a model this complex. Further experiments beyond what was performed here may be needed to fully describe this response. -% TABLE combine these tables into one We performed linear regression on the other three responses, all of which -performed much better than the \ptmem{} response as expected given the much +performed much better than the \ptmem{} response as expected given the lower apparent complexity in the response plots (\cref{fig:doe_responses_cd4,fig:doe_responses_mem4,fig:doe_responses_ratio}). All these models appeared to fit will, with $R^2$ and $R_{adj}^2$ upward of @@ -3380,11 +3370,10 @@ significant predictors. We then visualized the total \ptmemh{} cells and \rratio{} using the response explorer in DataModeler to create contour plots around the maximum responses. -For both, it appeared that maximizing all three input parameters resulted in the -maximum value for either response (\cref{fig:doe_sr_contour}). While not all -combinations at and around this optimum were tested, the model nonetheless -showed that there were no other optimal values or regions elsewhere in the -model. +For both, maximizing all input parameters maximized both responses +(\cref{fig:doe_sr_contour}). While not all combinations at and around this +optimum were tested, these plots suggest that there were no other optimal values +elsewhere. \subsection{Modeling with Machine Learning Reveals Putative CQAs} @@ -3407,16 +3396,15 @@ features of quality early in their expansion process. \label{fig:doe_luminex} \end{figure*} -We collected secretome data via luminex for days 4, 6, 8, 11, and 14. -Plotting the concentrations of these cytokines showed a large variation over all -runs and between different timepoints, demonstrated that these could potentially -be used to differentiate between different process conditions qualitatively -simply based on variance (\cref{fig:doe_luminex}). These were also much higher -in most cases that a set of bead based runs which were run in parallel, in -agreement with the luminex data obtained previously in the Grex system (these -data were collected in plates) (\cref{fig:grex_luminex}). +We collected secretome data via luminex for days 4, 6, 8, 11, and 14. Plotting +the concentrations of these cytokines showed a large variation over all runs and +between different timepoints, demonstrating that these could be used to +differentiate between different process conditions qualitatively simply based on +variance (\cref{fig:doe_luminex}). These were also much higher in most cases +that a set of bead based runs which were run in parallel, in agreement with the +luminex data obtained previously in the Grex system (these data were collected +in plates) (\cref{fig:grex_luminex}). -% TABLE this table looks like crap, break it up into smaller tables \begin{table}[!h] \centering \caption[Machine Learning Model Results] {Results for \gls{ml} modeling using process parameters (PP) with @@ -3428,15 +3416,15 @@ data were collected in plates) (\cref{fig:grex_luminex}). \end{table} \gls{sr} models achieved the highest predictive performance -($R^2$>\SI{93}{\percent}) when using multi-omics predictors for all endpoint -responses (\cref{tab:mod_results}). \gls{sr} achieved $R^2$>\SI{98}{\percent} -while \gls{gbm} ensembles showed \gls{loocv} $R^2$ > \SI{95}{\percent} for +($R^2>\SI{93}{\percent}$) when using multi-omics predictors for all endpoint +responses (\cref{tab:mod_results}). \gls{sr} achieved $R^2>\SI{98}{\percent}$ +while \gls{gbm} ensembles showed \gls{loocv} $R^2>\SI{95}{\percent}$ for \rmemh{} and \rmemk{} responses. Similarly, \gls{lasso}, \gls{plsr}, and \gls{svm} methods showed consistently high \gls{loocv}, (\SI{92.9}{\percent}, \SI{99.7}{\percent}, and \SI{90.5}{\percent} respectively), to predict the \rratio{}. Yet, about \SI{10}{\percent} reduction in \gls{loocv}, \SIrange{72.5}{81.7}{\percent}, was observed for \rmemh{} with these three -methods. Lastly, \gls{sr} and \gls{plsr} achieved $R^2$>\SI{90}{\percent} while +methods. Lastly, \gls{sr} and \gls{plsr} achieved $R^2>\SI{90}{\percent}$ while other \gls{ml} methods exhibited exceedingly variable \gls{loocv} (\SI{0.3}{\percent} for \gls{rf} to \SI{51.5}{\percent} for \gls{lasso}) for \rmemk{}. @@ -3485,18 +3473,13 @@ methods for predicting \rratio{} when considering features with the highest importance scores across models (\cref{fig:mod_flower_48r}). Other features, IL2R, IL4, IL17a, and \pdms{}, were commonly selected in $\ge$ 5 \gls{ml} methods (\cref{fig:mod_flower_48r}). When restricting the models only to include -metabolome, formate emerged as the dominant predictor shared across all seven -models. +metabolome, formate was the sole predictor shared by all. -% Moreover, IL13 and IL15 were found predictive in combination -% with these using \gls{sr} (Supp.Table.S4). - -When performing similar analysis on \rmemh{}, we observe that no species for -either the secretome or metabolome was agreed upon by all seven models -(\cref{fig:mod_flower_cd4}). We also observed that these models did not fit as -well as they did for \rratio{} (\cref{tab:mod_results}). For the secretome, the -species that were agreed upon by $\ge$ 5 models were IL4, IL17a, and IL2R. For -the metabolome, formate once again was agreed upon by $\ge$ 5 models as well as +When performing similar analysis on \rmemh{}, no species for either secretome or +metabolome was shared by all models (\cref{fig:mod_flower_cd4}). These models +also had worse fits compared to those for \rratio{} (\cref{tab:mod_results}). +For the secretome, IL4, IL17a, and IL2R were agreed upon by $\ge$ 5 models. For +the metabolome, formate once again was shared by $\ge$ 5 models as well as lactate. \begin{figure*}[ht!] @@ -3523,12 +3506,12 @@ lactate. \label{fig:nmr_cors} \end{figure*} -We also investigated the \gls{nmr} features extracted from day of expansion to -assess if there was any predictive power for \ptmemh{}; in general these models -had almost as good of fit despite being 2 days earlier in the process -(\cref{fig:nmr_cors}). Lactate and formate were observed to correlate with each -other, and both correlated with \rmemh{}. Furthermore, lactate was observed to -positively correlate with \pdms{} and negatively correlate with glucose +We also asked if day 4 \gls{nmr} features could predict \ptmemh{}; these models +generally fit well despite being 2 days earlier in the process +(\cref{fig:nmr_cors})\footnote{for anyone wondering why we don't have the + matching secretome data for day 4, blame UPS for losing our samples}. Lactate +and formate correlated with each other and with \rmemh{}. Furthermore, lactate +positively correlated with \pdms{} and negatively correlated with glucose (\cref{fig:nmr_cors_lactate}). Formate also had the same correlation patterns (\cref{fig:nmr_cors_formate}). Glucose was only negatively correlated with formate and lactate (\cref{fig:nmr_cors_glucose}). Together, these data suggest @@ -3537,38 +3520,34 @@ that lactate, formate, \pdms{}, and \rmemh{} are fundamentally linked. \section{Discussion} \gls{cpp} modeling and understanding are critical to new product development and -in cell therapy development, it can have life-saving implications. The -challenges for effective modeling grow with the increasing complexity of -processes due to high dimensionality, and the potential for process interactions -and nonlinear relationships. Another critical challenge is the limited amount of -available data, mostly small \gls{doe} datasets. \gls{sr} has the necessary +have life-saving implications in the context of cell therapy. The challenges for +effective modeling grow with the increasing process complexity due to high +dimensionality, interactions between parameters, nonlinearity. Another critical +challenge is the limited amount of available data. \gls{sr} has the necessary capabilities to resolve the issues of process effects modeling and has been applied across multiple industries\cite{Kordona}. \gls{sr} discovers mathematical expressions that fit a given sample and differs from conventional regression techniques in that a model structure is not defined \textit{a -priori}\cite{Koza1992}. Hence, a key advantage of this methodology is that + priori}\cite{Koza1992}. Hence, a key advantage of this methodology is that transparent, human-interpretable models can be generated from small and large -datasets with no prior assumptions\cite{Kotancheka}. +datasets with few prior assumptions\cite{Kotancheka}. Since the model search process lets the data determine the model, diverse and -competitive model structures are typically discovered. An ensemble of diverse -models can be formed where its constituent models will tend to agree when -constrained by observed data yet diverge in new regions. Collecting data in -these regions helps to ensure that the target system is accurately modeled, and -its optimum is accurately located\cite{Kotancheka}. Exploiting these features -allows adaptive data collection and interactive modeling. Consequently, this -\gls{adoe} approach is useful in a variety of scenarios, including maximizing -model validity for model-based decision making, optimizing processing parameters -to maximize target yields, and developing emulators for online optimization and -human understanding\cite{Kotancheka}. +competitive model structures are typically discovered. An diverse ensemble will +contain models that agree in regions constrained by observable data and diverge +in regions without data. Collecting data in divergent regions ensures the system +is accurately modeled and its optimum accurately located\cite{Kotancheka}. +Consequently, this \gls{adoe} approach is useful in a many scenarios, including +maximizing model validity for model-based decision making, optimizing processing +parameters to maximize yield, and developing emulators for online optimization +and human understanding\cite{Kotancheka}. An in-depth characterization of potential \gls{dms} based T cell \glspl{cqa} includes a list of cytokine and \gls{nmr} features from media samples that are crucial in many aspects of T cell fate decisions and effector functions of -immune cells. Cytokine features were observed to slightly improve prediction and -dominated the ranking of important features and variable combinations when -modeling together with \gls{nmr} media analysis and process parameters -(\cref{fig:mod_flower}). +immune cells. Cytokine features slightly improved prediction and dominated the +ranking of important features and variable combinations when modeling together +with \gls{nmr} media analysis and process parameters (\cref{fig:mod_flower}). Predictive cytokine features such as \gls{tnfa}, IL2R, IL4, IL17a, IL13, and IL15 were biologically assessed in terms of their known functions and activities @@ -3577,35 +3556,35 @@ cells, as per their main functions, and activated T cells secrete more cytokines than resting T cells. It is possible that some cytokines simply reflect the \rratio{} and the activation degree by proxy proliferation. However, the exact ratio of expected cytokine abundance is less clear and depends on the subtypes -present, and thus examination of each relevant cytokine is needed. +present, thus examination of each relevant cytokine is needed. IL2R is secreted by activated T cells and binds to IL2, acting as a sink to -dampen its effect on T cells\cite{Witkowska2005}. Since IL2R was much greater +dampen its effect on T cells\cite{Witkowska2005}. Since IL2R was more abundant than IL2 in solution, this might reduce the overall effect of IL2, which could be further investigated by blocking IL2R with an antibody. In T cells, TNF can increase IL2R, proliferation, and cytokine production\cite{Mehta2018}. It may -also induce apoptosis depending on concentration and alter the CD4+ to CD8+ +also induce apoptosis depending on concentration and alter the CD4:CD8 ratio\cite{Vudattu2005}. Given that TNF has both a soluble and membrane-bound -form, this may either increase or decrease CD4+ ratio and/or memory T cells +form, this may either increase or decrease CD4:CD8 ratio and/or memory T cells depending on the ratio of the membrane to soluble TNF\cite{Mehta2018}. Since only soluble TNF was measured, membrane TNF is needed to understand its impact -on both CD4+ ratio and memory T cells. Furthermore, IL13 is known to be critical -for \gls{th2} response and therefore could be secreted if there are significant -\glspl{th2} already present in the starting population\cite{Wong2011}. This -cytokine has limited signaling in T cells and is thought to be more of an -effector than a differentiation cytokine\cite{Junttila2018}. It might be -emerging as relevant due to an initially large number of \glspl{th2} or because -\glspl{th2} were preferentially expanded; indeed, IL4, also found important, is -the conical cytokine that induces \gls{th2} differentiation -(\cref{fig:mod_flower}). The role of these cytokines could be investigated by -quantifying \glspl{th1}, \glspl{th2}, or \glspl{th17} both in the starting -population and longitudinally. Similar to IL13, IL17 is an effector cytokine -produced by \glspl{th17}\cite{Amatya2017} thus may reflect the number of -\glspl{th17} in the population. GM-CSF has been linked with activated T cells, -specifically \glspl{th17}, but it is not clear if this cytokine is inducing -differential expansion of CD8+ T cells or if it is simply a covariate with -another cytokine inducing this expansion\cite{Becher2016}. Finally, IL15 has -been shown to be essential for memory signaling and effective in skewing +on both CD4:CD8 ratio and memory T cells. Furthermore, IL13 is known to be +critical for \gls{th2} response and therefore could be secreted if there are +significant \glspl{th2} already present in the starting +population\cite{Wong2011}. This cytokine has limited signaling in T cells and is +thought to be more of an effector than a differentiation +cytokine\cite{Junttila2018}. It might be emerging here due to an initially large +number of \glspl{th2} or because \glspl{th2} were preferentially expanded; +indeed, IL4, also found important, is the canonical cytokine that induces +\gls{th2} differentiation (\cref{fig:mod_flower}). The role of these cytokines +could be investigated by quantifying \glspl{th1}, \glspl{th2}, or \glspl{th17} +both in the starting population and longitudinally. Similar to IL13, IL17 is an +effector cytokine produced by \glspl{th17}\cite{Amatya2017} thus may reflect the +number of \glspl{th17} in the population. GM-CSF has been linked with activated +T cells, specifically \glspl{th17}, but it is not clear if this cytokine is +inducing differential expansion of CD8+ T cells or if it is simply a covariate +with another cytokine inducing this expansion\cite{Becher2016}. Finally, IL15 +has been shown to be essential for memory signaling and effective in skewing \gls{car} T cells toward \glspl{tscm} when using membrane-bound IL15Ra and IL15R\cite{Hurton2016}. Its high predictive behavior goes with its ability to induce large numbers of memory T cells by functioning in an autocrine/paracrine @@ -3616,24 +3595,24 @@ activity associated with T cell activation and differentiation, yet it is not clear how the various combinations of metabolites relate with each other in a heterogeneous cell population. Formate and lactate were found to be highly predictive and observed to positively correlate with higher values of total live -\rmemh{} cells (~\cref{fig:nmr_cors}). Formate is a byproduct of the -one-carbon cycle implicated in promoting T cell activation\cite{RonHarel2016}. -Importantly, this cycle occurs between the cytosol and mitochondria of cells and -formate excreted\cite{Pietzke2020}. Mitochondrial biogenesis and function are -shown necessary for memory cell persistence\cite{van_der_Windt_2012, - Vardhana2020}. Therefore, increased formate in media could be an indicator of -one-carbon metabolism and mitochondrial activity in the culture. +\rmemh{} cells (\cref{fig:nmr_cors}). Formate is a byproduct of the one-carbon +cycle implicated in promoting T cell activation\cite{RonHarel2016}. Importantly, +this cycle occurs between the cytosol and mitochondria, from which formate is +excreted\cite{Pietzke2020}. Mitochondrial biogenesis and function are shown to +be necessary for memory cell persistence\cite{van_der_Windt_2012, Vardhana2020}. +Therefore, increased formate in media could be an indicator of one-carbon +metabolism and mitochondrial activity in the culture. In addition to formate, lactate was found as a putative \gls{cqa} of \ptmem{} cells. Lactate is the end-product of aerobic glycolysis, characteristic of highly proliferating cells and activated T cells\cite{Lunt2011, Chang2013}. -Glucose import and glycolytic genes are immediately upregulated in response to T -cell stimulation, and thus generation of lactate. At earlier time-points, this -abundance suggests a more robust induction of glycolysis and higher overall T -cell proliferation. Interestingly, our models indicate that higher lactate -predicts higher CD4+, both in total and in proportion to CD8+, seemingly -contrary to previous studies showing that CD8+ T cells rely more on glycolysis -for proliferation following activation\cite{Cao2014}. It may be that glycolytic +Glucose import and glycolytic genes are upregulated in response to T cell +stimulation, thus leading to lactate. At earlier time-points, this abundance +suggests a more robust induction of glycolysis and higher overall T cell +proliferation. Interestingly, our models indicate that higher lactate predicts +higher CD4+, both in total and in proportion to CD8+, seemingly contrary to +previous studies showing that CD8+ T cells rely more on glycolysis for +proliferation following activation\cite{Cao2014}. It may be that glycolytic cells dominate in the culture at the early time points used for prediction, and higher lactate reflects more cells. @@ -3652,28 +3631,26 @@ confounded by the partial replacement of media that occurred periodically during expansion, thus likely diluting some metabolic byproducts (such as formate, lactate) and elevating depleted precursors (such as glucose and amino acids). More definitive conclusions of metabolic activity across the expanding cell -population can be addressed by a closed system, ideally with on-line process -sensors and controls for formate, lactate, along with ethanol and glucose. +population can be addressed by a closed system, ideally with on-line sensors and +controls for formate, lactate, ethanol, and glucose. Practically, knowledge of how cytokines and/or metabolites are related to outcome can be utilized for process control, which involves measuring the current state of the culture, comparing it to a desired state, and intervening if it is outside an acceptable range. In the case of lactate and formate, a -benchtop \gls{nmr} can be utilized to sample the media in real time during -culture. This \gls{nmr} can be tuned to automatically quantify the presence of -lactate and formate. Formate is part of the one-carbon pathway, and thus culture -fate may be controlled by altering the inputs to this pathway (glycine, serine, -choline) and/or adding folic acid inhibitors\cite{Ducker2017}. Since lactate is -a direct byproduct of glycolysis, this may be controlled by altering the -concentration of glucose in solution. Each of these control schemes would need -further study to assess if they have enough precision and temporal resolution to -reasonably ensure product quality. In the case of cytokines, there is currently -no analogue to a benchtop \gls{nmr}; however, research is underway to develop -protein-specific sensors using aptamers\cite{Parolo2020}. Even without these -developments, one could still use \gls{elisa} or Luminex to assess protein -levels in a semi-automated manner, but the disadvantage is that these assays are -temporally discrete and impose a significant time lag before the intervention -can be performed. +benchtop \gls{nmr} can be tuned to quantify lactate and formate to sample the +media in real time during culture. Formate is part of the one-carbon pathway, +and thus culture fate may be controlled by altering the inputs to this pathway +(glycine, serine, choline) and/or adding folic acid inhibitors\cite{Ducker2017}. +Since lactate is a direct byproduct of glycolysis, this may be controlled by +altering the concentration of glucose in solution. Each of these control schemes +would need further study to assess if they have enough precision and temporal +resolution to reasonably ensure product quality. For cytokines, there is +currently no analogue to a benchtop \gls{nmr}; however, research is underway to +develop protein-specific sensors using aptamers\cite{Parolo2020}. Even without +these developments, \gls{elisa} or Luminex can still quantify cytokines in a +semi-automated manner. However, these are temporally discrete and impose a +non-trivial delay before the intervention can be performed. \chapter{AIM 2B}\label{aim2b}