{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 3. Apply TriMap to predict novel epitopes from gut bacterial proteomes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "At the last step, we will use 10 pretrained THEmap models with different random initialization to predict novel epitopes from gut bacterial proteomes.\n", "\n", "**The model parameters can be downloaded from** [here](https://drive.google.com/drive/folders/1nyxjbuJEZ4BRiNSVdlG9nkLKnsURB9TL?usp=drive_link)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load required libraries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gzip\n", "from Bio import SeqIO\n", "import torch\n", "from trimap.model import TCRbind\n", "import pandas as pd\n", "from themap import utils\n", "import torch\n", "import numpy as np\n", "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# for each microbe, predict the epitopes using the TCRs\n", "name = ['RJX1181','RJX1596','RJX1754','RJX1996','RJX1119','RJX1588','GCA_029961225','GCF_902362795','GCF_902387715','GCF_902858935','GCA_009731575','GCA_009738105','GCA_020735445','GCA_000144405','GCA_000205025','GCA_001412635']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scan the proteome files in order to find the protein name for each peptide" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Bacterial proteomes can be downloaded from** [here](https://drive.google.com/drive/folders/18VGxJh_6d-OJAexfdKDrd5KaSr450OTA?usp=drive_link)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing RJX1181\n", "Number of unique peptides found: 591463\n" ] } ], "source": [ "n = name[0] # Change this to process a different microbe\n", "print('Processing {}'.format(n))\n", "file_path = 'bacterial/{}.faa.gz'.format(n)\n", "all_seqs = []\n", "protein_seqs = []\n", "protein_dict = {}\n", "\n", "def scan_strings(input_list, protein_seqs, protein_dict):\n", " all_peptides = []\n", " for item, protein in zip(input_list, protein_seqs):\n", " for i in range(0, len(item) - 8, 1):\n", " new_str = item[i:i+9]\n", " peptide_seq = new_str\n", " all_peptides.append(protein)\n", " if peptide_seq not in protein_dict:\n", " protein_dict[peptide_seq] = [protein]\n", " else:\n", " protein_dict[peptide_seq].append(protein)\n", " print('Number of unique peptides found: {}'.format(len(protein_dict)))\n", " \n", "with gzip.open(file_path, \"rt\") as handle:\n", " for record in SeqIO.parse(handle, \"fasta\"):\n", " protein_id = record.description\n", " sequence = str(record.seq)\n", " all_seqs.append(sequence)\n", " protein_seqs.append(protein_id)\n", "scan_strings(all_seqs, protein_seqs, protein_dict)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Find HLA-B27 high-affinity peptides according to NetMHCpan outputs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**The NetMHCpan outputs can be downloaded from** [here](https://drive.google.com/drive/folders/1WjUtQSiI8V5mFa7ZIpy1JFAUDX61SwFE?usp=drive_link)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('NetMHCpan_output/{}.xls'.format(n), sep='\\t', header=1)\n", "df = df[df['NB']==1]\n", "df = df[df['EL_Rank']<5]\n", "df = df[df['BA_Rank']<5]\n", "df.reset_index(drop=True, inplace=True)\n", "all_peptides = df['Peptide'].values.tolist()\n", "\n", "df['Protein_ID'] = df['Peptide'].apply(lambda x: protein_dict[x] if x in protein_dict else x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load interested TCR sequences (TRBV9 TCR for AS)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Download the TRBV9 TCR sequences from** [here](https://drive.google.com/file/d/1xHsha2IrAUzwng-r-_DnPqcFW3Bm1k-b/view?usp=drive_link)." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "TRBV9_TCR = pd.read_csv('TRBV9_TCR.csv')\n", "TRA = utils.determine_tcr_seq_vj(TRBV9_TCR['alpha'].tolist(), TRBV9_TCR['V_alpha'].tolist(), TRBV9_TCR['J_alpha'].tolist(), chain='A')\n", "TRB = utils.determine_tcr_seq_vj(TRBV9_TCR['beta'].tolist(), TRBV9_TCR['V_beta'].tolist(), TRBV9_TCR['J_beta'].tolist(), chain='B')\n", "TRBV9_TCR['alpha'] = TRA\n", "TRBV9_TCR['beta'] = TRB\n", "TCR_list = ['AS3.1', 'AS4.1', 'AS4.2', 'AS4.3', 'AS8.4']\n", "df_TCR_selected = TRBV9_TCR[TRBV9_TCR['name'].isin(TCR_list)].reset_index(drop=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict TCR recognition of the peptides using 10 aggregated THEmap models" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/local_home/cao/miniconda3/envs/general/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n", " WeightNorm.apply(module, name, dim)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/local_home/cao/miniconda3/envs/general/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n", "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:11<00:00, 3.39it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:14<00:00, 3.24it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:18<00:00, 3.08it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:21<00:00, 2.95it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 4\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:24<00:00, 2.87it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 5\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:29<00:00, 2.71it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 6\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:06<00:00, 3.62it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 7\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:09<00:00, 3.50it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 8\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:10<00:00, 3.45it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model seed 9\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:themap.model:Loading alpha_dict.pt\n", "INFO:themap.model:No new alpha sequences found\n", "INFO:themap.model:Loading beta_dict.pt\n", "INFO:themap.model:No new beta sequences found\n", "INFO:themap.model:Predicting...\n", "100%|██████████| 242/242 [01:03<00:00, 3.78it/s]\n" ] } ], "source": [ "all_preds = []\n", "trimap_model = TCRbind().to(device)\n", "trimap_model.eval()\n", "\n", "for seed in range(10):\n", " print(f'Loading model seed {seed}')\n", " trimap_model.load_state_dict(torch.load(f'model/TriMap_AS_use_all_synthetic_and_natural_peptides_{seed}.pt'))\n", "\n", " df_all = pd.DataFrame(np.repeat(df_TCR_selected.values, len(all_peptides), axis=0), columns=df_TCR_selected.columns)\n", " df_all['Epitope'] = all_peptides * len(df_TCR_selected)\n", "\n", " result, _, _ = trimap_model.test_model(df_test=df_all, device=device)\n", " all_preds.append(result)\n", "\n", "df_all['pred'] = np.mean(all_preds, axis=0)\n", "df_all.to_csv(f'{n}_predict.csv', sep='\\t', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Show predictions" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameEpitopepred
0AS3.1GQYLITWIF0.484653
1AS3.1SRWNDYKIV0.218381
2AS3.1HRKVLANLK0.089420
3AS3.1RKVLANLKK0.175217
4AS3.1TRIAKYFMM0.430186
............
61825AS8.4QRMRWLDGI0.055362
61826AS8.4RLSDFTFTF0.240501
61827AS8.4ARLAGPIFS0.087526
61828AS8.4IRVFKAGVL0.179825
61829AS8.4FKAGVLLEY0.196660
\n", "

61830 rows × 3 columns

\n", "
" ], "text/plain": [ " name Epitope pred\n", "0 AS3.1 GQYLITWIF 0.484653\n", "1 AS3.1 SRWNDYKIV 0.218381\n", "2 AS3.1 HRKVLANLK 0.089420\n", "3 AS3.1 RKVLANLKK 0.175217\n", "4 AS3.1 TRIAKYFMM 0.430186\n", "... ... ... ...\n", "61825 AS8.4 QRMRWLDGI 0.055362\n", "61826 AS8.4 RLSDFTFTF 0.240501\n", "61827 AS8.4 ARLAGPIFS 0.087526\n", "61828 AS8.4 IRVFKAGVL 0.179825\n", "61829 AS8.4 FKAGVLLEY 0.196660\n", "\n", "[61830 rows x 3 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_all[['name', 'Epitope', 'pred']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top 5 peptides for each TCR" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameEpitopepredProtein_ID
0AS3.1ARVMALMPF0.923957[RJX1181_1545 30S]
1AS3.1GRIVVLLVP0.770734[RJX1181_1038 hypothetical]
2AS3.1ARVLLITPF0.769916[RJX1181_0828 Ribosome-recycling]
3AS3.1SRVMFPGWY0.762865[RJX1181_1310 Phosphoenolpyruvate]
4AS3.1GRMAIMIYY0.751273[RJX1181_0774 hypothetical]
5AS4.1ARVMALMPF0.879250[RJX1181_1545 30S]
6AS4.1GRCWMFAAL0.701453[RJX1181_1851 Aminopeptidase]
7AS4.1WRLSTLVPF0.700965[RJX1181_1725 hypothetical]
8AS4.1WRKVVASPK0.666782[RJX1181_0410 Carbamate]
9AS4.1NQWWWPESK0.657591[RJX1181_0878 hypothetical]
10AS4.2ARVMALMPF0.971340[RJX1181_1545 30S]
11AS4.2GRCWMFAAL0.885641[RJX1181_1851 Aminopeptidase]
12AS4.2GRIVVLLVP0.882411[RJX1181_1038 hypothetical]
13AS4.2GRMAIMIYY0.849540[RJX1181_0774 hypothetical]
14AS4.2SRMAMWIIL0.846020[RJX1181_0432 hypothetical]
15AS4.3GRMVPLNTK0.950566[RJX1181_0267 Bifunctional]
16AS4.3GRALMSTPK0.939676[RJX1181_0806 High-affinity]
17AS4.3ARVLATSPK0.936035[RJX1181_0855 Phosphate]
18AS4.3TRAPNPMIV0.927110[RJX1181_0099 hypothetical]
19AS4.3NRVMVSPDL0.923938[RJX1181_1492 hypothetical]
20AS8.4ARVMALMPF0.927952[RJX1181_1545 30S]
21AS8.4ARVLLITPF0.669127[RJX1181_0828 Ribosome-recycling]
22AS8.4GRCWMFAAL0.664344[RJX1181_1851 Aminopeptidase]
23AS8.4SRVMFPGWY0.653470[RJX1181_1310 Phosphoenolpyruvate]
24AS8.4GRMAIMIYY0.628679[RJX1181_0774 hypothetical]
\n", "
" ], "text/plain": [ " name Epitope pred Protein_ID\n", "0 AS3.1 ARVMALMPF 0.923957 [RJX1181_1545 30S]\n", "1 AS3.1 GRIVVLLVP 0.770734 [RJX1181_1038 hypothetical]\n", "2 AS3.1 ARVLLITPF 0.769916 [RJX1181_0828 Ribosome-recycling]\n", "3 AS3.1 SRVMFPGWY 0.762865 [RJX1181_1310 Phosphoenolpyruvate]\n", "4 AS3.1 GRMAIMIYY 0.751273 [RJX1181_0774 hypothetical]\n", "5 AS4.1 ARVMALMPF 0.879250 [RJX1181_1545 30S]\n", "6 AS4.1 GRCWMFAAL 0.701453 [RJX1181_1851 Aminopeptidase]\n", "7 AS4.1 WRLSTLVPF 0.700965 [RJX1181_1725 hypothetical]\n", "8 AS4.1 WRKVVASPK 0.666782 [RJX1181_0410 Carbamate]\n", "9 AS4.1 NQWWWPESK 0.657591 [RJX1181_0878 hypothetical]\n", "10 AS4.2 ARVMALMPF 0.971340 [RJX1181_1545 30S]\n", "11 AS4.2 GRCWMFAAL 0.885641 [RJX1181_1851 Aminopeptidase]\n", "12 AS4.2 GRIVVLLVP 0.882411 [RJX1181_1038 hypothetical]\n", "13 AS4.2 GRMAIMIYY 0.849540 [RJX1181_0774 hypothetical]\n", "14 AS4.2 SRMAMWIIL 0.846020 [RJX1181_0432 hypothetical]\n", "15 AS4.3 GRMVPLNTK 0.950566 [RJX1181_0267 Bifunctional]\n", "16 AS4.3 GRALMSTPK 0.939676 [RJX1181_0806 High-affinity]\n", "17 AS4.3 ARVLATSPK 0.936035 [RJX1181_0855 Phosphate]\n", "18 AS4.3 TRAPNPMIV 0.927110 [RJX1181_0099 hypothetical]\n", "19 AS4.3 NRVMVSPDL 0.923938 [RJX1181_1492 hypothetical]\n", "20 AS8.4 ARVMALMPF 0.927952 [RJX1181_1545 30S]\n", "21 AS8.4 ARVLLITPF 0.669127 [RJX1181_0828 Ribosome-recycling]\n", "22 AS8.4 GRCWMFAAL 0.664344 [RJX1181_1851 Aminopeptidase]\n", "23 AS8.4 SRVMFPGWY 0.653470 [RJX1181_1310 Phosphoenolpyruvate]\n", "24 AS8.4 GRMAIMIYY 0.628679 [RJX1181_0774 hypothetical]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top5_df = df_all.sort_values(['name', 'pred'], ascending=[True, False]) \\\n", " .groupby('name').head(5).reset_index(drop=True)\n", "# find proteins for top 5 epitopes\n", "top5_df['Protein_ID'] = top5_df['Epitope'].apply(lambda x: protein_dict[x] if x in protein_dict else x)\n", "top5_df[['name', 'Epitope', 'pred', 'Protein_ID']]" ] } ], "metadata": { "kernelspec": { "display_name": "general", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 2 }