{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 3. Apply TriMap to predict novel epitopes from gut bacterial proteomes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "At the last step, we will use 10 pretrained THEmap models with different random initialization to predict novel epitopes from gut bacterial proteomes.\n",
    "\n",
    "**The model parameters can be downloaded from** [here](https://drive.google.com/drive/folders/1nyxjbuJEZ4BRiNSVdlG9nkLKnsURB9TL?usp=drive_link)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load required libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "from Bio import SeqIO\n",
    "import torch\n",
    "from trimap.model import TCRbind\n",
    "import pandas as pd\n",
    "from themap import utils\n",
    "import torch\n",
    "import numpy as np\n",
    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "# for each microbe, predict the epitopes using the TCRs\n",
    "name = ['RJX1181','RJX1596','RJX1754','RJX1996','RJX1119','RJX1588','GCA_029961225','GCF_902362795','GCF_902387715','GCF_902858935','GCA_009731575','GCA_009738105','GCA_020735445','GCA_000144405','GCA_000205025','GCA_001412635']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scan the proteome files in order to find the protein name for each peptide"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Bacterial proteomes can be downloaded from** [here](https://drive.google.com/drive/folders/18VGxJh_6d-OJAexfdKDrd5KaSr450OTA?usp=drive_link)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing RJX1181\n",
      "Number of unique peptides found: 591463\n"
     ]
    }
   ],
   "source": [
    "n = name[0]  # Change this to process a different microbe\n",
    "print('Processing {}'.format(n))\n",
    "file_path = 'bacterial/{}.faa.gz'.format(n)\n",
    "all_seqs = []\n",
    "protein_seqs = []\n",
    "protein_dict = {}\n",
    "\n",
    "def scan_strings(input_list, protein_seqs, protein_dict):\n",
    "    all_peptides = []\n",
    "    for item, protein in zip(input_list, protein_seqs):\n",
    "        for i in range(0, len(item) - 8, 1):\n",
    "            new_str = item[i:i+9]\n",
    "            peptide_seq = new_str\n",
    "            all_peptides.append(protein)\n",
    "            if peptide_seq not in protein_dict:\n",
    "                protein_dict[peptide_seq] = [protein]\n",
    "            else:\n",
    "                protein_dict[peptide_seq].append(protein)\n",
    "    print('Number of unique peptides found: {}'.format(len(protein_dict)))\n",
    "                    \n",
    "with gzip.open(file_path, \"rt\") as handle:\n",
    "    for record in SeqIO.parse(handle, \"fasta\"):\n",
    "        protein_id = record.description\n",
    "        sequence = str(record.seq)\n",
    "        all_seqs.append(sequence)\n",
    "        protein_seqs.append(protein_id)\n",
    "scan_strings(all_seqs, protein_seqs, protein_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Find HLA-B27 high-affinity peptides according to NetMHCpan outputs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**The NetMHCpan outputs can be downloaded from** [here](https://drive.google.com/drive/folders/1WjUtQSiI8V5mFa7ZIpy1JFAUDX61SwFE?usp=drive_link)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('NetMHCpan_output/{}.xls'.format(n), sep='\\t', header=1)\n",
    "df = df[df['NB']==1]\n",
    "df = df[df['EL_Rank']<5]\n",
    "df = df[df['BA_Rank']<5]\n",
    "df.reset_index(drop=True, inplace=True)\n",
    "all_peptides = df['Peptide'].values.tolist()\n",
    "\n",
    "df['Protein_ID'] = df['Peptide'].apply(lambda x: protein_dict[x] if x in protein_dict else x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load interested TCR sequences (TRBV9 TCR for AS)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Download the TRBV9 TCR sequences from** [here](https://drive.google.com/file/d/1xHsha2IrAUzwng-r-_DnPqcFW3Bm1k-b/view?usp=drive_link)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "TRBV9_TCR = pd.read_csv('TRBV9_TCR.csv')\n",
    "TRA = utils.determine_tcr_seq_vj(TRBV9_TCR['alpha'].tolist(), TRBV9_TCR['V_alpha'].tolist(), TRBV9_TCR['J_alpha'].tolist(), chain='A')\n",
    "TRB = utils.determine_tcr_seq_vj(TRBV9_TCR['beta'].tolist(), TRBV9_TCR['V_beta'].tolist(), TRBV9_TCR['J_beta'].tolist(), chain='B')\n",
    "TRBV9_TCR['alpha'] = TRA\n",
    "TRBV9_TCR['beta'] = TRB\n",
    "TCR_list = ['AS3.1', 'AS4.1', 'AS4.2', 'AS4.3', 'AS8.4']\n",
    "df_TCR_selected = TRBV9_TCR[TRBV9_TCR['name'].isin(TCR_list)].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict TCR recognition of the peptides using 10 aggregated THEmap models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local_home/cao/miniconda3/envs/general/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
      "  WeightNorm.apply(module, name, dim)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local_home/cao/miniconda3/envs/general/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n",
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:11<00:00,  3.39it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:14<00:00,  3.24it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:18<00:00,  3.08it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:21<00:00,  2.95it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 4\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:24<00:00,  2.87it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:29<00:00,  2.71it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 6\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:06<00:00,  3.62it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 7\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:09<00:00,  3.50it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 8\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:10<00:00,  3.45it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model seed 9\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:themap.model:Loading alpha_dict.pt\n",
      "INFO:themap.model:No new alpha sequences found\n",
      "INFO:themap.model:Loading beta_dict.pt\n",
      "INFO:themap.model:No new beta sequences found\n",
      "INFO:themap.model:Predicting...\n",
      "100%|██████████| 242/242 [01:03<00:00,  3.78it/s]\n"
     ]
    }
   ],
   "source": [
    "all_preds = []\n",
    "trimap_model = TCRbind().to(device)\n",
    "trimap_model.eval()\n",
    "\n",
    "for seed in range(10):\n",
    "    print(f'Loading model seed {seed}')\n",
    "    trimap_model.load_state_dict(torch.load(f'model/TriMap_AS_use_all_synthetic_and_natural_peptides_{seed}.pt'))\n",
    "\n",
    "    df_all = pd.DataFrame(np.repeat(df_TCR_selected.values, len(all_peptides), axis=0), columns=df_TCR_selected.columns)\n",
    "    df_all['Epitope'] = all_peptides * len(df_TCR_selected)\n",
    "\n",
    "    result, _, _ = trimap_model.test_model(df_test=df_all, device=device)\n",
    "    all_preds.append(result)\n",
    "\n",
    "df_all['pred'] = np.mean(all_preds, axis=0)\n",
    "df_all.to_csv(f'{n}_predict.csv', sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Show predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>Epitope</th>\n",
       "      <th>pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>GQYLITWIF</td>\n",
       "      <td>0.484653</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>SRWNDYKIV</td>\n",
       "      <td>0.218381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>HRKVLANLK</td>\n",
       "      <td>0.089420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>RKVLANLKK</td>\n",
       "      <td>0.175217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>TRIAKYFMM</td>\n",
       "      <td>0.430186</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61825</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>QRMRWLDGI</td>\n",
       "      <td>0.055362</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61826</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>RLSDFTFTF</td>\n",
       "      <td>0.240501</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61827</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>ARLAGPIFS</td>\n",
       "      <td>0.087526</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61828</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>IRVFKAGVL</td>\n",
       "      <td>0.179825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61829</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>FKAGVLLEY</td>\n",
       "      <td>0.196660</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>61830 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        name    Epitope      pred\n",
       "0      AS3.1  GQYLITWIF  0.484653\n",
       "1      AS3.1  SRWNDYKIV  0.218381\n",
       "2      AS3.1  HRKVLANLK  0.089420\n",
       "3      AS3.1  RKVLANLKK  0.175217\n",
       "4      AS3.1  TRIAKYFMM  0.430186\n",
       "...      ...        ...       ...\n",
       "61825  AS8.4  QRMRWLDGI  0.055362\n",
       "61826  AS8.4  RLSDFTFTF  0.240501\n",
       "61827  AS8.4  ARLAGPIFS  0.087526\n",
       "61828  AS8.4  IRVFKAGVL  0.179825\n",
       "61829  AS8.4  FKAGVLLEY  0.196660\n",
       "\n",
       "[61830 rows x 3 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_all[['name', 'Epitope', 'pred']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top 5 peptides for each TCR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>Epitope</th>\n",
       "      <th>pred</th>\n",
       "      <th>Protein_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>ARVMALMPF</td>\n",
       "      <td>0.923957</td>\n",
       "      <td>[RJX1181_1545 30S]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>GRIVVLLVP</td>\n",
       "      <td>0.770734</td>\n",
       "      <td>[RJX1181_1038 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>ARVLLITPF</td>\n",
       "      <td>0.769916</td>\n",
       "      <td>[RJX1181_0828 Ribosome-recycling]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>SRVMFPGWY</td>\n",
       "      <td>0.762865</td>\n",
       "      <td>[RJX1181_1310 Phosphoenolpyruvate]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AS3.1</td>\n",
       "      <td>GRMAIMIYY</td>\n",
       "      <td>0.751273</td>\n",
       "      <td>[RJX1181_0774 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>AS4.1</td>\n",
       "      <td>ARVMALMPF</td>\n",
       "      <td>0.879250</td>\n",
       "      <td>[RJX1181_1545 30S]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>AS4.1</td>\n",
       "      <td>GRCWMFAAL</td>\n",
       "      <td>0.701453</td>\n",
       "      <td>[RJX1181_1851 Aminopeptidase]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>AS4.1</td>\n",
       "      <td>WRLSTLVPF</td>\n",
       "      <td>0.700965</td>\n",
       "      <td>[RJX1181_1725 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>AS4.1</td>\n",
       "      <td>WRKVVASPK</td>\n",
       "      <td>0.666782</td>\n",
       "      <td>[RJX1181_0410 Carbamate]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>AS4.1</td>\n",
       "      <td>NQWWWPESK</td>\n",
       "      <td>0.657591</td>\n",
       "      <td>[RJX1181_0878 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>AS4.2</td>\n",
       "      <td>ARVMALMPF</td>\n",
       "      <td>0.971340</td>\n",
       "      <td>[RJX1181_1545 30S]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>AS4.2</td>\n",
       "      <td>GRCWMFAAL</td>\n",
       "      <td>0.885641</td>\n",
       "      <td>[RJX1181_1851 Aminopeptidase]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>AS4.2</td>\n",
       "      <td>GRIVVLLVP</td>\n",
       "      <td>0.882411</td>\n",
       "      <td>[RJX1181_1038 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>AS4.2</td>\n",
       "      <td>GRMAIMIYY</td>\n",
       "      <td>0.849540</td>\n",
       "      <td>[RJX1181_0774 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>AS4.2</td>\n",
       "      <td>SRMAMWIIL</td>\n",
       "      <td>0.846020</td>\n",
       "      <td>[RJX1181_0432 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>AS4.3</td>\n",
       "      <td>GRMVPLNTK</td>\n",
       "      <td>0.950566</td>\n",
       "      <td>[RJX1181_0267 Bifunctional]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>AS4.3</td>\n",
       "      <td>GRALMSTPK</td>\n",
       "      <td>0.939676</td>\n",
       "      <td>[RJX1181_0806 High-affinity]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>AS4.3</td>\n",
       "      <td>ARVLATSPK</td>\n",
       "      <td>0.936035</td>\n",
       "      <td>[RJX1181_0855 Phosphate]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>AS4.3</td>\n",
       "      <td>TRAPNPMIV</td>\n",
       "      <td>0.927110</td>\n",
       "      <td>[RJX1181_0099 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>AS4.3</td>\n",
       "      <td>NRVMVSPDL</td>\n",
       "      <td>0.923938</td>\n",
       "      <td>[RJX1181_1492 hypothetical]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>ARVMALMPF</td>\n",
       "      <td>0.927952</td>\n",
       "      <td>[RJX1181_1545 30S]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>ARVLLITPF</td>\n",
       "      <td>0.669127</td>\n",
       "      <td>[RJX1181_0828 Ribosome-recycling]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>GRCWMFAAL</td>\n",
       "      <td>0.664344</td>\n",
       "      <td>[RJX1181_1851 Aminopeptidase]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>SRVMFPGWY</td>\n",
       "      <td>0.653470</td>\n",
       "      <td>[RJX1181_1310 Phosphoenolpyruvate]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>AS8.4</td>\n",
       "      <td>GRMAIMIYY</td>\n",
       "      <td>0.628679</td>\n",
       "      <td>[RJX1181_0774 hypothetical]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name    Epitope      pred                          Protein_ID\n",
       "0   AS3.1  ARVMALMPF  0.923957                  [RJX1181_1545 30S]\n",
       "1   AS3.1  GRIVVLLVP  0.770734         [RJX1181_1038 hypothetical]\n",
       "2   AS3.1  ARVLLITPF  0.769916   [RJX1181_0828 Ribosome-recycling]\n",
       "3   AS3.1  SRVMFPGWY  0.762865  [RJX1181_1310 Phosphoenolpyruvate]\n",
       "4   AS3.1  GRMAIMIYY  0.751273         [RJX1181_0774 hypothetical]\n",
       "5   AS4.1  ARVMALMPF  0.879250                  [RJX1181_1545 30S]\n",
       "6   AS4.1  GRCWMFAAL  0.701453       [RJX1181_1851 Aminopeptidase]\n",
       "7   AS4.1  WRLSTLVPF  0.700965         [RJX1181_1725 hypothetical]\n",
       "8   AS4.1  WRKVVASPK  0.666782            [RJX1181_0410 Carbamate]\n",
       "9   AS4.1  NQWWWPESK  0.657591         [RJX1181_0878 hypothetical]\n",
       "10  AS4.2  ARVMALMPF  0.971340                  [RJX1181_1545 30S]\n",
       "11  AS4.2  GRCWMFAAL  0.885641       [RJX1181_1851 Aminopeptidase]\n",
       "12  AS4.2  GRIVVLLVP  0.882411         [RJX1181_1038 hypothetical]\n",
       "13  AS4.2  GRMAIMIYY  0.849540         [RJX1181_0774 hypothetical]\n",
       "14  AS4.2  SRMAMWIIL  0.846020         [RJX1181_0432 hypothetical]\n",
       "15  AS4.3  GRMVPLNTK  0.950566         [RJX1181_0267 Bifunctional]\n",
       "16  AS4.3  GRALMSTPK  0.939676        [RJX1181_0806 High-affinity]\n",
       "17  AS4.3  ARVLATSPK  0.936035            [RJX1181_0855 Phosphate]\n",
       "18  AS4.3  TRAPNPMIV  0.927110         [RJX1181_0099 hypothetical]\n",
       "19  AS4.3  NRVMVSPDL  0.923938         [RJX1181_1492 hypothetical]\n",
       "20  AS8.4  ARVMALMPF  0.927952                  [RJX1181_1545 30S]\n",
       "21  AS8.4  ARVLLITPF  0.669127   [RJX1181_0828 Ribosome-recycling]\n",
       "22  AS8.4  GRCWMFAAL  0.664344       [RJX1181_1851 Aminopeptidase]\n",
       "23  AS8.4  SRVMFPGWY  0.653470  [RJX1181_1310 Phosphoenolpyruvate]\n",
       "24  AS8.4  GRMAIMIYY  0.628679         [RJX1181_0774 hypothetical]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top5_df = df_all.sort_values(['name', 'pred'], ascending=[True, False]) \\\n",
    "                .groupby('name').head(5).reset_index(drop=True)\n",
    "# find proteins for top 5 epitopes\n",
    "top5_df['Protein_ID'] = top5_df['Epitope'].apply(lambda x: protein_dict[x] if x in protein_dict else x)\n",
    "top5_df[['name', 'Epitope', 'pred', 'Protein_ID']]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "general",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.21"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}