Step 3. Apply TriMap to predict novel epitopes from gut bacterial proteomes¶

At the last step, we will use 10 pretrained THEmap models with different random initialization to predict novel epitopes from gut bacterial proteomes.

The model parameters can be downloaded from here.

Load required libraries¶

import gzip
from Bio import SeqIO
import torch
from trimap.model import TCRbind
import pandas as pd
from themap import utils
import torch
import numpy as np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# for each microbe, predict the epitopes using the TCRs
name = ['RJX1181','RJX1596','RJX1754','RJX1996','RJX1119','RJX1588','GCA_029961225','GCF_902362795','GCF_902387715','GCF_902858935','GCA_009731575','GCA_009738105','GCA_020735445','GCA_000144405','GCA_000205025','GCA_001412635']

Scan the proteome files in order to find the protein name for each peptide¶

Bacterial proteomes can be downloaded from here.

n = name[0]  # Change this to process a different microbe
print('Processing {}'.format(n))
file_path = 'bacterial/{}.faa.gz'.format(n)
all_seqs = []
protein_seqs = []
protein_dict = {}

def scan_strings(input_list, protein_seqs, protein_dict):
    all_peptides = []
    for item, protein in zip(input_list, protein_seqs):
        for i in range(0, len(item) - 8, 1):
            new_str = item[i:i+9]
            peptide_seq = new_str
            all_peptides.append(protein)
            if peptide_seq not in protein_dict:
                protein_dict[peptide_seq] = [protein]
            else:
                protein_dict[peptide_seq].append(protein)
    print('Number of unique peptides found: {}'.format(len(protein_dict)))
                    
with gzip.open(file_path, "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        protein_id = record.description
        sequence = str(record.seq)
        all_seqs.append(sequence)
        protein_seqs.append(protein_id)
scan_strings(all_seqs, protein_seqs, protein_dict)

Processing RJX1181
Number of unique peptides found: 591463

Find HLA-B27 high-affinity peptides according to NetMHCpan outputs¶

The NetMHCpan outputs can be downloaded from here.

df = pd.read_csv('NetMHCpan_output/{}.xls'.format(n), sep='\t', header=1)
df = df[df['NB']==1]
df = df[df['EL_Rank']<5]
df = df[df['BA_Rank']<5]
df.reset_index(drop=True, inplace=True)
all_peptides = df['Peptide'].values.tolist()

df['Protein_ID'] = df['Peptide'].apply(lambda x: protein_dict[x] if x in protein_dict else x)

Load interested TCR sequences (TRBV9 TCR for AS)¶

Download the TRBV9 TCR sequences from here.

TRBV9_TCR = pd.read_csv('TRBV9_TCR.csv')
TRA = utils.determine_tcr_seq_vj(TRBV9_TCR['alpha'].tolist(), TRBV9_TCR['V_alpha'].tolist(), TRBV9_TCR['J_alpha'].tolist(), chain='A')
TRB = utils.determine_tcr_seq_vj(TRBV9_TCR['beta'].tolist(), TRBV9_TCR['V_beta'].tolist(), TRBV9_TCR['J_beta'].tolist(), chain='B')
TRBV9_TCR['alpha'] = TRA
TRBV9_TCR['beta'] = TRB
TCR_list = ['AS3.1', 'AS4.1', 'AS4.2', 'AS4.3', 'AS8.4']
df_TCR_selected = TRBV9_TCR[TRBV9_TCR['name'].isin(TCR_list)].reset_index(drop=True)

Predict TCR recognition of the peptides using 10 aggregated THEmap models¶

all_preds = []
trimap_model = TCRbind().to(device)
trimap_model.eval()

for seed in range(10):
    print(f'Loading model seed {seed}')
    trimap_model.load_state_dict(torch.load(f'model/TriMap_AS_use_all_synthetic_and_natural_peptides_{seed}.pt'))

    df_all = pd.DataFrame(np.repeat(df_TCR_selected.values, len(all_peptides), axis=0), columns=df_TCR_selected.columns)
    df_all['Epitope'] = all_peptides * len(df_TCR_selected)

    result, _, _ = trimap_model.test_model(df_test=df_all, device=device)
    all_preds.append(result)

df_all['pred'] = np.mean(all_preds, axis=0)
df_all.to_csv(f'{n}_predict.csv', sep='\t', index=False)

/local_home/cao/miniconda3/envs/general/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
  WeightNorm.apply(module, name, dim)

Loading model seed 0

/local_home/cao/miniconda3/envs/general/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:11<00:00,  3.39it/s]

Loading model seed 1

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:14<00:00,  3.24it/s]

Loading model seed 2

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:18<00:00,  3.08it/s]

Loading model seed 3

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:21<00:00,  2.95it/s]

Loading model seed 4

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:24<00:00,  2.87it/s]

Loading model seed 5

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:29<00:00,  2.71it/s]

Loading model seed 6

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:06<00:00,  3.62it/s]

Loading model seed 7

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:09<00:00,  3.50it/s]

Loading model seed 8

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:10<00:00,  3.45it/s]

Loading model seed 9

INFO:themap.model:Loading alpha_dict.pt
INFO:themap.model:No new alpha sequences found
INFO:themap.model:Loading beta_dict.pt
INFO:themap.model:No new beta sequences found
INFO:themap.model:Predicting...
100%|██████████| 242/242 [01:03<00:00,  3.78it/s]

Show predictions¶

df_all[['name', 'Epitope', 'pred']]

	name	Epitope	pred
0	AS3.1	GQYLITWIF	0.484653
1	AS3.1	SRWNDYKIV	0.218381
2	AS3.1	HRKVLANLK	0.089420
3	AS3.1	RKVLANLKK	0.175217
4	AS3.1	TRIAKYFMM	0.430186
...	...	...	...
61825	AS8.4	QRMRWLDGI	0.055362
61826	AS8.4	RLSDFTFTF	0.240501
61827	AS8.4	ARLAGPIFS	0.087526
61828	AS8.4	IRVFKAGVL	0.179825
61829	AS8.4	FKAGVLLEY	0.196660

61830 rows × 3 columns

Top 5 peptides for each TCR¶

top5_df = df_all.sort_values(['name', 'pred'], ascending=[True, False]) \
                .groupby('name').head(5).reset_index(drop=True)
# find proteins for top 5 epitopes
top5_df['Protein_ID'] = top5_df['Epitope'].apply(lambda x: protein_dict[x] if x in protein_dict else x)
top5_df[['name', 'Epitope', 'pred', 'Protein_ID']]

	name	Epitope	pred	Protein_ID
0	AS3.1	ARVMALMPF	0.923957	[RJX1181_1545 30S]
1	AS3.1	GRIVVLLVP	0.770734	[RJX1181_1038 hypothetical]
2	AS3.1	ARVLLITPF	0.769916	[RJX1181_0828 Ribosome-recycling]
3	AS3.1	SRVMFPGWY	0.762865	[RJX1181_1310 Phosphoenolpyruvate]
4	AS3.1	GRMAIMIYY	0.751273	[RJX1181_0774 hypothetical]
5	AS4.1	ARVMALMPF	0.879250	[RJX1181_1545 30S]
6	AS4.1	GRCWMFAAL	0.701453	[RJX1181_1851 Aminopeptidase]
7	AS4.1	WRLSTLVPF	0.700965	[RJX1181_1725 hypothetical]
8	AS4.1	WRKVVASPK	0.666782	[RJX1181_0410 Carbamate]
9	AS4.1	NQWWWPESK	0.657591	[RJX1181_0878 hypothetical]
10	AS4.2	ARVMALMPF	0.971340	[RJX1181_1545 30S]
11	AS4.2	GRCWMFAAL	0.885641	[RJX1181_1851 Aminopeptidase]
12	AS4.2	GRIVVLLVP	0.882411	[RJX1181_1038 hypothetical]
13	AS4.2	GRMAIMIYY	0.849540	[RJX1181_0774 hypothetical]
14	AS4.2	SRMAMWIIL	0.846020	[RJX1181_0432 hypothetical]
15	AS4.3	GRMVPLNTK	0.950566	[RJX1181_0267 Bifunctional]
16	AS4.3	GRALMSTPK	0.939676	[RJX1181_0806 High-affinity]
17	AS4.3	ARVLATSPK	0.936035	[RJX1181_0855 Phosphate]
18	AS4.3	TRAPNPMIV	0.927110	[RJX1181_0099 hypothetical]
19	AS4.3	NRVMVSPDL	0.923938	[RJX1181_1492 hypothetical]
20	AS8.4	ARVMALMPF	0.927952	[RJX1181_1545 30S]
21	AS8.4	ARVLLITPF	0.669127	[RJX1181_0828 Ribosome-recycling]
22	AS8.4	GRCWMFAAL	0.664344	[RJX1181_1851 Aminopeptidase]
23	AS8.4	SRVMFPGWY	0.653470	[RJX1181_1310 Phosphoenolpyruvate]
24	AS8.4	GRMAIMIYY	0.628679	[RJX1181_0774 hypothetical]