# %% import torch import json import random import numpy as np from transformers import AutoTokenizer from transformers import AutoModel from loss import batch_all_triplet_loss, batch_hard_triplet_loss from sklearn.neighbors import KNeighborsClassifier from tqdm import tqdm import gc # %% # Step 2: Load the state dictionary DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # MODEL_NAME = 'distilbert-base-cased' #'prajjwal1/bert-small' #'bert-base-cased' # MODEL_NAME = 'bert-base-cased' # 'prajjwal1/bert-small' 'bert-base-cased' 'distilbert-base-cased' MODEL_NAME = 'prajjwal1/bert-small' # 'prajjwal1/bert-small' 'bert-base-cased' 'distilbert-base-cased' tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME) # state_dict = torch.load('./checkpoint/siamese.pt') state_dict = torch.load('./checkpoint/siamese_simple.pt') # Step 3: Apply the state dictionary to the model model.load_state_dict(state_dict) model.to(DEVICE) model.eval() # %% with open('../esAppMod/tca_entities.json', 'r') as file: entities = json.load(file) all_entity_id_name = {entity['entity_id']: entity['entity_name'] for _, entity in entities['data'].items()} with open('../esAppMod/train.json', 'r') as file: train = json.load(file) train_entity_id_mentions = {data['entity_id']: data['mentions'] for _, data in train['data'].items()} train_entity_id_name = {data['entity_id']: all_entity_id_name[data['entity_id']] for _, data in train['data'].items()} # %% with open('../esAppMod/infer.json', 'r') as file: test = json.load(file) x_test = [d['mention'] for _, d in test['data'].items()] y_test = [d['entity_id'] for _, d in test['data'].items()] train_entities, labels = list(train_entity_id_name.values()), list(train_entity_id_name.keys()) def batch_list(data, batch_size): """Yield successive n-sized chunks from data.""" for i in range(0, len(data), batch_size): yield data[i:i + batch_size] batches = batch_list(train_entities, 64) embedding_list = [] for batch in batches: inputs = tokenizer(batch, padding=True, return_tensors='pt') outputs = model( input_ids=inputs['input_ids'].to(DEVICE), attention_mask=inputs['attention_mask'].to(DEVICE) ) output = outputs.last_hidden_state[:,0,:] output = output.detach().cpu().numpy() embedding_list.append(output) cls = np.concatenate(embedding_list) # %% gc.collect() torch.cuda.empty_cache() # %% batches = batch_list(x_test, 64) embedding_list = [] for batch in batches: inputs = tokenizer(batch, padding=True, return_tensors='pt') outputs = model( input_ids=inputs['input_ids'].to(DEVICE), attention_mask=inputs['attention_mask'].to(DEVICE) ) output = outputs.last_hidden_state[:,0,:] output = output.detach().cpu().numpy() embedding_list.append(output) cls_test = np.concatenate(embedding_list) # %% knn = KNeighborsClassifier(n_neighbors=1, metric='cosine').fit(cls, labels) n_neighbors = [1, 3, 5, 10] with open("results/output.txt", "w") as f: for n in n_neighbors: distances, indices = knn.kneighbors(cls_test, n_neighbors=n) num = 0 for a,b in zip(y_test, indices): b = [labels[i] for i in b] if a in b: num += 1 print(f'Top-{n:<3} accuracy: {num / len(y_test)}', file=f) print(np.min(distances), np.max(distances), file=f) # %%