First commit
- added classification-based mapping for esAppMod data
This commit is contained in:
commit
a1d000d9c8
|
@ -0,0 +1 @@
|
|||
__pycache__
|
|
@ -0,0 +1,80 @@
|
|||
# %%
|
||||
import json
|
||||
import pandas as pd
|
||||
from utils import Retriever, cosine_similarity_chunked
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
##########################################
|
||||
# %%
|
||||
|
||||
# Load the JSON file
|
||||
data_path = '../esAppMod/tca_entities.json'
|
||||
with open(data_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Initialize an empty list to store the rows
|
||||
rows = []
|
||||
|
||||
# %%
|
||||
# Loop through all entities in the JSON
|
||||
for entity in data["data"].items():
|
||||
entity_data = entity[1]
|
||||
entity_id = entity_data['entity_id']
|
||||
entity_name = entity_data['entity_name']
|
||||
|
||||
# Add each mention and its entity_id to the rows list
|
||||
rows.append({"id": entity_id, "name": entity_name})
|
||||
|
||||
# Create a DataFrame from the rows
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
|
||||
# %%
|
||||
# df.to_csv('entity.csv', index=False)
|
||||
|
||||
|
||||
# %%
|
||||
# we want to automatically identify clusters
|
||||
class Embedder():
|
||||
input_df: pd.DataFrame
|
||||
fold: int
|
||||
|
||||
def __init__(self, input_df):
|
||||
self.input_df = input_df
|
||||
|
||||
|
||||
def make_embedding(self, checkpoint_path):
|
||||
|
||||
def generate_input_list(df):
|
||||
input_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['name']
|
||||
input_list.append(desc)
|
||||
return input_list
|
||||
|
||||
# prepare reference embed
|
||||
train_data = list(generate_input_list(self.input_df))
|
||||
# Define the directory and the pattern
|
||||
retriever_train = Retriever(train_data, checkpoint_path)
|
||||
retriever_train.make_embedding(batch_size=64)
|
||||
return retriever_train.embeddings.to('cpu')
|
||||
|
||||
# model_checkpoint = 'google-bert/bert-base-cased'
|
||||
model_checkpoint = '../train/class_bert_simple/checkpoint/checkpoint-4500'
|
||||
embedder = Embedder(input_df=df)
|
||||
embeddings = embedder.make_embedding(model_checkpoint)
|
||||
|
||||
# %%
|
||||
similarity_matrix = cosine_similarity(embeddings)
|
||||
|
||||
# %%
|
||||
similarity_matrix.shape
|
||||
|
||||
# %%
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
|
||||
clustering = AgglomerativeClustering(metric='precomputed', linkage='average')
|
||||
clustering.fit(1 - similarity_matrix) # Use distance = 1 - similarity
|
||||
|
||||
print(clustering.labels_) # Cluster assignments
|
||||
# %%
|
|
@ -0,0 +1,17 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# %%
|
||||
# import training file
|
||||
data_path = '../data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
# %%
|
||||
id_counts = train_df['entity_id'].value_counts()
|
||||
|
||||
# %%
|
||||
|
||||
plt.hist(id_counts, bins=50)
|
||||
# %%
|
|
@ -0,0 +1,95 @@
|
|||
# %%
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
##########################################
|
||||
# %%
|
||||
|
||||
# Load the JSON file
|
||||
data_path = '../esAppMod/tca_entities.json'
|
||||
with open(data_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Initialize an empty list to store the rows
|
||||
rows = []
|
||||
|
||||
# %%
|
||||
# Loop through all entities in the JSON
|
||||
for entity in data["data"].items():
|
||||
entity_data = entity[1]
|
||||
entity_id = entity_data['entity_id']
|
||||
entity_name = entity_data['entity_name']
|
||||
entity_type_id = entity_data['entity_type_id']
|
||||
entity_type_name = entity_data['entity_type_name']
|
||||
|
||||
# Add each mention and its entity_id to the rows list
|
||||
rows.append(
|
||||
{
|
||||
'id': entity_id,
|
||||
'name': entity_name,
|
||||
'type_id': entity_type_id,
|
||||
'type_name': entity_type_name
|
||||
})
|
||||
|
||||
# Create a DataFrame from the rows
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
# %%
|
||||
# df.to_csv('entity.csv', index=False)
|
||||
df
|
||||
|
||||
# %%
|
||||
df['type_name'].value_counts()
|
||||
# %%
|
||||
df['type_id'].value_counts()
|
||||
|
||||
# %%
|
||||
name_list = df['name'].to_list()
|
||||
# %%
|
||||
name_list
|
||||
|
||||
# %%
|
||||
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
||||
import numpy as np
|
||||
|
||||
# %%
|
||||
# Define labels
|
||||
labels = name_list
|
||||
|
||||
# Create a prefix-based distance matrix
|
||||
def prefix_distance(label1, label2):
|
||||
prefix1 = label1.split()
|
||||
prefix2 = label2.split()
|
||||
# Find common prefix length
|
||||
common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
|
||||
# Distance is inversely proportional to common prefix length
|
||||
return 1.0 / (common_prefix_length + 1)
|
||||
|
||||
# Create a pairwise distance matrix
|
||||
n = len(labels)
|
||||
distance_matrix = np.zeros((n, n))
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
|
||||
|
||||
# Perform hierarchical clustering
|
||||
linkage_matrix = linkage(distance_matrix, method='average')
|
||||
|
||||
# Visualize as a dendrogram
|
||||
import matplotlib.pyplot as plt
|
||||
dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
|
||||
plt.title("Prefix-Based Clustering")
|
||||
plt.show()
|
||||
|
||||
# %%
|
||||
linkage_matrix
|
||||
# %%
|
||||
# Extract flat clusters with a distance threshold
|
||||
threshold = 0.5
|
||||
clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
|
||||
|
||||
# Display clusters
|
||||
for i, cluster_id in enumerate(clusters):
|
||||
print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
|
||||
|
||||
# %%
|
|
@ -0,0 +1,71 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
|
||||
# %%
|
||||
# import training file
|
||||
data_path = '../data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
# import test file
|
||||
data_path = '../data_import/test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# import entity file
|
||||
data_path = '../data_import/entity.csv'
|
||||
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
id2label = {}
|
||||
for _, row in entity_df.iterrows():
|
||||
id2label[row['id']] = row['name']
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
data_path = '../train/class_bert_process/classification_prediction/exports/result.csv'
|
||||
prediction_df = pd.read_csv(data_path)
|
||||
|
||||
# %%
|
||||
predicted_entity_list = []
|
||||
for element in prediction_df['class_prediction']:
|
||||
predicted_entity_list.append(id2label[element])
|
||||
|
||||
prediction_df['predicted_name'] = predicted_entity_list
|
||||
# %%
|
||||
new_df = pd.concat((test_df, prediction_df ), axis=1)
|
||||
|
||||
# %%
|
||||
mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
|
||||
mismatch_df = new_df[mismatch_mask]
|
||||
|
||||
|
||||
# %%
|
||||
# print the top 10 offending classes
|
||||
print(mismatch_df['entity_id'].value_counts()[:10])
|
||||
|
||||
# %%
|
||||
# Convert the whole dataframe as a string and display
|
||||
# print the mismatch_df
|
||||
print(mismatch_df.to_markdown())
|
||||
|
||||
|
||||
# %%
|
||||
# let us see the test mentions
|
||||
select_value = 434
|
||||
select_mask = mismatch_df['entity_id'] == select_value
|
||||
mismatch_df[select_mask]
|
||||
|
||||
# %%
|
||||
# let us see the train mentions
|
||||
select_value = 434
|
||||
select_mask = train_df['entity_id'] == select_value
|
||||
train_df[select_mask]
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
mismatch_df[select_mask]['class_prediction'].to_list()
|
||||
|
||||
# %%
|
||||
# %%
|
|
@ -0,0 +1,81 @@
|
|||
import torch
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
)
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
|
||||
class Retriever:
|
||||
def __init__(self, input_texts, model_checkpoint):
|
||||
# we need to generate the embedding from list of input strings
|
||||
self.embeddings = []
|
||||
self.inputs = input_texts
|
||||
model_checkpoint = model_checkpoint
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
# device = "cpu"
|
||||
model.to(self.device)
|
||||
self.model = model.eval()
|
||||
|
||||
|
||||
def make_embedding(self, batch_size=64):
|
||||
all_embeddings = self.embeddings
|
||||
input_texts = self.inputs
|
||||
|
||||
for i in range(0, len(input_texts), batch_size):
|
||||
batch_texts = input_texts[i:i+batch_size]
|
||||
# Tokenize the input text
|
||||
inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
|
||||
input_ids = inputs.input_ids.to(self.device)
|
||||
attention_mask = inputs.attention_mask.to(self.device)
|
||||
|
||||
|
||||
# Pass the input through the encoder and retrieve the embeddings
|
||||
with torch.no_grad():
|
||||
encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
|
||||
# get last layer
|
||||
embeddings = encoder_outputs.hidden_states[-1]
|
||||
# get cls token embedding
|
||||
cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size)
|
||||
all_embeddings.append(cls_embeddings)
|
||||
|
||||
# remove the batch list and makes a single large tensor, dim=0 increases row-wise
|
||||
all_embeddings = torch.cat(all_embeddings, dim=0)
|
||||
|
||||
self.embeddings = all_embeddings
|
||||
|
||||
def cosine_similarity_chunked(batch1, batch2, chunk_size=1024):
|
||||
device = 'cuda'
|
||||
batch1_size = batch1.size(0)
|
||||
batch2_size = batch2.size(0)
|
||||
batch2.to(device)
|
||||
|
||||
# Prepare an empty tensor to store results
|
||||
cos_sim = torch.empty(batch1_size, batch2_size, device=device)
|
||||
|
||||
# Process batch1 in chunks
|
||||
for i in range(0, batch1_size, chunk_size):
|
||||
batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1
|
||||
|
||||
batch1_chunk.to(device)
|
||||
# Expand batch1 chunk and entire batch2 for comparison
|
||||
# batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len)
|
||||
# batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len)
|
||||
batch2_norms = batch2.norm(dim=1, keepdim=True)
|
||||
|
||||
|
||||
# Compute cosine similarity for the chunk and store it in the final tensor
|
||||
# cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
|
||||
|
||||
# Compute cosine similarity by matrix multiplication and normalizing
|
||||
sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8)
|
||||
|
||||
# Store the results in the appropriate part of the final tensor
|
||||
cos_sim[i:i + chunk_size] = sim_chunk
|
||||
|
||||
return cos_sim
|
|
@ -0,0 +1 @@
|
|||
*.csv
|
|
@ -0,0 +1,41 @@
|
|||
# %%
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
##########################################
|
||||
# %%
|
||||
|
||||
# Load the JSON file
|
||||
data_path = '../esAppMod/tca_entities.json'
|
||||
with open(data_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Initialize an empty list to store the rows
|
||||
rows = []
|
||||
|
||||
# %%
|
||||
# Loop through all entities in the JSON
|
||||
for entity in data["data"].items():
|
||||
entity_data = entity[1]
|
||||
entity_id = entity_data['entity_id']
|
||||
entity_name = entity_data['entity_name']
|
||||
entity_type_id = entity_data['entity_type_id']
|
||||
entity_type_name = entity_data['entity_type_name']
|
||||
|
||||
# Add each mention and its entity_id to the rows list
|
||||
rows.append(
|
||||
{
|
||||
'id': entity_id,
|
||||
'name': entity_name,
|
||||
'type_id': entity_type_id,
|
||||
'type_name': entity_type_name
|
||||
})
|
||||
|
||||
# Create a DataFrame from the rows
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
# %%
|
||||
df.to_csv('entity.csv', index=False)
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1,85 @@
|
|||
# %%
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
##########################################
|
||||
# %%
|
||||
# import entity information
|
||||
|
||||
# %%
|
||||
data_path = 'entity.csv'
|
||||
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
id2label = {}
|
||||
for _, row in entity_df.iterrows():
|
||||
id2label[row['id']] = row['name']
|
||||
|
||||
|
||||
# Load the JSON file
|
||||
data_path = '../esAppMod/train.json'
|
||||
with open(data_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Initialize an empty list to store the rows
|
||||
rows = []
|
||||
|
||||
# Loop through all entities in the JSON
|
||||
for entity_key, entity_data in data["data"].items():
|
||||
mentions = entity_data["mentions"]
|
||||
entity_id = entity_data["entity_id"]
|
||||
entity_name = id2label[entity_id]
|
||||
|
||||
# Add each mention and its entity_id to the rows list
|
||||
for mention in mentions:
|
||||
rows.append(
|
||||
{
|
||||
"mention": mention,
|
||||
"entity_id": entity_id,
|
||||
"entity_name": entity_name
|
||||
})
|
||||
|
||||
# Create a DataFrame from the rows
|
||||
train_df = pd.DataFrame(rows)
|
||||
|
||||
train_class_set = set(train_df['entity_id'].to_list())
|
||||
|
||||
# %%
|
||||
train_df.to_csv('train.csv', index=False)
|
||||
##########################################
|
||||
# %%
|
||||
# Load the JSON file
|
||||
data_path = '../esAppMod/infer.json'
|
||||
with open(data_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Initialize an empty list to store the rows
|
||||
rows = []
|
||||
|
||||
# Loop through all entities in the JSON
|
||||
for entity_key, entity_data in data["data"].items():
|
||||
mention = entity_data["mention"]
|
||||
entity_id = entity_data["entity_id"]
|
||||
entity_name = id2label[entity_id]
|
||||
|
||||
# Add each mention and its entity_id to the rows list
|
||||
rows.append(
|
||||
{
|
||||
"mention": mention,
|
||||
"entity_id": entity_id,
|
||||
"entity_name": entity_name
|
||||
})
|
||||
|
||||
|
||||
|
||||
# Create a DataFrame from the rows
|
||||
test_df = pd.DataFrame(rows)
|
||||
|
||||
test_class_set = (set(test_df['entity_id'].to_list()))
|
||||
|
||||
# %%
|
||||
test_df.to_csv('test.csv', index=False)
|
||||
|
||||
# %%
|
||||
# this shows that the training data can be found in the train set
|
||||
test_class_set - train_class_set
|
||||
|
||||
# %%
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,2 @@
|
|||
checkpoint*
|
||||
tensorboard-log
|
|
@ -0,0 +1 @@
|
|||
exports
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
*******************************************************************************
|
||||
Accuracy: 0.79090
|
||||
F1 Score: 0.80996
|
||||
Precision: 0.88827
|
||||
Recall: 0.79090
|
|
@ -0,0 +1,262 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
import glob
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import re
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
|
||||
BATCH_SIZE = 256
|
||||
|
||||
# %%
|
||||
data_path = '../../../data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(target_id_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
|
||||
# introduce pre-processing functions
|
||||
def preprocess_text(text):
|
||||
|
||||
# 1. Make all uppercase
|
||||
text = text.upper()
|
||||
|
||||
# 2. Remove punctuations
|
||||
# text = re.sub(r'[^\w\s]', '', text) # Retains only alphanumeric and spaces
|
||||
|
||||
# 3. Substitute digits with '#'
|
||||
text = re.sub(r'\d', '#', text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
desc = preprocess_text(desc)
|
||||
index = row['entity_id']
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../../data_import/test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
# combined_data = DatasetDict({
|
||||
# 'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
# })
|
||||
return Dataset.from_list(process_df_to_dict(test_df))
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
def test():
|
||||
|
||||
test_dataset = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
checkpoint_directory = f'../checkpoint'
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
pattern = 'checkpoint-*'
|
||||
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
# %%
|
||||
# compute max token length
|
||||
max_length = 0
|
||||
for sample in test_dataset['text']:
|
||||
# Tokenize the sample and get the length
|
||||
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||
length = len(input_ids)
|
||||
|
||||
# Update max_length if this sample is longer
|
||||
if length > max_length:
|
||||
max_length = length
|
||||
|
||||
print(max_length)
|
||||
|
||||
# %%
|
||||
|
||||
max_length = 128
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
# truncation=True,
|
||||
padding='max_length'
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
datasets = test_dataset.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
|
||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
# metric = evaluate.load("accuracy")
|
||||
#
|
||||
#
|
||||
# def compute_metrics(eval_preds):
|
||||
# preds, labels = eval_preds
|
||||
# preds = np.argmax(preds, axis=1)
|
||||
# return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(target_id_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
model = model.eval()
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
model.to(device)
|
||||
|
||||
pred_labels = []
|
||||
actual_labels = []
|
||||
|
||||
|
||||
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||
for batch in tqdm(dataloader):
|
||||
# Inference in batches
|
||||
input_ids = batch['input_ids']
|
||||
attention_mask = batch['attention_mask']
|
||||
# save labels too
|
||||
actual_labels.extend(batch['label'])
|
||||
|
||||
|
||||
# Move to GPU if available
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
|
||||
# Perform inference
|
||||
with torch.no_grad():
|
||||
logits = model(
|
||||
input_ids,
|
||||
attention_mask).logits
|
||||
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||
pred_labels.extend(predicted_class_ids)
|
||||
|
||||
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||
|
||||
|
||||
# %%
|
||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||
y_true = actual_labels
|
||||
y_pred = pred_labels
|
||||
|
||||
# Compute metrics
|
||||
accuracy = accuracy_score(y_true, y_pred)
|
||||
average_parameter = 'weighted'
|
||||
zero_division_parameter = 0
|
||||
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
|
||||
with open("output.txt", "a") as f:
|
||||
|
||||
print('*' * 80, file=f)
|
||||
# Print the results
|
||||
print(f'Accuracy: {accuracy:.5f}', file=f)
|
||||
print(f'F1 Score: {f1:.5f}', file=f)
|
||||
print(f'Precision: {precision:.5f}', file=f)
|
||||
print(f'Recall: {recall:.5f}', file=f)
|
||||
|
||||
# export result
|
||||
label_list = [id2label[id] for id in pred_labels]
|
||||
df = pd.DataFrame({
|
||||
'class_prediction': pd.Series(label_list)
|
||||
})
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result.csv", index=False)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# reset file before writing to it
|
||||
with open("output.txt", "w") as f:
|
||||
print('', file=f)
|
||||
test()
|
|
@ -0,0 +1,283 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import re
|
||||
import random
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
Trainer,
|
||||
EarlyStoppingCallback,
|
||||
TrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
# %%
|
||||
def set_seed(seed):
|
||||
"""
|
||||
Set the random seed for reproducibility.
|
||||
"""
|
||||
random.seed(seed) # Python random module
|
||||
np.random.seed(seed) # NumPy random
|
||||
torch.manual_seed(seed) # PyTorch CPU
|
||||
torch.cuda.manual_seed(seed) # PyTorch GPU
|
||||
torch.cuda.manual_seed_all(seed) # If using multiple GPUs
|
||||
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
|
||||
torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility
|
||||
|
||||
set_seed(42)
|
||||
|
||||
SHUFFLES=5
|
||||
|
||||
# %%
|
||||
|
||||
# import training file
|
||||
data_path = '../../data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(target_id_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
# %%
|
||||
# introduce pre-processing functions
|
||||
def preprocess_text(text):
|
||||
|
||||
# 1. Make all uppercase
|
||||
text = text.upper()
|
||||
|
||||
# 2. Remove punctuations
|
||||
# text = re.sub(r'[^\w\s]', '', text) # Retains only alphanumeric and spaces
|
||||
|
||||
# 3. Substitute digits with '#'
|
||||
text = re.sub(r'\d', '#', text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def generate_random_shuffles(text, n):
|
||||
"""
|
||||
Generate n strings with randomly shuffled words from the input text.
|
||||
|
||||
Args:
|
||||
text (str): The input text.
|
||||
n (int): The number of random variations to generate.
|
||||
|
||||
Returns:
|
||||
list: A list of strings with shuffled words.
|
||||
"""
|
||||
words = text.split() # Split the input into words
|
||||
shuffled_variations = []
|
||||
|
||||
for _ in range(n):
|
||||
shuffled = words[:] # Copy the word list to avoid in-place modification
|
||||
random.shuffle(shuffled) # Randomly shuffle the words
|
||||
shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string
|
||||
|
||||
return shuffled_variations
|
||||
|
||||
|
||||
# generate n more shuffled examples
|
||||
def shuffle_text(text, n_shuffles=SHUFFLES):
|
||||
"""
|
||||
Preprocess a list of texts and add n random shuffles for each string.
|
||||
|
||||
Args:
|
||||
texts (list): An input strings.
|
||||
n_shuffles (int): Number of random shuffles to generate for each string.
|
||||
|
||||
Returns:
|
||||
list: A list of preprocessed and shuffled strings.
|
||||
"""
|
||||
all_processed = []
|
||||
all_processed.append(text)
|
||||
|
||||
# Generate random shuffles
|
||||
shuffled_variations = generate_random_shuffles(text, n_shuffles)
|
||||
all_processed.extend(shuffled_variations)
|
||||
|
||||
return all_processed
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
# produce shuffling
|
||||
index = row['entity_id']
|
||||
desc = row['mention']
|
||||
desc = preprocess_text(desc)
|
||||
processed_descs = shuffle_text(desc, n_shuffles=SHUFFLES)
|
||||
|
||||
for desc in processed_descs:
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
combined_data = DatasetDict({
|
||||
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
})
|
||||
return combined_data
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
def train():
|
||||
|
||||
save_path = f'checkpoint'
|
||||
split_datasets = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
# model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||
model_checkpoint = 'google-bert/bert-base-cased'
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<DESC>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
max_length = 120
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
tokenized_datasets = split_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels = eval_preds
|
||||
preds = np.argmax(preds, axis=1)
|
||||
return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
# %%
|
||||
# create id2label and label2id
|
||||
|
||||
|
||||
# %%
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(target_id_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
||||
|
||||
# %%
|
||||
# Trainer
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=f"{save_path}",
|
||||
# eval_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-4,
|
||||
per_device_train_batch_size=128,
|
||||
per_device_eval_batch_size=128,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=120,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
|
||||
trainer = Trainer(
|
||||
model,
|
||||
training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||
)
|
||||
|
||||
# uncomment to load training from checkpoint
|
||||
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# execute training
|
||||
train()
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1,2 @@
|
|||
checkpoint*
|
||||
tensorboard-log
|
|
@ -0,0 +1 @@
|
|||
exports
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
*******************************************************************************
|
||||
Accuracy: 0.70070
|
||||
F1 Score: 0.73260
|
||||
Precision: 0.84815
|
||||
Recall: 0.70070
|
|
@ -0,0 +1,246 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
import glob
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
|
||||
BATCH_SIZE = 256
|
||||
|
||||
# %%
|
||||
data_path = '../../../data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(target_id_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
index = row['entity_id']
|
||||
element = {
|
||||
'text' : f"{desc}",
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../../data_import/test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
# combined_data = DatasetDict({
|
||||
# 'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
# })
|
||||
return Dataset.from_list(process_df_to_dict(test_df))
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
def test():
|
||||
|
||||
test_dataset = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
checkpoint_directory = f'../checkpoint'
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
pattern = 'checkpoint-*'
|
||||
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
# %%
|
||||
# compute max token length
|
||||
max_length = 0
|
||||
for sample in test_dataset['text']:
|
||||
# Tokenize the sample and get the length
|
||||
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||
length = len(input_ids)
|
||||
|
||||
# Update max_length if this sample is longer
|
||||
if length > max_length:
|
||||
max_length = length
|
||||
|
||||
print(max_length)
|
||||
|
||||
# %%
|
||||
|
||||
max_length = 128
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
# truncation=True,
|
||||
padding='max_length'
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
datasets = test_dataset.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
|
||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
# metric = evaluate.load("accuracy")
|
||||
#
|
||||
#
|
||||
# def compute_metrics(eval_preds):
|
||||
# preds, labels = eval_preds
|
||||
# preds = np.argmax(preds, axis=1)
|
||||
# return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(target_id_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
model = model.eval()
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
model.to(device)
|
||||
|
||||
pred_labels = []
|
||||
actual_labels = []
|
||||
|
||||
|
||||
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||
for batch in tqdm(dataloader):
|
||||
# Inference in batches
|
||||
input_ids = batch['input_ids']
|
||||
attention_mask = batch['attention_mask']
|
||||
# save labels too
|
||||
actual_labels.extend(batch['label'])
|
||||
|
||||
|
||||
# Move to GPU if available
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
|
||||
# Perform inference
|
||||
with torch.no_grad():
|
||||
logits = model(
|
||||
input_ids,
|
||||
attention_mask).logits
|
||||
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||
pred_labels.extend(predicted_class_ids)
|
||||
|
||||
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||
|
||||
|
||||
# %%
|
||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||
y_true = actual_labels
|
||||
y_pred = pred_labels
|
||||
|
||||
# Compute metrics
|
||||
accuracy = accuracy_score(y_true, y_pred)
|
||||
average_parameter = 'weighted'
|
||||
zero_division_parameter = 0
|
||||
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
|
||||
with open("output.txt", "a") as f:
|
||||
|
||||
print('*' * 80, file=f)
|
||||
# Print the results
|
||||
print(f'Accuracy: {accuracy:.5f}', file=f)
|
||||
print(f'F1 Score: {f1:.5f}', file=f)
|
||||
print(f'Precision: {precision:.5f}', file=f)
|
||||
print(f'Recall: {recall:.5f}', file=f)
|
||||
|
||||
# export result
|
||||
label_list = [id2label[id] for id in pred_labels]
|
||||
df = pd.DataFrame({
|
||||
'class_prediction': pd.Series(label_list)
|
||||
})
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result.csv", index=False)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# reset file before writing to it
|
||||
with open("output.txt", "w") as f:
|
||||
print('', file=f)
|
||||
test()
|
|
@ -0,0 +1,200 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
Trainer,
|
||||
EarlyStoppingCallback,
|
||||
TrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
# %%
|
||||
|
||||
# import training file
|
||||
data_path = '../../data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(target_id_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
# %%
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
index = row['entity_id']
|
||||
element = {
|
||||
'text' : f"{desc}",
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
combined_data = DatasetDict({
|
||||
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
})
|
||||
return combined_data
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
def train():
|
||||
|
||||
save_path = f'checkpoint'
|
||||
split_datasets = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
# model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||
model_checkpoint = 'google-bert/bert-base-cased'
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<DESC>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
max_length = 120
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
tokenized_datasets = split_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels = eval_preds
|
||||
preds = np.argmax(preds, axis=1)
|
||||
return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
# %%
|
||||
# create id2label and label2id
|
||||
|
||||
|
||||
# %%
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(target_id_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
||||
|
||||
# %%
|
||||
# Trainer
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=f"{save_path}",
|
||||
# eval_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-4,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=250,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
|
||||
trainer = Trainer(
|
||||
model,
|
||||
training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||
)
|
||||
|
||||
# uncomment to load training from checkpoint
|
||||
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# execute training
|
||||
train()
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1,2 @@
|
|||
checkpoint*
|
||||
tensorboard-log
|
|
@ -0,0 +1,2 @@
|
|||
__pycache__
|
||||
exports/
|
|
@ -0,0 +1,150 @@
|
|||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import (
|
||||
T5TokenizerFast,
|
||||
AutoModelForSeq2SeqLM,
|
||||
)
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
from datasets import Dataset
|
||||
import numpy as np
|
||||
|
||||
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
||||
|
||||
|
||||
class Inference():
|
||||
tokenizer: T5TokenizerFast
|
||||
model: torch.nn.Module
|
||||
dataloader: DataLoader
|
||||
|
||||
def __init__(self, checkpoint_path):
|
||||
self._create_tokenizer()
|
||||
self._load_model(checkpoint_path)
|
||||
|
||||
|
||||
def _create_tokenizer(self):
|
||||
# %%
|
||||
# load tokenizer
|
||||
self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
def _load_model(self, checkpoint_path: str):
|
||||
# load model
|
||||
# Define the directory and the pattern
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
|
||||
model = torch.compile(model)
|
||||
# set model to eval
|
||||
self.model = model.eval()
|
||||
|
||||
|
||||
|
||||
|
||||
def prepare_dataloader(self, input_df, batch_size, max_length):
|
||||
"""
|
||||
*arguments*
|
||||
- input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
|
||||
- batch_size: the batch size of dataloader output
|
||||
- max_length: length of tokenizer output
|
||||
"""
|
||||
print("preparing dataloader")
|
||||
# convert each dataframe row into a dictionary
|
||||
# outputs a list of dictionaries
|
||||
|
||||
def _process_df(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
label = row['entity_name']
|
||||
element = {
|
||||
'input' : desc,
|
||||
'output': label
|
||||
}
|
||||
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
def _preprocess_function(example):
|
||||
input = example['input']
|
||||
target = example['output']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = self.tokenizer(
|
||||
input,
|
||||
text_target=target,
|
||||
max_length=max_length,
|
||||
return_tensors="pt",
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
test_dataset = Dataset.from_list(_process_df(input_df))
|
||||
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
datasets = test_dataset.map(
|
||||
_preprocess_function,
|
||||
batched=True,
|
||||
num_proc=1,
|
||||
remove_columns=test_dataset.column_names,
|
||||
)
|
||||
# datasets = _preprocess_function(test_dataset)
|
||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
||||
|
||||
# create dataloader
|
||||
self.dataloader = DataLoader(datasets, batch_size=batch_size)
|
||||
|
||||
|
||||
def generate(self):
|
||||
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||
MAX_GENERATE_LENGTH = 128
|
||||
|
||||
pred_generations = []
|
||||
pred_labels = []
|
||||
|
||||
print("start generation")
|
||||
for batch in tqdm(self.dataloader):
|
||||
# Inference in batches
|
||||
input_ids = batch['input_ids']
|
||||
attention_mask = batch['attention_mask']
|
||||
# save labels too
|
||||
pred_labels.extend(batch['labels'])
|
||||
|
||||
|
||||
# Move to GPU if available
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
self.model.to(device)
|
||||
|
||||
# Perform inference
|
||||
with torch.no_grad():
|
||||
outputs = self.model.generate(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
max_length=MAX_GENERATE_LENGTH)
|
||||
|
||||
# Decode the output and print the results
|
||||
pred_generations.extend(outputs.to("cpu"))
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
def process_tensor_output(tokens):
|
||||
predictions = self.tokenizer.decode(tokens, skip_special_tokens=True)
|
||||
return predictions
|
||||
|
||||
# decode prediction labels
|
||||
def decode_preds(tokens_list):
|
||||
prediction_list = []
|
||||
for tokens in tokens_list:
|
||||
predicted_seq = process_tensor_output(tokens)
|
||||
prediction_list.append(predicted_seq)
|
||||
return prediction_list
|
||||
|
||||
prediction_list = decode_preds(pred_generations)
|
||||
return prediction_list
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
|
||||
Accuracy for fold: 0.5846658466584665
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
import pandas as pd
|
||||
import os
|
||||
import glob
|
||||
from inference import Inference
|
||||
|
||||
checkpoint_directory = '../'
|
||||
|
||||
BATCH_SIZE = 512
|
||||
|
||||
def infer():
|
||||
print(f"Inference for data")
|
||||
# import test data
|
||||
data_path = '../../../data_import/test.csv'
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
##########################################
|
||||
# run inference
|
||||
# checkpoint
|
||||
# Use glob to find matching paths
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint')
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
pattern = 'checkpoint-*'
|
||||
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
|
||||
|
||||
|
||||
infer = Inference(checkpoint_path)
|
||||
infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
|
||||
prediction_list = infer.generate()
|
||||
|
||||
# add labels too
|
||||
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
|
||||
# Convert the list to a Pandas DataFrame
|
||||
df_out = pd.DataFrame({
|
||||
'predictions': prediction_list
|
||||
})
|
||||
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
|
||||
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
||||
df = pd.concat([df, df_out], axis=1)
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result.csv", index=False)
|
||||
|
||||
# here we want to evaluate mapping accuracy within the valid in mdm data only
|
||||
condition_correct = df['predictions'] == df['entity_name']
|
||||
pred_correct_proportion = sum(condition_correct)/len(df)
|
||||
|
||||
# write output to file output.txt
|
||||
with open("output.txt", "a") as f:
|
||||
print(f'Accuracy for fold: {pred_correct_proportion}', file=f)
|
||||
|
||||
###########################################
|
||||
# execute
|
||||
|
||||
# reset file before writing to it
|
||||
with open("output.txt", "w") as f:
|
||||
print('', file=f)
|
||||
|
||||
infer()
|
|
@ -0,0 +1,190 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
T5TokenizerFast,
|
||||
AutoModelForSeq2SeqLM,
|
||||
DataCollatorForSeq2Seq,
|
||||
Seq2SeqTrainer,
|
||||
EarlyStoppingCallback,
|
||||
Seq2SeqTrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
# %%
|
||||
# outputs a list of dictionaries
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
label = row['entity_name']
|
||||
element = {
|
||||
'input' : desc,
|
||||
'output': label
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = f"../../data_import/train.csv"
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
combined_data = DatasetDict({
|
||||
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
})
|
||||
return combined_data
|
||||
|
||||
|
||||
# function to perform training for a given fold
|
||||
def train():
|
||||
save_path = f'checkpoint'
|
||||
split_datasets = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
model_checkpoint = "t5-small"
|
||||
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
max_length = 120
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['input']
|
||||
target = example['output']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
text_target=target,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
tokenized_datasets = split_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns=split_datasets["train"].column_names,
|
||||
)
|
||||
|
||||
# https://github.com/huggingface/transformers/pull/28414
|
||||
# model_checkpoint = "google/t5-efficient-tiny"
|
||||
# device_map set to auto to force it to load contiguous weights
|
||||
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||
metric = evaluate.load("sacrebleu")
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels = eval_preds
|
||||
# In case the model returns more than the prediction logits
|
||||
if isinstance(preds, tuple):
|
||||
preds = preds[0]
|
||||
|
||||
decoded_preds = tokenizer.batch_decode(preds,
|
||||
skip_special_tokens=False)
|
||||
|
||||
# Replace -100s in the labels as we can't decode them
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels,
|
||||
skip_special_tokens=False)
|
||||
|
||||
# Remove <PAD> tokens from decoded predictions and labels
|
||||
decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
|
||||
decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
|
||||
|
||||
# Some simple post-processing
|
||||
# decoded_preds = [pred.strip() for pred in decoded_preds]
|
||||
# decoded_labels = [[label.strip()] for label in decoded_labels]
|
||||
# print(decoded_preds, decoded_labels)
|
||||
|
||||
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
|
||||
return {"bleu": result["score"]}
|
||||
|
||||
|
||||
# Generation Config
|
||||
# from transformers import GenerationConfig
|
||||
gen_config = model.generation_config
|
||||
gen_config.max_length = 64
|
||||
|
||||
# compile
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
||||
|
||||
# Trainer
|
||||
|
||||
args = Seq2SeqTrainingArguments(
|
||||
f"{save_path}",
|
||||
# eval_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-3,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
generation_config=gen_config,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
|
||||
trainer = Seq2SeqTrainer(
|
||||
model,
|
||||
args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
data_collator=data_collator,
|
||||
tokenizer=tokenizer,
|
||||
compute_metrics=compute_metrics,
|
||||
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||
)
|
||||
|
||||
# uncomment to load training from checkpoint
|
||||
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# execute training
|
||||
train()
|
||||
|
Loading…
Reference in New Issue