First commit

- added classification-based mapping for esAppMod data
This commit is contained in:
Richard Wong 2025-01-13 19:05:13 +09:00
commit a1d000d9c8
29 changed files with 25962 additions and 0 deletions

1
analysis/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__

View File

@ -0,0 +1,80 @@
# %%
import json
import pandas as pd
from utils import Retriever, cosine_similarity_chunked
from sklearn.metrics.pairwise import cosine_similarity
##########################################
# %%
# Load the JSON file
data_path = '../esAppMod/tca_entities.json'
with open(data_path, 'r') as file:
data = json.load(file)
# Initialize an empty list to store the rows
rows = []
# %%
# Loop through all entities in the JSON
for entity in data["data"].items():
entity_data = entity[1]
entity_id = entity_data['entity_id']
entity_name = entity_data['entity_name']
# Add each mention and its entity_id to the rows list
rows.append({"id": entity_id, "name": entity_name})
# Create a DataFrame from the rows
df = pd.DataFrame(rows)
# %%
# df.to_csv('entity.csv', index=False)
# %%
# we want to automatically identify clusters
class Embedder():
input_df: pd.DataFrame
fold: int
def __init__(self, input_df):
self.input_df = input_df
def make_embedding(self, checkpoint_path):
def generate_input_list(df):
input_list = []
for _, row in df.iterrows():
desc = row['name']
input_list.append(desc)
return input_list
# prepare reference embed
train_data = list(generate_input_list(self.input_df))
# Define the directory and the pattern
retriever_train = Retriever(train_data, checkpoint_path)
retriever_train.make_embedding(batch_size=64)
return retriever_train.embeddings.to('cpu')
# model_checkpoint = 'google-bert/bert-base-cased'
model_checkpoint = '../train/class_bert_simple/checkpoint/checkpoint-4500'
embedder = Embedder(input_df=df)
embeddings = embedder.make_embedding(model_checkpoint)
# %%
similarity_matrix = cosine_similarity(embeddings)
# %%
similarity_matrix.shape
# %%
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(metric='precomputed', linkage='average')
clustering.fit(1 - similarity_matrix) # Use distance = 1 - similarity
print(clustering.labels_) # Cluster assignments
# %%

View File

@ -0,0 +1,17 @@
# %%
import pandas as pd
import matplotlib.pyplot as plt
# %%
# import training file
data_path = '../data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
# %%
id_counts = train_df['entity_id'].value_counts()
# %%
plt.hist(id_counts, bins=50)
# %%

View File

@ -0,0 +1,95 @@
# %%
import json
import pandas as pd
##########################################
# %%
# Load the JSON file
data_path = '../esAppMod/tca_entities.json'
with open(data_path, 'r') as file:
data = json.load(file)
# Initialize an empty list to store the rows
rows = []
# %%
# Loop through all entities in the JSON
for entity in data["data"].items():
entity_data = entity[1]
entity_id = entity_data['entity_id']
entity_name = entity_data['entity_name']
entity_type_id = entity_data['entity_type_id']
entity_type_name = entity_data['entity_type_name']
# Add each mention and its entity_id to the rows list
rows.append(
{
'id': entity_id,
'name': entity_name,
'type_id': entity_type_id,
'type_name': entity_type_name
})
# Create a DataFrame from the rows
df = pd.DataFrame(rows)
# %%
# df.to_csv('entity.csv', index=False)
df
# %%
df['type_name'].value_counts()
# %%
df['type_id'].value_counts()
# %%
name_list = df['name'].to_list()
# %%
name_list
# %%
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import numpy as np
# %%
# Define labels
labels = name_list
# Create a prefix-based distance matrix
def prefix_distance(label1, label2):
prefix1 = label1.split()
prefix2 = label2.split()
# Find common prefix length
common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
# Distance is inversely proportional to common prefix length
return 1.0 / (common_prefix_length + 1)
# Create a pairwise distance matrix
n = len(labels)
distance_matrix = np.zeros((n, n))
for i in range(n):
for j in range(n):
distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='average')
# Visualize as a dendrogram
import matplotlib.pyplot as plt
dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
plt.title("Prefix-Based Clustering")
plt.show()
# %%
linkage_matrix
# %%
# Extract flat clusters with a distance threshold
threshold = 0.5
clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
# Display clusters
for i, cluster_id in enumerate(clusters):
print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
# %%

View File

@ -0,0 +1,71 @@
# %%
import pandas as pd
# %%
# import training file
data_path = '../data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
# import test file
data_path = '../data_import/test.csv'
test_df = pd.read_csv(data_path, skipinitialspace=True)
# import entity file
data_path = '../data_import/entity.csv'
entity_df = pd.read_csv(data_path, skipinitialspace=True)
id2label = {}
for _, row in entity_df.iterrows():
id2label[row['id']] = row['name']
# %%
data_path = '../train/class_bert_process/classification_prediction/exports/result.csv'
prediction_df = pd.read_csv(data_path)
# %%
predicted_entity_list = []
for element in prediction_df['class_prediction']:
predicted_entity_list.append(id2label[element])
prediction_df['predicted_name'] = predicted_entity_list
# %%
new_df = pd.concat((test_df, prediction_df ), axis=1)
# %%
mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
mismatch_df = new_df[mismatch_mask]
# %%
# print the top 10 offending classes
print(mismatch_df['entity_id'].value_counts()[:10])
# %%
# Convert the whole dataframe as a string and display
# print the mismatch_df
print(mismatch_df.to_markdown())
# %%
# let us see the test mentions
select_value = 434
select_mask = mismatch_df['entity_id'] == select_value
mismatch_df[select_mask]
# %%
# let us see the train mentions
select_value = 434
select_mask = train_df['entity_id'] == select_value
train_df[select_mask]
# %%
mismatch_df[select_mask]['class_prediction'].to_list()
# %%
# %%

81
analysis/utils.py Normal file
View File

@ -0,0 +1,81 @@
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
)
import torch.nn.functional as F
class Retriever:
def __init__(self, input_texts, model_checkpoint):
# we need to generate the embedding from list of input strings
self.embeddings = []
self.inputs = input_texts
model_checkpoint = model_checkpoint
self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
model.to(self.device)
self.model = model.eval()
def make_embedding(self, batch_size=64):
all_embeddings = self.embeddings
input_texts = self.inputs
for i in range(0, len(input_texts), batch_size):
batch_texts = input_texts[i:i+batch_size]
# Tokenize the input text
inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
input_ids = inputs.input_ids.to(self.device)
attention_mask = inputs.attention_mask.to(self.device)
# Pass the input through the encoder and retrieve the embeddings
with torch.no_grad():
encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
# get last layer
embeddings = encoder_outputs.hidden_states[-1]
# get cls token embedding
cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size)
all_embeddings.append(cls_embeddings)
# remove the batch list and makes a single large tensor, dim=0 increases row-wise
all_embeddings = torch.cat(all_embeddings, dim=0)
self.embeddings = all_embeddings
def cosine_similarity_chunked(batch1, batch2, chunk_size=1024):
device = 'cuda'
batch1_size = batch1.size(0)
batch2_size = batch2.size(0)
batch2.to(device)
# Prepare an empty tensor to store results
cos_sim = torch.empty(batch1_size, batch2_size, device=device)
# Process batch1 in chunks
for i in range(0, batch1_size, chunk_size):
batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1
batch1_chunk.to(device)
# Expand batch1 chunk and entire batch2 for comparison
# batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len)
# batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len)
batch2_norms = batch2.norm(dim=1, keepdim=True)
# Compute cosine similarity for the chunk and store it in the final tensor
# cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
# Compute cosine similarity by matrix multiplication and normalizing
sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8)
# Store the results in the appropriate part of the final tensor
cos_sim[i:i + chunk_size] = sim_chunk
return cos_sim

1
data_import/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.csv

View File

@ -0,0 +1,41 @@
# %%
import json
import pandas as pd
##########################################
# %%
# Load the JSON file
data_path = '../esAppMod/tca_entities.json'
with open(data_path, 'r') as file:
data = json.load(file)
# Initialize an empty list to store the rows
rows = []
# %%
# Loop through all entities in the JSON
for entity in data["data"].items():
entity_data = entity[1]
entity_id = entity_data['entity_id']
entity_name = entity_data['entity_name']
entity_type_id = entity_data['entity_type_id']
entity_type_name = entity_data['entity_type_name']
# Add each mention and its entity_id to the rows list
rows.append(
{
'id': entity_id,
'name': entity_name,
'type_id': entity_type_id,
'type_name': entity_type_name
})
# Create a DataFrame from the rows
df = pd.DataFrame(rows)
# %%
df.to_csv('entity.csv', index=False)
# %%

View File

@ -0,0 +1,85 @@
# %%
import json
import pandas as pd
##########################################
# %%
# import entity information
# %%
data_path = 'entity.csv'
entity_df = pd.read_csv(data_path, skipinitialspace=True)
id2label = {}
for _, row in entity_df.iterrows():
id2label[row['id']] = row['name']
# Load the JSON file
data_path = '../esAppMod/train.json'
with open(data_path, 'r') as file:
data = json.load(file)
# Initialize an empty list to store the rows
rows = []
# Loop through all entities in the JSON
for entity_key, entity_data in data["data"].items():
mentions = entity_data["mentions"]
entity_id = entity_data["entity_id"]
entity_name = id2label[entity_id]
# Add each mention and its entity_id to the rows list
for mention in mentions:
rows.append(
{
"mention": mention,
"entity_id": entity_id,
"entity_name": entity_name
})
# Create a DataFrame from the rows
train_df = pd.DataFrame(rows)
train_class_set = set(train_df['entity_id'].to_list())
# %%
train_df.to_csv('train.csv', index=False)
##########################################
# %%
# Load the JSON file
data_path = '../esAppMod/infer.json'
with open(data_path, 'r') as file:
data = json.load(file)
# Initialize an empty list to store the rows
rows = []
# Loop through all entities in the JSON
for entity_key, entity_data in data["data"].items():
mention = entity_data["mention"]
entity_id = entity_data["entity_id"]
entity_name = id2label[entity_id]
# Add each mention and its entity_id to the rows list
rows.append(
{
"mention": mention,
"entity_id": entity_id,
"entity_name": entity_name
})
# Create a DataFrame from the rows
test_df = pd.DataFrame(rows)
test_class_set = (set(test_df['entity_id'].to_list()))
# %%
test_df.to_csv('test.csv', index=False)
# %%
# this shows that the training data can be found in the train set
test_class_set - train_class_set
# %%

9763
esAppMod/infer.json Normal file

File diff suppressed because it is too large Load Diff

1687
esAppMod/infer_negative.json Normal file

File diff suppressed because it is too large Load Diff

4891
esAppMod/tca_entities.json Normal file

File diff suppressed because it is too large Load Diff

7732
esAppMod/train.json Normal file

File diff suppressed because it is too large Load Diff

2
train/class_bert_process/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
checkpoint*
tensorboard-log

View File

@ -0,0 +1 @@
exports

View File

@ -0,0 +1,6 @@
*******************************************************************************
Accuracy: 0.79090
F1 Score: 0.80996
Precision: 0.88827
Recall: 0.79090

View File

@ -0,0 +1,262 @@
# %%
# from datasets import load_from_disk
import os
import glob
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import re
import torch
from torch.utils.data import DataLoader
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
)
import evaluate
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from tqdm import tqdm
torch.set_float32_matmul_precision('high')
BATCH_SIZE = 256
# %%
data_path = '../../../data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
# rather than use pattern, we use the real thing and property
entity_ids = train_df['entity_id'].to_list()
target_id_list = sorted(list(set(entity_ids)))
# %%
id2label = {}
label2id = {}
for idx, val in enumerate(target_id_list):
id2label[idx] = val
label2id[val] = idx
# introduce pre-processing functions
def preprocess_text(text):
# 1. Make all uppercase
text = text.upper()
# 2. Remove punctuations
# text = re.sub(r'[^\w\s]', '', text) # Retains only alphanumeric and spaces
# 3. Substitute digits with '#'
text = re.sub(r'\d', '#', text)
return text
# outputs a list of dictionaries
# processes dataframe into lists of dictionaries
# each element maps input to output
# input: tag_description
# output: class label
def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
desc = row['mention']
desc = preprocess_text(desc)
index = row['entity_id']
element = {
'text' : desc,
'label': label2id[index], # ensure labels starts from 0
}
output_list.append(element)
return output_list
def create_dataset():
# train
data_path = '../../../data_import/test.csv'
test_df = pd.read_csv(data_path, skipinitialspace=True)
# combined_data = DatasetDict({
# 'train': Dataset.from_list(process_df_to_dict(train_df)),
# })
return Dataset.from_list(process_df_to_dict(test_df))
# %%
def test():
test_dataset = create_dataset()
# prepare tokenizer
checkpoint_directory = f'../checkpoint'
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
# %%
# compute max token length
max_length = 0
for sample in test_dataset['text']:
# Tokenize the sample and get the length
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
length = len(input_ids)
# Update max_length if this sample is longer
if length > max_length:
max_length = length
print(max_length)
# %%
max_length = 128
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
# truncation=True,
padding='max_length'
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
datasets = test_dataset.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# %% temp
# tokenized_datasets['train'].rename_columns()
# %%
# create data collator
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
# %%
# compute metrics
# metric = evaluate.load("accuracy")
#
#
# def compute_metrics(eval_preds):
# preds, labels = eval_preds
# preds = np.argmax(preds, axis=1)
# return metric.compute(predictions=preds, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(target_id_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
model = model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
pred_labels = []
actual_labels = []
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
for batch in tqdm(dataloader):
# Inference in batches
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
# save labels too
actual_labels.extend(batch['label'])
# Move to GPU if available
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Perform inference
with torch.no_grad():
logits = model(
input_ids,
attention_mask).logits
predicted_class_ids = logits.argmax(dim=1).to("cpu")
pred_labels.extend(predicted_class_ids)
pred_labels = [tensor.item() for tensor in pred_labels]
# %%
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
y_true = actual_labels
y_pred = pred_labels
# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
average_parameter = 'weighted'
zero_division_parameter = 0
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
with open("output.txt", "a") as f:
print('*' * 80, file=f)
# Print the results
print(f'Accuracy: {accuracy:.5f}', file=f)
print(f'F1 Score: {f1:.5f}', file=f)
print(f'Precision: {precision:.5f}', file=f)
print(f'Recall: {recall:.5f}', file=f)
# export result
label_list = [id2label[id] for id in pred_labels]
df = pd.DataFrame({
'class_prediction': pd.Series(label_list)
})
# we can save the t5 generation output here
df.to_csv(f"exports/result.csv", index=False)
# %%
# reset file before writing to it
with open("output.txt", "w") as f:
print('', file=f)
test()

View File

@ -0,0 +1,283 @@
# %%
# from datasets import load_from_disk
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import re
import random
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
Trainer,
EarlyStoppingCallback,
TrainingArguments
)
import evaluate
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# %%
def set_seed(seed):
"""
Set the random seed for reproducibility.
"""
random.seed(seed) # Python random module
np.random.seed(seed) # NumPy random
torch.manual_seed(seed) # PyTorch CPU
torch.cuda.manual_seed(seed) # PyTorch GPU
torch.cuda.manual_seed_all(seed) # If using multiple GPUs
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility
set_seed(42)
SHUFFLES=5
# %%
# import training file
data_path = '../../data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
# rather than use pattern, we use the real thing and property
entity_ids = train_df['entity_id'].to_list()
target_id_list = sorted(list(set(entity_ids)))
# %%
id2label = {}
label2id = {}
for idx, val in enumerate(target_id_list):
id2label[idx] = val
label2id[val] = idx
# %%
# introduce pre-processing functions
def preprocess_text(text):
# 1. Make all uppercase
text = text.upper()
# 2. Remove punctuations
# text = re.sub(r'[^\w\s]', '', text) # Retains only alphanumeric and spaces
# 3. Substitute digits with '#'
text = re.sub(r'\d', '#', text)
return text
def generate_random_shuffles(text, n):
"""
Generate n strings with randomly shuffled words from the input text.
Args:
text (str): The input text.
n (int): The number of random variations to generate.
Returns:
list: A list of strings with shuffled words.
"""
words = text.split() # Split the input into words
shuffled_variations = []
for _ in range(n):
shuffled = words[:] # Copy the word list to avoid in-place modification
random.shuffle(shuffled) # Randomly shuffle the words
shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string
return shuffled_variations
# generate n more shuffled examples
def shuffle_text(text, n_shuffles=SHUFFLES):
"""
Preprocess a list of texts and add n random shuffles for each string.
Args:
texts (list): An input strings.
n_shuffles (int): Number of random shuffles to generate for each string.
Returns:
list: A list of preprocessed and shuffled strings.
"""
all_processed = []
all_processed.append(text)
# Generate random shuffles
shuffled_variations = generate_random_shuffles(text, n_shuffles)
all_processed.extend(shuffled_variations)
return all_processed
# outputs a list of dictionaries
# processes dataframe into lists of dictionaries
# each element maps input to output
# input: tag_description
# output: class label
def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
# produce shuffling
index = row['entity_id']
desc = row['mention']
desc = preprocess_text(desc)
processed_descs = shuffle_text(desc, n_shuffles=SHUFFLES)
for desc in processed_descs:
element = {
'text' : desc,
'label': label2id[index], # ensure labels starts from 0
}
output_list.append(element)
return output_list
def create_dataset():
# train
data_path = '../../data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
combined_data = DatasetDict({
'train': Dataset.from_list(process_df_to_dict(train_df)),
})
return combined_data
# %%
def train():
save_path = f'checkpoint'
split_datasets = create_dataset()
# prepare tokenizer
# model_checkpoint = "distilbert/distilbert-base-uncased"
model_checkpoint = 'google-bert/bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
# additional_special_tokens = ["<DESC>"]
# Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
truncation=True,
padding=True
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
# %% temp
# tokenized_datasets['train'].rename_columns()
# %%
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# %%
# compute metrics
metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
preds, labels = eval_preds
preds = np.argmax(preds, axis=1)
return metric.compute(predictions=preds, references=labels)
# %%
# create id2label and label2id
# %%
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(target_id_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
# model = torch.compile(model, backend="inductor", dynamic=True)
# %%
# Trainer
training_args = TrainingArguments(
output_dir=f"{save_path}",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-4,
per_device_train_batch_size=128,
per_device_eval_batch_size=128,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=120,
bf16=True,
push_to_hub=False,
remove_unused_columns=False,
)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# execute training
train()
# %%

2
train/class_bert_simple/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
checkpoint*
tensorboard-log

View File

@ -0,0 +1 @@
exports

View File

@ -0,0 +1,6 @@
*******************************************************************************
Accuracy: 0.70070
F1 Score: 0.73260
Precision: 0.84815
Recall: 0.70070

View File

@ -0,0 +1,246 @@
# %%
# from datasets import load_from_disk
import os
import glob
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from torch.utils.data import DataLoader
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
)
import evaluate
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from tqdm import tqdm
torch.set_float32_matmul_precision('high')
BATCH_SIZE = 256
# %%
data_path = '../../../data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
# rather than use pattern, we use the real thing and property
entity_ids = train_df['entity_id'].to_list()
target_id_list = sorted(list(set(entity_ids)))
# %%
id2label = {}
label2id = {}
for idx, val in enumerate(target_id_list):
id2label[idx] = val
label2id[val] = idx
# %%
# outputs a list of dictionaries
# processes dataframe into lists of dictionaries
# each element maps input to output
# input: tag_description
# output: class label
def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
desc = row['mention']
index = row['entity_id']
element = {
'text' : f"{desc}",
'label': label2id[index], # ensure labels starts from 0
}
output_list.append(element)
return output_list
def create_dataset():
# train
data_path = '../../../data_import/test.csv'
test_df = pd.read_csv(data_path, skipinitialspace=True)
# combined_data = DatasetDict({
# 'train': Dataset.from_list(process_df_to_dict(train_df)),
# })
return Dataset.from_list(process_df_to_dict(test_df))
# %%
def test():
test_dataset = create_dataset()
# prepare tokenizer
checkpoint_directory = f'../checkpoint'
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
# %%
# compute max token length
max_length = 0
for sample in test_dataset['text']:
# Tokenize the sample and get the length
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
length = len(input_ids)
# Update max_length if this sample is longer
if length > max_length:
max_length = length
print(max_length)
# %%
max_length = 128
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
# truncation=True,
padding='max_length'
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
datasets = test_dataset.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# %% temp
# tokenized_datasets['train'].rename_columns()
# %%
# create data collator
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
# %%
# compute metrics
# metric = evaluate.load("accuracy")
#
#
# def compute_metrics(eval_preds):
# preds, labels = eval_preds
# preds = np.argmax(preds, axis=1)
# return metric.compute(predictions=preds, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(target_id_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
model = model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
pred_labels = []
actual_labels = []
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
for batch in tqdm(dataloader):
# Inference in batches
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
# save labels too
actual_labels.extend(batch['label'])
# Move to GPU if available
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Perform inference
with torch.no_grad():
logits = model(
input_ids,
attention_mask).logits
predicted_class_ids = logits.argmax(dim=1).to("cpu")
pred_labels.extend(predicted_class_ids)
pred_labels = [tensor.item() for tensor in pred_labels]
# %%
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
y_true = actual_labels
y_pred = pred_labels
# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
average_parameter = 'weighted'
zero_division_parameter = 0
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
with open("output.txt", "a") as f:
print('*' * 80, file=f)
# Print the results
print(f'Accuracy: {accuracy:.5f}', file=f)
print(f'F1 Score: {f1:.5f}', file=f)
print(f'Precision: {precision:.5f}', file=f)
print(f'Recall: {recall:.5f}', file=f)
# export result
label_list = [id2label[id] for id in pred_labels]
df = pd.DataFrame({
'class_prediction': pd.Series(label_list)
})
# we can save the t5 generation output here
df.to_csv(f"exports/result.csv", index=False)
# %%
# reset file before writing to it
with open("output.txt", "w") as f:
print('', file=f)
test()

View File

@ -0,0 +1,200 @@
# %%
# from datasets import load_from_disk
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
Trainer,
EarlyStoppingCallback,
TrainingArguments
)
import evaluate
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# %%
# import training file
data_path = '../../data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
# rather than use pattern, we use the real thing and property
entity_ids = train_df['entity_id'].to_list()
target_id_list = sorted(list(set(entity_ids)))
# %%
id2label = {}
label2id = {}
for idx, val in enumerate(target_id_list):
id2label[idx] = val
label2id[val] = idx
# %%
# outputs a list of dictionaries
# processes dataframe into lists of dictionaries
# each element maps input to output
# input: tag_description
# output: class label
def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
desc = row['mention']
index = row['entity_id']
element = {
'text' : f"{desc}",
'label': label2id[index], # ensure labels starts from 0
}
output_list.append(element)
return output_list
def create_dataset():
# train
data_path = '../../data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
combined_data = DatasetDict({
'train': Dataset.from_list(process_df_to_dict(train_df)),
})
return combined_data
# %%
def train():
save_path = f'checkpoint'
split_datasets = create_dataset()
# prepare tokenizer
# model_checkpoint = "distilbert/distilbert-base-uncased"
model_checkpoint = 'google-bert/bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
# additional_special_tokens = ["<DESC>"]
# Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
truncation=True,
padding=True
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
# %% temp
# tokenized_datasets['train'].rename_columns()
# %%
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# %%
# compute metrics
metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
preds, labels = eval_preds
preds = np.argmax(preds, axis=1)
return metric.compute(predictions=preds, references=labels)
# %%
# create id2label and label2id
# %%
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(target_id_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
# model = torch.compile(model, backend="inductor", dynamic=True)
# %%
# Trainer
training_args = TrainingArguments(
output_dir=f"{save_path}",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-4,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=250,
bf16=True,
push_to_hub=False,
remove_unused_columns=False,
)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# execute training
train()
# %%

2
train/seq2seq_t5_simple/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
checkpoint*
tensorboard-log

View File

@ -0,0 +1,2 @@
__pycache__
exports/

View File

@ -0,0 +1,150 @@
import torch
from torch.utils.data import DataLoader
from transformers import (
T5TokenizerFast,
AutoModelForSeq2SeqLM,
)
import os
from tqdm import tqdm
from datasets import Dataset
import numpy as np
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
class Inference():
tokenizer: T5TokenizerFast
model: torch.nn.Module
dataloader: DataLoader
def __init__(self, checkpoint_path):
self._create_tokenizer()
self._load_model(checkpoint_path)
def _create_tokenizer(self):
# %%
# load tokenizer
self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
# Add the additional special tokens to the tokenizer
# self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
def _load_model(self, checkpoint_path: str):
# load model
# Define the directory and the pattern
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
model = torch.compile(model)
# set model to eval
self.model = model.eval()
def prepare_dataloader(self, input_df, batch_size, max_length):
"""
*arguments*
- input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
- batch_size: the batch size of dataloader output
- max_length: length of tokenizer output
"""
print("preparing dataloader")
# convert each dataframe row into a dictionary
# outputs a list of dictionaries
def _process_df(df):
output_list = []
for _, row in df.iterrows():
desc = row['mention']
label = row['entity_name']
element = {
'input' : desc,
'output': label
}
output_list.append(element)
return output_list
def _preprocess_function(example):
input = example['input']
target = example['output']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = self.tokenizer(
input,
text_target=target,
max_length=max_length,
return_tensors="pt",
padding='max_length',
truncation=True,
)
return model_inputs
test_dataset = Dataset.from_list(_process_df(input_df))
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
datasets = test_dataset.map(
_preprocess_function,
batched=True,
num_proc=1,
remove_columns=test_dataset.column_names,
)
# datasets = _preprocess_function(test_dataset)
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# create dataloader
self.dataloader = DataLoader(datasets, batch_size=batch_size)
def generate(self):
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
MAX_GENERATE_LENGTH = 128
pred_generations = []
pred_labels = []
print("start generation")
for batch in tqdm(self.dataloader):
# Inference in batches
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
# save labels too
pred_labels.extend(batch['labels'])
# Move to GPU if available
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
self.model.to(device)
# Perform inference
with torch.no_grad():
outputs = self.model.generate(input_ids,
attention_mask=attention_mask,
max_length=MAX_GENERATE_LENGTH)
# Decode the output and print the results
pred_generations.extend(outputs.to("cpu"))
# %%
def process_tensor_output(tokens):
predictions = self.tokenizer.decode(tokens, skip_special_tokens=True)
return predictions
# decode prediction labels
def decode_preds(tokens_list):
prediction_list = []
for tokens in tokens_list:
predicted_seq = process_tensor_output(tokens)
prediction_list.append(predicted_seq)
return prediction_list
prediction_list = decode_preds(pred_generations)
return prediction_list

View File

@ -0,0 +1,2 @@
Accuracy for fold: 0.5846658466584665

View File

@ -0,0 +1,62 @@
import pandas as pd
import os
import glob
from inference import Inference
checkpoint_directory = '../'
BATCH_SIZE = 512
def infer():
print(f"Inference for data")
# import test data
data_path = '../../../data_import/test.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
##########################################
# run inference
# checkpoint
# Use glob to find matching paths
directory = os.path.join(checkpoint_directory, f'checkpoint')
# Use glob to find matching paths
# path is usually checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
infer = Inference(checkpoint_path)
infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
prediction_list = infer.generate()
# add labels too
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
# Convert the list to a Pandas DataFrame
df_out = pd.DataFrame({
'predictions': prediction_list
})
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
df = pd.concat([df, df_out], axis=1)
# we can save the t5 generation output here
df.to_csv(f"exports/result.csv", index=False)
# here we want to evaluate mapping accuracy within the valid in mdm data only
condition_correct = df['predictions'] == df['entity_name']
pred_correct_proportion = sum(condition_correct)/len(df)
# write output to file output.txt
with open("output.txt", "a") as f:
print(f'Accuracy for fold: {pred_correct_proportion}', file=f)
###########################################
# execute
# reset file before writing to it
with open("output.txt", "w") as f:
print('', file=f)
infer()

View File

@ -0,0 +1,190 @@
# %%
# from datasets import load_from_disk
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from transformers import (
T5TokenizerFast,
AutoModelForSeq2SeqLM,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
EarlyStoppingCallback,
Seq2SeqTrainingArguments
)
import evaluate
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# %%
# outputs a list of dictionaries
def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
desc = row['mention']
label = row['entity_name']
element = {
'input' : desc,
'output': label
}
output_list.append(element)
return output_list
def create_dataset():
# train
data_path = f"../../data_import/train.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
combined_data = DatasetDict({
'train': Dataset.from_list(process_df_to_dict(train_df)),
})
return combined_data
# function to perform training for a given fold
def train():
save_path = f'checkpoint'
split_datasets = create_dataset()
# prepare tokenizer
model_checkpoint = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['input']
target = example['output']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
text_target=target,
max_length=max_length,
truncation=True,
padding=True
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns=split_datasets["train"].column_names,
)
# https://github.com/huggingface/transformers/pull/28414
# model_checkpoint = "google/t5-efficient-tiny"
# device_map set to auto to force it to load contiguous weights
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load("sacrebleu")
def compute_metrics(eval_preds):
preds, labels = eval_preds
# In case the model returns more than the prediction logits
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds,
skip_special_tokens=False)
# Replace -100s in the labels as we can't decode them
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels,
skip_special_tokens=False)
# Remove <PAD> tokens from decoded predictions and labels
decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
# Some simple post-processing
# decoded_preds = [pred.strip() for pred in decoded_preds]
# decoded_labels = [[label.strip()] for label in decoded_labels]
# print(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
return {"bleu": result["score"]}
# Generation Config
# from transformers import GenerationConfig
gen_config = model.generation_config
gen_config.max_length = 64
# compile
# model = torch.compile(model, backend="inductor", dynamic=True)
# Trainer
args = Seq2SeqTrainingArguments(
f"{save_path}",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,
generation_config=gen_config,
remove_unused_columns=False,
)
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# execute training
train()