diff --git a/analysis/error_analysis.py b/analysis/error_analysis_esAppMod.py similarity index 100% rename from analysis/error_analysis.py rename to analysis/error_analysis_esAppMod.py diff --git a/analysis_biomedical/data_properties.py b/analysis_biomedical/data_properties.py new file mode 100644 index 0000000..d5c3c1a --- /dev/null +++ b/analysis_biomedical/data_properties.py @@ -0,0 +1,58 @@ +# %% +import pandas as pd + +# %% +############################# +# How much data +# data_path = '../biomedical_data_import/bc2gm_test.csv' +# data_path = '../biomedical_data_import/bc2gm_test.csv' +data_path = '../biomedical_data_import/bc5cdr-chemical_train.csv' +df = pd.read_csv(data_path) +len(df) + +# %% + +# %% +# bc2gm: +# train: 288939 +# test: 1034 + +# %% +################################ +# check for NA values +df[df['mention'].isna()] + + + +# %% +############################## +# how many labels? +data_path = '../biomedical_data_import/bc2gm_test.csv' +df = pd.read_csv(data_path) + +id_list = sorted(list(set(df['entity_id'].to_list()))) + +# %% +len(id_list) + +# %% +for id in id_list: + if isinstance(id,int): + continue + else: + print(id) +# %% +# bc2gm: +# 61641 - holy shit + +# %% +############################### +# max length +max_length = 0 +for mention in df['mention']: + current_length = len(mention) + if current_length > max_length: + max_length = current_length +print(max_length) + +# %% diff --git a/analysis_biomedical/measure_tokenization_length.py b/analysis_biomedical/measure_tokenization_length.py new file mode 100644 index 0000000..0ebee92 --- /dev/null +++ b/analysis_biomedical/measure_tokenization_length.py @@ -0,0 +1,17 @@ +# %% +from transformers import AutoTokenizer +import pandas as pd + + +data_path = '../biomedical_data_import/bc2gm_train.csv' +df = pd.DataFrame(data_path) + +# Load the tokenizer (e.g., BERT tokenizer) +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + +# %% +# Calculate token lengths +df['token_length'] = df['mention'].apply(lambda x: len(tokenizer.tokenize(x))) + +# Display the dataset with token lengths +print(df) \ No newline at end of file diff --git a/biomedical_data_import/.gitignore b/biomedical_data_import/.gitignore new file mode 100644 index 0000000..16f2dc5 --- /dev/null +++ b/biomedical_data_import/.gitignore @@ -0,0 +1 @@ +*.csv \ No newline at end of file diff --git a/biomedical_data_import/original_data_processing.py b/biomedical_data_import/original_data_processing.py new file mode 100644 index 0000000..f6778bc --- /dev/null +++ b/biomedical_data_import/original_data_processing.py @@ -0,0 +1,36 @@ +# %% +from collections import defaultdict + +# %% +data_name = 'bc2gm' # and the other 3 names +train_path = 'test_dictionary.txt' +test_path = 'processed_test_refined' + +# %% +vocab = defaultdict(set) +with open(f'../biomedical/{data_name}/{train_path}') as f: + for line in f: + term_list = line.strip().split('||') + vocab[term_list[0]].add(term_list[1].lower()) + +cui_to_id, id_to_cui = {}, {} +vocab_entity_id_mentions = {} +for id, cui in enumerate(vocab): + cui_to_id[cui] = id + id_to_cui[id] = cui +for cui, mention in vocab.items(): + vocab_entity_id_mentions[cui_to_id[cui]] = mention + +vocab_mentions, vocab_ids = [], [] +for id, mentions in vocab_entity_id_mentions.items(): + vocab_mentions.extend(mentions) + vocab_ids.extend([id]*len(mentions)) + +# %% +test_mentions, test_cuis = [], [] +with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f: + for line in f: + term_list = line.strip().split('||') + test_cuis.append(term_list[-1]) + test_mentions.append(term_list[-2].lower()) + diff --git a/biomedical_data_import/process_to_df.py b/biomedical_data_import/process_to_df.py new file mode 100644 index 0000000..b605746 --- /dev/null +++ b/biomedical_data_import/process_to_df.py @@ -0,0 +1,134 @@ +# %% +import pandas as pd +from tqdm import tqdm +import multiprocessing + +# %% +######################### +# we first process training data +def process_train_to_csv(data_path, output): + # data_path = '../esAppMod_data_import/parent_train.csv' + input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None) + input_df = input_df.rename(columns={0: 'entity_id', 1: 'mention',}) + + # handle 'or' values in the number column + df = input_df.copy() + new_rows = [] + for idx,row in df.iterrows(): + index = row['entity_id'] + mention = row['mention'] + + # omit nan values + if row['mention'] == 'NaN' or pd.isna(row['mention']): + df = df.drop(index=[idx]) + continue + + # handle possible multiple indices in index field + if '|' in row['entity_id']: + # print(row[0]) + df = df.drop(index=[idx]) + index_list = index.split('|') + + for new_index in index_list: + element = { + 'entity_id': new_index, + 'mention': mention, + } + new_rows.append(element) + + df_new = pd.DataFrame(new_rows, columns=df.columns) + df = pd.concat([df, df_new], ignore_index=True) + df = df.reset_index(drop=True) + + df.to_csv(output, index=False) + + +# %% +name_list =[ + ('../biomedical/bc2gm/test_dictionary.txt', 'bc2gm_train.csv'), + ('../biomedical/bc5cdr-chemical/test_dictionary.txt', 'bc5cdr-chemical_train.csv'), + ('../biomedical/bc5cdr-disease/test_dictionary.txt', 'bc5cdr-disease_train.csv'), + ('../biomedical/ncbi/test_dictionary.txt', 'ncbi_train.csv'), +] + +# for data_path, output in name_list: +# process_train_to_csv(data_path, output) + +if __name__ == "__main__": + # Create a pool of workers + num_workers = 4 # set number of cpus to use + with multiprocessing.Pool(num_workers) as pool: + # starmap + # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)]. + pool.starmap(process_train_to_csv, name_list) + + +# %% +################################################# +# process test data + +def is_int_string(s): + try: + int(s) + return True + except ValueError: + return False + +def process_test_to_csv(data_path, output): + # data_path = '../esAppMod_data_import/parent_train.csv' + input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None) + input_df = input_df.drop(columns=[0, 1, 2]) + input_df = input_df.rename(columns={3: 'mention', 4: 'entity_id'}) + + # handle 'or' values in the number column + df = input_df.copy() + new_rows = [] + for idx,row in df.iterrows(): + + # handle possible multiple indices + if '|' in row['entity_id']: + index = row['entity_id'] + mention = row['mention'] + df = df.drop(index=[idx]) + index_list = index.split('|') + + for new_index in index_list: + element = { + 'entity_id': new_index, + 'mention': mention, + } + new_rows.append(element) + + df_new = pd.DataFrame(new_rows, columns=df.columns) + df = pd.concat([df, df_new], ignore_index=True) + df = df.reset_index(drop=True) + + # do some cleanup + df['entity_id'].isna() + + df.to_csv(output, index=False) + + +# %% +name_list =[ + ('../biomedical/bc2gm/processed_test_refined/0.concept', 'bc2gm_test.csv'), + ('../biomedical/bc5cdr-chemical/processed_test_refined/0.concept', 'bc5cdr-chemical_test.csv'), + ('../biomedical/bc5cdr-disease/processed_test_refined/0.concept', 'bc5cdr-disease_test.csv'), + ('../biomedical/ncbi/processed_test_refined/0.concept', 'ncbi_test.csv'), +] + +# for data_path, output in name_list: +# process_test_to_csv(data_path, output) +if __name__ == "__main__": + # Create a pool of workers + num_workers = 4 # set number of cpus to use + with multiprocessing.Pool(num_workers) as pool: + # starmap + # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)]. + pool.starmap(process_test_to_csv, name_list) + + + +# %% + +# %% diff --git a/train/class_bert_augmentation/.gitignore b/biomedical_train/bc2gm/augmentation/.gitignore similarity index 100% rename from train/class_bert_augmentation/.gitignore rename to biomedical_train/bc2gm/augmentation/.gitignore diff --git a/biomedical_train/bc2gm/augmentation/dynamic_train.py b/biomedical_train/bc2gm/augmentation/dynamic_train.py new file mode 100644 index 0000000..b5d5b24 --- /dev/null +++ b/biomedical_train/bc2gm/augmentation/dynamic_train.py @@ -0,0 +1,388 @@ +# %% +from torch.utils.data import Dataset, DataLoader + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments, + TrainerCallback +) +import evaluate +import numpy as np +import pandas as pd +import math +from functools import partial +import warnings + +warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0') +warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.') + +# import matplotlib.pyplot as plt + + + +torch.set_float32_matmul_precision('high') + +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +# %% +# PARAMETERS +SAMPLES=20 +SHUFFLES=5 +AMPLIFY_FACTOR=5 + +# %% +################################################### +# import code +# import training file +data_path = '../../esAppMod_data_import/train.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +df["training_id"] = df["entity_id"].map(label2id) + +# %% +############################################################## +# augmentation code + +# basic preprocessing +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +def shuffle_text(text, n_shuffles=SHUFFLES): + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + + + +# %% +def create_example(index, mention): + return {'training_id': index, 'mention': mention} + +# augment whole dataset +def augment_data(df): + output_list = [] + + for idx,row in df.iterrows(): + index = row['training_id'] + parent_desc = row['mention'] + parent_desc = preprocess_text(parent_desc) + + # add basic example + output_list.append(create_example(index, parent_desc)) + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + output_list.append(create_example(index, desc)) + + # add corrupted strings + desc = corrupt_string(parent_desc, corruption_probability=0.1) + if (desc != parent_desc): + output_list.append(create_example(index, desc)) + + # add example with stripped non-alphanumerics + desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + if (desc != parent_desc): + output_list.append(create_example(index, desc)) + + # short sequence amplifier + # short sequences are rare, and we must compensate by including more examples + # also, short sequence don't usually get affected by shuffle + words = parent_desc.split() + word_count = len(words) + if word_count <= 2: + for _ in range(AMPLIFY_FACTOR): + output_list.append(create_example(index, desc)) + + new_df = pd.DataFrame(output_list) + return new_df + + +############################################################### +# regeneration code +# %% +# we want to sample n samples from each class +# sample_size refers to the number of samples per class +def sample_from_df(df, sample_size_per_class=5): + sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names + .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x)))) + .reset_index(drop=True)) + + return sampled_df + + + +# %% +class DynamicDataset(Dataset): + def __init__(self, df, sample_size_per_class, tokenizer): + """ + Args: + df (pd.DataFrame): Original DataFrame with class (id) and data columns. + sample_size_per_class (int): Number of samples to draw per class for each epoch. + """ + self.df = df + self.sample_size_per_class = sample_size_per_class + self.tokenizer = tokenizer + self.current_data = None + self.regenerate_data() # Generate the initial dataset + + def regenerate_data(self): + """ + Generate a new sampled dataset for the current epoch. + + dynamic callback function to regenerate data each time we call this + method, it updates the current_data we can: + + - re-sample the dataframe for a new set of n_samples + - generate fresh augmentations this effectively + + This allows us to re-sample and re-augment at the start of each epoch + """ + # Sample `sample_size_per_class` rows per class + sampled_df = sample_from_df(self.df, self.sample_size_per_class) + + # perform future edits here + sampled_df = augment_data(sampled_df) + + # perform tokenization here + # Batch tokenize the entire column of data + tokenized_batch = self.tokenizer( + sampled_df["mention"].to_list(), # Pass all text data at once + truncation=True, + # return_tensors="pt" # disabled because pt requires equal length tensors + ) + + # Store the tokenized data with labels + self.current_data = [ + { + "input_ids": torch.tensor(tokenized_batch["input_ids"][i]), + "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]), + "labels": torch.tensor(sampled_df.iloc[i]["training_id"]) # Include the label + } + for i in range(len(sampled_df)) + ] + + + def __len__(self): + return len(self.current_data) + + def __getitem__(self, idx): + return self.current_data[idx] + +# %% +class RegenerateDatasetCallback(TrainerCallback): + def __init__(self, dataset): + self.dataset = dataset + + def on_epoch_begin(self, args, state, control, **kwargs): + print(f"Epoch {int(math.ceil(state.epoch + 1))}: Regenerating dataset") + self.dataset.regenerate_data() + + + +# %% +def custom_collate_fn(batch): + # Dynamically pad tensors to the longest sequence in the batch + input_ids = [item["input_ids"] for item in batch] + attention_masks = [item["attention_mask"] for item in batch] + labels = torch.stack([item["labels"] for item in batch]) + + # Pad inputs to the same length + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True) + attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True) + + return { + "input_ids": input_ids, + "attention_mask": attention_masks, + "labels": labels + } + + +########################################################################## +# training code +# %% +def train(): + + save_path = f'checkpoint' + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True) + + # make the dataset + + + # Define the callback + lean_df = df.drop(columns=['entity_name']) + dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer) + + # create the regeneration callback + regeneration_callback = RegenerateDatasetCallback(dynamic_dataset) + + # compute metrics + metric = evaluate.load("accuracy") + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + save_strategy="steps", + save_steps=500, + load_best_model_at_end=False, + learning_rate=5e-5, + per_device_train_batch_size=64, + # per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=120, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=dynamic_dataset, + tokenizer=tokenizer, + data_collator=custom_collate_fn, + compute_metrics=compute_metrics, + callbacks=[regeneration_callback] + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/train/class_bert_augmentation/prediction/.gitignore b/biomedical_train/bc2gm/augmentation/prediction/.gitignore similarity index 100% rename from train/class_bert_augmentation/prediction/.gitignore rename to biomedical_train/bc2gm/augmentation/prediction/.gitignore diff --git a/train/class_bert_augmentation/prediction/output.txt b/biomedical_train/bc2gm/augmentation/prediction/output.txt similarity index 53% rename from train/class_bert_augmentation/prediction/output.txt rename to biomedical_train/bc2gm/augmentation/prediction/output.txt index 8a8215b..93e7dd1 100644 --- a/train/class_bert_augmentation/prediction/output.txt +++ b/biomedical_train/bc2gm/augmentation/prediction/output.txt @@ -1,6 +1,6 @@ ******************************************************************************* -Accuracy: 0.80197 -F1 Score: 0.81948 -Precision: 0.88067 -Recall: 0.80197 +Accuracy: 0.80655 +F1 Score: 0.82821 +Precision: 0.87847 +Recall: 0.80655 diff --git a/biomedical_train/bc2gm/augmentation/prediction/predict.py b/biomedical_train/bc2gm/augmentation/prediction/predict.py new file mode 100644 index 0000000..a7e1b62 --- /dev/null +++ b/biomedical_train/bc2gm/augmentation/prediction/predict.py @@ -0,0 +1,236 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 32 + +# %% +# construct the target id list +data_path = '../../../biomedical_data_import/bc2gm_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) +# target_id_list = [id for id in target_id_list] + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + row_id = row['entity_id'] + element = { + 'text' : desc, + 'labels': label2id[row_id], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../biomedical_data_import/bc2gm_test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'test': Dataset.from_list(process_df_to_dict(test_df)), + }) + return combined_data + + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # %% + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # print datasets['test'] columns + column_info = datasets['test'].features + for column, dtype in column_info.items(): + print(f"Column: {column}, Type: {dtype}") + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + dataloader = DataLoader( + datasets['test'], + batch_size=BATCH_SIZE, + shuffle=False, + collate_fn=data_collator) + + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/biomedical_train/bc2gm/augmentation/train.py b/biomedical_train/bc2gm/augmentation/train.py new file mode 100644 index 0000000..1e413f5 --- /dev/null +++ b/biomedical_train/bc2gm/augmentation/train.py @@ -0,0 +1,367 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=0 # 0 shuffles means it does not re-sample + +# %% + +# We want to map the entity_id to a consecutive set of id's +# import training file +data_path = '../../../biomedical_data_import/bc2gm_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + + +###################################### + +# augmentation by text corruption + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + +############################################################# +# Data Run code here + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label + +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + if isinstance(parent_desc, float): + print(parent_desc) + parent_desc = f'{parent_desc}' + parent_desc = preprocess_text(parent_desc) + + # unaugmented data + element = { + 'text' : parent_desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # # short sequences are rare, and we must compensate by including more examples + # # mutation of other longer sequences might drown out rare short sequences + # words = parent_desc.split() + # word_count = len(words) + # if word_count < 3: + # for _ in range(10): + # element = { + # 'text': parent_desc, + # 'label': label2id[index], + # } + # output_list.append(element) + + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # # corrupt string + # desc = corrupt_string(parent_desc, corruption_probability=0.1) + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + # # augmentation + # # remove all non-alphanumerics + # desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + + data_path = '../../../biomedical_data_import/bc2gm_train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% +######################################### +# training function + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, # enable truncation for efficiency + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", # we only need the tokenization, not the original strings + ) + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=512, + # per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, # data_collator performs dynamic padding + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/train/class_bert_hierarchical/.gitignore b/biomedical_train/bc2gm/simple/.gitignore similarity index 100% rename from train/class_bert_hierarchical/.gitignore rename to biomedical_train/bc2gm/simple/.gitignore diff --git a/biomedical_train/bc2gm/simple/dynamic_train.py b/biomedical_train/bc2gm/simple/dynamic_train.py new file mode 100644 index 0000000..5556aa8 --- /dev/null +++ b/biomedical_train/bc2gm/simple/dynamic_train.py @@ -0,0 +1,280 @@ +# %% +from torch.utils.data import Dataset, DataLoader + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments, + TrainerCallback +) +import evaluate +import numpy as np +import pandas as pd +from functools import partial +import warnings + +warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0') +warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.') + +# import matplotlib.pyplot as plt + + + +torch.set_float32_matmul_precision('high') + +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +# %% +# PARAMETERS +SAMPLES=20 + +# %% +################################################### +# import code +# import training file +data_path = '../../../biomedical_data_import/bc2gm_train.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +df["training_id"] = df["entity_id"].map(label2id) + +############################################################### +# regeneration code +# %% +# we want to sample n samples from each class +# sample_size refers to the number of samples per class +def sample_from_df(df, sample_size_per_class=5): + sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names + .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x)))) + .reset_index(drop=True)) + + return sampled_df + + +# %% +# augment whole dataset +# for now, we just return the same df +def augment_data(df): + return df + +# %% +class DynamicDataset(Dataset): + def __init__(self, df, sample_size_per_class, tokenizer): + """ + Args: + df (pd.DataFrame): Original DataFrame with class (id) and data columns. + sample_size_per_class (int): Number of samples to draw per class for each epoch. + """ + self.df = df + self.sample_size_per_class = sample_size_per_class + self.tokenizer = tokenizer + self.current_data = None + self.regenerate_data() # Generate the initial dataset + + def regenerate_data(self): + """ + Generate a new sampled dataset for the current epoch. + + dynamic callback function to regenerate data each time we call this + method, it updates the current_data we can: + + - re-sample the dataframe for a new set of n_samples + - generate fresh augmentations this effectively + + This allows us to re-sample and re-augment at the start of each epoch + """ + # Sample `sample_size_per_class` rows per class + sampled_df = sample_from_df(self.df, self.sample_size_per_class) + + # perform future edits here + sampled_df = augment_data(sampled_df) + + # perform tokenization here + # Batch tokenize the entire column of data + tokenized_batch = self.tokenizer( + sampled_df["mention"].to_list(), # Pass all text data at once + truncation=True, + # return_tensors="pt" # disabled because pt requires equal length tensors + ) + + # Store the tokenized data with labels + self.current_data = [ + { + "input_ids": torch.tensor(tokenized_batch["input_ids"][i]), + "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]), + "labels": torch.tensor(sampled_df.iloc[i]["training_id"]) # Include the label + } + for i in range(len(sampled_df)) + ] + + + def __len__(self): + return len(self.current_data) + + def __getitem__(self, idx): + return self.current_data[idx] + +# %% +class RegenerateDatasetCallback(TrainerCallback): + def __init__(self, dataset, every_n_epochs=2): + """ + Args: + dataset: The dataset instance that supports regeneration. + every_n_epochs (int): Number of epochs to wait before regenerating the dataset. + """ + self.dataset = dataset + self.every_n_epochs = every_n_epochs + + def on_epoch_begin(self, args, state, control, **kwargs): + # Check if the current epoch is a multiple of `every_n_epochs` + if (state.epoch + 1) % self.every_n_epochs == 0: + print(f"Epoch {int(state.epoch + 1)}: Regenerating dataset...") + self.dataset.regenerate_data() + + +# %% +def custom_collate_fn(batch): + # Dynamically pad tensors to the longest sequence in the batch + input_ids = [item["input_ids"] for item in batch] + attention_masks = [item["attention_mask"] for item in batch] + labels = torch.stack([item["labels"] for item in batch]) + + # Pad inputs to the same length + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True) + attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True) + + return { + "input_ids": input_ids, + "attention_mask": attention_masks, + "labels": labels + } + + +########################################################################## +# training code +# %% +def train(): + + save_path = f'checkpoint' + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True) + + # make the dataset + + + # Define the callback + # lean_df = df.drop(columns=['entity_name']) + dynamic_dataset = DynamicDataset(df = df, sample_size_per_class=SAMPLES, tokenizer=tokenizer) + + # create the regeneration callback + regeneration_callback = RegenerateDatasetCallback(dynamic_dataset, every_n_epochs=2) + + # compute metrics + metric = evaluate.load("accuracy") + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-4, + per_device_train_batch_size=256, + # per_device_eval_batch_size=256, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=200, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=dynamic_dataset, + tokenizer=tokenizer, + data_collator=custom_collate_fn, + compute_metrics=compute_metrics, + callbacks=[regeneration_callback] + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/train/class_bert_hierarchical/prediction/.gitignore b/biomedical_train/bc2gm/simple/prediction/.gitignore similarity index 100% rename from train/class_bert_hierarchical/prediction/.gitignore rename to biomedical_train/bc2gm/simple/prediction/.gitignore diff --git a/biomedical_train/bc2gm/simple/prediction/output.txt b/biomedical_train/bc2gm/simple/prediction/output.txt new file mode 100644 index 0000000..7811f07 --- /dev/null +++ b/biomedical_train/bc2gm/simple/prediction/output.txt @@ -0,0 +1,6 @@ + +******************************************************************************* +Accuracy: 0.15093 +F1 Score: 0.14063 +Precision: 0.15594 +Recall: 0.15093 diff --git a/biomedical_train/bc2gm/simple/prediction/predict.py b/biomedical_train/bc2gm/simple/prediction/predict.py new file mode 100644 index 0000000..19cdb35 --- /dev/null +++ b/biomedical_train/bc2gm/simple/prediction/predict.py @@ -0,0 +1,246 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 32 + +# %% +# construct the target id list +data_path = '../../../../biomedical_data_import/bc2gm_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) +# target_id_list = [id for id in target_id_list] + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def is_int_string(s): + try: + int(s) + return True + except ValueError: + return False + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + row_id = row['entity_id'] + if not is_int_string(row_id): + continue + row_id = int(row_id) + desc = row['mention'] + desc = preprocess_text(desc) + element = { + 'text' : desc, + 'labels': label2id[row_id], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../../biomedical_data_import/bc2gm_test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'test': Dataset.from_list(process_df_to_dict(test_df)), + }) + return combined_data + + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # %% + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # print datasets['test'] columns + column_info = datasets['test'].features + for column, dtype in column_info.items(): + print(f"Column: {column}, Type: {dtype}") + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + dataloader = DataLoader( + datasets['test'], + batch_size=BATCH_SIZE, + shuffle=False, + collate_fn=data_collator) + + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/biomedical_train/bc2gm/simple/train.py b/biomedical_train/bc2gm/simple/train.py new file mode 100644 index 0000000..614de64 --- /dev/null +++ b/biomedical_train/bc2gm/simple/train.py @@ -0,0 +1,368 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=0 # 0 shuffles means it does not re-sample + +# %% + +# We want to map the entity_id to a consecutive set of id's +# import training file +data_path = '../../biomedical_data_import/bc2gm_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + + +###################################### + +# augmentation by text corruption + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + +############################################################# +# Data Run code here + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label + +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + if isinstance(parent_desc, float): + print(parent_desc) + parent_desc = f'{parent_desc}' + parent_desc = preprocess_text(parent_desc) + + # unaugmented data + element = { + 'text' : parent_desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # # short sequences are rare, and we must compensate by including more examples + # # mutation of other longer sequences might drown out rare short sequences + # words = parent_desc.split() + # word_count = len(words) + # if word_count < 3: + # for _ in range(10): + # element = { + # 'text': parent_desc, + # 'label': label2id[index], + # } + # output_list.append(element) + + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # # corrupt string + # desc = corrupt_string(parent_desc, corruption_probability=0.1) + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + # # augmentation + # # remove all non-alphanumerics + # desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + + data_path = '../../biomedical_data_import/bc2gm_train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% +######################################### +# training function + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + # max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, # enable truncation for efficiency + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", # we only need the tokenization, not the original strings + ) + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=512, + # per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, # data_collator performs dynamic padding + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/train/class_bert_simple/.gitignore b/biomedical_train/bc5cdr-chemical/augmentation/.gitignore similarity index 100% rename from train/class_bert_simple/.gitignore rename to biomedical_train/bc5cdr-chemical/augmentation/.gitignore diff --git a/train/class_bert_simple/classification_prediction/.gitignore b/biomedical_train/bc5cdr-chemical/augmentation/prediction/.gitignore similarity index 100% rename from train/class_bert_simple/classification_prediction/.gitignore rename to biomedical_train/bc5cdr-chemical/augmentation/prediction/.gitignore diff --git a/biomedical_train/bc5cdr-chemical/augmentation/prediction/output.txt b/biomedical_train/bc5cdr-chemical/augmentation/prediction/output.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/biomedical_train/bc5cdr-chemical/augmentation/prediction/output.txt @@ -0,0 +1 @@ + diff --git a/biomedical_train/bc5cdr-chemical/augmentation/prediction/predict.py b/biomedical_train/bc5cdr-chemical/augmentation/prediction/predict.py new file mode 100644 index 0000000..04c059f --- /dev/null +++ b/biomedical_train/bc5cdr-chemical/augmentation/prediction/predict.py @@ -0,0 +1,236 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% +# construct the target id list +data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) +# target_id_list = [id for id in target_id_list] + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + row_id = row['entity_id'] + element = { + 'text' : desc, + 'labels': label2id[row_id], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../biomedical_data_import/bc5cdr-chemical_test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'test': Dataset.from_list(process_df_to_dict(test_df)), + }) + return combined_data + + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # %% + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # print datasets['test'] columns + column_info = datasets['test'].features + for column, dtype in column_info.items(): + print(f"Column: {column}, Type: {dtype}") + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + dataloader = DataLoader( + datasets['test'], + batch_size=BATCH_SIZE, + shuffle=False, + collate_fn=data_collator) + + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/biomedical_train/bc5cdr-chemical/augmentation/train.py b/biomedical_train/bc5cdr-chemical/augmentation/train.py new file mode 100644 index 0000000..6e4d7f9 --- /dev/null +++ b/biomedical_train/bc5cdr-chemical/augmentation/train.py @@ -0,0 +1,368 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=0 # 0 shuffles means it does not re-sample + +# %% + +# We want to map the entity_id to a consecutive set of id's +# import training file +data_path = '../../biomedical_data_import/bc5cdr-chemical_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + + +###################################### + +# augmentation by text corruption + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + +############################################################# +# Data Run code here + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label + +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + if isinstance(parent_desc, float): + print(parent_desc) + parent_desc = f'{parent_desc}' + parent_desc = preprocess_text(parent_desc) + + # unaugmented data + element = { + 'text' : parent_desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # # short sequences are rare, and we must compensate by including more examples + # # mutation of other longer sequences might drown out rare short sequences + # words = parent_desc.split() + # word_count = len(words) + # if word_count < 3: + # for _ in range(10): + # element = { + # 'text': parent_desc, + # 'labels': label2id[index], + # } + # output_list.append(element) + + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # # corrupt string + # desc = corrupt_string(parent_desc, corruption_probability=0.1) + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'labels': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + # # augmentation + # # remove all non-alphanumerics + # desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'labels': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + + data_path = '../../biomedical_data_import/bc5cdr-chemical.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% +######################################### +# training function + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + # max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, # enable truncation for efficiency + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", # we only need the tokenization, not the original strings + ) + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=512, + # per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, # data_collator performs dynamic padding + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/biomedical_train/bc5cdr-chemical/simple/.gitignore b/biomedical_train/bc5cdr-chemical/simple/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/biomedical_train/bc5cdr-chemical/simple/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/biomedical_train/bc5cdr-chemical/simple/prediction/.gitignore b/biomedical_train/bc5cdr-chemical/simple/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/biomedical_train/bc5cdr-chemical/simple/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/biomedical_train/bc5cdr-chemical/simple/prediction/output.txt b/biomedical_train/bc5cdr-chemical/simple/prediction/output.txt new file mode 100644 index 0000000..a4e2c6d --- /dev/null +++ b/biomedical_train/bc5cdr-chemical/simple/prediction/output.txt @@ -0,0 +1,6 @@ + +******************************************************************************* +Accuracy: 0.04872 +F1 Score: 0.04283 +Precision: 0.04903 +Recall: 0.04872 diff --git a/biomedical_train/bc5cdr-chemical/simple/prediction/predict.py b/biomedical_train/bc5cdr-chemical/simple/prediction/predict.py new file mode 100644 index 0000000..24bef63 --- /dev/null +++ b/biomedical_train/bc5cdr-chemical/simple/prediction/predict.py @@ -0,0 +1,234 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 32 + +# %% +# construct the target id list +data_path = '../../../../biomedical_data_import/bc5cdr-chemical_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) +# target_id_list = [id for id in target_id_list] + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + row_id = row['entity_id'] + element = { + 'text' : desc, + 'labels': label2id[row_id], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../../biomedical_data_import/bc5cdr-chemical_test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'test': Dataset.from_list(process_df_to_dict(test_df)), + }) + return combined_data + + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # %% + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + column_info = datasets['test'].features + for column, dtype in column_info.items(): + print(f"Column: {column}, Type: {dtype}") + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + dataloader = DataLoader( + datasets['test'], + batch_size=BATCH_SIZE, + shuffle=False, + collate_fn=data_collator) + + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/biomedical_train/bc5cdr-chemical/simple/train.py b/biomedical_train/bc5cdr-chemical/simple/train.py new file mode 100644 index 0000000..3ad9ec8 --- /dev/null +++ b/biomedical_train/bc5cdr-chemical/simple/train.py @@ -0,0 +1,367 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=0 # 0 shuffles means it does not re-sample + +# %% + +# We want to map the entity_id to a consecutive set of id's +# import training file +data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + + +###################################### + +# augmentation by text corruption + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + +############################################################# +# Data Run code here + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label + +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + if isinstance(parent_desc, float): + print(parent_desc) + parent_desc = f'{parent_desc}' + parent_desc = preprocess_text(parent_desc) + + # unaugmented data + element = { + 'text' : parent_desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # # short sequences are rare, and we must compensate by including more examples + # # mutation of other longer sequences might drown out rare short sequences + # words = parent_desc.split() + # word_count = len(words) + # if word_count < 3: + # for _ in range(10): + # element = { + # 'text': parent_desc, + # 'labels': label2id[index], + # } + # output_list.append(element) + + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # # corrupt string + # desc = corrupt_string(parent_desc, corruption_probability=0.1) + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'labels': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + # # augmentation + # # remove all non-alphanumerics + # desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'labels': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + + data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% +######################################### +# training function + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + # max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, # enable truncation for efficiency + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", # we only need the tokenization, not the original strings + ) + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=5e-5, + per_device_train_batch_size=64, + # per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, # data_collator performs dynamic padding + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + diff --git a/biomedical_train/bc5cdr-disease/.gitignore b/biomedical_train/bc5cdr-disease/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/biomedical_train/bc5cdr-disease/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/biomedical_train/bc5cdr-disease/prediction/.gitignore b/biomedical_train/bc5cdr-disease/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/biomedical_train/bc5cdr-disease/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/biomedical_train/bc5cdr-disease/prediction/output.txt b/biomedical_train/bc5cdr-disease/prediction/output.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/biomedical_train/bc5cdr-disease/prediction/output.txt @@ -0,0 +1 @@ + diff --git a/biomedical_train/bc5cdr-disease/prediction/predict.py b/biomedical_train/bc5cdr-disease/prediction/predict.py new file mode 100644 index 0000000..c692e72 --- /dev/null +++ b/biomedical_train/bc5cdr-disease/prediction/predict.py @@ -0,0 +1,236 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% +# construct the target id list +data_path = '../../../biomedical_data_import/bc2gm_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) +# target_id_list = [id for id in target_id_list] + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + row_id = row['entity_id'] + element = { + 'text' : desc, + 'labels': label2id[row_id], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../biomedical_data_import/bc2gm_test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'test': Dataset.from_list(process_df_to_dict(test_df)), + }) + return combined_data + + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # %% + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # print datasets['test'] columns + column_info = datasets['test'].features + for column, dtype in column_info.items(): + print(f"Column: {column}, Type: {dtype}") + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + dataloader = DataLoader( + datasets['test'], + batch_size=BATCH_SIZE, + shuffle=False, + collate_fn=data_collator) + + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/biomedical_train/bc5cdr-disease/train.py b/biomedical_train/bc5cdr-disease/train.py new file mode 100644 index 0000000..614de64 --- /dev/null +++ b/biomedical_train/bc5cdr-disease/train.py @@ -0,0 +1,368 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=0 # 0 shuffles means it does not re-sample + +# %% + +# We want to map the entity_id to a consecutive set of id's +# import training file +data_path = '../../biomedical_data_import/bc2gm_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + + +###################################### + +# augmentation by text corruption + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + +############################################################# +# Data Run code here + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label + +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + if isinstance(parent_desc, float): + print(parent_desc) + parent_desc = f'{parent_desc}' + parent_desc = preprocess_text(parent_desc) + + # unaugmented data + element = { + 'text' : parent_desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # # short sequences are rare, and we must compensate by including more examples + # # mutation of other longer sequences might drown out rare short sequences + # words = parent_desc.split() + # word_count = len(words) + # if word_count < 3: + # for _ in range(10): + # element = { + # 'text': parent_desc, + # 'label': label2id[index], + # } + # output_list.append(element) + + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # # corrupt string + # desc = corrupt_string(parent_desc, corruption_probability=0.1) + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + # # augmentation + # # remove all non-alphanumerics + # desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + + data_path = '../../biomedical_data_import/bc2gm_train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% +######################################### +# training function + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + # max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, # enable truncation for efficiency + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", # we only need the tokenization, not the original strings + ) + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=512, + # per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, # data_collator performs dynamic padding + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/biomedical_train/ncbi/.gitignore b/biomedical_train/ncbi/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/biomedical_train/ncbi/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/biomedical_train/ncbi/prediction/.gitignore b/biomedical_train/ncbi/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/biomedical_train/ncbi/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/biomedical_train/ncbi/prediction/output.txt b/biomedical_train/ncbi/prediction/output.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/biomedical_train/ncbi/prediction/output.txt @@ -0,0 +1 @@ + diff --git a/biomedical_train/ncbi/prediction/predict.py b/biomedical_train/ncbi/prediction/predict.py new file mode 100644 index 0000000..c692e72 --- /dev/null +++ b/biomedical_train/ncbi/prediction/predict.py @@ -0,0 +1,236 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% +# construct the target id list +data_path = '../../../biomedical_data_import/bc2gm_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) +# target_id_list = [id for id in target_id_list] + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + row_id = row['entity_id'] + element = { + 'text' : desc, + 'labels': label2id[row_id], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../biomedical_data_import/bc2gm_test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'test': Dataset.from_list(process_df_to_dict(test_df)), + }) + return combined_data + + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # %% + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # print datasets['test'] columns + column_info = datasets['test'].features + for column, dtype in column_info.items(): + print(f"Column: {column}, Type: {dtype}") + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + dataloader = DataLoader( + datasets['test'], + batch_size=BATCH_SIZE, + shuffle=False, + collate_fn=data_collator) + + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/biomedical_train/ncbi/train.py b/biomedical_train/ncbi/train.py new file mode 100644 index 0000000..614de64 --- /dev/null +++ b/biomedical_train/ncbi/train.py @@ -0,0 +1,368 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=0 # 0 shuffles means it does not re-sample + +# %% + +# We want to map the entity_id to a consecutive set of id's +# import training file +data_path = '../../biomedical_data_import/bc2gm_train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + + +###################################### + +# augmentation by text corruption + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + +############################################################# +# Data Run code here + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label + +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + if isinstance(parent_desc, float): + print(parent_desc) + parent_desc = f'{parent_desc}' + parent_desc = preprocess_text(parent_desc) + + # unaugmented data + element = { + 'text' : parent_desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # # short sequences are rare, and we must compensate by including more examples + # # mutation of other longer sequences might drown out rare short sequences + # words = parent_desc.split() + # word_count = len(words) + # if word_count < 3: + # for _ in range(10): + # element = { + # 'text': parent_desc, + # 'label': label2id[index], + # } + # output_list.append(element) + + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # # corrupt string + # desc = corrupt_string(parent_desc, corruption_probability=0.1) + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + # # augmentation + # # remove all non-alphanumerics + # desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + + data_path = '../../biomedical_data_import/bc2gm_train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% +######################################### +# training function + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + # max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, # enable truncation for efficiency + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", # we only need the tokenization, not the original strings + ) + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=512, + # per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, # data_collator performs dynamic padding + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/esAppMod_train/augmentation/.gitignore b/esAppMod_train/augmentation/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/esAppMod_train/augmentation/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/esAppMod_train/augmentation/dynamic_train.py b/esAppMod_train/augmentation/dynamic_train.py new file mode 100644 index 0000000..a75d98c --- /dev/null +++ b/esAppMod_train/augmentation/dynamic_train.py @@ -0,0 +1,388 @@ +# %% +from torch.utils.data import Dataset, DataLoader + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments, + TrainerCallback +) +import evaluate +import numpy as np +import pandas as pd +import math +from functools import partial +import warnings + +warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0') +warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.') + +# import matplotlib.pyplot as plt + + + +torch.set_float32_matmul_precision('high') + +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +# %% +# PARAMETERS +SAMPLES=20 +SHUFFLES=5 +AMPLIFY_FACTOR=5 + +# %% +################################################### +# import code +# import training file +data_path = '../../esAppMod_data_import/train.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +df["training_id"] = df["entity_id"].map(label2id) + +# %% +############################################################## +# augmentation code + +# basic preprocessing +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +def shuffle_text(text, n_shuffles=SHUFFLES): + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + + + +# %% +def create_example(index, mention): + return {'training_id': index, 'mention': mention} + +# augment whole dataset +def augment_data(df): + output_list = [] + + for idx,row in df.iterrows(): + index = row['training_id'] + parent_desc = row['mention'] + parent_desc = preprocess_text(parent_desc) + + # add basic example + output_list.append(create_example(index, parent_desc)) + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + output_list.append(create_example(index, desc)) + + # add corrupted strings + desc = corrupt_string(parent_desc, corruption_probability=0.1) + if (desc != parent_desc): + output_list.append(create_example(index, desc)) + + # add example with stripped non-alphanumerics + desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + if (desc != parent_desc): + output_list.append(create_example(index, desc)) + + # short sequence amplifier + # short sequences are rare, and we must compensate by including more examples + # also, short sequence don't usually get affected by shuffle + words = parent_desc.split() + word_count = len(words) + if word_count <= 2: + for _ in range(AMPLIFY_FACTOR): + output_list.append(create_example(index, desc)) + + new_df = pd.DataFrame(output_list) + return new_df + + +############################################################### +# regeneration code +# %% +# we want to sample n samples from each class +# sample_size refers to the number of samples per class +def sample_from_df(df, sample_size_per_class=5): + sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names + .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x)))) + .reset_index(drop=True)) + + return sampled_df + + + +# %% +class DynamicDataset(Dataset): + def __init__(self, df, sample_size_per_class, tokenizer): + """ + Args: + df (pd.DataFrame): Original DataFrame with class (id) and data columns. + sample_size_per_class (int): Number of samples to draw per class for each epoch. + """ + self.df = df + self.sample_size_per_class = sample_size_per_class + self.tokenizer = tokenizer + self.current_data = None + self.regenerate_data() # Generate the initial dataset + + def regenerate_data(self): + """ + Generate a new sampled dataset for the current epoch. + + dynamic callback function to regenerate data each time we call this + method, it updates the current_data we can: + + - re-sample the dataframe for a new set of n_samples + - generate fresh augmentations this effectively + + This allows us to re-sample and re-augment at the start of each epoch + """ + # Sample `sample_size_per_class` rows per class + sampled_df = sample_from_df(self.df, self.sample_size_per_class) + + # perform future edits here + sampled_df = augment_data(sampled_df) + + # perform tokenization here + # Batch tokenize the entire column of data + tokenized_batch = self.tokenizer( + sampled_df["mention"].to_list(), # Pass all text data at once + truncation=True, + # return_tensors="pt" # disabled because pt requires equal length tensors + ) + + # Store the tokenized data with labels + self.current_data = [ + { + "input_ids": torch.tensor(tokenized_batch["input_ids"][i]), + "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]), + "labels": torch.tensor(sampled_df.iloc[i]["training_id"]) # Include the label + } + for i in range(len(sampled_df)) + ] + + + def __len__(self): + return len(self.current_data) + + def __getitem__(self, idx): + return self.current_data[idx] + +# %% +class RegenerateDatasetCallback(TrainerCallback): + def __init__(self, dataset): + self.dataset = dataset + + def on_epoch_begin(self, args, state, control, **kwargs): + print(f"Epoch {int(math.ceil(state.epoch + 1))}: Regenerating dataset") + self.dataset.regenerate_data() + + + +# %% +def custom_collate_fn(batch): + # Dynamically pad tensors to the longest sequence in the batch + input_ids = [item["input_ids"] for item in batch] + attention_masks = [item["attention_mask"] for item in batch] + labels = torch.stack([item["labels"] for item in batch]) + + # Pad inputs to the same length + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True) + attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True) + + return { + "input_ids": input_ids, + "attention_mask": attention_masks, + "labels": labels + } + + +########################################################################## +# training code +# %% +def train(): + + save_path = f'checkpoint' + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True) + + # make the dataset + + + # Define the callback + lean_df = df.drop(columns=['entity_name']) + dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=SAMPLES, tokenizer=tokenizer) + + # create the regeneration callback + regeneration_callback = RegenerateDatasetCallback(dynamic_dataset) + + # compute metrics + metric = evaluate.load("accuracy") + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + save_strategy="steps", + save_steps=500, + load_best_model_at_end=False, + learning_rate=5e-5, + per_device_train_batch_size=64, + # per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=120, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=dynamic_dataset, + tokenizer=tokenizer, + data_collator=custom_collate_fn, + compute_metrics=compute_metrics, + callbacks=[regeneration_callback] + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/esAppMod_train/augmentation/prediction/.gitignore b/esAppMod_train/augmentation/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/esAppMod_train/augmentation/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/esAppMod_train/augmentation/prediction/output.txt b/esAppMod_train/augmentation/prediction/output.txt new file mode 100644 index 0000000..5b098e0 --- /dev/null +++ b/esAppMod_train/augmentation/prediction/output.txt @@ -0,0 +1,6 @@ + +******************************************************************************* +Accuracy: 0.76958 +F1 Score: 0.79382 +Precision: 0.88705 +Recall: 0.76958 diff --git a/train/class_bert_augmentation/prediction/predict.py b/esAppMod_train/augmentation/prediction/predict.py similarity index 100% rename from train/class_bert_augmentation/prediction/predict.py rename to esAppMod_train/augmentation/prediction/predict.py diff --git a/esAppMod_train/class_bert_augmentation/.gitignore b/esAppMod_train/class_bert_augmentation/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/esAppMod_train/class_bert_augmentation/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/esAppMod_train/class_bert_augmentation/prediction/.gitignore b/esAppMod_train/class_bert_augmentation/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/esAppMod_train/class_bert_augmentation/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/esAppMod_train/class_bert_augmentation/prediction/output.txt b/esAppMod_train/class_bert_augmentation/prediction/output.txt new file mode 100644 index 0000000..d13147d --- /dev/null +++ b/esAppMod_train/class_bert_augmentation/prediction/output.txt @@ -0,0 +1,6 @@ + +******************************************************************************* +Accuracy: 0.80689 +F1 Score: 0.82527 +Precision: 0.89684 +Recall: 0.80689 diff --git a/esAppMod_train/class_bert_augmentation/prediction/predict.py b/esAppMod_train/class_bert_augmentation/prediction/predict.py new file mode 100644 index 0000000..12b1954 --- /dev/null +++ b/esAppMod_train/class_bert_augmentation/prediction/predict.py @@ -0,0 +1,264 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% +# construct the target id list +# data_path = '../../../esAppMod_data_import/train.csv' +data_path = '../../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + index = row['entity_id'] + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../esAppMod_data_import/test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + # combined_data = DatasetDict({ + # 'train': Dataset.from_list(process_df_to_dict(train_df)), + # }) + return Dataset.from_list(process_df_to_dict(test_df)) + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + # additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 128 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/esAppMod_train/class_bert_augmentation/train.py b/esAppMod_train/class_bert_augmentation/train.py new file mode 100644 index 0000000..4344bf6 --- /dev/null +++ b/esAppMod_train/class_bert_augmentation/train.py @@ -0,0 +1,558 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=5 + +# %% + +# import training file +data_path = '../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + +acronym_mapping = { + 'hpsa': 'hp server automation', + 'tam': 'tivoli access manager', + 'adf': 'application development facility', + 'html': 'hypertext markup language', + 'wff': 'microsoft web farm framework', + 'jsp': 'javaserver pages', + 'bw': 'business works', + 'ssrs': 'sql server reporting services', + 'cl': 'control language', + 'vba': 'visual basic for applications', + 'esapi': 'enterprise security api', + 'gwt': 'google web toolkit', + 'pki': 'perkin elmer informatics', + 'rtd': 'oracle realtime decisions', + 'jms': 'java message service', + 'db': 'database', + 'soa': 'service oriented architecture', + 'xsl': 'extensible stylesheet language', + 'com': 'compopent object model', + 'ldap': 'lightweight directory access protocol', + 'odm': 'ibm operational decision manager', + 'soql': 'salesforce object query language', + 'oms': 'order management system', + 'cfml': 'coldfusion markup language', + 'nas': 'netscape application server', + 'sql': 'structured query language', + 'bde': 'borland database engine', + 'imap': 'internet message access protocol', + 'uws': 'ultidev web server', + 'birt': 'business intelligence and reporting tools', + 'mdw': 'model driven workflow', + 'tws': 'tivoli workload scheduler', + 'jre': 'java runtime environment', + 'wcs': 'websphere commerce suite', + 'was': 'websphere application server', + 'ssis': 'sql server integration services', + 'xhtml': 'extensible hypertext markup language', + 'soap': 'simple object access protocol', + 'san': 'storage area network', + 'elk': 'elastic stack', + 'arr': 'application request routing', + 'xlst': 'extensible stylesheet language transformations', + 'sccm': 'microsoft endpoint configuration manager', + 'ejb': 'enterprise java beans', + 'css': 'cascading style sheets', + 'hpoo': 'hp operations orchestration', + 'xml': 'extensible markup language', + 'esb': 'enterprise service bus', + 'edi': 'electronic data interchange', + 'imsva': 'interscan messaging security virtual appliance', + 'wtx': 'ibm websphere transformation extender', + 'cgi': 'common gateway interface', + 'bal': 'ibm basic assembly language', + 'issow': 'integrated safe system of work', + 'dcl': 'data control language', + 'jdom': 'java document object model', + 'fim': 'microsoft forefront identity manager', + 'npl': 'niakwa programming language', + 'wf': 'windows workflow foundation', + 'lm': 'etap license manager', + 'wts': 'windows terminal server', + 'asp': 'active server pages', + 'jil': 'job information language', + 'mvc': 'model view controller', + 'rmi': 'remote method invocation', + 'ad': 'active directory', + 'owb': 'oracle warehouse builder', + 'rest': 'representational state transfer', + 'jdk': 'java development kit', + 'ids': 'integrated data store', + 'bms': 'batch management software', + 'vsx': 'vmware solution exchange', + 'ssas': 'sql server analysis services', + 'atl': 'atlas transformation language', + 'ice': 'infobright community edition', + 'esql': 'extended structured query language', + 'corba': 'common object request broker architecture', + 'dpe': 'device provisioning engines', + 'rac': 'oracle real application clusters', + 'iemt': 'iis easy migration tool', + 'mes': 'manufacturing execution system', + 'odbc': 'open database connectivity', + 'lms': 'lan management solution', + 'wcf': 'windows communication foundation', + 'nes': 'netscape enterprise server', + 'jsf': 'javaserver faces', + 'alm': 'application lifecycle management', + 'hlasm': 'high level assembler', + 'cmod': 'content manager ondemand'} + +external_source = { + 'vb.net': 'visual basic dot net', + 'jes': 'job entry subsystem', + 'svn': 'subversion', + 'vcs': 'version control system', + 'lims': 'laboratory information management system', + 'ide': 'integrated development environment', + 'sdk': 'software development kit', + 'mq': 'message queue', + 'ims': 'information management system', + 'isa': 'internet security and acceleration', + 'vs': 'visual studio', + 'esr': 'extended support release', + 'ff': 'firefox', + 'vb': 'visual basic', + 'rhel': 'red hat enterprise linux', + 'iis': 'internet information server', + 'api': 'application programming interface', + 'se': 'standard edition', + '\.net': 'dot net', + 'c#': 'c sharp' +} + + +# synonyms = { +# 'windows server': 'windows nt', +# 'windows 7': 'windows desktop', +# 'windows 8': 'windows desktop', +# 'windows 10': 'windows desktop' +# } + + +# add more information +acronym_mapping.update(external_source) + + +abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()} +term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()} + +def replace_terms_with_abbreviations(text): + for input, replacement in term_to_abbrev.items(): + text = re.sub(input, replacement, text) + return text + +def replace_abbreviations_with_terms(text): + for input, replacement in abbrev_to_term.items(): + text = re.sub(input, replacement, text) + return text + +###################################### + +# augmentation by text corruption + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +label_flag_list = [] + +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + parent_desc = preprocess_text(parent_desc) + + # unaugmented data + element = { + 'text' : parent_desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # short sequences are rare, and we must compensate by including more examples + # mutation of other longer sequences might drown out rare short sequences + words = parent_desc.split() + word_count = len(words) + if word_count < 3: + for _ in range(10): + element = { + 'text': parent_desc, + 'label': label2id[index], + } + output_list.append(element) + + + # check if label is in label_flag_list + if index not in label_flag_list: + + entity_name = row['entity_name'] + # add the "entity_name" label as a mention + element = { + 'text': entity_name, + 'labels': label2id[index], + } + output_list.append(element) + + # remove all non-alphanumerics + desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + if (desc != parent_desc): + element = { + 'text' : desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # add shufles of the original entity name + no_of_shuffles = SHUFFLES + processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + label_flag_list.append(index) + + + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # corrupt string + desc = corrupt_string(parent_desc, corruption_probability=0.1) + if (desc != parent_desc): + element = { + 'text' : desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # augmentation + # remove all non-alphanumerics + desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + if (desc != parent_desc): + element = { + 'text' : desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # # augmentation + # # perform abbrev_to_term + # temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + # desc = replace_terms_with_abbreviations(temp_desc) + # if (desc != temp_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + # # augmentation + # # perform term to abbrev + # desc = replace_abbreviations_with_terms(parent_desc) + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + data_path = '../../esAppMod_data_import/train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=5e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/esAppMod_train/class_bert_hierarchical/.gitignore b/esAppMod_train/class_bert_hierarchical/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/esAppMod_train/class_bert_hierarchical/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/esAppMod_train/class_bert_hierarchical/prediction/.gitignore b/esAppMod_train/class_bert_hierarchical/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/esAppMod_train/class_bert_hierarchical/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/train/class_bert_hierarchical/prediction/output.txt b/esAppMod_train/class_bert_hierarchical/prediction/output.txt similarity index 100% rename from train/class_bert_hierarchical/prediction/output.txt rename to esAppMod_train/class_bert_hierarchical/prediction/output.txt diff --git a/train/class_bert_hierarchical/prediction/output_1.txt b/esAppMod_train/class_bert_hierarchical/prediction/output_1.txt similarity index 100% rename from train/class_bert_hierarchical/prediction/output_1.txt rename to esAppMod_train/class_bert_hierarchical/prediction/output_1.txt diff --git a/train/class_bert_hierarchical/prediction/output_2.txt b/esAppMod_train/class_bert_hierarchical/prediction/output_2.txt similarity index 100% rename from train/class_bert_hierarchical/prediction/output_2.txt rename to esAppMod_train/class_bert_hierarchical/prediction/output_2.txt diff --git a/train/class_bert_hierarchical/prediction/predict_1.py b/esAppMod_train/class_bert_hierarchical/prediction/predict_1.py similarity index 100% rename from train/class_bert_hierarchical/prediction/predict_1.py rename to esAppMod_train/class_bert_hierarchical/prediction/predict_1.py diff --git a/train/class_bert_hierarchical/prediction/predict_2.py b/esAppMod_train/class_bert_hierarchical/prediction/predict_2.py similarity index 100% rename from train/class_bert_hierarchical/prediction/predict_2.py rename to esAppMod_train/class_bert_hierarchical/prediction/predict_2.py diff --git a/train/class_bert_hierarchical/train_1.py b/esAppMod_train/class_bert_hierarchical/train_1.py similarity index 100% rename from train/class_bert_hierarchical/train_1.py rename to esAppMod_train/class_bert_hierarchical/train_1.py diff --git a/train/class_bert_hierarchical/train_2.py b/esAppMod_train/class_bert_hierarchical/train_2.py similarity index 100% rename from train/class_bert_hierarchical/train_2.py rename to esAppMod_train/class_bert_hierarchical/train_2.py diff --git a/esAppMod_train/class_bert_simple/.gitignore b/esAppMod_train/class_bert_simple/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/esAppMod_train/class_bert_simple/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/esAppMod_train/class_bert_simple/classification_prediction/.gitignore b/esAppMod_train/class_bert_simple/classification_prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/esAppMod_train/class_bert_simple/classification_prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/train/class_bert_simple/classification_prediction/output.txt b/esAppMod_train/class_bert_simple/classification_prediction/output.txt similarity index 100% rename from train/class_bert_simple/classification_prediction/output.txt rename to esAppMod_train/class_bert_simple/classification_prediction/output.txt diff --git a/train/class_bert_simple/classification_prediction/predict.py b/esAppMod_train/class_bert_simple/classification_prediction/predict.py similarity index 100% rename from train/class_bert_simple/classification_prediction/predict.py rename to esAppMod_train/class_bert_simple/classification_prediction/predict.py diff --git a/train/class_bert_simple/train.py b/esAppMod_train/class_bert_simple/train.py similarity index 100% rename from train/class_bert_simple/train.py rename to esAppMod_train/class_bert_simple/train.py diff --git a/esAppMod_train/golden_sample/.gitignore b/esAppMod_train/golden_sample/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/esAppMod_train/golden_sample/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/esAppMod_train/golden_sample/prediction/.gitignore b/esAppMod_train/golden_sample/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/esAppMod_train/golden_sample/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/esAppMod_train/golden_sample/prediction/output.txt b/esAppMod_train/golden_sample/prediction/output.txt new file mode 100644 index 0000000..d13147d --- /dev/null +++ b/esAppMod_train/golden_sample/prediction/output.txt @@ -0,0 +1,6 @@ + +******************************************************************************* +Accuracy: 0.80689 +F1 Score: 0.82527 +Precision: 0.89684 +Recall: 0.80689 diff --git a/esAppMod_train/golden_sample/prediction/predict.py b/esAppMod_train/golden_sample/prediction/predict.py new file mode 100644 index 0000000..12b1954 --- /dev/null +++ b/esAppMod_train/golden_sample/prediction/predict.py @@ -0,0 +1,264 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% +# construct the target id list +# data_path = '../../../esAppMod_data_import/train.csv' +data_path = '../../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + index = row['entity_id'] + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../esAppMod_data_import/test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + # combined_data = DatasetDict({ + # 'train': Dataset.from_list(process_df_to_dict(train_df)), + # }) + return Dataset.from_list(process_df_to_dict(test_df)) + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + # additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 128 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/train/class_bert_augmentation/train.py b/esAppMod_train/golden_sample/train.py similarity index 97% rename from train/class_bert_augmentation/train.py rename to esAppMod_train/golden_sample/train.py index 7a90289..8cae5da 100644 --- a/train/class_bert_augmentation/train.py +++ b/esAppMod_train/golden_sample/train.py @@ -45,7 +45,7 @@ def set_seed(seed): set_seed(42) -SHUFFLES=10 +SHUFFLES=5 # %% @@ -411,15 +411,15 @@ def process_df_to_dict(df): # } # output_list.append(element) - # augmentation - # perform term to abbrev - desc = replace_abbreviations_with_terms(parent_desc) - if (desc != parent_desc): - element = { - 'text' : desc, - 'label': label2id[index], # ensure labels starts from 0 - } - output_list.append(element) + # # augmentation + # # perform term to abbrev + # desc = replace_abbreviations_with_terms(parent_desc) + # if (desc != parent_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) return output_list diff --git a/train/seq2seq_t5_simple/.gitignore b/esAppMod_train/seq2seq_t5_simple/.gitignore similarity index 100% rename from train/seq2seq_t5_simple/.gitignore rename to esAppMod_train/seq2seq_t5_simple/.gitignore diff --git a/train/seq2seq_t5_simple/prediction/.gitignore b/esAppMod_train/seq2seq_t5_simple/prediction/.gitignore similarity index 100% rename from train/seq2seq_t5_simple/prediction/.gitignore rename to esAppMod_train/seq2seq_t5_simple/prediction/.gitignore diff --git a/train/seq2seq_t5_simple/prediction/inference.py b/esAppMod_train/seq2seq_t5_simple/prediction/inference.py similarity index 100% rename from train/seq2seq_t5_simple/prediction/inference.py rename to esAppMod_train/seq2seq_t5_simple/prediction/inference.py diff --git a/train/seq2seq_t5_simple/prediction/output.txt b/esAppMod_train/seq2seq_t5_simple/prediction/output.txt similarity index 100% rename from train/seq2seq_t5_simple/prediction/output.txt rename to esAppMod_train/seq2seq_t5_simple/prediction/output.txt diff --git a/train/seq2seq_t5_simple/prediction/predict.py b/esAppMod_train/seq2seq_t5_simple/prediction/predict.py similarity index 100% rename from train/seq2seq_t5_simple/prediction/predict.py rename to esAppMod_train/seq2seq_t5_simple/prediction/predict.py diff --git a/train/seq2seq_t5_simple/train.py b/esAppMod_train/seq2seq_t5_simple/train.py similarity index 100% rename from train/seq2seq_t5_simple/train.py rename to esAppMod_train/seq2seq_t5_simple/train.py diff --git a/esAppMod_train/simple/.gitignore b/esAppMod_train/simple/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/esAppMod_train/simple/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/esAppMod_train/simple/dynamic_train.py b/esAppMod_train/simple/dynamic_train.py new file mode 100644 index 0000000..bed35d9 --- /dev/null +++ b/esAppMod_train/simple/dynamic_train.py @@ -0,0 +1,273 @@ +# %% +from torch.utils.data import Dataset, DataLoader + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments, + TrainerCallback +) +import evaluate +import numpy as np +import pandas as pd +from functools import partial +import warnings + +warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0') +warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.') + +# import matplotlib.pyplot as plt + + + +torch.set_float32_matmul_precision('high') + +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +# %% +# PARAMETERS +SAMPLES=20 + +# %% +################################################### +# import code +# import training file +data_path = '../../esAppMod_data_import/train.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +df["training_id"] = df["entity_id"].map(label2id) + +############################################################### +# regeneration code +# %% +# we want to sample n samples from each class +# sample_size refers to the number of samples per class +def sample_from_df(df, sample_size_per_class=5): + sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names + .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x)))) + .reset_index(drop=True)) + + return sampled_df + + +# %% +# augment whole dataset +# for now, we just return the same df +def augment_data(df): + return df + +# %% +class DynamicDataset(Dataset): + def __init__(self, df, sample_size_per_class, tokenizer): + """ + Args: + df (pd.DataFrame): Original DataFrame with class (id) and data columns. + sample_size_per_class (int): Number of samples to draw per class for each epoch. + """ + self.df = df + self.sample_size_per_class = sample_size_per_class + self.tokenizer = tokenizer + self.current_data = None + self.regenerate_data() # Generate the initial dataset + + def regenerate_data(self): + """ + Generate a new sampled dataset for the current epoch. + + dynamic callback function to regenerate data each time we call this + method, it updates the current_data we can: + + - re-sample the dataframe for a new set of n_samples + - generate fresh augmentations this effectively + + This allows us to re-sample and re-augment at the start of each epoch + """ + # Sample `sample_size_per_class` rows per class + sampled_df = sample_from_df(self.df, self.sample_size_per_class) + + # perform future edits here + sampled_df = augment_data(sampled_df) + + # perform tokenization here + # Batch tokenize the entire column of data + tokenized_batch = self.tokenizer( + sampled_df["mention"].to_list(), # Pass all text data at once + truncation=True, + # return_tensors="pt" # disabled because pt requires equal length tensors + ) + + # Store the tokenized data with labels + self.current_data = [ + { + "input_ids": torch.tensor(tokenized_batch["input_ids"][i]), + "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]), + "labels": torch.tensor(sampled_df.iloc[i]["training_id"]) # Include the label + } + for i in range(len(sampled_df)) + ] + + + def __len__(self): + return len(self.current_data) + + def __getitem__(self, idx): + return self.current_data[idx] + +# %% +class RegenerateDatasetCallback(TrainerCallback): + def __init__(self, dataset): + self.dataset = dataset + + def on_epoch_begin(self, args, state, control, **kwargs): + print(f"Epoch {state.epoch + 1}: Regenerating dataset") + self.dataset.regenerate_data() + + + +# %% +def custom_collate_fn(batch): + # Dynamically pad tensors to the longest sequence in the batch + input_ids = [item["input_ids"] for item in batch] + attention_masks = [item["attention_mask"] for item in batch] + labels = torch.stack([item["labels"] for item in batch]) + + # Pad inputs to the same length + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True) + attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True) + + return { + "input_ids": input_ids, + "attention_mask": attention_masks, + "labels": labels + } + + +########################################################################## +# training code +# %% +def train(): + + save_path = f'checkpoint' + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True) + + # make the dataset + + + # Define the callback + lean_df = df.drop(columns=['entity_name']) + dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer) + + # create the regeneration callback + regeneration_callback = RegenerateDatasetCallback(dynamic_dataset) + + # compute metrics + metric = evaluate.load("accuracy") + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=5e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=120, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=dynamic_dataset, + tokenizer=tokenizer, + data_collator=custom_collate_fn, + compute_metrics=compute_metrics, + callbacks=[regeneration_callback] + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/esAppMod_train/simple/prediction/.gitignore b/esAppMod_train/simple/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/esAppMod_train/simple/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/esAppMod_train/simple/prediction/output.txt b/esAppMod_train/simple/prediction/output.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/esAppMod_train/simple/prediction/output.txt @@ -0,0 +1 @@ + diff --git a/esAppMod_train/simple/prediction/predict.py b/esAppMod_train/simple/prediction/predict.py new file mode 100644 index 0000000..12b1954 --- /dev/null +++ b/esAppMod_train/simple/prediction/predict.py @@ -0,0 +1,264 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% +# construct the target id list +# data_path = '../../../esAppMod_data_import/train.csv' +data_path = '../../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + index = row['entity_id'] + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + data_path = '../../../esAppMod_data_import/test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + # combined_data = DatasetDict({ + # 'train': Dataset.from_list(process_df_to_dict(train_df)), + # }) + return Dataset.from_list(process_df_to_dict(test_df)) + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + # additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 128 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + test() diff --git a/esAppMod_train/simple/train.py b/esAppMod_train/simple/train.py new file mode 100644 index 0000000..3085560 --- /dev/null +++ b/esAppMod_train/simple/train.py @@ -0,0 +1,232 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=5 + +# %% + +# import training file +data_path = '../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + parent_desc = preprocess_text(parent_desc) + + # unaugmented data + element = { + 'text' : parent_desc, + 'labels': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + data_path = '../../esAppMod_data_import/train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + truncation=True, + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=5e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/reference_code/dynamic_dataset_generation.py b/reference_code/dynamic_dataset_generation.py new file mode 100644 index 0000000..2d179ad --- /dev/null +++ b/reference_code/dynamic_dataset_generation.py @@ -0,0 +1,188 @@ +# why? +# the existing huggingface library does not allow for flexibility in changing +# the training data between epochs + +# this code example illustrates the use of dataset regeneration to make changes +# to the training data between epochs +# %% +from torch.utils.data import Dataset, DataLoader + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +from functools import partial +# import matplotlib.pyplot as plt + + + +torch.set_float32_matmul_precision('high') + +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +# %% +# PARAMETERS +SAMPLES=5 + +# %% +# import training file +data_path = '../../esAppMod_data_import/train.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# we want to sample n samples from each class +# sample_size refers to the number of samples per class +def sample_from_df(df, sample_size_per_class=5): + sampled_df = (df.groupby( "entity_id")[['entity_id', 'mention']] # explicit give column names + .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x)))) + .reset_index(drop=True)) + + return sampled_df + + +# %% +# augment whole dataset +# for now, we just return the same df +def augment_data(df): + return df + +# %% +class DynamicDataset(Dataset): + def __init__(self, df, sample_size_per_class, tokenizer): + """ + Args: + df (pd.DataFrame): Original DataFrame with class (id) and data columns. + sample_size_per_class (int): Number of samples to draw per class for each epoch. + """ + self.df = df + self.sample_size_per_class = sample_size_per_class + self.tokenizer = tokenizer + self.current_data = None + self.regenerate_data() # Generate the initial dataset + + def regenerate_data(self): + """ + Generate a new sampled dataset for the current epoch. + + dynamic callback function to regenerate data each time we call this + method, it updates the current_data we can: + + - re-sample the dataframe for a new set of n_samples + - generate fresh augmentations this effectively + + This allows us to re-sample and re-augment at the start of each epoch + """ + # Sample `sample_size_per_class` rows per class + sampled_df = sample_from_df(self.df, self.sample_size_per_class) + + # perform future augmentations here + sampled_df = augment_data(sampled_df) + + # perform tokenization here + # Batch tokenize the entire column of data + tokenized_batch = self.tokenizer( + sampled_df["mention"].to_list(), # Pass all text data at once + truncation=True, + # return_tensors="pt" # disabled because pt requires equal length tensors + ) + + # Store the tokenized data with labels + # we need to convert to torch tensors so that subsequent 'pad_sequence' + # and 'stack' operations can work + self.current_data = [ + { + "input_ids": torch.tensor(tokenized_batch["input_ids"][i]), + "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]), + "labels": torch.tensor(sampled_df.iloc[i]["entity_id"]) # Include the label + } + for i in range(len(sampled_df)) + ] + + + def __len__(self): + return len(self.current_data) + + def __getitem__(self, idx): + return self.current_data[idx] + + +# %% +# Dynamic dataset +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces=False) +lean_df = df.drop(columns=['entity_name']) +dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer) + +# %% +# custom tokenization + +# %% +# Example usage of dynamic dataset +sample = dynamic_dataset[0] +print(sample) + + +# %% +def custom_collate_fn(batch): + # Dynamically pad tensors to the longest sequence in the batch + input_ids = [item["input_ids"] for item in batch] + attention_masks = [item["attention_mask"] for item in batch] + labels = torch.stack([item["labels"] for item in batch]) + + # Pad inputs to the same length + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True) + attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True) + + return { + "input_ids": input_ids, + "attention_mask": attention_masks, + "labels": labels + } + + +dataloader = DataLoader( + dynamic_dataset, + batch_size=32, + collate_fn=custom_collate_fn +) + +# %%