diff --git a/analysis/categories/label_print.py b/analysis/categories/label_print.py index 29de0ec..1968f8d 100644 --- a/analysis/categories/label_print.py +++ b/analysis/categories/label_print.py @@ -10,6 +10,12 @@ mdm_list = sorted(list((set(full_df['pattern'])))) # %% full_df +# %% +mdm_list + +# %% +mask = full_df['pattern'] == 'GE#Flow FGMassFlow' +full_df[mask] # %% mask1 = full_df['thing'] == 'ME1TurboCharger1' mask2 = full_df['property'] == 'LOInletPress' diff --git a/train/frozen_t5_decoder/.gitignore b/train/frozen_t5_decoder/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/frozen_t5_decoder/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/frozen_t5_decoder/custom_t5/.gitignore b/train/frozen_t5_decoder/custom_t5/.gitignore new file mode 100644 index 0000000..ed8ebf5 --- /dev/null +++ b/train/frozen_t5_decoder/custom_t5/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file diff --git a/train/frozen_t5_decoder/custom_t5/modeling_t5.py b/train/frozen_t5_decoder/custom_t5/modeling_t5.py new file mode 100644 index 0000000..40ca208 --- /dev/null +++ b/train/frozen_t5_decoder/custom_t5/modeling_t5.py @@ -0,0 +1,125 @@ +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers import ( + T5PreTrainedModel, + T5Model + +) + +from transformers.modeling_outputs import ( + SequenceClassifierOutput, +) + +def mean_pooling(encoder_outputs, attention_mask): + """ + Perform mean pooling over encoder outputs, considering the attention mask. + """ + hidden_states = encoder_outputs.last_hidden_state # Shape: (batch_size, seq_length, hidden_size) + mask = attention_mask.unsqueeze(-1) # Shape: (batch_size, seq_length, 1) + masked_hidden_states = hidden_states * mask # Zero out padding tokens + sum_hidden_states = masked_hidden_states.sum(dim=1) # Sum over sequence length + sum_mask = mask.sum(dim=1) # Sum the mask (number of non-padding tokens) + return sum_hidden_states / sum_mask # Mean pooling + + +class T5EncoderForSequenceClassification(T5PreTrainedModel): + + def __init__(self, checkpoint, tokenizer, config, num_labels): + super().__init__(config) + self.num_labels = num_labels + self.config = config + + # we force the loading of a pre-trained model here + self.t5 = T5Model.from_pretrained(checkpoint) + self.t5.resize_token_embeddings(len(tokenizer)) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, self.num_labels) + + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + + # encoder_outputs = self.t5.encoder( + # input_ids, + # attention_mask=attention_mask, + # head_mask=head_mask, + # inputs_embeds=inputs_embeds, + # output_attentions=output_attentions, + # output_hidden_states=output_hidden_states, + # return_dict=return_dict, + # ) + + + encoder_outputs = self.t5.encoder(input_ids, attention_mask=attention_mask) + # last_hidden_state = encoder_outputs.last_hidden_state + # use mean of hidden state + # pooled_output = mean_pooling(encoder_outputs, attention_mask) + + # Use the hidden state of the first token as the sequence representation + pooled_output = encoder_outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + + # pooled_output = encoder_outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + encoder_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) \ No newline at end of file diff --git a/train/frozen_t5_decoder/mapping_prediction/.gitignore b/train/frozen_t5_decoder/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/frozen_t5_decoder/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/frozen_t5_decoder/mapping_prediction/inference.py b/train/frozen_t5_decoder/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/frozen_t5_decoder/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/frozen_t5_decoder/mapping_prediction/output.txt b/train/frozen_t5_decoder/mapping_prediction/output.txt new file mode 100644 index 0000000..58eca44 --- /dev/null +++ b/train/frozen_t5_decoder/mapping_prediction/output.txt @@ -0,0 +1,2 @@ + +Accuracy for fold 1: 0.0 diff --git a/train/frozen_t5_decoder/mapping_prediction/predict.py b/train/frozen_t5_decoder/mapping_prediction/predict.py new file mode 100644 index 0000000..6bb0650 --- /dev/null +++ b/train/frozen_t5_decoder/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1]: + infer_and_select(fold) diff --git a/train/frozen_t5_decoder/train_decoder.py b/train/frozen_t5_decoder/train_decoder.py new file mode 100644 index 0000000..18f9e0f --- /dev/null +++ b/train/frozen_t5_decoder/train_decoder.py @@ -0,0 +1,236 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from custom_t5.modeling_t5 import T5EncoderForSequenceClassification + +from safetensors.torch import load_file +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # Filter out weights related to the classification head + # given name format: t5.encoder.embed_tokens.weight + # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + # model.load_state_dict(state_dict=t5_weights, strict=False) + + # for key, param in model.state_dict().items(): + # if key in t5_weights: + # print(f"{key}: Successfully overridden") + # else: + # print(f"{key}: Retained original weights") + + + # Freeze the decoder + for param in model.decoder.parameters(): + param.requires_grad = False + + # Freeze the shared embedding layer + for param in model.shared.parameters(): + param.requires_grad = False + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1]: + print(fold) + train(fold) + diff --git a/train/frozen_t5_encoder/.gitignore b/train/frozen_t5_encoder/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/frozen_t5_encoder/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/frozen_t5_encoder/mapping_prediction/.gitignore b/train/frozen_t5_encoder/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/frozen_t5_encoder/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/frozen_t5_encoder/mapping_prediction/inference.py b/train/frozen_t5_encoder/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/frozen_t5_encoder/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/frozen_t5_encoder/mapping_prediction/output.txt b/train/frozen_t5_encoder/mapping_prediction/output.txt new file mode 100644 index 0000000..52834ce --- /dev/null +++ b/train/frozen_t5_encoder/mapping_prediction/output.txt @@ -0,0 +1,3 @@ + +Accuracy for fold 1: 0.9342167534311405 +Accuracy for fold 2: 0.883177570093458 diff --git a/train/frozen_t5_encoder/mapping_prediction/predict.py b/train/frozen_t5_encoder/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/frozen_t5_encoder/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/frozen_t5_encoder/train_decoder.py b/train/frozen_t5_encoder/train_decoder.py new file mode 100644 index 0000000..41c3a39 --- /dev/null +++ b/train/frozen_t5_encoder/train_decoder.py @@ -0,0 +1,235 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch + +from safetensors.torch import load_file +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + # model.load_state_dict(state_dict=t5_weights, strict=False) + + # for key, param in model.state_dict().items(): + # if key in t5_weights: + # print(f"{key}: Successfully overridden") + # else: + # print(f"{key}: Retained original weights") + + + # Freeze the encoder + for param in model.encoder.parameters(): + param.requires_grad = False + + # Freeze the shared embedding layer + for param in model.shared.parameters(): + param.requires_grad = False + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt b/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt index cf3b2d5..08326c5 100644 --- a/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt +++ b/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt @@ -1,6 +1,2 @@ -Accuracy for fold 1: 0.9427354472314246 -Accuracy for fold 2: 0.8859813084112149 -Accuracy for fold 3: 0.9683734939759037 -Accuracy for fold 4: 0.9762131303520457 -Accuracy for fold 5: 0.907924874026569 +Accuracy for fold 1: 0.9398958826313298 diff --git a/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py b/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py index 7baf191..6bb0650 100644 --- a/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py +++ b/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py @@ -13,6 +13,7 @@ def infer_and_select(fold): # import test data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) # get target data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" @@ -69,5 +70,5 @@ def infer_and_select(fold): with open("output.txt", "w") as f: print('', file=f) -for fold in [1,2,3,4,5]: +for fold in [1]: infer_and_select(fold) diff --git a/train/hybrid_t5_complete_desc_unit/train_decoder.py b/train/hybrid_t5_complete_desc_unit/train_decoder.py index 7725033..31a3ee4 100644 --- a/train/hybrid_t5_complete_desc_unit/train_decoder.py +++ b/train/hybrid_t5_complete_desc_unit/train_decoder.py @@ -120,14 +120,23 @@ def train(fold): checkpoint_path = f"{prev_checkpoint}/model.safetensors" checkpoint = load_file(checkpoint_path) # Filter out weights related to the classification head - t5_weights = {key: value for key, value in checkpoint.items() if "classifier" not in key} - + # given name format: t5.encoder.embed_tokens.weight + # we want: encoder.embed.tokens.weight + t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) - model.load_state_dict(state_dict=t5_weights, strict=False) - # important! after extending tokens vocab + # change the token embedding size to match the shape model.resize_token_embeddings(len(tokenizer)) + model.load_state_dict(state_dict=t5_weights, strict=False) + + for key, param in model.state_dict().items(): + if key in t5_weights: + print(f"{key}: Successfully overridden") + else: + print(f"{key}: Retained original weights") + + # Freeze the encoder for param in model.encoder.parameters(): param.requires_grad = False @@ -194,7 +203,7 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=80, + num_train_epochs=40, predict_with_generate=True, bf16=True, push_to_hub=False, @@ -221,7 +230,7 @@ def train(fold): trainer.train() # execute training -for fold in [1,2,3,4,5]: +for fold in [1]: print(fold) train(fold) diff --git a/train/hybrid_t5_complete_desc_unit/train_encoder.py b/train/hybrid_t5_complete_desc_unit/train_encoder.py index 6ce6a39..5fff07d 100644 --- a/train/hybrid_t5_complete_desc_unit/train_encoder.py +++ b/train/hybrid_t5_complete_desc_unit/train_encoder.py @@ -35,12 +35,13 @@ torch.set_float32_matmul_precision('high') # import the full mdm-only file data_path = '../../data_import/exports/data_mapping_mdm.csv' full_df = pd.read_csv(data_path, skipinitialspace=True) -mdm_list = sorted(list((set(full_df['pattern'])))) +# mdm_list = sorted(list((set(full_df['pattern'])))) # # rather than use pattern, we use the real thing and property -# thing_property = full_df['thing'] + full_df['property'] -# thing_property = thing_property.to_list() -# mdm_list = sorted(list(set(thing_property))) +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +mdm_list = sorted(list(set(thing_property))) +print("number of classes: ", len(mdm_list)) # %% @@ -62,8 +63,8 @@ def process_df_to_dict(df, mdm_list): for _, row in df.iterrows(): desc = f"{row['tag_description']}" unit = f"{row['unit']}" - # pattern = f"{row['thing'] + row['property']}" - pattern = f"{row['thing_pattern'] + ' ' + row['property_pattern']}" + pattern = f"{row['thing'] + row['property']}" + # pattern = f"{row['thing_pattern'] + ' ' + row['property_pattern']}" try: index = mdm_list.index(pattern) except ValueError: @@ -137,7 +138,7 @@ def train(fold): remove_columns="text", ) - # %% temp + # %% temp # t5_classify = T5Model.from_pretrained(prev_checkpoint) # tokenized_datasets['train'].rename_columns() # %% @@ -192,7 +193,7 @@ def train(fold): per_device_train_batch_size=128, per_device_eval_batch_size=128, auto_find_batch_size=False, - ddp_find_unused_parameters=False, + ddp_find_unused_parameters=False, # t5_classify = T5Model.from_pretrained(prev_checkpoint) weight_decay=0.01, save_total_limit=1, num_train_epochs=80, @@ -220,7 +221,7 @@ def train(fold): trainer.train() # execute training -for fold in [1,2,3,4,5]: +for fold in [1]: print(fold) train(fold) diff --git a/train/hybrid_t5_pattern_desc_unit/.gitignore b/train/hybrid_t5_pattern_desc_unit/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/hybrid_t5_pattern_desc_unit/custom_t5/.gitignore b/train/hybrid_t5_pattern_desc_unit/custom_t5/.gitignore new file mode 100644 index 0000000..ed8ebf5 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/custom_t5/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file diff --git a/train/hybrid_t5_pattern_desc_unit/custom_t5/modeling_t5.py b/train/hybrid_t5_pattern_desc_unit/custom_t5/modeling_t5.py new file mode 100644 index 0000000..40ca208 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/custom_t5/modeling_t5.py @@ -0,0 +1,125 @@ +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers import ( + T5PreTrainedModel, + T5Model + +) + +from transformers.modeling_outputs import ( + SequenceClassifierOutput, +) + +def mean_pooling(encoder_outputs, attention_mask): + """ + Perform mean pooling over encoder outputs, considering the attention mask. + """ + hidden_states = encoder_outputs.last_hidden_state # Shape: (batch_size, seq_length, hidden_size) + mask = attention_mask.unsqueeze(-1) # Shape: (batch_size, seq_length, 1) + masked_hidden_states = hidden_states * mask # Zero out padding tokens + sum_hidden_states = masked_hidden_states.sum(dim=1) # Sum over sequence length + sum_mask = mask.sum(dim=1) # Sum the mask (number of non-padding tokens) + return sum_hidden_states / sum_mask # Mean pooling + + +class T5EncoderForSequenceClassification(T5PreTrainedModel): + + def __init__(self, checkpoint, tokenizer, config, num_labels): + super().__init__(config) + self.num_labels = num_labels + self.config = config + + # we force the loading of a pre-trained model here + self.t5 = T5Model.from_pretrained(checkpoint) + self.t5.resize_token_embeddings(len(tokenizer)) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, self.num_labels) + + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + + # encoder_outputs = self.t5.encoder( + # input_ids, + # attention_mask=attention_mask, + # head_mask=head_mask, + # inputs_embeds=inputs_embeds, + # output_attentions=output_attentions, + # output_hidden_states=output_hidden_states, + # return_dict=return_dict, + # ) + + + encoder_outputs = self.t5.encoder(input_ids, attention_mask=attention_mask) + # last_hidden_state = encoder_outputs.last_hidden_state + # use mean of hidden state + # pooled_output = mean_pooling(encoder_outputs, attention_mask) + + # Use the hidden state of the first token as the sequence representation + pooled_output = encoder_outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + + # pooled_output = encoder_outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + encoder_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) \ No newline at end of file diff --git a/train/hybrid_t5_pattern_desc_unit/mapping_prediction/.gitignore b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/hybrid_t5_pattern_desc_unit/mapping_prediction/inference.py b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/hybrid_t5_pattern_desc_unit/mapping_prediction/output.txt b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/output.txt new file mode 100644 index 0000000..344ffc5 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9337434926644581 +Accuracy for fold 2: 0.914018691588785 +Accuracy for fold 3: 0.9623493975903614 +Accuracy for fold 4: 0.9738344433872502 +Accuracy for fold 5: 0.9042601923957856 diff --git a/train/hybrid_t5_pattern_desc_unit/mapping_prediction/predict.py b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/hybrid_t5_pattern_desc_unit/train_decoder.py b/train/hybrid_t5_pattern_desc_unit/train_decoder.py new file mode 100644 index 0000000..ed95aa6 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/train_decoder.py @@ -0,0 +1,234 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from custom_t5.modeling_t5 import T5EncoderForSequenceClassification + +from safetensors.torch import load_file +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # Load the checkpoint + checkpoint_path = f"{prev_checkpoint}/model.safetensors" + checkpoint = load_file(checkpoint_path) + # Filter out weights related to the classification head + # given name format: t5.encoder.embed_tokens.weight + # we want: encoder.embed.tokens.weight + t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + model.load_state_dict(state_dict=t5_weights, strict=False) + + for key, param in model.state_dict().items(): + if key in t5_weights: + print(f"{key}: Successfully overridden") + else: + print(f"{key}: Retained original weights") + + # Freeze the encoder + for param in model.encoder.parameters(): + param.requires_grad = False + + # Freeze the shared embedding layer + for param in model.shared.parameters(): + param.requires_grad = False + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1]: + print(fold) + train(fold) + diff --git a/train/hybrid_t5_pattern_desc_unit/train_encoder.py b/train/hybrid_t5_pattern_desc_unit/train_encoder.py new file mode 100644 index 0000000..eb31879 --- /dev/null +++ b/train/hybrid_t5_pattern_desc_unit/train_encoder.py @@ -0,0 +1,228 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from custom_t5.modeling_t5 import T5EncoderForSequenceClassification +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments, + T5Config, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# # rather than use pattern, we use the real thing and property +# thing_property = full_df['thing'] + full_df['property'] +# thing_property = thing_property.to_list() +# mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + # pattern = f"{row['thing'] + row['property']}" + pattern = f"{row['thing_pattern'] + ' ' + row['property_pattern']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}a' + split_datasets = create_split_dataset(fold, mdm_list) + + # prepare tokenizer + + # model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + model_checkpoint = "t5-small" + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + # model = AutoModelForSequenceClassification.from_pretrained( + # model_checkpoint, + # num_labels=len(mdm_list), + # id2label=id2label, + # label2id=label2id) + model = T5EncoderForSequenceClassification( + checkpoint=model_checkpoint, + tokenizer=tokenizer, + config=T5Config.from_pretrained(model_checkpoint), + num_labels=len(mdm_list) + ) + # important! after extending tokens vocab + # model.t5.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=128, + per_device_eval_batch_size=128, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1]: + print(fold) + train(fold) + + +# %% diff --git a/train/modified_t5_decoder_12_layers/.gitignore b/train/modified_t5_decoder_12_layers/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/modified_t5_decoder_12_layers/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_12_layers/mapping_prediction/.gitignore b/train/modified_t5_decoder_12_layers/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/modified_t5_decoder_12_layers/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/modified_t5_decoder_12_layers/mapping_prediction/inference.py b/train/modified_t5_decoder_12_layers/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/modified_t5_decoder_12_layers/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/modified_t5_decoder_12_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_12_layers/mapping_prediction/output.txt new file mode 100644 index 0000000..877665d --- /dev/null +++ b/train/modified_t5_decoder_12_layers/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9403691433980123 +Accuracy for fold 2: 0.9046728971962616 +Accuracy for fold 3: 0.9678714859437751 +Accuracy for fold 4: 0.9695528068506185 +Accuracy for fold 5: 0.902427851580394 diff --git a/train/modified_t5_decoder_12_layers/mapping_prediction/predict.py b/train/modified_t5_decoder_12_layers/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/modified_t5_decoder_12_layers/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/modified_t5_decoder_12_layers/train_decoder.py b/train/modified_t5_decoder_12_layers/train_decoder.py new file mode 100644 index 0000000..c1fd98b --- /dev/null +++ b/train/modified_t5_decoder_12_layers/train_decoder.py @@ -0,0 +1,255 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch + +from safetensors.torch import load_file + +from transformers.models.t5.modeling_t5 import T5Block +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + + + +# %% + +# model_checkpoint = "t5-small" +# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) +# model.config + +# %% +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + + pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + + # Access the decoder stack + # config = T5Config("t5-small") + + config = pretrained_model.config + config.num_layers = 6 + config.num_decoder_layers = 12 # set new decoder layer count + + model = T5ForConditionalGeneration(config) + + model.shared = pretrained_model.shared + model.encoder = pretrained_model.encoder + + pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] + for i, layer in enumerate(pretrained_decoder_weights): + model.decoder.block[i].load_state_dict(layer) # Load pretrained weights + + + # print number of decoder blocks + print(f'Number of decoder blocks: {len(model.decoder.block)}') + print(f'num_layers: {model.config.num_layers}') + print(f'num_decoder_layers: {model.config.num_decoder_layers}') + + + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/modified_t5_decoder_1_layers/.gitignore b/train/modified_t5_decoder_1_layers/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/modified_t5_decoder_1_layers/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_1_layers/mapping_prediction/.gitignore b/train/modified_t5_decoder_1_layers/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/modified_t5_decoder_1_layers/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/modified_t5_decoder_1_layers/mapping_prediction/inference.py b/train/modified_t5_decoder_1_layers/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/modified_t5_decoder_1_layers/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/modified_t5_decoder_1_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_1_layers/mapping_prediction/output.txt new file mode 100644 index 0000000..1607643 --- /dev/null +++ b/train/modified_t5_decoder_1_layers/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.8968291528632276 +Accuracy for fold 2: 0.8859813084112149 +Accuracy for fold 3: 0.9382530120481928 +Accuracy for fold 4: 0.9586108468125595 +Accuracy for fold 5: 0.8827301878149336 diff --git a/train/modified_t5_decoder_1_layers/mapping_prediction/predict.py b/train/modified_t5_decoder_1_layers/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/modified_t5_decoder_1_layers/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/modified_t5_decoder_1_layers/train_decoder.py b/train/modified_t5_decoder_1_layers/train_decoder.py new file mode 100644 index 0000000..7780901 --- /dev/null +++ b/train/modified_t5_decoder_1_layers/train_decoder.py @@ -0,0 +1,255 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch + +from safetensors.torch import load_file + +from transformers.models.t5.modeling_t5 import T5Block +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + + + +# %% + +# model_checkpoint = "t5-small" +# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) +# model.config + +# %% +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + + pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + + # Access the decoder stack + # config = T5Config("t5-small") + + config = pretrained_model.config + config.num_layers = 6 + config.num_decoder_layers = 1 # set new decoder layer count + + model = T5ForConditionalGeneration(config) + + model.shared = pretrained_model.shared + model.encoder = pretrained_model.encoder + + pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] + for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): + model.decoder.block[i].load_state_dict(layer) # Load pretrained weights + + + # print number of decoder blocks + print(f'Number of decoder blocks: {len(model.decoder.block)}') + print(f'num_layers: {model.config.num_layers}') + print(f'num_decoder_layers: {model.config.num_decoder_layers}') + + + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/modified_t5_decoder_2_layers/.gitignore b/train/modified_t5_decoder_2_layers/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/modified_t5_decoder_2_layers/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_2_layers/mapping_prediction/.gitignore b/train/modified_t5_decoder_2_layers/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/modified_t5_decoder_2_layers/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/modified_t5_decoder_2_layers/mapping_prediction/inference.py b/train/modified_t5_decoder_2_layers/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/modified_t5_decoder_2_layers/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/modified_t5_decoder_2_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_2_layers/mapping_prediction/output.txt new file mode 100644 index 0000000..83c8528 --- /dev/null +++ b/train/modified_t5_decoder_2_layers/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9318504495977283 +Accuracy for fold 2: 0.8859813084112149 +Accuracy for fold 3: 0.9678714859437751 +Accuracy for fold 4: 0.9738344433872502 +Accuracy for fold 5: 0.9015116811726981 diff --git a/train/modified_t5_decoder_2_layers/mapping_prediction/predict.py b/train/modified_t5_decoder_2_layers/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/modified_t5_decoder_2_layers/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/modified_t5_decoder_2_layers/train_decoder.py b/train/modified_t5_decoder_2_layers/train_decoder.py new file mode 100644 index 0000000..fa96896 --- /dev/null +++ b/train/modified_t5_decoder_2_layers/train_decoder.py @@ -0,0 +1,255 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch + +from safetensors.torch import load_file + +from transformers.models.t5.modeling_t5 import T5Block +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + + + +# %% + +# model_checkpoint = "t5-small" +# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) +# model.config + +# %% +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + + pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + + # Access the decoder stack + # config = T5Config("t5-small") + + config = pretrained_model.config + config.num_layers = 6 + config.num_decoder_layers = 2 # set new decoder layer count + + model = T5ForConditionalGeneration(config) + + model.shared = pretrained_model.shared + model.encoder = pretrained_model.encoder + + pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] + for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): + model.decoder.block[i].load_state_dict(layer) # Load pretrained weights + + + # print number of decoder blocks + print(f'Number of decoder blocks: {len(model.decoder.block)}') + print(f'num_layers: {model.config.num_layers}') + print(f'num_decoder_layers: {model.config.num_decoder_layers}') + + + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/modified_t5_decoder_3_layers/.gitignore b/train/modified_t5_decoder_3_layers/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/modified_t5_decoder_3_layers/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_3_layers/mapping_prediction/.gitignore b/train/modified_t5_decoder_3_layers/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/modified_t5_decoder_3_layers/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/modified_t5_decoder_3_layers/mapping_prediction/inference.py b/train/modified_t5_decoder_3_layers/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/modified_t5_decoder_3_layers/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/modified_t5_decoder_3_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_3_layers/mapping_prediction/output.txt new file mode 100644 index 0000000..539366c --- /dev/null +++ b/train/modified_t5_decoder_3_layers/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9427354472314246 +Accuracy for fold 2: 0.9098130841121496 +Accuracy for fold 3: 0.964859437751004 +Accuracy for fold 4: 0.9719314938154139 +Accuracy for fold 5: 0.9070087036188731 diff --git a/train/modified_t5_decoder_3_layers/mapping_prediction/predict.py b/train/modified_t5_decoder_3_layers/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/modified_t5_decoder_3_layers/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/modified_t5_decoder_3_layers/train_decoder.py b/train/modified_t5_decoder_3_layers/train_decoder.py new file mode 100644 index 0000000..d4d3170 --- /dev/null +++ b/train/modified_t5_decoder_3_layers/train_decoder.py @@ -0,0 +1,255 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch + +from safetensors.torch import load_file + +from transformers.models.t5.modeling_t5 import T5Block +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + + + +# %% + +# model_checkpoint = "t5-small" +# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) +# model.config + +# %% +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + + pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + + # Access the decoder stack + # config = T5Config("t5-small") + + config = pretrained_model.config + config.num_layers = 6 + config.num_decoder_layers = 3 # set new decoder layer count + + model = T5ForConditionalGeneration(config) + + model.shared = pretrained_model.shared + model.encoder = pretrained_model.encoder + + pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] + for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): + model.decoder.block[i].load_state_dict(layer) # Load pretrained weights + + + # print number of decoder blocks + print(f'Number of decoder blocks: {len(model.decoder.block)}') + print(f'num_layers: {model.config.num_layers}') + print(f'num_decoder_layers: {model.config.num_decoder_layers}') + + + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/modified_t5_decoder_4_layers/.gitignore b/train/modified_t5_decoder_4_layers/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/modified_t5_decoder_4_layers/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_4_layers/mapping_prediction/.gitignore b/train/modified_t5_decoder_4_layers/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/modified_t5_decoder_4_layers/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/modified_t5_decoder_4_layers/mapping_prediction/inference.py b/train/modified_t5_decoder_4_layers/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/modified_t5_decoder_4_layers/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/modified_t5_decoder_4_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_4_layers/mapping_prediction/output.txt new file mode 100644 index 0000000..7f5c6e2 --- /dev/null +++ b/train/modified_t5_decoder_4_layers/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9503076194983436 +Accuracy for fold 2: 0.9135514018691588 +Accuracy for fold 3: 0.9698795180722891 +Accuracy for fold 4: 0.9790675547098002 +Accuracy for fold 5: 0.907924874026569 diff --git a/train/modified_t5_decoder_4_layers/mapping_prediction/predict.py b/train/modified_t5_decoder_4_layers/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/modified_t5_decoder_4_layers/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/modified_t5_decoder_4_layers/train_decoder.py b/train/modified_t5_decoder_4_layers/train_decoder.py new file mode 100644 index 0000000..155cdfd --- /dev/null +++ b/train/modified_t5_decoder_4_layers/train_decoder.py @@ -0,0 +1,255 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch + +from safetensors.torch import load_file + +from transformers.models.t5.modeling_t5 import T5Block +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + + + +# %% + +# model_checkpoint = "t5-small" +# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) +# model.config + +# %% +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + + pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + + # Access the decoder stack + # config = T5Config("t5-small") + + config = pretrained_model.config + config.num_layers = 6 + config.num_decoder_layers = 4 # set new decoder layer count + + model = T5ForConditionalGeneration(config) + + model.shared = pretrained_model.shared + model.encoder = pretrained_model.encoder + + pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] + for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): + model.decoder.block[i].load_state_dict(layer) # Load pretrained weights + + + # print number of decoder blocks + print(f'Number of decoder blocks: {len(model.decoder.block)}') + print(f'num_layers: {model.config.num_layers}') + print(f'num_decoder_layers: {model.config.num_decoder_layers}') + + + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/modified_t5_decoder_8_layers/.gitignore b/train/modified_t5_decoder_8_layers/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/modified_t5_decoder_8_layers/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_8_layers/mapping_prediction/.gitignore b/train/modified_t5_decoder_8_layers/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/modified_t5_decoder_8_layers/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/modified_t5_decoder_8_layers/mapping_prediction/inference.py b/train/modified_t5_decoder_8_layers/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/modified_t5_decoder_8_layers/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/modified_t5_decoder_8_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_8_layers/mapping_prediction/output.txt new file mode 100644 index 0000000..37ce896 --- /dev/null +++ b/train/modified_t5_decoder_8_layers/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9441552295314718 +Accuracy for fold 2: 0.9121495327102803 +Accuracy for fold 3: 0.963855421686747 +Accuracy for fold 4: 0.9752616555661275 +Accuracy for fold 5: 0.907924874026569 diff --git a/train/modified_t5_decoder_8_layers/mapping_prediction/predict.py b/train/modified_t5_decoder_8_layers/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/modified_t5_decoder_8_layers/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/modified_t5_decoder_8_layers/train_decoder.py b/train/modified_t5_decoder_8_layers/train_decoder.py new file mode 100644 index 0000000..7f4e233 --- /dev/null +++ b/train/modified_t5_decoder_8_layers/train_decoder.py @@ -0,0 +1,255 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "3" + +import torch + +from safetensors.torch import load_file + +from transformers.models.t5.modeling_t5 import T5Block +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + + + +# %% + +# model_checkpoint = "t5-small" +# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) +# model.config + +# %% +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + + pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + + # Access the decoder stack + # config = T5Config("t5-small") + + config = pretrained_model.config + config.num_layers = 6 + config.num_decoder_layers = 8 # set new decoder layer count + + model = T5ForConditionalGeneration(config) + + model.shared = pretrained_model.shared + model.encoder = pretrained_model.encoder + + pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] + for i, layer in enumerate(pretrained_decoder_weights): + model.decoder.block[i].load_state_dict(layer) # Load pretrained weights + + + # print number of decoder blocks + print(f'Number of decoder blocks: {len(model.decoder.block)}') + print(f'num_layers: {model.config.num_layers}') + print(f'num_decoder_layers: {model.config.num_decoder_layers}') + + + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/modified_t5_decoder_9_layers/.gitignore b/train/modified_t5_decoder_9_layers/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/modified_t5_decoder_9_layers/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_9_layers/mapping_prediction/.gitignore b/train/modified_t5_decoder_9_layers/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/modified_t5_decoder_9_layers/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/modified_t5_decoder_9_layers/mapping_prediction/inference.py b/train/modified_t5_decoder_9_layers/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/modified_t5_decoder_9_layers/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/modified_t5_decoder_9_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_9_layers/mapping_prediction/output.txt new file mode 100644 index 0000000..37ce896 --- /dev/null +++ b/train/modified_t5_decoder_9_layers/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9441552295314718 +Accuracy for fold 2: 0.9121495327102803 +Accuracy for fold 3: 0.963855421686747 +Accuracy for fold 4: 0.9752616555661275 +Accuracy for fold 5: 0.907924874026569 diff --git a/train/modified_t5_decoder_9_layers/mapping_prediction/predict.py b/train/modified_t5_decoder_9_layers/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/modified_t5_decoder_9_layers/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/modified_t5_decoder_9_layers/train_decoder.py b/train/modified_t5_decoder_9_layers/train_decoder.py new file mode 100644 index 0000000..7969fa2 --- /dev/null +++ b/train/modified_t5_decoder_9_layers/train_decoder.py @@ -0,0 +1,255 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch + +from safetensors.torch import load_file + +from transformers.models.t5.modeling_t5 import T5Block +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + + + +# %% + +# model_checkpoint = "t5-small" +# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) +# model.config + +# %% +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + + pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + + # Access the decoder stack + # config = T5Config("t5-small") + + config = pretrained_model.config + config.num_layers = 6 + config.num_decoder_layers = 9 # set new decoder layer count + + model = T5ForConditionalGeneration(config) + + model.shared = pretrained_model.shared + model.encoder = pretrained_model.encoder + + pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] + for i, layer in enumerate(pretrained_decoder_weights): + model.decoder.block[i].load_state_dict(layer) # Load pretrained weights + + + # print number of decoder blocks + print(f'Number of decoder blocks: {len(model.decoder.block)}') + print(f'num_layers: {model.config.num_layers}') + print(f'num_decoder_layers: {model.config.num_decoder_layers}') + + + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/random_t5_encoder/.gitignore b/train/random_t5_encoder/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/random_t5_encoder/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/random_t5_encoder/mapping_prediction/.gitignore b/train/random_t5_encoder/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/random_t5_encoder/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/random_t5_encoder/mapping_prediction/inference.py b/train/random_t5_encoder/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/random_t5_encoder/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/random_t5_encoder/mapping_prediction/output.txt b/train/random_t5_encoder/mapping_prediction/output.txt new file mode 100644 index 0000000..f60f90f --- /dev/null +++ b/train/random_t5_encoder/mapping_prediction/output.txt @@ -0,0 +1,2 @@ + +Accuracy for fold 1: 0.9342167534311405 diff --git a/train/random_t5_encoder/mapping_prediction/predict.py b/train/random_t5_encoder/mapping_prediction/predict.py new file mode 100644 index 0000000..29e45f8 --- /dev/null +++ b/train/random_t5_encoder/mapping_prediction/predict.py @@ -0,0 +1,74 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/random_t5_encoder/train_decoder.py b/train/random_t5_encoder/train_decoder.py new file mode 100644 index 0000000..17874d9 --- /dev/null +++ b/train/random_t5_encoder/train_decoder.py @@ -0,0 +1,246 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch + +from safetensors.torch import load_file + +from transformers import T5ForConditionalGeneration, T5Config +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + # directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # # Use glob to find matching paths + # # path is usually checkpoint_fold_1/checkpoint- + # # we are guaranteed to save only 1 checkpoint from training + # pattern = 'checkpoint-*' + # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # # Load the checkpoint + # checkpoint_path = f"{prev_checkpoint}/model.safetensors" + # checkpoint = load_file(checkpoint_path) + # # Filter out weights related to the classification head + # # given name format: t5.encoder.embed_tokens.weight + # # we want: encoder.embed.tokens.weight + # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} + + model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + # change the token embedding size to match the shape + model.resize_token_embeddings(len(tokenizer)) + + + # Create a T5 model with random weights + config = T5Config.from_pretrained("t5-small") # Use T5 configuration + random_model = T5ForConditionalGeneration(config) # Model initialized with random weights + random_model.resize_token_embeddings(len(tokenizer)) + + model.encoder = random_model.encoder + model.shared = random_model.shared + + # model.load_state_dict(state_dict=t5_weights, strict=False) + + # for key, param in model.state_dict().items(): + # if key in t5_weights: + # print(f"{key}: Successfully overridden") + # else: + # print(f"{key}: Retained original weights") + + + # Freeze the encoder + for param in model.encoder.parameters(): + param.requires_grad = False + + # Freeze the shared embedding layer + for param in model.shared.parameters(): + param.requires_grad = False + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) +