diff --git a/.gitignore b/.gitignore index e69de29..6f66c74 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +*.zip \ No newline at end of file diff --git a/end_to_end/.gitignore b/end_to_end/.gitignore index a1bff18..a499704 100644 --- a/end_to_end/.gitignore +++ b/end_to_end/.gitignore @@ -1,4 +1,5 @@ models raw_data.csv train_all.csv -__pycache__ \ No newline at end of file +__pycache__ +*.pt diff --git a/end_to_end/deduplication.py b/end_to_end/deduplication.py index b7fa37c..37073ad 100644 --- a/end_to_end/deduplication.py +++ b/end_to_end/deduplication.py @@ -31,8 +31,9 @@ class BertEmbedder: model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = "cpu" - model.to(self.device) - self.model = model.eval() + self.model = model.to(self.device) + self.model = self.model.eval() + # self.model = torch.compile(self.model) def make_embedding(self, batch_size=128): @@ -75,10 +76,11 @@ class T5Embedder: self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) - self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = "cpu" model.to(self.device) self.model = model.eval() + self.model = torch.compile(self.model) @@ -251,10 +253,26 @@ def run_deduplication( # generate your embeddings checkpoint_path = 'models/bert_model' + + # cache embeddings + file_path = "train_embeds.pt" + if os.path.exists(file_path): + # Load the tensor if the file exists + tensor = torch.load(file_path, weights_only=True) + print("Loaded tensor") + else: + # Create and save the tensor if the file doesn't exist + print('generate train embeddings') + train_embedder = Embedder(input_df=train_df, batch_size=batch_size) + tensor = train_embedder.make_embedding(checkpoint_path) + torch.save(tensor, file_path, weights_only=True) + print("Tensor saved to file.") + + train_embeds = tensor + + + # if we can, we can cache the train embeddings and load directly # we can generate the train embeddings once and re-use for every ship - print('generate train embeddings') - train_embedder = Embedder(input_df=train_df, batch_size=batch_size) - train_embeds = train_embedder.make_embedding(checkpoint_path) # generate new embeddings for each ship print('generate test embeddings') diff --git a/end_to_end/mapper.py b/end_to_end/mapper.py index c9685fb..d9dcd56 100644 --- a/end_to_end/mapper.py +++ b/end_to_end/mapper.py @@ -3,11 +3,14 @@ from torch.utils.data import DataLoader from transformers import ( T5TokenizerFast, AutoModelForSeq2SeqLM, + StoppingCriteria, + StoppingCriteriaList, ) import os from tqdm import tqdm from datasets import Dataset import numpy as np +from torch.nn.utils.rnn import pad_sequence os.environ['TOKENIZERS_PARALLELISM'] = 'false' @@ -35,9 +38,11 @@ class Mapper(): # load model # Define the directory and the pattern model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) - model = torch.compile(model) # set model to eval self.model = model.eval() + self.model.cuda() + # self.model = torch.compile(self.model) + @@ -59,8 +64,8 @@ class Mapper(): desc = f"{row['tag_description']}" unit = f"{row['unit']}" element = { - 'input' : f"{desc}{unit}", - 'output': f"{row['thing']}{row['property']}", + 'input' : f"{desc}{unit}" + # 'output': f"{row['thing']}{row['property']}", } output_list.append(element) @@ -68,16 +73,16 @@ class Mapper(): def _preprocess_function(example): input = example['input'] - target = example['output'] + # target = example['output'] # text_target sets the corresponding label to inputs # there is no need to create a separate 'labels' model_inputs = self.tokenizer( input, - text_target=target, - max_length=max_length, - return_tensors="pt", - padding='max_length', + # text_target=target, + # max_length=max_length, + padding=True, truncation=True, + return_tensors="pt", ) return model_inputs @@ -93,19 +98,55 @@ class Mapper(): remove_columns=test_dataset.column_names, ) # datasets = _preprocess_function(test_dataset) - datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask']) + + def _custom_collate_fn(batch): + # Extract data and targets separately if needed + inputs = [item['input_ids'] for item in batch] + attention_masks = [item['attention_mask'] for item in batch] + + # Pad data to the same length + padded_inputs = pad_sequence(inputs, batch_first=True) + padded_attention_masks = pad_sequence(attention_masks, batch_first=True) + + return {'input_ids': padded_inputs, 'attention_mask': padded_attention_masks} + # create dataloader - self.dataloader = DataLoader(datasets, batch_size=batch_size) + self.dataloader = DataLoader( + datasets, + batch_size=batch_size, + collate_fn=_custom_collate_fn + ) def generate(self): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - MAX_GENERATE_LENGTH = 128 + MAX_GENERATE_LENGTH = 120 pred_generations = [] - pred_labels = [] - self.model.cuda() + # pred_labels = [] + # self.model already assigned to device + # self.model.cuda() + + # introduce early stopping so that it doesn't have to generate max length + class StopOnEndToken(StoppingCriteria): + def __init__(self, end_token_id): + self.end_token_id = end_token_id + + def __call__(self, input_ids, scores, **kwargs): + # Check if the last token in any sequence is the end token + batch_stopped = input_ids[:, -1] == self.end_token_id + # only stop if all have reached end token + if batch_stopped.all(): + return True # Stop generation for the entire batch + return False + + # Define the end token ID (e.g., the ID for ) + end_token_id = 32103 # property end token + + # Use the stopping criteria + stopping_criteria = StoppingCriteriaList([StopOnEndToken(end_token_id)]) print("start generation") for batch in tqdm(self.dataloader): @@ -113,7 +154,7 @@ class Mapper(): input_ids = batch['input_ids'] attention_mask = batch['attention_mask'] # save labels too - pred_labels.extend(batch['labels']) + # pred_labels.extend(batch['labels']) # Move to GPU if available @@ -126,7 +167,11 @@ class Mapper(): with torch.no_grad(): outputs = self.model.generate(input_ids, attention_mask=attention_mask, - max_length=MAX_GENERATE_LENGTH) + max_length=MAX_GENERATE_LENGTH, + # eos_token_id=32103, + # early_stopping=True, + use_cache=True, + stopping_criteria=stopping_criteria) # Decode the output and print the results pred_generations.extend(outputs.to("cpu")) diff --git a/end_to_end/run.py b/end_to_end/run.py index 8182821..df921ac 100644 --- a/end_to_end/run.py +++ b/end_to_end/run.py @@ -7,8 +7,8 @@ from preprocess import Abbreviator from deduplication import run_deduplication # global config -BATCH_SIZE = 1024 -SHIPS_LIST = [1000,1001,1003] +BATCH_SIZE = 512 +SHIPS_LIST = [1000,1001,1003,1004] # %% # START: we import the raw data csv and extract only a few ships from it to simulate incoming json @@ -17,7 +17,8 @@ full_df = pd.read_csv(data_path, skipinitialspace=True) # subset ships only to that found in SHIPS_LIST df = full_df[full_df['ships_idx'].isin(SHIPS_LIST)].reset_index(drop=True) -num_rows = 2000 +# test parameters +num_rows = len(df) - 1 df = df[:num_rows] print(len(df)) @@ -58,7 +59,7 @@ df = run_deduplication( test_df=df, train_df=train_df, batch_size=BATCH_SIZE, - threshold=0.9, + threshold=0.85, diagnostic=True) # %% diff --git a/train/hybrid_t5_complete_desc_unit/.gitignore b/train/hybrid_t5_complete_desc_unit/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/train/hybrid_t5_complete_desc_unit/custom_t5/.gitignore b/train/hybrid_t5_complete_desc_unit/custom_t5/.gitignore new file mode 100644 index 0000000..ed8ebf5 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/custom_t5/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file diff --git a/train/hybrid_t5_complete_desc_unit/custom_t5/modeling_t5.py b/train/hybrid_t5_complete_desc_unit/custom_t5/modeling_t5.py new file mode 100644 index 0000000..40ca208 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/custom_t5/modeling_t5.py @@ -0,0 +1,125 @@ +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers import ( + T5PreTrainedModel, + T5Model + +) + +from transformers.modeling_outputs import ( + SequenceClassifierOutput, +) + +def mean_pooling(encoder_outputs, attention_mask): + """ + Perform mean pooling over encoder outputs, considering the attention mask. + """ + hidden_states = encoder_outputs.last_hidden_state # Shape: (batch_size, seq_length, hidden_size) + mask = attention_mask.unsqueeze(-1) # Shape: (batch_size, seq_length, 1) + masked_hidden_states = hidden_states * mask # Zero out padding tokens + sum_hidden_states = masked_hidden_states.sum(dim=1) # Sum over sequence length + sum_mask = mask.sum(dim=1) # Sum the mask (number of non-padding tokens) + return sum_hidden_states / sum_mask # Mean pooling + + +class T5EncoderForSequenceClassification(T5PreTrainedModel): + + def __init__(self, checkpoint, tokenizer, config, num_labels): + super().__init__(config) + self.num_labels = num_labels + self.config = config + + # we force the loading of a pre-trained model here + self.t5 = T5Model.from_pretrained(checkpoint) + self.t5.resize_token_embeddings(len(tokenizer)) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, self.num_labels) + + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + + # encoder_outputs = self.t5.encoder( + # input_ids, + # attention_mask=attention_mask, + # head_mask=head_mask, + # inputs_embeds=inputs_embeds, + # output_attentions=output_attentions, + # output_hidden_states=output_hidden_states, + # return_dict=return_dict, + # ) + + + encoder_outputs = self.t5.encoder(input_ids, attention_mask=attention_mask) + # last_hidden_state = encoder_outputs.last_hidden_state + # use mean of hidden state + # pooled_output = mean_pooling(encoder_outputs, attention_mask) + + # Use the hidden state of the first token as the sequence representation + pooled_output = encoder_outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + + # pooled_output = encoder_outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + encoder_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) \ No newline at end of file diff --git a/train/hybrid_t5_complete_desc_unit/mapping_prediction/.gitignore b/train/hybrid_t5_complete_desc_unit/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/hybrid_t5_complete_desc_unit/mapping_prediction/inference.py b/train/hybrid_t5_complete_desc_unit/mapping_prediction/inference.py new file mode 100644 index 0000000..9ea9c77 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt b/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt new file mode 100644 index 0000000..cf3b2d5 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9427354472314246 +Accuracy for fold 2: 0.8859813084112149 +Accuracy for fold 3: 0.9683734939759037 +Accuracy for fold 4: 0.9762131303520457 +Accuracy for fold 5: 0.907924874026569 diff --git a/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py b/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py new file mode 100644 index 0000000..7baf191 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py @@ -0,0 +1,73 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +BATCH_SIZE = 512 + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/hybrid_t5_complete_desc_unit/train_decoder.py b/train/hybrid_t5_complete_desc_unit/train_decoder.py new file mode 100644 index 0000000..7725033 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/train_decoder.py @@ -0,0 +1,227 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from custom_t5.modeling_t5 import T5EncoderForSequenceClassification + +from safetensors.torch import load_file +from transformers import ( + T5Config, + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + T5ForConditionalGeneration, + T5Model +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}b' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + directory = os.path.join(".", f'checkpoint_fold_{fold}a') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] + # t5_classify = T5Model.from_pretrained(prev_checkpoint) + # Load the checkpoint + checkpoint_path = f"{prev_checkpoint}/model.safetensors" + checkpoint = load_file(checkpoint_path) + # Filter out weights related to the classification head + t5_weights = {key: value for key, value in checkpoint.items() if "classifier" not in key} + + + model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) + model.load_state_dict(state_dict=t5_weights, strict=False) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # Freeze the encoder + for param in model.encoder.parameters(): + param.requires_grad = False + + # Freeze the shared embedding layer + for param in model.shared.parameters(): + param.requires_grad = False + + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 128 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + diff --git a/train/hybrid_t5_complete_desc_unit/train_encoder.py b/train/hybrid_t5_complete_desc_unit/train_encoder.py new file mode 100644 index 0000000..6ce6a39 --- /dev/null +++ b/train/hybrid_t5_complete_desc_unit/train_encoder.py @@ -0,0 +1,228 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from custom_t5.modeling_t5 import T5EncoderForSequenceClassification +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments, + T5Config, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# # rather than use pattern, we use the real thing and property +# thing_property = full_df['thing'] + full_df['property'] +# thing_property = thing_property.to_list() +# mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + # pattern = f"{row['thing'] + row['property']}" + pattern = f"{row['thing_pattern'] + ' ' + row['property_pattern']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}a' + split_datasets = create_split_dataset(fold, mdm_list) + + # prepare tokenizer + + # model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + model_checkpoint = "t5-small" + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding="max_length" + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + # model = AutoModelForSequenceClassification.from_pretrained( + # model_checkpoint, + # num_labels=len(mdm_list), + # id2label=id2label, + # label2id=label2id) + model = T5EncoderForSequenceClassification( + checkpoint=model_checkpoint, + tokenizer=tokenizer, + config=T5Config.from_pretrained(model_checkpoint), + num_labels=len(mdm_list) + ) + # important! after extending tokens vocab + # model.t5.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=128, + per_device_eval_batch_size=128, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + + +# %%