Feat: added modified layer-size decoder variants
- added frozen encoder/decoder variants
This commit is contained in:
		
							parent
							
								
									c5760d127d
								
							
						
					
					
						commit
						6072e4408c
					
				|  | @ -10,6 +10,12 @@ mdm_list = sorted(list((set(full_df['pattern'])))) | ||||||
| # %% | # %% | ||||||
| full_df | full_df | ||||||
| 
 | 
 | ||||||
|  | # %% | ||||||
|  | mdm_list | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | mask = full_df['pattern'] == 'GE#Flow FGMassFlow' | ||||||
|  | full_df[mask] | ||||||
| # %% | # %% | ||||||
| mask1 = full_df['thing'] == 'ME1TurboCharger1' | mask1 = full_df['thing'] == 'ME1TurboCharger1' | ||||||
| mask2 = full_df['property'] == 'LOInletPress' | mask2 = full_df['property'] == 'LOInletPress' | ||||||
|  |  | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | __pycache__ | ||||||
|  | @ -0,0 +1,125 @@ | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from typing import List, Optional, Tuple, Union | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | import torch.utils.checkpoint | ||||||
|  | from torch import nn | ||||||
|  | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss | ||||||
|  | 
 | ||||||
|  | from transformers import ( | ||||||
|  |     T5PreTrainedModel, | ||||||
|  |     T5Model | ||||||
|  |      | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | from transformers.modeling_outputs import ( | ||||||
|  |     SequenceClassifierOutput, | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | def mean_pooling(encoder_outputs, attention_mask): | ||||||
|  |     """ | ||||||
|  |     Perform mean pooling over encoder outputs, considering the attention mask. | ||||||
|  |     """ | ||||||
|  |     hidden_states = encoder_outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size) | ||||||
|  |     mask = attention_mask.unsqueeze(-1)  # Shape: (batch_size, seq_length, 1) | ||||||
|  |     masked_hidden_states = hidden_states * mask  # Zero out padding tokens | ||||||
|  |     sum_hidden_states = masked_hidden_states.sum(dim=1)  # Sum over sequence length | ||||||
|  |     sum_mask = mask.sum(dim=1)  # Sum the mask (number of non-padding tokens) | ||||||
|  |     return sum_hidden_states / sum_mask  # Mean pooling | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class T5EncoderForSequenceClassification(T5PreTrainedModel): | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint, tokenizer, config, num_labels): | ||||||
|  |         super().__init__(config) | ||||||
|  |         self.num_labels = num_labels | ||||||
|  |         self.config = config | ||||||
|  | 
 | ||||||
|  |         # we force the loading of a pre-trained model here | ||||||
|  |         self.t5 = T5Model.from_pretrained(checkpoint) | ||||||
|  |         self.t5.resize_token_embeddings(len(tokenizer)) | ||||||
|  |         classifier_dropout = ( | ||||||
|  |             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob | ||||||
|  |         ) | ||||||
|  |         self.dropout = nn.Dropout(classifier_dropout) | ||||||
|  |         self.classifier = nn.Linear(config.hidden_size, self.num_labels) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def forward( | ||||||
|  |         self, | ||||||
|  |         input_ids: Optional[torch.Tensor] = None, | ||||||
|  |         attention_mask: Optional[torch.Tensor] = None, | ||||||
|  |         token_type_ids: Optional[torch.Tensor] = None, | ||||||
|  |         position_ids: Optional[torch.Tensor] = None, | ||||||
|  |         head_mask: Optional[torch.Tensor] = None, | ||||||
|  |         inputs_embeds: Optional[torch.Tensor] = None, | ||||||
|  |         labels: Optional[torch.Tensor] = None, | ||||||
|  |         output_attentions: Optional[bool] = None, | ||||||
|  |         output_hidden_states: Optional[bool] = None, | ||||||
|  |         return_dict: Optional[bool] = None, | ||||||
|  |     ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: | ||||||
|  |         r""" | ||||||
|  |         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): | ||||||
|  |             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., | ||||||
|  |             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If | ||||||
|  |             `config.num_labels > 1` a classification loss is computed (Cross-Entropy). | ||||||
|  |         """ | ||||||
|  |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # encoder_outputs = self.t5.encoder( | ||||||
|  |         #     input_ids, | ||||||
|  |         #     attention_mask=attention_mask, | ||||||
|  |         #     head_mask=head_mask, | ||||||
|  |         #     inputs_embeds=inputs_embeds, | ||||||
|  |         #     output_attentions=output_attentions, | ||||||
|  |         #     output_hidden_states=output_hidden_states, | ||||||
|  |         #     return_dict=return_dict, | ||||||
|  |         # ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         encoder_outputs = self.t5.encoder(input_ids, attention_mask=attention_mask) | ||||||
|  |         # last_hidden_state = encoder_outputs.last_hidden_state | ||||||
|  |         # use mean of hidden state | ||||||
|  |         # pooled_output = mean_pooling(encoder_outputs, attention_mask) | ||||||
|  | 
 | ||||||
|  |         # Use the hidden state of the first token as the sequence representation | ||||||
|  |         pooled_output = encoder_outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size) | ||||||
|  | 
 | ||||||
|  |         # pooled_output = encoder_outputs[1] | ||||||
|  | 
 | ||||||
|  |         pooled_output = self.dropout(pooled_output) | ||||||
|  |         logits = self.classifier(pooled_output) | ||||||
|  | 
 | ||||||
|  |         loss = None | ||||||
|  |         if labels is not None: | ||||||
|  |             if self.config.problem_type is None: | ||||||
|  |                 if self.num_labels == 1: | ||||||
|  |                     self.config.problem_type = "regression" | ||||||
|  |                 elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): | ||||||
|  |                     self.config.problem_type = "single_label_classification" | ||||||
|  |                 else: | ||||||
|  |                     self.config.problem_type = "multi_label_classification" | ||||||
|  | 
 | ||||||
|  |             if self.config.problem_type == "regression": | ||||||
|  |                 loss_fct = MSELoss() | ||||||
|  |                 if self.num_labels == 1: | ||||||
|  |                     loss = loss_fct(logits.squeeze(), labels.squeeze()) | ||||||
|  |                 else: | ||||||
|  |                     loss = loss_fct(logits, labels) | ||||||
|  |             elif self.config.problem_type == "single_label_classification": | ||||||
|  |                 loss_fct = CrossEntropyLoss() | ||||||
|  |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | ||||||
|  |             elif self.config.problem_type == "multi_label_classification": | ||||||
|  |                 loss_fct = BCEWithLogitsLoss() | ||||||
|  |                 loss = loss_fct(logits, labels) | ||||||
|  |         if not return_dict: | ||||||
|  |             output = (logits,) + encoder_outputs[2:] | ||||||
|  |             return ((loss,) + output) if loss is not None else output | ||||||
|  | 
 | ||||||
|  |         return SequenceClassifierOutput( | ||||||
|  |             loss=loss, | ||||||
|  |             logits=logits, | ||||||
|  |             hidden_states=encoder_outputs.hidden_states, | ||||||
|  |             attentions=encoder_outputs.attentions, | ||||||
|  |         ) | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.0 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,236 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | from custom_t5.modeling_t5 import T5EncoderForSequenceClassification | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # Filter out weights related to the classification head | ||||||
|  |     # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     # model.load_state_dict(state_dict=t5_weights, strict=False) | ||||||
|  | 
 | ||||||
|  |     # for key, param in model.state_dict().items(): | ||||||
|  |     #     if key in t5_weights: | ||||||
|  |     #         print(f"{key}: Successfully overridden") | ||||||
|  |     #     else: | ||||||
|  |     #         print(f"{key}: Retained original weights") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Freeze the decoder | ||||||
|  |     for param in model.decoder.parameters(): | ||||||
|  |         param.requires_grad = False | ||||||
|  | 
 | ||||||
|  |     # Freeze the shared embedding layer | ||||||
|  |     for param in model.shared.parameters(): | ||||||
|  |         param.requires_grad = False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,3 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9342167534311405 | ||||||
|  | Accuracy for fold 2: 0.883177570093458 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,235 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     # model.load_state_dict(state_dict=t5_weights, strict=False) | ||||||
|  | 
 | ||||||
|  |     # for key, param in model.state_dict().items(): | ||||||
|  |     #     if key in t5_weights: | ||||||
|  |     #         print(f"{key}: Successfully overridden") | ||||||
|  |     #     else: | ||||||
|  |     #         print(f"{key}: Retained original weights") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Freeze the encoder | ||||||
|  |     for param in model.encoder.parameters(): | ||||||
|  |         param.requires_grad = False | ||||||
|  | 
 | ||||||
|  |     # Freeze the shared embedding layer | ||||||
|  |     for param in model.shared.parameters(): | ||||||
|  |         param.requires_grad = False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -1,6 +1,2 @@ | ||||||
| 
 | 
 | ||||||
| Accuracy for fold 1: 0.9427354472314246 | Accuracy for fold 1: 0.9398958826313298 | ||||||
| Accuracy for fold 2: 0.8859813084112149 |  | ||||||
| Accuracy for fold 3: 0.9683734939759037 |  | ||||||
| Accuracy for fold 4: 0.9762131303520457 |  | ||||||
| Accuracy for fold 5: 0.907924874026569 |  | ||||||
|  |  | ||||||
|  | @ -13,6 +13,7 @@ def infer_and_select(fold): | ||||||
|     # import test data |     # import test data | ||||||
|     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|     df = pd.read_csv(data_path, skipinitialspace=True) |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
| 
 | 
 | ||||||
|     # get target data |     # get target data | ||||||
|     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  | @ -69,5 +70,5 @@ def infer_and_select(fold): | ||||||
| with open("output.txt", "w") as f: | with open("output.txt", "w") as f: | ||||||
|     print('', file=f) |     print('', file=f) | ||||||
| 
 | 
 | ||||||
| for fold in [1,2,3,4,5]: | for fold in [1]: | ||||||
|     infer_and_select(fold) |     infer_and_select(fold) | ||||||
|  |  | ||||||
|  | @ -120,14 +120,23 @@ def train(fold): | ||||||
|     checkpoint_path = f"{prev_checkpoint}/model.safetensors" |     checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|     checkpoint = load_file(checkpoint_path) |     checkpoint = load_file(checkpoint_path) | ||||||
|     # Filter out weights related to the classification head |     # Filter out weights related to the classification head | ||||||
|     t5_weights = {key: value for key, value in checkpoint.items() if "classifier" not in key} |     # given name format: t5.encoder.embed_tokens.weight | ||||||
| 
 |     # we want: encoder.embed.tokens.weight | ||||||
|  |     t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
| 
 | 
 | ||||||
|     model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) |     model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|     model.load_state_dict(state_dict=t5_weights, strict=False) |     # change the token embedding size to match the shape | ||||||
|     # important! after extending tokens vocab |  | ||||||
|     model.resize_token_embeddings(len(tokenizer)) |     model.resize_token_embeddings(len(tokenizer)) | ||||||
| 
 | 
 | ||||||
|  |     model.load_state_dict(state_dict=t5_weights, strict=False) | ||||||
|  | 
 | ||||||
|  |     for key, param in model.state_dict().items(): | ||||||
|  |         if key in t5_weights: | ||||||
|  |             print(f"{key}: Successfully overridden") | ||||||
|  |         else: | ||||||
|  |             print(f"{key}: Retained original weights") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|     # Freeze the encoder |     # Freeze the encoder | ||||||
|     for param in model.encoder.parameters(): |     for param in model.encoder.parameters(): | ||||||
|         param.requires_grad = False |         param.requires_grad = False | ||||||
|  | @ -194,7 +203,7 @@ def train(fold): | ||||||
|         ddp_find_unused_parameters=False, |         ddp_find_unused_parameters=False, | ||||||
|         weight_decay=0.01, |         weight_decay=0.01, | ||||||
|         save_total_limit=1, |         save_total_limit=1, | ||||||
|         num_train_epochs=80, |         num_train_epochs=40, | ||||||
|         predict_with_generate=True, |         predict_with_generate=True, | ||||||
|         bf16=True, |         bf16=True, | ||||||
|         push_to_hub=False, |         push_to_hub=False, | ||||||
|  | @ -221,7 +230,7 @@ def train(fold): | ||||||
|     trainer.train() |     trainer.train() | ||||||
| 
 | 
 | ||||||
| # execute training | # execute training | ||||||
| for fold in [1,2,3,4,5]: | for fold in [1]: | ||||||
|     print(fold) |     print(fold) | ||||||
|     train(fold) |     train(fold) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -35,12 +35,13 @@ torch.set_float32_matmul_precision('high') | ||||||
| # import the full mdm-only file | # import the full mdm-only file | ||||||
| data_path = '../../data_import/exports/data_mapping_mdm.csv' | data_path = '../../data_import/exports/data_mapping_mdm.csv' | ||||||
| full_df = pd.read_csv(data_path, skipinitialspace=True) | full_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| mdm_list = sorted(list((set(full_df['pattern'])))) | # mdm_list = sorted(list((set(full_df['pattern'])))) | ||||||
| 
 | 
 | ||||||
| # # rather than use pattern, we use the real thing and property | # # rather than use pattern, we use the real thing and property | ||||||
| # thing_property = full_df['thing'] + full_df['property'] | thing_property = full_df['thing'] + full_df['property'] | ||||||
| # thing_property = thing_property.to_list() | thing_property = thing_property.to_list() | ||||||
| # mdm_list = sorted(list(set(thing_property))) | mdm_list = sorted(list(set(thing_property))) | ||||||
|  | print("number of classes: ", len(mdm_list)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
|  | @ -62,8 +63,8 @@ def process_df_to_dict(df, mdm_list): | ||||||
|     for _, row in df.iterrows(): |     for _, row in df.iterrows(): | ||||||
|         desc = f"<DESC>{row['tag_description']}<DESC>" |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|         unit = f"<UNIT>{row['unit']}<UNIT>" |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|         # pattern = f"{row['thing'] + row['property']}" |         pattern = f"{row['thing'] + row['property']}" | ||||||
|         pattern = f"{row['thing_pattern'] + ' ' + row['property_pattern']}" |         # pattern = f"{row['thing_pattern'] + ' ' + row['property_pattern']}" | ||||||
|         try: |         try: | ||||||
|             index = mdm_list.index(pattern) |             index = mdm_list.index(pattern) | ||||||
|         except ValueError: |         except ValueError: | ||||||
|  | @ -137,7 +138,7 @@ def train(fold): | ||||||
|         remove_columns="text", |         remove_columns="text", | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     # %% temp |     # %% temp    # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|     # tokenized_datasets['train'].rename_columns() |     # tokenized_datasets['train'].rename_columns() | ||||||
| 
 | 
 | ||||||
|     # %% |     # %% | ||||||
|  | @ -192,7 +193,7 @@ def train(fold): | ||||||
|         per_device_train_batch_size=128, |         per_device_train_batch_size=128, | ||||||
|         per_device_eval_batch_size=128, |         per_device_eval_batch_size=128, | ||||||
|         auto_find_batch_size=False, |         auto_find_batch_size=False, | ||||||
|         ddp_find_unused_parameters=False, |         ddp_find_unused_parameters=False,    # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|         weight_decay=0.01, |         weight_decay=0.01, | ||||||
|         save_total_limit=1, |         save_total_limit=1, | ||||||
|         num_train_epochs=80, |         num_train_epochs=80, | ||||||
|  | @ -220,7 +221,7 @@ def train(fold): | ||||||
|     trainer.train() |     trainer.train() | ||||||
| 
 | 
 | ||||||
| # execute training | # execute training | ||||||
| for fold in [1,2,3,4,5]: | for fold in [1]: | ||||||
|     print(fold) |     print(fold) | ||||||
|     train(fold) |     train(fold) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | __pycache__ | ||||||
|  | @ -0,0 +1,125 @@ | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from typing import List, Optional, Tuple, Union | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | import torch.utils.checkpoint | ||||||
|  | from torch import nn | ||||||
|  | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss | ||||||
|  | 
 | ||||||
|  | from transformers import ( | ||||||
|  |     T5PreTrainedModel, | ||||||
|  |     T5Model | ||||||
|  |      | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | from transformers.modeling_outputs import ( | ||||||
|  |     SequenceClassifierOutput, | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | def mean_pooling(encoder_outputs, attention_mask): | ||||||
|  |     """ | ||||||
|  |     Perform mean pooling over encoder outputs, considering the attention mask. | ||||||
|  |     """ | ||||||
|  |     hidden_states = encoder_outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size) | ||||||
|  |     mask = attention_mask.unsqueeze(-1)  # Shape: (batch_size, seq_length, 1) | ||||||
|  |     masked_hidden_states = hidden_states * mask  # Zero out padding tokens | ||||||
|  |     sum_hidden_states = masked_hidden_states.sum(dim=1)  # Sum over sequence length | ||||||
|  |     sum_mask = mask.sum(dim=1)  # Sum the mask (number of non-padding tokens) | ||||||
|  |     return sum_hidden_states / sum_mask  # Mean pooling | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class T5EncoderForSequenceClassification(T5PreTrainedModel): | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint, tokenizer, config, num_labels): | ||||||
|  |         super().__init__(config) | ||||||
|  |         self.num_labels = num_labels | ||||||
|  |         self.config = config | ||||||
|  | 
 | ||||||
|  |         # we force the loading of a pre-trained model here | ||||||
|  |         self.t5 = T5Model.from_pretrained(checkpoint) | ||||||
|  |         self.t5.resize_token_embeddings(len(tokenizer)) | ||||||
|  |         classifier_dropout = ( | ||||||
|  |             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob | ||||||
|  |         ) | ||||||
|  |         self.dropout = nn.Dropout(classifier_dropout) | ||||||
|  |         self.classifier = nn.Linear(config.hidden_size, self.num_labels) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def forward( | ||||||
|  |         self, | ||||||
|  |         input_ids: Optional[torch.Tensor] = None, | ||||||
|  |         attention_mask: Optional[torch.Tensor] = None, | ||||||
|  |         token_type_ids: Optional[torch.Tensor] = None, | ||||||
|  |         position_ids: Optional[torch.Tensor] = None, | ||||||
|  |         head_mask: Optional[torch.Tensor] = None, | ||||||
|  |         inputs_embeds: Optional[torch.Tensor] = None, | ||||||
|  |         labels: Optional[torch.Tensor] = None, | ||||||
|  |         output_attentions: Optional[bool] = None, | ||||||
|  |         output_hidden_states: Optional[bool] = None, | ||||||
|  |         return_dict: Optional[bool] = None, | ||||||
|  |     ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: | ||||||
|  |         r""" | ||||||
|  |         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): | ||||||
|  |             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., | ||||||
|  |             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If | ||||||
|  |             `config.num_labels > 1` a classification loss is computed (Cross-Entropy). | ||||||
|  |         """ | ||||||
|  |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # encoder_outputs = self.t5.encoder( | ||||||
|  |         #     input_ids, | ||||||
|  |         #     attention_mask=attention_mask, | ||||||
|  |         #     head_mask=head_mask, | ||||||
|  |         #     inputs_embeds=inputs_embeds, | ||||||
|  |         #     output_attentions=output_attentions, | ||||||
|  |         #     output_hidden_states=output_hidden_states, | ||||||
|  |         #     return_dict=return_dict, | ||||||
|  |         # ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         encoder_outputs = self.t5.encoder(input_ids, attention_mask=attention_mask) | ||||||
|  |         # last_hidden_state = encoder_outputs.last_hidden_state | ||||||
|  |         # use mean of hidden state | ||||||
|  |         # pooled_output = mean_pooling(encoder_outputs, attention_mask) | ||||||
|  | 
 | ||||||
|  |         # Use the hidden state of the first token as the sequence representation | ||||||
|  |         pooled_output = encoder_outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size) | ||||||
|  | 
 | ||||||
|  |         # pooled_output = encoder_outputs[1] | ||||||
|  | 
 | ||||||
|  |         pooled_output = self.dropout(pooled_output) | ||||||
|  |         logits = self.classifier(pooled_output) | ||||||
|  | 
 | ||||||
|  |         loss = None | ||||||
|  |         if labels is not None: | ||||||
|  |             if self.config.problem_type is None: | ||||||
|  |                 if self.num_labels == 1: | ||||||
|  |                     self.config.problem_type = "regression" | ||||||
|  |                 elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): | ||||||
|  |                     self.config.problem_type = "single_label_classification" | ||||||
|  |                 else: | ||||||
|  |                     self.config.problem_type = "multi_label_classification" | ||||||
|  | 
 | ||||||
|  |             if self.config.problem_type == "regression": | ||||||
|  |                 loss_fct = MSELoss() | ||||||
|  |                 if self.num_labels == 1: | ||||||
|  |                     loss = loss_fct(logits.squeeze(), labels.squeeze()) | ||||||
|  |                 else: | ||||||
|  |                     loss = loss_fct(logits, labels) | ||||||
|  |             elif self.config.problem_type == "single_label_classification": | ||||||
|  |                 loss_fct = CrossEntropyLoss() | ||||||
|  |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | ||||||
|  |             elif self.config.problem_type == "multi_label_classification": | ||||||
|  |                 loss_fct = BCEWithLogitsLoss() | ||||||
|  |                 loss = loss_fct(logits, labels) | ||||||
|  |         if not return_dict: | ||||||
|  |             output = (logits,) + encoder_outputs[2:] | ||||||
|  |             return ((loss,) + output) if loss is not None else output | ||||||
|  | 
 | ||||||
|  |         return SequenceClassifierOutput( | ||||||
|  |             loss=loss, | ||||||
|  |             logits=logits, | ||||||
|  |             hidden_states=encoder_outputs.hidden_states, | ||||||
|  |             attentions=encoder_outputs.attentions, | ||||||
|  |         ) | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9337434926644581 | ||||||
|  | Accuracy for fold 2: 0.914018691588785 | ||||||
|  | Accuracy for fold 3: 0.9623493975903614 | ||||||
|  | Accuracy for fold 4: 0.9738344433872502 | ||||||
|  | Accuracy for fold 5: 0.9042601923957856 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,234 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | from custom_t5.modeling_t5 import T5EncoderForSequenceClassification | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # Load the checkpoint | ||||||
|  |     checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     checkpoint = load_file(checkpoint_path) | ||||||
|  |     # Filter out weights related to the classification head | ||||||
|  |     # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # we want: encoder.embed.tokens.weight | ||||||
|  |     t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     model.load_state_dict(state_dict=t5_weights, strict=False) | ||||||
|  | 
 | ||||||
|  |     for key, param in model.state_dict().items(): | ||||||
|  |         if key in t5_weights: | ||||||
|  |             print(f"{key}: Successfully overridden") | ||||||
|  |         else: | ||||||
|  |             print(f"{key}: Retained original weights") | ||||||
|  | 
 | ||||||
|  |     # Freeze the encoder | ||||||
|  |     for param in model.encoder.parameters(): | ||||||
|  |         param.requires_grad = False | ||||||
|  | 
 | ||||||
|  |     # Freeze the shared embedding layer | ||||||
|  |     for param in model.shared.parameters(): | ||||||
|  |         param.requires_grad = False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,228 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | from custom_t5.modeling_t5 import T5EncoderForSequenceClassification | ||||||
|  | from transformers import ( | ||||||
|  |     AutoTokenizer, | ||||||
|  |     AutoModelForSequenceClassification, | ||||||
|  |     DataCollatorWithPadding, | ||||||
|  |     Trainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     TrainingArguments, | ||||||
|  |     T5Config, | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # we need to create the mdm_list | ||||||
|  | # import the full mdm-only file | ||||||
|  | data_path = '../../data_import/exports/data_mapping_mdm.csv' | ||||||
|  | full_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | mdm_list = sorted(list((set(full_df['pattern'])))) | ||||||
|  | 
 | ||||||
|  | # # rather than use pattern, we use the real thing and property | ||||||
|  | # thing_property = full_df['thing'] + full_df['property'] | ||||||
|  | # thing_property = thing_property.to_list() | ||||||
|  | # mdm_list = sorted(list(set(thing_property))) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | id2label = {} | ||||||
|  | label2id = {} | ||||||
|  | for idx, val in enumerate(mdm_list): | ||||||
|  |     id2label[idx] = val | ||||||
|  |     label2id[val] = idx | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | # processes dataframe into lists of dictionaries | ||||||
|  | # each element maps input to output | ||||||
|  | # input: tag_description | ||||||
|  | # output: class label | ||||||
|  | def process_df_to_dict(df, mdm_list): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         # pattern = f"{row['thing'] + row['property']}" | ||||||
|  |         pattern = f"{row['thing_pattern'] + ' ' + row['property_pattern']}" | ||||||
|  |         try: | ||||||
|  |             index = mdm_list.index(pattern) | ||||||
|  |         except ValueError: | ||||||
|  |             print("Error: value not found in MDM list") | ||||||
|  |             index = -1 | ||||||
|  |         element = { | ||||||
|  |             'text' : f"{desc}{unit}", | ||||||
|  |             'label': index, | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold, mdm_list): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  | 
 | ||||||
|  |     save_path = f'checkpoint_fold_{fold}a' | ||||||
|  |     split_datasets = create_split_dataset(fold, mdm_list) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  | 
 | ||||||
|  |     # model_checkpoint = "distilbert/distilbert-base-uncased" | ||||||
|  |     # model_checkpoint = 'google-bert/bert-base-cased' | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['text'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns="text", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # %% temp | ||||||
|  |     # tokenized_datasets['train'].rename_columns() | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # create data collator | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # compute metrics | ||||||
|  |     metric = evaluate.load("accuracy") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         preds = np.argmax(preds, axis=1) | ||||||
|  |         return metric.compute(predictions=preds, references=labels) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # create id2label and label2id | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # model = AutoModelForSequenceClassification.from_pretrained( | ||||||
|  |     #     model_checkpoint, | ||||||
|  |     #     num_labels=len(mdm_list), | ||||||
|  |     #     id2label=id2label, | ||||||
|  |     #     label2id=label2id) | ||||||
|  |     model = T5EncoderForSequenceClassification( | ||||||
|  |         checkpoint=model_checkpoint, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         config=T5Config.from_pretrained(model_checkpoint), | ||||||
|  |         num_labels=len(mdm_list) | ||||||
|  |     ) | ||||||
|  |     # important! after extending tokens vocab | ||||||
|  |     # model.t5.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     training_args = TrainingArguments( | ||||||
|  |         output_dir=f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=128, | ||||||
|  |         per_device_eval_batch_size=128, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Trainer( | ||||||
|  |         model, | ||||||
|  |         training_args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9403691433980123 | ||||||
|  | Accuracy for fold 2: 0.9046728971962616 | ||||||
|  | Accuracy for fold 3: 0.9678714859437751 | ||||||
|  | Accuracy for fold 4: 0.9695528068506185 | ||||||
|  | Accuracy for fold 5: 0.902427851580394 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,255 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | 
 | ||||||
|  | from transformers.models.t5.modeling_t5 import T5Block | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # model_checkpoint = "t5-small" | ||||||
|  | # model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | # model.config | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | 
 | ||||||
|  |     # Access the decoder stack | ||||||
|  |     # config = T5Config("t5-small") | ||||||
|  | 
 | ||||||
|  |     config = pretrained_model.config | ||||||
|  |     config.num_layers = 6 | ||||||
|  |     config.num_decoder_layers = 12  # set new decoder layer count | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration(config) | ||||||
|  | 
 | ||||||
|  |     model.shared = pretrained_model.shared | ||||||
|  |     model.encoder = pretrained_model.encoder | ||||||
|  | 
 | ||||||
|  |     pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] | ||||||
|  |     for i, layer in enumerate(pretrained_decoder_weights): | ||||||
|  |         model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # print number of decoder blocks | ||||||
|  |     print(f'Number of decoder blocks: {len(model.decoder.block)}') | ||||||
|  |     print(f'num_layers: {model.config.num_layers}') | ||||||
|  |     print(f'num_decoder_layers: {model.config.num_decoder_layers}') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.8968291528632276 | ||||||
|  | Accuracy for fold 2: 0.8859813084112149 | ||||||
|  | Accuracy for fold 3: 0.9382530120481928 | ||||||
|  | Accuracy for fold 4: 0.9586108468125595 | ||||||
|  | Accuracy for fold 5: 0.8827301878149336 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,255 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | 
 | ||||||
|  | from transformers.models.t5.modeling_t5 import T5Block | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # model_checkpoint = "t5-small" | ||||||
|  | # model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | # model.config | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | 
 | ||||||
|  |     # Access the decoder stack | ||||||
|  |     # config = T5Config("t5-small") | ||||||
|  | 
 | ||||||
|  |     config = pretrained_model.config | ||||||
|  |     config.num_layers = 6 | ||||||
|  |     config.num_decoder_layers = 1  # set new decoder layer count | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration(config) | ||||||
|  | 
 | ||||||
|  |     model.shared = pretrained_model.shared | ||||||
|  |     model.encoder = pretrained_model.encoder | ||||||
|  | 
 | ||||||
|  |     pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] | ||||||
|  |     for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): | ||||||
|  |         model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # print number of decoder blocks | ||||||
|  |     print(f'Number of decoder blocks: {len(model.decoder.block)}') | ||||||
|  |     print(f'num_layers: {model.config.num_layers}') | ||||||
|  |     print(f'num_decoder_layers: {model.config.num_decoder_layers}') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9318504495977283 | ||||||
|  | Accuracy for fold 2: 0.8859813084112149 | ||||||
|  | Accuracy for fold 3: 0.9678714859437751 | ||||||
|  | Accuracy for fold 4: 0.9738344433872502 | ||||||
|  | Accuracy for fold 5: 0.9015116811726981 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,255 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | 
 | ||||||
|  | from transformers.models.t5.modeling_t5 import T5Block | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # model_checkpoint = "t5-small" | ||||||
|  | # model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | # model.config | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | 
 | ||||||
|  |     # Access the decoder stack | ||||||
|  |     # config = T5Config("t5-small") | ||||||
|  | 
 | ||||||
|  |     config = pretrained_model.config | ||||||
|  |     config.num_layers = 6 | ||||||
|  |     config.num_decoder_layers = 2  # set new decoder layer count | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration(config) | ||||||
|  | 
 | ||||||
|  |     model.shared = pretrained_model.shared | ||||||
|  |     model.encoder = pretrained_model.encoder | ||||||
|  | 
 | ||||||
|  |     pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] | ||||||
|  |     for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): | ||||||
|  |         model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # print number of decoder blocks | ||||||
|  |     print(f'Number of decoder blocks: {len(model.decoder.block)}') | ||||||
|  |     print(f'num_layers: {model.config.num_layers}') | ||||||
|  |     print(f'num_decoder_layers: {model.config.num_decoder_layers}') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9427354472314246 | ||||||
|  | Accuracy for fold 2: 0.9098130841121496 | ||||||
|  | Accuracy for fold 3: 0.964859437751004 | ||||||
|  | Accuracy for fold 4: 0.9719314938154139 | ||||||
|  | Accuracy for fold 5: 0.9070087036188731 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,255 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | 
 | ||||||
|  | from transformers.models.t5.modeling_t5 import T5Block | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # model_checkpoint = "t5-small" | ||||||
|  | # model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | # model.config | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | 
 | ||||||
|  |     # Access the decoder stack | ||||||
|  |     # config = T5Config("t5-small") | ||||||
|  | 
 | ||||||
|  |     config = pretrained_model.config | ||||||
|  |     config.num_layers = 6 | ||||||
|  |     config.num_decoder_layers = 3  # set new decoder layer count | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration(config) | ||||||
|  | 
 | ||||||
|  |     model.shared = pretrained_model.shared | ||||||
|  |     model.encoder = pretrained_model.encoder | ||||||
|  | 
 | ||||||
|  |     pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] | ||||||
|  |     for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): | ||||||
|  |         model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # print number of decoder blocks | ||||||
|  |     print(f'Number of decoder blocks: {len(model.decoder.block)}') | ||||||
|  |     print(f'num_layers: {model.config.num_layers}') | ||||||
|  |     print(f'num_decoder_layers: {model.config.num_decoder_layers}') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9503076194983436 | ||||||
|  | Accuracy for fold 2: 0.9135514018691588 | ||||||
|  | Accuracy for fold 3: 0.9698795180722891 | ||||||
|  | Accuracy for fold 4: 0.9790675547098002 | ||||||
|  | Accuracy for fold 5: 0.907924874026569 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,255 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | 
 | ||||||
|  | from transformers.models.t5.modeling_t5 import T5Block | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # model_checkpoint = "t5-small" | ||||||
|  | # model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | # model.config | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | 
 | ||||||
|  |     # Access the decoder stack | ||||||
|  |     # config = T5Config("t5-small") | ||||||
|  | 
 | ||||||
|  |     config = pretrained_model.config | ||||||
|  |     config.num_layers = 6 | ||||||
|  |     config.num_decoder_layers = 4  # set new decoder layer count | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration(config) | ||||||
|  | 
 | ||||||
|  |     model.shared = pretrained_model.shared | ||||||
|  |     model.encoder = pretrained_model.encoder | ||||||
|  | 
 | ||||||
|  |     pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] | ||||||
|  |     for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): | ||||||
|  |         model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # print number of decoder blocks | ||||||
|  |     print(f'Number of decoder blocks: {len(model.decoder.block)}') | ||||||
|  |     print(f'num_layers: {model.config.num_layers}') | ||||||
|  |     print(f'num_decoder_layers: {model.config.num_decoder_layers}') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9441552295314718 | ||||||
|  | Accuracy for fold 2: 0.9121495327102803 | ||||||
|  | Accuracy for fold 3: 0.963855421686747 | ||||||
|  | Accuracy for fold 4: 0.9752616555661275 | ||||||
|  | Accuracy for fold 5: 0.907924874026569 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,255 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | 
 | ||||||
|  | from transformers.models.t5.modeling_t5 import T5Block | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # model_checkpoint = "t5-small" | ||||||
|  | # model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | # model.config | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | 
 | ||||||
|  |     # Access the decoder stack | ||||||
|  |     # config = T5Config("t5-small") | ||||||
|  | 
 | ||||||
|  |     config = pretrained_model.config | ||||||
|  |     config.num_layers = 6 | ||||||
|  |     config.num_decoder_layers = 8  # set new decoder layer count | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration(config) | ||||||
|  | 
 | ||||||
|  |     model.shared = pretrained_model.shared | ||||||
|  |     model.encoder = pretrained_model.encoder | ||||||
|  | 
 | ||||||
|  |     pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] | ||||||
|  |     for i, layer in enumerate(pretrained_decoder_weights): | ||||||
|  |         model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # print number of decoder blocks | ||||||
|  |     print(f'Number of decoder blocks: {len(model.decoder.block)}') | ||||||
|  |     print(f'num_layers: {model.config.num_layers}') | ||||||
|  |     print(f'num_decoder_layers: {model.config.num_decoder_layers}') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9441552295314718 | ||||||
|  | Accuracy for fold 2: 0.9121495327102803 | ||||||
|  | Accuracy for fold 3: 0.963855421686747 | ||||||
|  | Accuracy for fold 4: 0.9752616555661275 | ||||||
|  | Accuracy for fold 5: 0.907924874026569 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,255 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | 
 | ||||||
|  | from transformers.models.t5.modeling_t5 import T5Block | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # model_checkpoint = "t5-small" | ||||||
|  | # model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | # model.config | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  | 
 | ||||||
|  |     # Access the decoder stack | ||||||
|  |     # config = T5Config("t5-small") | ||||||
|  | 
 | ||||||
|  |     config = pretrained_model.config | ||||||
|  |     config.num_layers = 6 | ||||||
|  |     config.num_decoder_layers = 9  # set new decoder layer count | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration(config) | ||||||
|  | 
 | ||||||
|  |     model.shared = pretrained_model.shared | ||||||
|  |     model.encoder = pretrained_model.encoder | ||||||
|  | 
 | ||||||
|  |     pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] | ||||||
|  |     for i, layer in enumerate(pretrained_decoder_weights): | ||||||
|  |         model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # print number of decoder blocks | ||||||
|  |     print(f'Number of decoder blocks: {len(model.decoder.block)}') | ||||||
|  |     print(f'num_layers: {model.config.num_layers}') | ||||||
|  |     print(f'num_decoder_layers: {model.config.num_decoder_layers}') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | __pycache__ | ||||||
|  | exports/ | ||||||
|  | @ -0,0 +1,168 @@ | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | from transformers import ( | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  | ) | ||||||
|  | import os | ||||||
|  | from tqdm import tqdm | ||||||
|  | from datasets import Dataset | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Inference(): | ||||||
|  |     tokenizer: T5TokenizerFast | ||||||
|  |     model: torch.nn.Module | ||||||
|  |     dataloader: DataLoader | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checkpoint_path): | ||||||
|  |         self._create_tokenizer() | ||||||
|  |         self._load_model(checkpoint_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def _create_tokenizer(self): | ||||||
|  |         # %% | ||||||
|  |         # load tokenizer | ||||||
|  |         self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |         # Define additional special tokens | ||||||
|  |         additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"] | ||||||
|  |         # Add the additional special tokens to the tokenizer | ||||||
|  |         self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     def _load_model(self, checkpoint_path: str): | ||||||
|  |         # load model | ||||||
|  |         # Define the directory and the pattern | ||||||
|  |         model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) | ||||||
|  |         model = torch.compile(model) | ||||||
|  |         # set model to eval | ||||||
|  |         self.model = model.eval() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def prepare_dataloader(self, input_df, batch_size, max_length): | ||||||
|  |         """ | ||||||
|  |         *arguments* | ||||||
|  |         - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' | ||||||
|  |         - batch_size: the batch size of dataloader output | ||||||
|  |         - max_length: length of tokenizer output | ||||||
|  |         """ | ||||||
|  |         print("preparing dataloader") | ||||||
|  |         # convert each dataframe row into a dictionary | ||||||
|  |         # outputs a list of dictionaries | ||||||
|  | 
 | ||||||
|  |         def _process_df(df): | ||||||
|  |             output_list = [] | ||||||
|  |             for _, row in df.iterrows(): | ||||||
|  |                 desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |                 unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |                 element = { | ||||||
|  |                     'input' : f"{desc}{unit}", | ||||||
|  |                     'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             return output_list | ||||||
|  | 
 | ||||||
|  |         def _preprocess_function(example): | ||||||
|  |             input = example['input'] | ||||||
|  |             target = example['output'] | ||||||
|  |             # text_target sets the corresponding label to inputs | ||||||
|  |             # there is no need to create a separate 'labels' | ||||||
|  |             model_inputs = self.tokenizer( | ||||||
|  |                 input, | ||||||
|  |                 text_target=target,  | ||||||
|  |                 max_length=max_length, | ||||||
|  |                 return_tensors="pt", | ||||||
|  |                 padding="max_length", | ||||||
|  |                 truncation=True, | ||||||
|  |             ) | ||||||
|  |             return model_inputs | ||||||
|  | 
 | ||||||
|  |         test_dataset = Dataset.from_list(_process_df(input_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # map maps function to each "row" in the dataset | ||||||
|  |         # aka the data in the immediate nesting | ||||||
|  |         datasets = test_dataset.map( | ||||||
|  |             _preprocess_function, | ||||||
|  |             batched=True, | ||||||
|  |             num_proc=1, | ||||||
|  |             remove_columns=test_dataset.column_names, | ||||||
|  |         ) | ||||||
|  |         # datasets = _preprocess_function(test_dataset) | ||||||
|  |         datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | ||||||
|  | 
 | ||||||
|  |         # create dataloader | ||||||
|  |         self.dataloader = DataLoader(datasets, batch_size=batch_size) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def generate(self): | ||||||
|  |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |         MAX_GENERATE_LENGTH = 128 | ||||||
|  | 
 | ||||||
|  |         pred_generations = [] | ||||||
|  |         pred_labels = [] | ||||||
|  | 
 | ||||||
|  |         print("start generation") | ||||||
|  |         for batch in tqdm(self.dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             pred_labels.extend(batch['labels']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  |             self.model.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 outputs = self.model.generate(input_ids, | ||||||
|  |                                         attention_mask=attention_mask, | ||||||
|  |                                         max_length=MAX_GENERATE_LENGTH) | ||||||
|  |                  | ||||||
|  |                 # Decode the output and print the results | ||||||
|  |                 pred_generations.extend(outputs.to("cpu")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # %% | ||||||
|  |         # extract sequence and decode | ||||||
|  |         def extract_seq(tokens, start_value, end_value): | ||||||
|  |             if start_value not in tokens or end_value not in tokens: | ||||||
|  |                 return None  # Or handle this case according to your requirements | ||||||
|  |             start_id = np.where(tokens == start_value)[0][0] | ||||||
|  |             end_id = np.where(tokens == end_value)[0][0] | ||||||
|  | 
 | ||||||
|  |             return tokens[start_id+1:end_id] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         def process_tensor_output(tokens): | ||||||
|  |             thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END> | ||||||
|  |             property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END> | ||||||
|  |             p_thing = None | ||||||
|  |             p_property = None | ||||||
|  |             if (thing_seq is not None): | ||||||
|  |                 p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False) | ||||||
|  |             if (property_seq is not None): | ||||||
|  |                 p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False) | ||||||
|  |             return p_thing, p_property | ||||||
|  | 
 | ||||||
|  |         # decode prediction labels | ||||||
|  |         def decode_preds(tokens_list): | ||||||
|  |             thing_prediction_list = [] | ||||||
|  |             property_prediction_list = [] | ||||||
|  |             for tokens in tokens_list: | ||||||
|  |                 p_thing, p_property = process_tensor_output(tokens) | ||||||
|  |                 thing_prediction_list.append(p_thing) | ||||||
|  |                 property_prediction_list.append(p_property) | ||||||
|  |             return thing_prediction_list, property_prediction_list  | ||||||
|  | 
 | ||||||
|  |         thing_prediction_list, property_prediction_list = decode_preds(pred_generations) | ||||||
|  |         return thing_prediction_list, property_prediction_list | ||||||
|  | 
 | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold 1: 0.9342167534311405 | ||||||
|  | @ -0,0 +1,74 @@ | ||||||
|  | 
 | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | from inference import Inference | ||||||
|  | 
 | ||||||
|  | checkpoint_directory =  '../' | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 512 | ||||||
|  | 
 | ||||||
|  | def infer_and_select(fold): | ||||||
|  |     print(f"Inference for fold {fold}") | ||||||
|  |     # import test data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" | ||||||
|  |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     df = df[df['MDM']].reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |     # get target data | ||||||
|  |     data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  |     # processing to help with selection later | ||||||
|  |     train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     ########################################## | ||||||
|  |     # run inference | ||||||
|  |     # checkpoint | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     infer = Inference(checkpoint_path) | ||||||
|  |     infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) | ||||||
|  |     thing_prediction_list, property_prediction_list = infer.generate() | ||||||
|  | 
 | ||||||
|  |     # add labels too | ||||||
|  |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|  |     # Convert the list to a Pandas DataFrame | ||||||
|  |     df_out = pd.DataFrame({ | ||||||
|  |         'p_thing': thing_prediction_list,  | ||||||
|  |         'p_property': property_prediction_list | ||||||
|  |     }) | ||||||
|  |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|  |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|  |     df = pd.concat([df, df_out], axis=1) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_group_{fold}.csv", index=False) | ||||||
|  | 
 | ||||||
|  |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|  |     in_mdm = df['MDM'] | ||||||
|  |     condition_correct_thing = df['p_thing'] == df['thing'] | ||||||
|  |     condition_correct_property = df['p_property'] == df['property'] | ||||||
|  |     prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) | ||||||
|  |     pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) | ||||||
|  | 
 | ||||||
|  |     # write output to file output.txt | ||||||
|  |     with open("output.txt", "a") as f: | ||||||
|  |         print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) | ||||||
|  | 
 | ||||||
|  | ###########################################   | ||||||
|  | # Execute for all folds | ||||||
|  | 
 | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  | 
 | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     infer_and_select(fold) | ||||||
|  | @ -0,0 +1,246 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | 
 | ||||||
|  | from safetensors.torch import load_file | ||||||
|  | 
 | ||||||
|  | from transformers import T5ForConditionalGeneration, T5Config | ||||||
|  | from transformers import ( | ||||||
|  |     T5Config, | ||||||
|  |     T5TokenizerFast, | ||||||
|  |     AutoModelForSeq2SeqLM, | ||||||
|  |     DataCollatorForSeq2Seq, | ||||||
|  |     Seq2SeqTrainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     Seq2SeqTrainingArguments, | ||||||
|  |     T5ForConditionalGeneration, | ||||||
|  |     T5Model | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = f"<DESC>{row['tag_description']}<DESC>" | ||||||
|  |         unit = f"<UNIT>{row['unit']}<UNIT>" | ||||||
|  |         element = { | ||||||
|  |             'input' : f"{desc}{unit}", | ||||||
|  |             'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_split_dataset(fold): | ||||||
|  |     # train  | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     # valid | ||||||
|  |     data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" | ||||||
|  |     validation_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |         'validation' : Dataset.from_list(process_df_to_dict(validation_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # function to perform training for a given fold | ||||||
|  | def train(fold): | ||||||
|  |     save_path = f'checkpoint_fold_{fold}b' | ||||||
|  |     split_datasets = create_split_dataset(fold) | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  |     model_checkpoint = "t5-small" | ||||||
|  |     tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['input'] | ||||||
|  |         target = example['output'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             text_target=target,  | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding="max_length" | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns=split_datasets["train"].column_names, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # https://github.com/huggingface/transformers/pull/28414 | ||||||
|  |     # model_checkpoint = "google/t5-efficient-tiny" | ||||||
|  |     # device_map set to auto to force it to load contiguous weights  | ||||||
|  |     # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') | ||||||
|  | 
 | ||||||
|  |     # directory = os.path.join(".", f'checkpoint_fold_{fold}a') | ||||||
|  |     # # Use glob to find matching paths | ||||||
|  |     # # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     # pattern = 'checkpoint-*' | ||||||
|  |     # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] | ||||||
|  |     # # t5_classify = T5Model.from_pretrained(prev_checkpoint) | ||||||
|  |     # # Load the checkpoint | ||||||
|  |     # checkpoint_path = f"{prev_checkpoint}/model.safetensors" | ||||||
|  |     # checkpoint = load_file(checkpoint_path) | ||||||
|  |     # # Filter out weights related to the classification head | ||||||
|  |     # # given name format: t5.encoder.embed_tokens.weight | ||||||
|  |     # # we want: encoder.embed.tokens.weight | ||||||
|  |     # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} | ||||||
|  | 
 | ||||||
|  |     model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) | ||||||
|  |     # change the token embedding size to match the shape | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Create a T5 model with random weights | ||||||
|  |     config = T5Config.from_pretrained("t5-small")  # Use T5 configuration | ||||||
|  |     random_model = T5ForConditionalGeneration(config)  # Model initialized with random weights | ||||||
|  |     random_model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     model.encoder = random_model.encoder | ||||||
|  |     model.shared = random_model.shared | ||||||
|  | 
 | ||||||
|  |     # model.load_state_dict(state_dict=t5_weights, strict=False) | ||||||
|  | 
 | ||||||
|  |     # for key, param in model.state_dict().items(): | ||||||
|  |     #     if key in t5_weights: | ||||||
|  |     #         print(f"{key}: Successfully overridden") | ||||||
|  |     #     else: | ||||||
|  |     #         print(f"{key}: Retained original weights") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Freeze the encoder | ||||||
|  |     for param in model.encoder.parameters(): | ||||||
|  |         param.requires_grad = False | ||||||
|  | 
 | ||||||
|  |     # Freeze the shared embedding layer | ||||||
|  |     for param in model.shared.parameters(): | ||||||
|  |         param.requires_grad = False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | ||||||
|  |     metric = evaluate.load("sacrebleu") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         # In case the model returns more than the prediction logits | ||||||
|  |         if isinstance(preds, tuple): | ||||||
|  |             preds = preds[0] | ||||||
|  | 
 | ||||||
|  |         decoded_preds = tokenizer.batch_decode(preds,  | ||||||
|  |                                             skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Replace -100s in the labels as we can't decode them | ||||||
|  |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | ||||||
|  |         decoded_labels = tokenizer.batch_decode(labels, | ||||||
|  |                                                 skip_special_tokens=False) | ||||||
|  | 
 | ||||||
|  |         # Remove <PAD> tokens from decoded predictions and labels | ||||||
|  |         decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] | ||||||
|  |         decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] | ||||||
|  | 
 | ||||||
|  |         # Some simple post-processing | ||||||
|  |         # decoded_preds = [pred.strip() for pred in decoded_preds] | ||||||
|  |         # decoded_labels = [[label.strip()] for label in decoded_labels] | ||||||
|  |         # print(decoded_preds, decoded_labels) | ||||||
|  | 
 | ||||||
|  |         result = metric.compute(predictions=decoded_preds, references=decoded_labels) | ||||||
|  |         return {"bleu": result["score"]} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Generation Config | ||||||
|  |     # from transformers import GenerationConfig | ||||||
|  |     gen_config = model.generation_config | ||||||
|  |     gen_config.max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # compile | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     args = Seq2SeqTrainingArguments( | ||||||
|  |         f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=1e-3, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         predict_with_generate=True, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         generation_config=gen_config, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Seq2SeqTrainer( | ||||||
|  |         model, | ||||||
|  |         args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         eval_dataset=tokenized_datasets["validation"], | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | for fold in [1,2,3,4,5]: | ||||||
|  |     print(fold) | ||||||
|  |     train(fold) | ||||||
|  | 
 | ||||||
		Loading…
	
		Reference in New Issue