Feat: implement find-back for analysis in find_closest.py

Feat: implement bert classification
2024-11-08 20:50:41 +09:00 · 2024-11-08 20:50:41 +09:00 · 59bbf1f403
parent 22429ea536
commit 59bbf1f403
5 changed files with 455 additions and 11 deletions
--- a/analysis/find_closest.py
+++ b/analysis/find_closest.py
@ -107,7 +107,7 @@ def find_closest(cos_sim_matrix, condition_source, condition_target):
    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
    # we select top k here
    # Get the indices of the top 5 maximum values along axis 1
-    top_k = 5
+    top_k = 3
    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
    # note that top_k_indices is a nested list because of the 2d nature of the matrix
    # the result is flipped
@ -135,15 +135,20 @@ def find_back_element_with_print(select_idx):
        condition_target=condition_target)
    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
    test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
-    predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + test_df[test_df.index == select_idx]['p_property']
+    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
    predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
    predicted_test_data = predicted_test_data.to_list()[0]
    print("*" * 80)
    print("idx:", select_idx)
-    print(training_data_pattern_list)
+    print("train desc", training_desc_list)
-    print(test_data_pattern_list)
+    print("train thing+property", training_data_pattern_list)
-    print(predicted_test_data)
+    print("test desc", test_desc_list)
    print("test thing+property", test_data_pattern_list)
    print("predicted thing+property", predicted_test_data)
    test_pattern = test_data_pattern_list[0]
@ -154,7 +159,7 @@ def find_back_element_with_print(select_idx):
    else:
        return False
-find_back_element_with_print(2884)
+find_back_element_with_print(0)
 # %%
 def find_back_element(select_idx):
@ -194,15 +199,13 @@ for select_idx in error_thing_df.index:
    print("status:", result)
    pattern_in_train.append(result)
 # %%
 sum(pattern_in_train)/len(pattern_in_train)
 ###
 # for error property
 # %%
 pattern_in_train = []
 for select_idx in error_property_df.index:
-    result = find_back_element(select_idx)
+    result = find_back_element_with_print(select_idx)
    print("status:", result)
    pattern_in_train.append(result)
 # %%
--- a/train/classification_bert/.gitignore
+++ b/train/classification_bert/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/train/classification_bert/classification_prediction/predict.py
+++ b/train/classification_bert/classification_prediction/predict.py
@ -0,0 +1,228 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 # %%
 # we need to create the mdm_list
 # import the full mdm-only file
 data_path = '../../../data_import/exports/data_mapping_mdm.csv'
 full_df = pd.read_csv(data_path, skipinitialspace=True)
 mdm_list = sorted(list((set(full_df['pattern']))))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(mdm_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
        desc = f"<DESC>{row['tag_description']}<DESC>"
        pattern = row['pattern']
        try:
            index = mdm_list.index(pattern)
        except ValueError:
            index = -1
        element = {
            'text' : f"{desc}",
            'label': index,
        }
        output_list.append(element)
    return output_list
 def create_dataset(fold, mdm_list):
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    # we only use the mdm subset
    test_df = test_df[test_df['MDM']].reset_index(drop=True)
    test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
    return test_dataset
 # %%
 # function to perform training for a given fold
 # def train(fold):
 fold = 1
 test_dataset = create_dataset(fold, mdm_list)
 # prepare tokenizer
 checkpoint_directory = f'../checkpoint_fold_{fold}'
 # Use glob to find matching paths
 # path is usually checkpoint_fold_1/checkpoint-<step number>
 # we are guaranteed to save only 1 checkpoint from training
 pattern = 'checkpoint-*'
 model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
 # Define additional special tokens
 # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
 # Add the additional special tokens to the tokenizer
 # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
 # %%
 # compute max token length
 max_length = 0
 for sample in test_dataset['text']:
    # Tokenize the sample and get the length
    input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
    length = len(input_ids)
    # Update max_length if this sample is longer
    if length > max_length:
        max_length = length
 print(max_length)
 # %%
 max_length = 64
 # given a dataset entry, run it through the tokenizer
 def preprocess_function(example):
    input = example['text']
    # text_target sets the corresponding label to inputs
    # there is no need to create a separate 'labels'
    model_inputs = tokenizer(
        input,
        max_length=max_length,
        # truncation=True,
        padding='max_length'
    )
    return model_inputs
 # map maps function to each "row" in the dataset
 # aka the data in the immediate nesting
 datasets = test_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=8,
    remove_columns="text",
 )
 datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
 # %% temp
 # tokenized_datasets['train'].rename_columns()
 # %%
 # create data collator
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
 # %%
 # compute metrics
 # metric = evaluate.load("accuracy")
 # 
 # 
 # def compute_metrics(eval_preds):
 #     preds, labels = eval_preds
 #     preds = np.argmax(preds, axis=1)
 #     return metric.compute(predictions=preds, references=labels)
 model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(mdm_list),
    id2label=id2label,
    label2id=label2id)
 # important! after extending tokens vocab
 model.resize_token_embeddings(len(tokenizer))
 model = model.eval()
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model.to(device)
 pred_labels = []
 actual_labels = []
 BATCH_SIZE = 64
 dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
 for batch in tqdm(dataloader):
        # Inference in batches
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        # save labels too
        actual_labels.extend(batch['label'])
        # Move to GPU if available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        # Perform inference
        with torch.no_grad():
            logits = model(
                input_ids,
                attention_mask).logits
            predicted_class_ids = logits.argmax(dim=1).to("cpu")
            pred_labels.extend(predicted_class_ids)
 pred_labels = [tensor.item() for tensor in pred_labels]
 # %%
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
 y_true = actual_labels
 y_pred = pred_labels
 # Compute metrics
 accuracy = accuracy_score(y_true, y_pred)
 f1 = f1_score(y_true, y_pred, average='macro')
 precision = precision_score(y_true, y_pred, average='macro')
 recall = recall_score(y_true, y_pred, average='macro')
 # Print the results
 print(f'Accuracy: {accuracy:.2f}')
 print(f'F1 Score: {f1:.2f}')
 print(f'Precision: {precision:.2f}')
 print(f'Recall: {recall:.2f}')
 # %%
--- a/train/classification_bert/train.py
+++ b/train/classification_bert/train.py
@ -0,0 +1,211 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 # we need to create the mdm_list
 # import the full mdm-only file
 data_path = '../../data_import/exports/data_mapping_mdm.csv'
 full_df = pd.read_csv(data_path, skipinitialspace=True)
 mdm_list = sorted(list((set(full_df['pattern']))))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(mdm_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
        desc = f"<DESC>{row['tag_description']}<DESC>"
        pattern = row['pattern']
        try:
            index = mdm_list.index(pattern)
        except ValueError:
            index = -1
        element = {
            'text' : f"{desc}",
            'label': index,
        }
        output_list.append(element)
    return output_list
 def create_split_dataset(fold, mdm_list):
    # train 
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    # valid
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
    validation_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
        'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
    })
    return combined_data
 # %%
 # function to perform training for a given fold
 # def train(fold):
 fold = 1
 save_path = f'checkpoint_fold_{fold}'
 split_datasets = create_split_dataset(fold, mdm_list)
 # prepare tokenizer
 model_checkpoint = "distilbert/distilbert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
 # Define additional special tokens
 # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
 # Add the additional special tokens to the tokenizer
 # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
 max_length = 120
 # given a dataset entry, run it through the tokenizer
 def preprocess_function(example):
    input = example['text']
    # text_target sets the corresponding label to inputs
    # there is no need to create a separate 'labels'
    model_inputs = tokenizer(
        input,
        max_length=max_length,
        truncation=True,
        padding=True
    )
    return model_inputs
 # map maps function to each "row" in the dataset
 # aka the data in the immediate nesting
 tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    num_proc=8,
    remove_columns="text",
 )
 # %% temp
 # tokenized_datasets['train'].rename_columns()
 # %% temp
 tokenized_datasets['train']['input_ids']
 # %%
 # create data collator
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 # %%
 # compute metrics
 metric = evaluate.load("accuracy")
 def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=labels)
 # %%
 # create id2label and label2id
 # %%
 model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(mdm_list),
    id2label=id2label,
    label2id=label2id)
 # important! after extending tokens vocab
 model.resize_token_embeddings(len(tokenizer))
 # model = torch.compile(model, backend="inductor", dynamic=True)
 # %%
 # Trainer
 training_args = TrainingArguments(
    output_dir=f"{save_path}",
    eval_strategy="epoch",
    logging_dir="tensorboard-log",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    auto_find_batch_size=False,
    ddp_find_unused_parameters=False,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=40,
    bf16=True,
    push_to_hub=False,
    remove_unused_columns=False,
 )
 trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
 )
 # uncomment to load training from checkpoint
 # checkpoint_path = 'default_40_1/checkpoint-5600'
 # trainer.train(resume_from_checkpoint=checkpoint_path)
 trainer.train()
 # # execute training
 # for fold in [1,2,3,4,5]:
 #     print(fold)
 #     train(fold)
 # %%
--- a/train/mapping_baseline/mapping_prediction/inference.py
+++ b/train/mapping_baseline/mapping_prediction/inference.py
@ -90,7 +90,7 @@ class Inference():
        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        # create dataloader
-        self.dataloader = DataLoader(datasets, batch_size=batch_size)
+        self.dataloader = DataLoader(datasets, batch_size=batch_size, shuffle=False)
    def generate(self):