Feat: implement find-back for analysis in find_closest.py

Feat: implement bert classification
2024-11-08 20:50:41 +09:00 · 2024-11-08 20:50:41 +09:00 · 59bbf1f403
parent 22429ea536
commit 59bbf1f403
5 changed files with 455 additions and 11 deletions
--- a/analysis/find_closest.py
+++ b/analysis/find_closest.py
@ -107,7 +107,7 @@ def find_closest(cos_sim_matrix, condition_source, condition_target):
    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
    # we select top k here
    # Get the indices of the top 5 maximum values along axis 1
-    top_k = 5
+    top_k = 3
    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
    # note that top_k_indices is a nested list because of the 2d nature of the matrix
    # the result is flipped
@ -135,15 +135,20 @@ def find_back_element_with_print(select_idx):
        condition_target=condition_target)

    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
+    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()

    test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
-    predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + test_df[test_df.index == select_idx]['p_property']
+    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
+    predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
+    predicted_test_data = predicted_test_data.to_list()[0]

    print("*" * 80)
    print("idx:", select_idx)
-    print(training_data_pattern_list)
-    print(test_data_pattern_list)
-    print(predicted_test_data)
+    print("train desc", training_desc_list)
+    print("train thing+property", training_data_pattern_list)
+    print("test desc", test_desc_list)
+    print("test thing+property", test_data_pattern_list)
+    print("predicted thing+property", predicted_test_data)

    test_pattern = test_data_pattern_list[0]

@ -154,7 +159,7 @@ def find_back_element_with_print(select_idx):
    else:
        return False

-find_back_element_with_print(2884)
+find_back_element_with_print(0)

 # %%
 def find_back_element(select_idx):
@ -194,15 +199,13 @@ for select_idx in error_thing_df.index:
    print("status:", result)
    pattern_in_train.append(result)

-# %%
-sum(pattern_in_train)/len(pattern_in_train)
-
 ###
 # for error property
 # %%
 pattern_in_train = []
 for select_idx in error_property_df.index:
-    result = find_back_element(select_idx)
+    result = find_back_element_with_print(select_idx)
+    print("status:", result)
    pattern_in_train.append(result)

 # %%
--- a/train/classification_bert/.gitignore
+++ b/train/classification_bert/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/classification_bert/classification_prediction/predict.py
+++ b/train/classification_bert/classification_prediction/predict.py
@ -0,0 +1,228 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+
+# we need to create the mdm_list
+# import the full mdm-only file
+data_path = '../../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+mdm_list = sorted(list((set(full_df['pattern']))))
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(mdm_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df, mdm_list):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = f"<DESC>{row['tag_description']}<DESC>"
+        pattern = row['pattern']
+        try:
+            index = mdm_list.index(pattern)
+        except ValueError:
+            index = -1
+        element = {
+            'text' : f"{desc}",
+            'label': index,
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset(fold, mdm_list):
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+    # we only use the mdm subset
+    test_df = test_df[test_df['MDM']].reset_index(drop=True)
+
+    test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
+
+    return test_dataset
+
+
+# %%
+
+# function to perform training for a given fold
+# def train(fold):
+fold = 1
+
+test_dataset = create_dataset(fold, mdm_list)
+
+# prepare tokenizer
+
+checkpoint_directory = f'../checkpoint_fold_{fold}'
+# Use glob to find matching paths
+# path is usually checkpoint_fold_1/checkpoint-<step number>
+# we are guaranteed to save only 1 checkpoint from training
+pattern = 'checkpoint-*'
+model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+# Define additional special tokens
+# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+# Add the additional special tokens to the tokenizer
+# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+# %%
+# compute max token length
+max_length = 0
+for sample in test_dataset['text']:
+    # Tokenize the sample and get the length
+    input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+    length = len(input_ids)
+    
+    # Update max_length if this sample is longer
+    if length > max_length:
+        max_length = length
+
+print(max_length)
+
+# %%
+
+max_length = 64
+
+# given a dataset entry, run it through the tokenizer
+def preprocess_function(example):
+    input = example['text']
+    # text_target sets the corresponding label to inputs
+    # there is no need to create a separate 'labels'
+    model_inputs = tokenizer(
+        input,
+        max_length=max_length,
+        # truncation=True,
+        padding='max_length'
+    )
+    return model_inputs
+
+# map maps function to each "row" in the dataset
+# aka the data in the immediate nesting
+datasets = test_dataset.map(
+    preprocess_function,
+    batched=True,
+    num_proc=8,
+    remove_columns="text",
+)
+
+
+datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+# %% temp
+# tokenized_datasets['train'].rename_columns()
+
+# %%
+# create data collator
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+# %%
+# compute metrics
+# metric = evaluate.load("accuracy")
+# 
+# 
+# def compute_metrics(eval_preds):
+#     preds, labels = eval_preds
+#     preds = np.argmax(preds, axis=1)
+#     return metric.compute(predictions=preds, references=labels)
+
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_checkpoint,
+    num_labels=len(mdm_list),
+    id2label=id2label,
+    label2id=label2id)
+# important! after extending tokens vocab
+model.resize_token_embeddings(len(tokenizer))
+
+model = model.eval()
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+
+pred_labels = []
+actual_labels = []
+
+
+BATCH_SIZE = 64
+dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+for batch in tqdm(dataloader):
+        # Inference in batches
+        input_ids = batch['input_ids']
+        attention_mask = batch['attention_mask']
+        # save labels too
+        actual_labels.extend(batch['label'])
+        
+
+        # Move to GPU if available
+        input_ids = input_ids.to(device)
+        attention_mask = attention_mask.to(device)
+
+        # Perform inference
+        with torch.no_grad():
+            logits = model(
+                input_ids,
+                attention_mask).logits
+            predicted_class_ids = logits.argmax(dim=1).to("cpu")
+            pred_labels.extend(predicted_class_ids)
+
+pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+# %%
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+y_true = actual_labels
+y_pred = pred_labels
+
+# Compute metrics
+accuracy = accuracy_score(y_true, y_pred)
+f1 = f1_score(y_true, y_pred, average='macro')
+precision = precision_score(y_true, y_pred, average='macro')
+recall = recall_score(y_true, y_pred, average='macro')
+
+# Print the results
+print(f'Accuracy: {accuracy:.2f}')
+print(f'F1 Score: {f1:.2f}')
+print(f'Precision: {precision:.2f}')
+print(f'Recall: {recall:.2f}')
+
+
+# %%
--- a/train/classification_bert/train.py
+++ b/train/classification_bert/train.py
@ -0,0 +1,211 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+
+# we need to create the mdm_list
+# import the full mdm-only file
+data_path = '../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+mdm_list = sorted(list((set(full_df['pattern']))))
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(mdm_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df, mdm_list):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = f"<DESC>{row['tag_description']}<DESC>"
+        pattern = row['pattern']
+        try:
+            index = mdm_list.index(pattern)
+        except ValueError:
+            index = -1
+        element = {
+            'text' : f"{desc}",
+            'label': index,
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_split_dataset(fold, mdm_list):
+    # train 
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # valid
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
+    validation_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
+        'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
+    })
+    return combined_data
+
+
+# %%
+
+# function to perform training for a given fold
+# def train(fold):
+fold = 1
+
+save_path = f'checkpoint_fold_{fold}'
+split_datasets = create_split_dataset(fold, mdm_list)
+
+# prepare tokenizer
+
+model_checkpoint = "distilbert/distilbert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+# Define additional special tokens
+# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+# Add the additional special tokens to the tokenizer
+# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+max_length = 120
+
+# given a dataset entry, run it through the tokenizer
+def preprocess_function(example):
+    input = example['text']
+    # text_target sets the corresponding label to inputs
+    # there is no need to create a separate 'labels'
+    model_inputs = tokenizer(
+        input,
+        max_length=max_length,
+        truncation=True,
+        padding=True
+    )
+    return model_inputs
+
+# map maps function to each "row" in the dataset
+# aka the data in the immediate nesting
+tokenized_datasets = split_datasets.map(
+    preprocess_function,
+    batched=True,
+    num_proc=8,
+    remove_columns="text",
+)
+
+# %% temp
+# tokenized_datasets['train'].rename_columns()
+# %% temp
+tokenized_datasets['train']['input_ids']
+
+# %%
+# create data collator
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+# %%
+# compute metrics
+metric = evaluate.load("accuracy")
+
+
+def compute_metrics(eval_preds):
+    preds, labels = eval_preds
+    preds = np.argmax(preds, axis=1)
+    return metric.compute(predictions=preds, references=labels)
+
+# %%
+# create id2label and label2id
+
+
+# %%
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_checkpoint,
+    num_labels=len(mdm_list),
+    id2label=id2label,
+    label2id=label2id)
+# important! after extending tokens vocab
+model.resize_token_embeddings(len(tokenizer))
+
+# model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+# %%
+# Trainer
+
+training_args = TrainingArguments(
+    output_dir=f"{save_path}",
+    eval_strategy="epoch",
+    logging_dir="tensorboard-log",
+    logging_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    learning_rate=2e-5,
+    per_device_train_batch_size=64,
+    per_device_eval_batch_size=64,
+    auto_find_batch_size=False,
+    ddp_find_unused_parameters=False,
+    weight_decay=0.01,
+    save_total_limit=1,
+    num_train_epochs=40,
+    bf16=True,
+    push_to_hub=False,
+    remove_unused_columns=False,
+)
+
+
+trainer = Trainer(
+    model,
+    training_args,
+    train_dataset=tokenized_datasets["train"],
+    eval_dataset=tokenized_datasets["validation"],
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+)
+
+# uncomment to load training from checkpoint
+# checkpoint_path = 'default_40_1/checkpoint-5600'
+# trainer.train(resume_from_checkpoint=checkpoint_path)
+
+trainer.train()
+
+# # execute training
+# for fold in [1,2,3,4,5]:
+#     print(fold)
+#     train(fold)
+
+
+# %%
--- a/train/mapping_baseline/mapping_prediction/inference.py
+++ b/train/mapping_baseline/mapping_prediction/inference.py
@ -90,7 +90,7 @@ class Inference():
        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

        # create dataloader
-        self.dataloader = DataLoader(datasets, batch_size=batch_size)
+        self.dataloader = DataLoader(datasets, batch_size=batch_size, shuffle=False)


    def generate(self):