# Goal: end to end inference and evaluation

given a csv, make predictions and evaluate predictions, then return results in a csv

In [1]:
import pandas as pd
import os
import json

# Read the mode from the JSON file
with open("mode.json", "r") as json_file:
    mode_dict = json.load(json_file)


# Set the mode variable from the JSON content
mode = mode_dict.get("mode", "none")  # 'default_value' is a fallback if 'mode' is not found
model_name = mode_dict.get("model", "none")  # 'default_value' is a fallback if 'mode' is not found
train_epochs = mode_dict.get("train_epochs", "none")  # 'default_value' is a fallback if 'mode' is not found
fold_group = mode_dict.get("fold_group", "none")  # 'default_value' is a fallback if 'mode' is not found

print(f"The mode has been set to: {mode} {model_name}")

# Define the base directory where checkpoints are stored
base_dir = f"train_{fold_group}_{model_name}_{mode}_{train_epochs}"

# List all subdirectories in the base directory
subdirectories = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

# Filter for checkpoint directories that match the pattern "checkpoint-"
checkpoints = [d for d in subdirectories if d.startswith("checkpoint-")]

# Select the latest checkpoint (the one with the highest number)
if checkpoints:
    latest_checkpoint = checkpoints[0]
    model_checkpoint = os.path.join(base_dir, latest_checkpoint)
    print(f"Using model checkpoint: {model_checkpoint}")
else:
    print("No checkpoints were found.")
    model_checkpoint = None  # Handle this case as needed

# Load the data
data_path = f"../../data_preprocess/dataset/{fold_group}/test.csv"  # Adjust the CSV file path as necessary

try:
    df = pd.read_csv(data_path)
except UnicodeDecodeError:
    df = pd.read_csv(data_path, encoding='ISO-8859-1')


# Drop rows where 'tag_description' is NaN and reset the index
df = df.dropna(subset=['tag_description']).reset_index(drop=True)

# Preserve df_org
df_org = df.copy()

# Print the column names of df_org
print("Columns in df_org:")
print(df_org.columns.tolist())

selected_columns = ['thing', 'property', 'tag_description', 'min', 'max', 'MDM', 'pattern']
df[selected_columns] = df[selected_columns].astype("string")


The mode has been set to: tn_td_unit t5-base
Using model checkpoint: train_1_t5-base_tn_td_unit_80/checkpoint-3840
Columns in df_org:
['thing', 'property', 'ships_idx', 'tag_name', 'tag_description', 'signal_type', 'min', 'max', 'unit', 'data_type', 'thing_pattern', 'property_pattern', 'pattern', 'MDM', 'org_tag_description']


In [2]:
from datasets import Dataset

def process_df(df, mode='only_td'):
    output_list = []
    for _, row in df.iterrows():
        try:
            if mode == 'only_td':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END>"
            elif mode == 'tn_td':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END>"
            elif mode == 'tn_td_min_max':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>"
            elif mode == 'td_min_max':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>"  
            elif mode == 'td_unit':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>"            
            elif mode == 'tn_td_unit':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>"      
            else:
                raise ValueError("Invalid mode specified")

            output_list.append({
                'translation': {
                    'ships_idx': row['ships_idx'],
                    'input': input_str,
                    'thing_property': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
                    'answer_thing': f"{row['thing']}",
                    'answer_property': f"{row['property']}",
                    'MDM': f"{row['MDM']}",
                }
            })
        except Exception as e:
            print(f"Error processing row: {row}")
            print(f"Exception: {e}")
    return output_list


# Process the DataFrame
processed_data = process_df(df, mode=mode)

# Create a Dataset object
test_dataset = Dataset.from_list(processed_data)

# Print the number of items in the dataset
print(f"The test_dataset contains {len(test_dataset)} items.")


The test_dataset contains 14718 items.


In [3]:
from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline
from tqdm import tqdm
import os
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, return_tensors="pt")
# Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>"]
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<TN_START>", "<TN_END>", "<TD_START>", "<TD_END>", "<MIN_START>", "<MIN_END>", "<MAX_START>", "<MAX_END>", "<UNIT_START>", "<UNIT_END>"]

# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
# tokenizer.add_special_tokens({'sep_token': "<SEP>"})


pipe = pipeline("translation_XX_to_YY", model=model_checkpoint, tokenizer=tokenizer, return_tensors=True, max_length=128, device=0)

# check what token-ids the special tokens are
# tokenizer.encode("<THING_START><THING_END><PROPERTY_START><PROPERTY_END>")

def extract_seq(tokens, start_value, end_value):
    if start_value not in tokens or end_value not in tokens:
        return None  # Or handle this case according to your requirements
    start_id = tokens.index(start_value)
    end_id = tokens.index(end_value)

    return tokens[start_id+1:end_id]

# problem, what if end tokens are not in?
def process_tensor_output(output):
    tokens = output[0]['translation_token_ids'].tolist()
    thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
    property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
    p_thing = None
    p_property = None
    if (thing_seq is not None):
        p_thing =  tokenizer.decode(thing_seq)
    if (property_seq is not None):
        p_property =  tokenizer.decode(property_seq)
    return p_thing, p_property

In [4]:
p_thing_list = []
p_property_list = []
print("making inference on test set")
for out in tqdm(pipe(KeyDataset(test_dataset["translation"], "input"), batch_size=256)):
    p_thing, p_property = process_tensor_output(out)
    p_thing_list.append(p_thing)
    p_property_list.append(p_property)
print("inference done")

making inference on test set


14718it [00:44, 330.24it/s]                   

inference done





In [5]:
answer_thing = [item['answer_thing'] for item in test_dataset["translation"]]
answer_property = [item['answer_property'] for item in test_dataset["translation"]]
mdm_list = [item['MDM'] for item in test_dataset["translation"]]

mdm_count = 0
for i in range(len(mdm_list)):
    if(mdm_list[i] == "True"):mdm_count = mdm_count + 1    

def correctness_test(input, reference, mdm_list):
    assert(len(input) == len(reference))
    correctness_list = []
    for i in range(len(input)):
        if(mdm_list[i] == "True"):
            correctness_list.append(input[i] == reference[i])
        else:correctness_list.append(False)
    return correctness_list

# Compare with answer to evaluate correctness
thing_correctness = correctness_test(p_thing_list, answer_thing, mdm_list)
property_correctness = correctness_test(p_property_list, answer_property, mdm_list)

correctness_mdm = []
for i in range(len(mdm_list)):
    if(thing_correctness[i] & property_correctness[i]):
        correctness_mdm.append(True)
    else:            
        correctness_mdm.append(False)
        
    
# Calculate accuracy
thing_accuracy = sum(thing_correctness) / mdm_count
property_accuracy = sum(property_correctness) / mdm_count
total_accuracy = sum(correctness_mdm) / mdm_count

# Count True/False values
thing_true_count = thing_correctness.count(True)
thing_false_count = 0
for i in range(len(thing_correctness)):
    if mdm_list[i] == "True" and thing_correctness[i] == False:
        thing_false_count += 1

property_true_count = property_correctness.count(True)
property_false_count = property_correctness.count(False)
total_true_count = correctness_mdm.count(True)
total_false_count = mdm_count - correctness_mdm.count(True)

# Print results
print("Thing prediction accuracy:", thing_accuracy)
print(f"Correct thing predictions: {thing_true_count}, Incorrect thing predictions: {thing_false_count}")
print("Property prediction accuracy:", property_accuracy)
print(f"Correct property predictions: {property_true_count}, Incorrect property predictions: {property_false_count}")
print("total accuracy:", total_accuracy)
print(f"Correct total predictions: {total_true_count}, Incorrect total predictions: {total_false_count}")

# Create a DataFrame with the results
dict = {
    'p_thing': p_thing_list,
    'p_property': p_property_list,
    'p_thing_correct': thing_correctness,
    'p_property_correct': property_correctness
}

df_pred = pd.DataFrame(dict)

# Read the mode from the JSON file
with open("mode.json", "r") as json_file:
    mode_dict = json.load(json_file)

# Add the model key to the dictionary
mode_dict["model"] = model_name
mode_dict["train_epochs"] = train_epochs

# Save the updated dictionary back to the JSON file
with open("mode.json", "w") as json_file:
    json.dump(mode_dict, json_file)


# Check if the file exists and is not empty
if os.path.exists("results.json") and os.path.getsize("results.json") > 0:
    # Read the existing results.json file
    with open("results.json", "r") as json_file:
        try:
            results_dict = json.load(json_file)
        except json.JSONDecodeError:
            results_dict = {}
else:
    results_dict = {}

# Add the new model_checkpoint key with the accuracy values as an object

model_key = model_checkpoint 

results_dict[model_key] = {
    "thing_accuracy": thing_accuracy,
    "thing_true": thing_true_count,
    "thing_false": thing_false_count,
    "property_accuracy": property_accuracy,
    "property_true": property_true_count,
    "property_false": property_false_count,
    "total_accuracy": total_accuracy,
    "total_true": total_true_count,
    "total_false": total_false_count    
}

# Save the updated dictionary back to the results.json file
with open("results.json", "w") as json_file:
    json.dump(results_dict, json_file, indent=4)

Thing prediction accuracy: 0.9895314057826521
Correct thing predictions: 1985, Incorrect thing predictions: 21
Property prediction accuracy: 0.9661016949152542
Correct property predictions: 1938, Incorrect property predictions: 12780
total accuracy: 0.9596211365902293
Correct total predictions: 1925, Incorrect total predictions: 81


In [7]:
import os

# Create a DataFrame with the results
df_pred = pd.DataFrame({
    'p_thing': p_thing_list,
    'p_property': p_property_list,
    'p_thing_correct': thing_correctness,
    'p_property_correct': property_correctness,
})

# Merge predictions with the original DataFrame (df_org)
df_org['p_thing'] = df_pred['p_thing']
df_org['p_property'] = df_pred['p_property']
df_org['p_thing_correct'] = df_pred['p_thing_correct']
df_org['p_property_correct'] = df_pred['p_property_correct']
df_org['p_correct'] = df_pred['p_thing_correct'] & df_org['p_property_correct']

df_master = pd.read_csv('../../data_import/data_model_master_export.csv')

df_org['pattern'] = df_org['thing'].str.replace(r'\d', '#', regex=True) + " " + df_org['property'].str.replace(r'\d', '#', regex=True)
df_org['p_pattern'] = df_org['p_thing'].str.replace(r'\d', '#', regex=True) + " " + df_org['p_property'].str.replace(r'\d', '#', regex=True)
df_master['master_pattern'] = df_master['thing'] + " " + df_master['property']

# Create a set of unique patterns from master for fast lookup
master_patterns = set(df_master['master_pattern'])
df_org['p_MDM'] =  df_org['p_pattern'].apply(lambda x: x in master_patterns)


output_path = f"../0.result/{fold_group}/test_p.csv"
debug_output_path = f"0.dresult/{fold_group}/test_p.csv"

# 폴더가 없으면 생성
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_org.to_csv(output_path, index=False, encoding='utf-8-sig')

os.makedirs(os.path.dirname(debug_output_path), exist_ok=True)
df_org.to_csv(debug_output_path, index=False, encoding='utf-8-sig')

print(f"Updated data saved to {output_path}")

Updated data saved to ../0.result/1/test_p.csv
