# Goal: end to end inference and evaluation

given a csv, make predictions and evaluate predictions, then return results in a csv

In [1]:
import pandas as pd
import os
import json

with open("mode.json", "r") as json_file:
    mode_dict = json.load(json_file)

mode = mode_dict.get("mode", "none")
model_name = mode_dict.get("model", "none")
train_epochs = mode_dict.get("train_epochs", "none")
fold_group = mode_dict.get("fold_group", "none")

base_dir = f"train_{fold_group}_{model_name}_{mode}_{train_epochs}"
checkpoints = [d for d in os.listdir(base_dir) if d.startswith("checkpoint-")]

model_checkpoint = os.path.join(base_dir, checkpoints[0]) if checkpoints else None

data_path = f"../../data_preprocess/dataset/{fold_group}/test.csv"

try:
    df = pd.read_csv(data_path)
except UnicodeDecodeError:
    df = pd.read_csv(data_path, encoding='ISO-8859-1')

df = df.dropna(subset=['tag_description']).reset_index(drop=True)

df_org = df.copy()
df[['thing', 'property', 'tag_description', 'min', 'max', 'MDM', 'pattern']] = df[['thing', 'property', 'tag_description', 'min', 'max', 'MDM', 'pattern']].astype("string")

from datasets import Dataset

def process_df(df, mode='only_td'):
    output_list = []
    for _, row in df.iterrows():
        try:
            if mode == 'only_td':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END>"
            elif mode == 'tn_td':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END>"
            elif mode == 'tn_td_min_max':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>"
            elif mode == 'td_min_max':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>"
            elif mode == 'td_unit':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>"
            elif mode == 'tn_td_unit':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>"
            elif mode == 'td_min_max_unit':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END><UNIT_START>{str(row['unit'])}<UNIT_END>"
            else:
                raise ValueError("Invalid mode specified")

            output_list.append({
                'translation': {
                    'ships_idx': row['ships_idx'],
                    'input': input_str,
                    'thing_property': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
                    'answer_thing': row['thing'],
                    'answer_property': row['property'],
                    'MDM': row['MDM'],
                }
            })
        except Exception as e:
            print(f"Error processing row: {e}")
    return output_list

processed_data = process_df(df, mode=mode)
test_dataset = Dataset.from_list(processed_data)
print(f"The test_dataset contains {len(test_dataset)} items.")

from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline, BartTokenizer
from tqdm import tqdm

# Use BartTokenizer for BART inference
tokenizer = BartTokenizer.from_pretrained(model_name, return_tensors="pt")
additional_special_tokens = [
    "<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", 
    "<TN_START>", "<TN_END>", "<TD_START>", "<TD_END>", 
    "<MIN_START>", "<MIN_END>", "<MAX_START>", "<MAX_END>", 
    "<UNIT_START>", "<UNIT_END>"
]
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})

# Use BART model for inference
pipe = pipeline("text2text-generation", model=model_checkpoint, tokenizer=tokenizer, return_tensors=True, max_length=128, device=0)

# Check what token-ids the special tokens are
thing_start_id = tokenizer.convert_tokens_to_ids("<THING_START>")
thing_end_id = tokenizer.convert_tokens_to_ids("<THING_END>")
property_start_id = tokenizer.convert_tokens_to_ids("<PROPERTY_START>")
property_end_id = tokenizer.convert_tokens_to_ids("<PROPERTY_END>")

def extract_seq(tokens, start_value, end_value):
    if start_value in tokens and end_value in tokens:
        return tokens[tokens.index(start_value)+1:tokens.index(end_value)]
    return None

def extract_seq_from_output(output):
    tokens = output[0]["generated_token_ids"].tolist()
    p_thing = tokenizer.decode(extract_seq(tokens, thing_start_id, thing_end_id)) if thing_start_id in tokens and thing_end_id in tokens else None
    p_property = tokenizer.decode(extract_seq(tokens, property_start_id, property_end_id)) if property_start_id in tokens and property_end_id in tokens else None
    return p_thing, p_property

# Inference and storing predictions
p_thing_list = []
p_property_list = []
print("Making inference on test set")

# Process the test set through the pipeline and generate predictions
for out in tqdm(pipe(KeyDataset(test_dataset["translation"], "input"), batch_size=256)):
    p_thing, p_property = extract_seq_from_output(out)
    p_thing_list.append(p_thing)
    p_property_list.append(p_property)

print("Inference done.")


The test_dataset contains 12938 items.
Making inference on test set


12938it [02:37, 82.28it/s]                    

Inference done.





In [2]:
answer_thing = [item['answer_thing'] for item in test_dataset["translation"]]
answer_property = [item['answer_property'] for item in test_dataset["translation"]]
mdm_list = [item['MDM'] for item in test_dataset["translation"]]

mdm_count = 0
for i in range(len(mdm_list)):
    if(mdm_list[i] == "True"):mdm_count = mdm_count + 1    

def correctness_test(input, reference, mdm_list):
    assert(len(input) == len(reference))
    correctness_list = []
    for i in range(len(input)):
        if(mdm_list[i] == "True"):
            correctness_list.append(input[i] == reference[i])
        else:correctness_list.append(False)
    return correctness_list

# Compare with answer to evaluate correctness
thing_correctness = correctness_test(p_thing_list, answer_thing, mdm_list)
property_correctness = correctness_test(p_property_list, answer_property, mdm_list)

correctness_mdm = []
for i in range(len(mdm_list)):
    if(thing_correctness[i] & property_correctness[i]):
        correctness_mdm.append(True)
    else:            
        correctness_mdm.append(False)
        
    
# Calculate accuracy
thing_accuracy = sum(thing_correctness) / mdm_count
property_accuracy = sum(property_correctness) / mdm_count
total_accuracy = sum(correctness_mdm) / mdm_count

# Count True/False values
thing_true_count = thing_correctness.count(True)
thing_false_count = 0
for i in range(len(thing_correctness)):
    if mdm_list[i] == "True" and thing_correctness[i] == False:
        thing_false_count += 1

property_true_count = property_correctness.count(True)
property_false_count = property_correctness.count(False)
total_true_count = correctness_mdm.count(True)
total_false_count = mdm_count - correctness_mdm.count(True)

# Print results
print("Thing prediction accuracy:", thing_accuracy)
print(f"Correct thing predictions: {thing_true_count}, Incorrect thing predictions: {thing_false_count}")
print("Property prediction accuracy:", property_accuracy)
print(f"Correct property predictions: {property_true_count}, Incorrect property predictions: {property_false_count}")
print("total accuracy:", total_accuracy)
print(f"Correct total predictions: {total_true_count}, Incorrect total predictions: {total_false_count}")

# Create a DataFrame with the results
dict = {
    'p_thing': p_thing_list,
    'p_property': p_property_list,
    'p_thing_correct': thing_correctness,
    'p_property_correct': property_correctness
}

df_pred = pd.DataFrame(dict)

# Read the mode from the JSON file
with open("mode.json", "r") as json_file:
    mode_dict = json.load(json_file)

# Add the model key to the dictionary
mode_dict["model"] = model_name
mode_dict["train_epochs"] = train_epochs

# Save the updated dictionary back to the JSON file
with open("mode.json", "w") as json_file:
    json.dump(mode_dict, json_file)


# Check if the file exists and is not empty
if os.path.exists("results.json") and os.path.getsize("results.json") > 0:
    # Read the existing results.json file
    with open("results.json", "r") as json_file:
        try:
            results_dict = json.load(json_file)
        except json.JSONDecodeError:
            results_dict = {}
else:
    results_dict = {}

# Add the new model_checkpoint key with the accuracy values as an object

model_key = model_checkpoint 

results_dict[model_key] = {
    "thing_accuracy": thing_accuracy,
    "thing_true": thing_true_count,
    "thing_false": thing_false_count,
    "property_accuracy": property_accuracy,
    "property_true": property_true_count,
    "property_false": property_false_count,
    "total_accuracy": total_accuracy,
    "total_true": total_true_count,
    "total_false": total_false_count    
}

# Save the updated dictionary back to the results.json file
with open("results.json", "w") as json_file:
    json.dump(results_dict, json_file, indent=4)

Thing prediction accuracy: 0.9793861658268438
Correct thing predictions: 2138, Incorrect thing predictions: 45
Property prediction accuracy: 0.9752633989922126
Correct property predictions: 2129, Incorrect property predictions: 10809
total accuracy: 0.9601465872652314
Correct total predictions: 2096, Incorrect total predictions: 87
