added more augmentations to finally beat sota

- class_bert_augmentation is now the reference training code
2025-01-15 20:09:15 +09:00 · 2025-01-15 20:09:15 +09:00 · 5312cfa06f
parent e90bc69ea9
commit 5312cfa06f
32 changed files with 2837 additions and 235 deletions
--- a/analysis/corrupt_text.py
+++ b/analysis/corrupt_text.py
@ -0,0 +1,41 @@
 # %%
 import random
 import string
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 # Example usage
 sentence = "This is a simple string for testing"
 corrupted_sentence = corrupt_string(sentence, corruption_probability=0.1)
 print("Original:", sentence)
 print("Corrupted:", corrupted_sentence)
 # %%
--- a/analysis/entity_hierarchy.py
+++ b/analysis/entity_hierarchy.py
@ -1,95 +0,0 @@
 # %%
 import json
 import pandas as pd
 ##########################################
 # %%
 # Load the JSON file
 data_path = '../esAppMod/tca_entities.json'
 with open(data_path, 'r') as file:
    data = json.load(file)
 # Initialize an empty list to store the rows
 rows = []
 # %%
 # Loop through all entities in the JSON
 for entity in data["data"].items():
    entity_data = entity[1]
    entity_id = entity_data['entity_id']
    entity_name = entity_data['entity_name']
    entity_type_id = entity_data['entity_type_id']
    entity_type_name = entity_data['entity_type_name']
    # Add each mention and its entity_id to the rows list
    rows.append(
        {
        'id': entity_id,
        'name': entity_name,
        'type_id': entity_type_id,
        'type_name': entity_type_name
        })
 # Create a DataFrame from the rows
 df = pd.DataFrame(rows)
 # %%
 # df.to_csv('entity.csv', index=False)
 df
 # %%
 df['type_name'].value_counts()
 # %%
 df['type_id'].value_counts()
 # %%
 name_list = df['name'].to_list()
 # %%
 name_list
 # %%
 from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
 import numpy as np
 # %%
 # Define labels
 labels = name_list
 # Create a prefix-based distance matrix
 def prefix_distance(label1, label2):
    prefix1 = label1.split()
    prefix2 = label2.split()
    # Find common prefix length
    common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
    # Distance is inversely proportional to common prefix length
    return 1.0 / (common_prefix_length + 1)
 # Create a pairwise distance matrix
 n = len(labels)
 distance_matrix = np.zeros((n, n))
 for i in range(n):
    for j in range(n):
        distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
 # Perform hierarchical clustering
 linkage_matrix = linkage(distance_matrix, method='average')
 # Visualize as a dendrogram
 import matplotlib.pyplot as plt
 dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
 plt.title("Prefix-Based Clustering")
 plt.show()
 # %%
 linkage_matrix
 # %%
 # Extract flat clusters with a distance threshold
 threshold = 0.5
 clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
 # Display clusters
 for i, cluster_id in enumerate(clusters):
    print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
 # %%
--- a/analysis/error_analysis.py
+++ b/analysis/error_analysis.py
@ -3,53 +3,55 @@ import pandas as pd
 # %%
 # import training file
-data_path = '../data_import/train.csv'
+data_path = '../esAppMod_data_import/train.csv'
 # data_path = '../esAppMod_data_import/parent_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # import test file
-data_path = '../data_import/test.csv'
+data_path = '../esAppMod_data_import/test.csv'
 # data_path = '../esAppMod_data_import/parent_test.csv'
 test_df = pd.read_csv(data_path, skipinitialspace=True)
 # import entity file
-data_path = '../data_import/entity.csv'
+data_path = '../esAppMod_data_import/entity.csv'
 entity_df = pd.read_csv(data_path, skipinitialspace=True)
 id2label = {}
 for _, row in entity_df.iterrows():
    id2label[row['id']] = row['name']
 # %%
 train_df.sort_values(by=['entity_id']).to_markdown('out.md')
 # %%
-data_path = '../train/class_bert_process/prediction/exports/result.csv'
+data_path = '../train/class_bert_augmentation/prediction/exports/result.csv'
 prediction_df = pd.read_csv(data_path)
 # %%
 predicted_entity_list = []
 for element in prediction_df['class_prediction']:
    predicted_entity_list.append(id2label[element])
 prediction_df['predicted_name'] = predicted_entity_list
 # %%
 new_df = pd.concat((test_df, prediction_df ), axis=1)
 # %%
 mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
 mismatch_df = new_df[mismatch_mask]
 # %%
 len(mismatch_df)
 # %%
 # print the top 10 offending classes
 # mask1 = mismatch_df['entity_id'] != 434
 # mask2 = mismatch_df['entity_id'] != 451
 # mask3 = mismatch_df['entity_id'] != 452
 # mask= mask1 & mask2 & mask3
 # masked_df = mismatch_df[mask]
 # print(masked_df['entity_id'].value_counts()[:10])
 print(mismatch_df['entity_id'].value_counts()[:10])
 masked_df = mismatch_df
 # %%
 # Convert the whole dataframe as a string and display
 # print the mismatch_df
-print(mismatch_df.sort_values(by=['entity_id']).to_markdown())
+print(masked_df.sort_values(by=['entity_id']).to_markdown())
 # %%
 mismatch_df.to_csv('error.csv')
@ -62,14 +64,9 @@ mismatch_df[select_mask]
 # %%
 # let us see the train mentions
-select_value = 452
+select_value = 130
 select_mask = train_df['entity_id'] == select_value
 train_df[select_mask]
 # %%
 mismatch_df[select_mask]['class_prediction'].to_list()
 # %%
 # %%
--- a/analysis/label_acronym.py
+++ b/analysis/label_acronym.py
@ -0,0 +1,62 @@
 # %%
 import pandas as pd
 import re
 # %%
 # import training file
 data_path = '../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # import test file
 data_path = '../esAppMod_data_import/test.csv'
 test_df = pd.read_csv(data_path, skipinitialspace=True)
 # import entity file
 data_path = '../esAppMod_data_import/entity.csv'
 entity_df = pd.read_csv(data_path, skipinitialspace=True)
 id2label = {}
 for _, row in entity_df.iterrows():
    id2label[row['id']] = row['name']
 # %%
 train_df
 # %%
 def extract_acronym_mapping(names):
    mapping = {}
    for name in names:
        # Find acronym in parentheses
        match = re.search(r"\((\w+)\)", name)
        if match:
            acronym = match.group(1)
            # Remove unrelated prepended terms
            core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip()
            # Add to dictionary
            mapping[acronym] = core_term
    return mapping
 names = set(train_df['entity_name'].to_list())
 # Extract mappings
 acronym_mapping = extract_acronym_mapping(names)
 print(acronym_mapping)
 # %%
 del acronym_mapping['E']  # too many false matches
 acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()}
 abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()}
 term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()}
 # %%
 abbrev_to_term
 # %%
 term_to_abbrev
 # %%
 acronym_mapping
 # %%
--- a/esAppMod_data_import/.gitignore
+++ b/esAppMod_data_import/.gitignore
@ -0,0 +1,5 @@
 out.md
 parent_test.csv
 parent_train.csv
 test_seq.csv
 train_seq.csv
--- a/esAppMod_data_import/entity_hierarchy.py
+++ b/esAppMod_data_import/entity_hierarchy.py
@ -0,0 +1,124 @@
 # %%
 import json
 import pandas as pd
 ##########################################
 # %%
 # import training file
 data_path = '../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # %%
 # import entity file
 # Keep only one row per unique value in 'column1'
 unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
 id2label = {}
 for _, row in unique_df.iterrows():
    id2label[row['entity_id']] = row['entity_name']
 inverse_dict = {value:key for key,value in id2label.items()}
 # %%
 # Create a new dictionary with sorted keys
 # sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
 sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
 # %%
 sorted_dict
 # %%
 rule_set ={
    '.NET': [497,482,484,487,485,486,483],
    'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
    'C++': [583,306],
    'CA': [290,22,23,24,25],
    'CSS': [307,377],
    'Cisco': [28,420,29],
    'Citrix': [563,565,31,292,291,564,32,30],
    'coldfusion': [311,37],
    'eclipse': [46,622,641,456],
    'xml': [596, 318],
    'xsl': [319,320],
    'HP': [59,293,60,61,58],
    'http': [505,543],
    'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
    'IBM BigFix': [62,457],
    'IBM ILOG': [253,255,254,256,252],
    'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
    'IBM WebSphere': [80,82,83,81],
    'IBM i': [424,329],
    'IDMS': [667,668],
    'IIS': [609,490,489,491],
    'JBoss': [268,492,493],
    'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
    'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
    'KVS': [549,550,551],
    'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
    'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
    'MVS': [577,440,441],
    'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
    'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
    'Oracle WebLogic': [600,233],
    'Oracle Application Server': [610,494],
    'Oracle Database': [134,474,475,478],
    'Oracle Hyperion': [607,138,139],
    'Oracle WebCenter': [276,495],
    'Pascal': [599,346],
    'Perl': [585,348,417,349],
    'ProjectWise': [161,162],
    'Rational': [166,167],
    'SAP': [173,175,695,176,676,178,179],
    'SAP ERP': [174,476,477],
    'SAP NetWeaver': [279,496,177],
    'Sybase SQL Server': [190,479,480],
    'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
    'TIBCO': [218,219],
    'TIBCO Business Works': [217,481],
    'Tivoli': [220,251],
    'Tortoise': [221,222],
    'Unix': [578,445,579,447,602,590,448,449],
    'VB': [368,369],
    'VMware': [568,569,229,230,231],
    'Visual Basic': [370,371,372],
    'WebSphere': [234,285,235,286,284,601,287],
    'Windows': [580,238,239,451,452],
    'z': [598,608,591]
 }
 # %%
 # iterate through the whole training set
 new_df = train_df.copy()
 for idx, row in train_df.iterrows():
    # we iterate through each rule set, replacing any matching values in the
    # list with the first element of the list
    for key in rule_set.keys():
        id = row['entity_id']
        if (id in rule_set[key]):
            new_df.loc[idx,('entity_id')] = rule_set[key][0]
 # %%
 len(set(new_df['entity_id'].to_list()))
 # %%
 new_df.to_csv('parent_train.csv')
 # %%
 # now do the same for the test data
 # import training file
 data_path = '../esAppMod_data_import/test.csv'
 test_df = pd.read_csv(data_path, skipinitialspace=True)
 new_df = test_df.copy()
 for idx, row in test_df.iterrows():
    # we iterate through each rule set, replacing any matching values in the
    # list with the first element of the list
    for key in rule_set.keys():
        id = row['entity_id']
        if (id in rule_set[key]):
            new_df.loc[idx,('entity_id')] = rule_set[key][0]
 # %%
 new_df
 # %%
 new_df.to_csv('parent_test.csv')
 # %%
--- a/esAppMod_data_import/entity_hierarchy_for_seq2seq.py
+++ b/esAppMod_data_import/entity_hierarchy_for_seq2seq.py
@ -0,0 +1,129 @@
 # %%
 import json
 import pandas as pd
 ##########################################
 # %%
 # import training file
 data_path = '../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # %%
 # import entity file
 # Keep only one row per unique value in 'column1'
 unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
 id2label = {}
 for _, row in unique_df.iterrows():
    id2label[row['entity_id']] = row['entity_name']
 inverse_dict = {value:key for key,value in id2label.items()}
 # %%
 # Create a new dictionary with sorted keys
 # sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
 sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
 # %%
 sorted_dict
 # %%
 rule_set ={
    '.NET': [497,482,484,487,485,486,483],
    'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
    'C++': [583,306],
    'CA': [290,22,23,24,25],
    'CSS': [307,377],
    'Cisco': [28,420,29],
    'Citrix': [563,565,31,292,291,564,32,30],
    'coldfusion': [311,37],
    'eclipse': [46,622,641,456],
    'xml': [596, 318],
    'xsl': [319,320],
    'HP': [59,293,60,61,58],
    'http': [505,543],
    'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
    'IBM BigFix': [62,457],
    'IBM ILOG': [253,255,254,256,252],
    'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
    'IBM WebSphere': [80,82,83,81],
    'IBM i': [424,329],
    'IDMS': [667,668],
    'IIS': [609,490,489,491],
    'JBoss': [268,492,493],
    'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
    'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
    'KVS': [549,550,551],
    'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
    'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
    'MVS': [577,440,441],
    'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
    'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
    'Oracle WebLogic': [600,233],
    'Oracle Application Server': [610,494],
    'Oracle Database': [134,474,475,478],
    'Oracle Hyperion': [607,138,139],
    'Oracle WebCenter': [276,495],
    'Pascal': [599,346],
    'Perl': [585,348,417,349],
    'ProjectWise': [161,162],
    'Rational': [166,167],
    'SAP': [173,175,695,176,676,178,179],
    'SAP ERP': [174,476,477],
    'SAP NetWeaver': [279,496,177],
    'Sybase SQL Server': [190,479,480],
    'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
    'TIBCO': [218,219],
    'TIBCO Business Works': [217,481],
    'Tivoli': [220,251],
    'Tortoise': [221,222],
    'Unix': [578,445,579,447,602,590,448,449],
    'VB': [368,369],
    'VMware': [568,569,229,230,231],
    'Visual Basic': [370,371,372],
    'WebSphere': [234,285,235,286,284,601,287],
    'Windows': [580,238,239,451,452],
    'z': [598,608,591]
 }
 # %%
 # iterate through the whole training set
 new_df = train_df.copy()
 for idx, row in train_df.iterrows():
    # we iterate through each rule set, replacing any matching values in the
    # list with the first element of the list
    for key in rule_set.keys():
        id = row['entity_id']
        if (id in rule_set[key]):
            stem = rule_set[key][0]
            leaf = rule_set[key].index(id)
            new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}"
 # %%
 len(set(new_df['entity_seq'].to_list()))
 # %%
 new_df.to_csv('train_seq.csv')
 # %%
 # now do the same for the test data
 # import training file
 data_path = '../esAppMod_data_import/test.csv'
 test_df = pd.read_csv(data_path, skipinitialspace=True)
 new_df = test_df.copy()
 for idx, row in test_df.iterrows():
    # we iterate through each rule set, replacing any matching values in the
    # list with the first element of the list
    for key in rule_set.keys():
        id = row['entity_id']
        if (id in rule_set[key]):
            stem = rule_set[key][0]
            leaf = rule_set[key].index(id)
            new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}"
 # %%
 new_df
 # %%
 new_df.to_csv('test_seq.csv')
 # %%
--- a/train/class_bert_augmentation/.gitignore
+++ b/train/class_bert_augmentation/.gitignore
--- a/train/class_bert_augmentation/prediction/.gitignore
+++ b/train/class_bert_augmentation/prediction/.gitignore
--- a/train/class_bert_augmentation/prediction/output.txt
+++ b/train/class_bert_augmentation/prediction/output.txt
@ -1,6 +1,6 @@
 *******************************************************************************
-Accuracy: 0.77655
+Accuracy: 0.80197
-F1 Score: 0.79605
+F1 Score: 0.81948
-Precision: 0.85637
+Precision: 0.88067
-Recall: 0.77655
+Recall: 0.80197
--- a/train/class_bert_augmentation/prediction/predict.py
+++ b/train/class_bert_augmentation/prediction/predict.py
@ -32,6 +32,8 @@ torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 # data_path = '../../../esAppMod_data_import/train.csv'
 data_path = '../../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
@ -52,19 +54,8 @@ def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Remove any non alphanumeric character
    # text = re.sub(r'[^\w\s]', ' ', text)  # Retains only alphanumeric and spaces
    text = re.sub(r"[-;:]", " ", text)
    # Add space between digit followed by a letter
    text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
    # Add space between letter followed by a digit
    text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
    # Substitute digits with '#'
-    text = re.sub(r'\d+', 'x', text)
+    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
--- a/train/class_bert_augmentation/train.py
+++ b/train/class_bert_augmentation/train.py
@ -0,0 +1,562 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=10
 # %%
 # import training file
 data_path = '../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 acronym_mapping = {
 'hpsa': 'hp server automation',
 'tam': 'tivoli access manager',
 'adf': 'application development facility',
 'html': 'hypertext markup language',
 'wff': 'microsoft web farm framework',
 'jsp': 'javaserver pages',
 'bw': 'business works',
 'ssrs': 'sql server reporting services',
 'cl': 'control language',
 'vba': 'visual basic for applications',
 'esapi': 'enterprise security api',
 'gwt': 'google web toolkit',
 'pki': 'perkin elmer informatics',
 'rtd': 'oracle realtime decisions',
 'jms': 'java message service',
 'db': 'database',
 'soa': 'service oriented architecture',
 'xsl': 'extensible stylesheet language',
 'com': 'compopent object model',
 'ldap': 'lightweight directory access protocol',
 'odm': 'ibm operational decision manager',
 'soql': 'salesforce object query language',
 'oms': 'order management system',
 'cfml': 'coldfusion markup language',
 'nas': 'netscape application server',
 'sql': 'structured query language',
 'bde': 'borland database engine',
 'imap': 'internet message access protocol',
 'uws': 'ultidev web server',
 'birt': 'business intelligence and reporting tools',
 'mdw': 'model driven workflow',
 'tws': 'tivoli workload scheduler',
 'jre': 'java runtime environment',
 'wcs': 'websphere commerce suite',
 'was': 'websphere application server',
 'ssis': 'sql server integration services',
 'xhtml': 'extensible hypertext markup language',
 'soap': 'simple object access protocol',
 'san': 'storage area network',
 'elk': 'elastic stack',
 'arr': 'application request routing',
 'xlst': 'extensible stylesheet language transformations',
 'sccm': 'microsoft endpoint configuration manager',
 'ejb': 'enterprise java beans',
 'css': 'cascading style sheets',
 'hpoo': 'hp operations orchestration',
 'xml': 'extensible markup language',
 'esb': 'enterprise service bus',
 'edi': 'electronic data interchange',
 'imsva': 'interscan messaging security virtual appliance',
 'wtx': 'ibm websphere transformation extender',
 'cgi': 'common gateway interface',
 'bal': 'ibm basic assembly language',
 'issow': 'integrated safe system of work',
 'dcl': 'data control language',
 'jdom': 'java document object model',
 'fim': 'microsoft forefront identity manager',
 'npl': 'niakwa programming language',
 'wf': 'windows workflow foundation',
 'lm': 'etap license manager',
 'wts': 'windows terminal server',
 'asp': 'active server pages',
 'jil': 'job information language',
 'mvc': 'model view controller',
 'rmi': 'remote method invocation',
 'ad': 'active directory',
 'owb': 'oracle warehouse builder',
 'rest': 'representational state transfer',
 'jdk': 'java development kit',
 'ids': 'integrated data store',
 'bms': 'batch management software',
 'vsx': 'vmware solution exchange',
 'ssas': 'sql server analysis services',
 'atl': 'atlas transformation language',
 'ice': 'infobright community edition',
 'esql': 'extended structured query language',
 'corba': 'common object request broker architecture',
 'dpe': 'device provisioning engines',
 'rac': 'oracle real application clusters',
 'iemt': 'iis easy migration tool',
 'mes': 'manufacturing execution system',
 'odbc': 'open database connectivity',
 'lms': 'lan management solution',
 'wcf': 'windows communication foundation',
 'nes': 'netscape enterprise server',
 'jsf': 'javaserver faces',
 'alm': 'application lifecycle management',
 'hlasm': 'high level assembler',
 'cmod': 'content manager ondemand'}
 external_source = {
 'vb.net': 'visual basic dot net',
 'jes': 'job entry subsystem',
 'svn': 'subversion',
 'vcs': 'version control system',
 'lims': 'laboratory information management system',
 'ide': 'integrated development environment',
 'sdk': 'software development kit',
 'mq': 'message queue',
 'ims': 'information management system',
 'isa': 'internet security and acceleration',
 'vs': 'visual studio',
 'esr': 'extended support release',
 'ff': 'firefox',
 'vb': 'visual basic',
 'rhel': 'red hat enterprise linux',
 'iis': 'internet information server',
 'api': 'application programming interface',
 'se': 'standard edition',
 '\.net': 'dot net',
 'c#': 'c sharp'
 }
 # synonyms = {
 #  'windows server': 'windows nt',
 #  'windows 7': 'windows desktop',
 #  'windows 8': 'windows desktop',
 #  'windows 10': 'windows desktop'
 # }
 # add more information
 acronym_mapping.update(external_source)
 abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
 term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
 def replace_terms_with_abbreviations(text):
    for input, replacement in term_to_abbrev.items():
        text = re.sub(input, replacement, text)
    return text
 def replace_abbreviations_with_terms(text):
    for input, replacement in abbrev_to_term.items():
        text = re.sub(input, replacement, text)
    return text
 ######################################
 # augmentation by text corruption
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 label_flag_list = []
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        parent_desc = preprocess_text(parent_desc)
        # Split the string into words
        words = parent_desc.split()
        # Count the number of words
        word_count = len(words)
        # short sequences are rare, and we must compensate by including more examples
        # mutation of other longer sequences might drown out rare short sequences
        if word_count < 3:
            for _ in range(10):
                element = {
                    'text': parent_desc,
                    'label': label2id[index],
                }
                output_list.append(element)
        # check if label is in label_flag_list
        if index not in label_flag_list:
            entity_name = row['entity_name']
            # add the "entity_name" label as a mention
            element = {
                'text': entity_name,
                'label': label2id[index],
            }
            output_list.append(element)
            # remove all non-alphanumerics
            desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'label': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
            # add shufles of the original entity name
            no_of_shuffles = SHUFFLES
            processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles)
            for desc in processed_descs:
                if (desc != parent_desc):
                    element = {
                        'text' : desc,
                        'label': label2id[index], # ensure labels starts from 0
                    }
                    output_list.append(element)
            label_flag_list.append(index)
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'label': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # corrupt string
        desc = corrupt_string(parent_desc, corruption_probability=0.1)
        if (desc != parent_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
        # augmentation
        # remove all non-alphanumerics
        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        if (desc != parent_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
        # # augmentation
        # # perform abbrev_to_term
        # temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        # desc = replace_terms_with_abbreviations(temp_desc)
        # if (desc != temp_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
        # augmentation
        # perform term to abbrev
        desc = replace_abbreviations_with_terms(parent_desc)
        if (desc != parent_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../esAppMod_data_import/train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    # additional_special_tokens = ["<DESC>"]
    # Add the additional special tokens to the tokenizer
    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            truncation=True,
            padding=True
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=5e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/train/class_bert_hierarchical/.gitignore
+++ b/train/class_bert_hierarchical/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/train/class_bert_hierarchical/prediction/.gitignore
+++ b/train/class_bert_hierarchical/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/train/class_bert_hierarchical/prediction/output.txt
+++ b/train/class_bert_hierarchical/prediction/output.txt
@ -0,0 +1,11 @@
 *******************************************************************************
 Accuracy: 0.71956
 F1 Score: 0.74142
 Precision: 0.81529
 Recall: 0.71956
 ********************************************************************************
 Accuracy: 0.71710
 F1 Score: 0.74095
 Precision: 0.82181
 Recall: 0.71710
--- a/train/class_bert_hierarchical/prediction/output_1.txt
+++ b/train/class_bert_hierarchical/prediction/output_1.txt
@ -0,0 +1,6 @@
 *******************************************************************************
 Accuracy: 0.81591
 F1 Score: 0.82162
 Precision: 0.85519
 Recall: 0.81591
--- a/train/class_bert_hierarchical/prediction/output_2.txt
+++ b/train/class_bert_hierarchical/prediction/output_2.txt
@ -0,0 +1,6 @@
 *******************************************************************************
 Accuracy: 0.59943
 F1 Score: 0.60266
 Precision: 0.66956
 Recall: 0.59943
--- a/train/class_bert_hierarchical/prediction/predict_1.py
+++ b/train/class_bert_hierarchical/prediction/predict_1.py
@ -0,0 +1,265 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 # data_path = '../../../esAppMod_data_import/train.csv'
 data_path = '../../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        index = row['entity_id']
        element = {
            'text' : desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    # data_path = '../../../esAppMod_data_import/test.csv'
    data_path = '../../../esAppMod_data_import/parent_test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    # combined_data = DatasetDict({
    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
    # })
    return Dataset.from_list(process_df_to_dict(test_df))
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint_part1-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    # %%
    # compute max token length
    max_length = 0
    for sample in test_dataset['text']:
        # Tokenize the sample and get the length
        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
        length = len(input_ids)
        # Update max_length if this sample is longer
        if length > max_length:
            max_length = length
    print(max_length)
    # %%
    max_length = 128
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            # truncation=True,
            padding='max_length'
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
    # %%
    # compute metrics
    # metric = evaluate.load("accuracy")
    # 
    # 
    # def compute_metrics(eval_preds):
    #     preds, labels = eval_preds
    #     preds = np.argmax(preds, axis=1)
    #     return metric.compute(predictions=preds, references=labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['label'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output_1.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result_1.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output_1.txt", "w") as f:
    print('', file=f)
    test()
--- a/train/class_bert_hierarchical/prediction/predict_2.py
+++ b/train/class_bert_hierarchical/prediction/predict_2.py
@ -0,0 +1,265 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 # data_path = '../../../esAppMod_data_import/train.csv'
 data_path = '../../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        index = row['entity_id']
        element = {
            'text' : desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    # data_path = '../../../esAppMod_data_import/test.csv'
    data_path = '../../../esAppMod_data_import/test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    # combined_data = DatasetDict({
    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
    # })
    return Dataset.from_list(process_df_to_dict(test_df))
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    # %%
    # compute max token length
    max_length = 0
    for sample in test_dataset['text']:
        # Tokenize the sample and get the length
        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
        length = len(input_ids)
        # Update max_length if this sample is longer
        if length > max_length:
            max_length = length
    print(max_length)
    # %%
    max_length = 128
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            # truncation=True,
            padding='max_length'
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
    # %%
    # compute metrics
    # metric = evaluate.load("accuracy")
    # 
    # 
    # def compute_metrics(eval_preds):
    #     preds, labels = eval_preds
    #     preds = np.argmax(preds, axis=1)
    #     return metric.compute(predictions=preds, references=labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['label'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output_2.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result_2.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output_2.txt", "w") as f:
    print('', file=f)
    test()
--- a/train/class_bert_hierarchical/train_1.py
+++ b/train/class_bert_hierarchical/train_1.py
@ -45,7 +45,7 @@ def set_seed(seed):
 set_seed(42)
-SHUFFLES=2
+SHUFFLES=5
 # %%
@ -56,37 +56,6 @@ train_df = pd.read_csv(data_path, skipinitialspace=True)
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 def compute_normalized_class_weights(class_counts, max_resamples=SHUFFLES):
    """
    Compute normalized class weights inversely proportional to class counts.
    The weights are normalized so that they sum to 1.
    Args:
        class_counts (array-like): An array or list where each element represents the count of samples for a class.
    Returns:
        numpy.ndarray: A normalized array of weights for each class.
    """
    class_counts = np.array(class_counts)
    total_samples = np.sum(class_counts)
    class_weights = total_samples / class_counts
    # so that highest weight is 1
    normalized_weights = class_weights / np.max(class_weights)
    # Scale weights such that the highest weight corresponds to `max_resamples`
    resample_counts = normalized_weights * max_resamples
    # Round resamples to nearest integer
    resample_counts = np.round(resample_counts).astype(int)
    return resample_counts
 # %%
 id_counts = train_df['entity_id'].value_counts()
 id_weights = compute_normalized_class_weights(id_counts, max_resamples=SHUFFLES)
 id_index = id_counts.index
 label2weight = {}
 for idx, label in enumerate(id_index):
    label2weight[label] = id_weights[idx]
 # %%
 id2label = {}
 label2id = {}
@ -101,20 +70,8 @@ def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Remove any non alphanumeric character
    # text = re.sub(r'[^\w\s]', ' ', text)  # Retains only alphanumeric and spaces
    # replace dashes
    text = re.sub(r"[-;:]", " ", text)
    # Add space between digit followed by a letter
    text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
    # Add space between letter followed by a digit
    text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
    # Substitute digits with 'x'
-    text = re.sub(r'\d+', 'x', text)
+    text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
@ -165,35 +122,143 @@ def shuffle_text(text, n_shuffles=SHUFFLES):
    return all_processed
-term_to_abbrev = {
+acronym_mapping = {
-    r'job entry system': 'jes',
+ 'hpsa': 'hp server automation',
-    r'subversion': 'svn',
+ 'tam': 'tivoli access manager',
-    r'borland database engine': 'bde',
+ 'adf': 'application development facility',
-    r'business intelligence and reporting tools': 'birt',
+ 'html': 'hypertext markup language',
-    r'lan management solution': 'lms',
+ 'wff': 'microsoft web farm framework',
-    r'laboratory information management system': 'lims',
+ 'jsp': 'javaserver pages',
-    r'ibm database 2': 'db/2',
+ 'bw': 'business works',
-    r'integrated development environment': 'ide',
+ 'ssrs': 'sql server reporting services',
-    r'software development kit': 'sdk',
+ 'cl': 'control language',
-    r'hp operations orchestration': 'hpoo',
+ 'vba': 'visual basic for applications',
-    r'hp server automation': 'hpsa',
+ 'esapi': 'enterprise security api',
-    r'internet information server': 'iis',
+ 'gwt': 'google web toolkit',
-    r'release 2': 'r2',
+ 'pki': 'perkin elmer informatics',
-    r'red hat enterprise linux': 'rhel',
+ 'rtd': 'oracle realtime decisions',
-    r'oracle enterprise linux': 'oel',
+ 'jms': 'java message service',
-    r'websphere application server': 'was',
+ 'db': 'database',
-    r'application development facility': 'adf',
+ 'soa': 'service oriented architecture',
-    r'server analysis services': 'ssas'
+ 'xsl': 'extensible stylesheet language',
 'com': 'compopent object model',
 'ldap': 'lightweight directory access protocol',
 'odm': 'ibm operational decision manager',
 'soql': 'salesforce object query language',
 'oms': 'order management system',
 'cfml': 'coldfusion markup language',
 'nas': 'netscape application server',
 'sql': 'structured query language',
 'bde': 'borland database engine',
 'imap': 'internet message access protocol',
 'uws': 'ultidev web server',
 'birt': 'business intelligence and reporting tools',
 'mdw': 'model driven workflow',
 'tws': 'tivoli workload scheduler',
 'jre': 'java runtime environment',
 'wcs': 'websphere commerce suite',
 'was': 'websphere application server',
 'ssis': 'sql server integration services',
 'xhtml': 'extensible hypertext markup language',
 'soap': 'simple object access protocol',
 'san': 'storage area network',
 'elk': 'elastic stack',
 'arr': 'application request routing',
 'xlst': 'extensible stylesheet language transformations',
 'sccm': 'microsoft endpoint configuration manager',
 'ejb': 'enterprise java beans',
 'css': 'cascading style sheets',
 'hpoo': 'hp operations orchestration',
 'xml': 'extensible markup language',
 'esb': 'enterprise service bus',
 'edi': 'electronic data interchange',
 'imsva': 'interscan messaging security virtual appliance',
 'wtx': 'ibm websphere transformation extender',
 'cgi': 'common gateway interface',
 'bal': 'ibm basic assembly language',
 'issow': 'integrated safe system of work',
 'dcl': 'data control language',
 'jdom': 'java document object model',
 'fim': 'microsoft forefront identity manager',
 'npl': 'niakwa programming language',
 'wf': 'windows workflow foundation',
 'lm': 'etap license manager',
 'wts': 'windows terminal server',
 'asp': 'active server pages',
 'jil': 'job information language',
 'mvc': 'model view controller',
 'rmi': 'remote method invocation',
 'ad': 'active directory',
 'owb': 'oracle warehouse builder',
 'rest': 'representational state transfer',
 'jdk': 'java development kit',
 'ids': 'integrated data store',
 'bms': 'batch management software',
 'vsx': 'vmware solution exchange',
 'ssas': 'sql server analysis services',
 'atl': 'atlas transformation language',
 'ice': 'infobright community edition',
 'esql': 'extended structured query language',
 'corba': 'common object request broker architecture',
 'dpe': 'device provisioning engines',
 'rac': 'oracle real application clusters',
 'iemt': 'iis easy migration tool',
 'mes': 'manufacturing execution system',
 'odbc': 'open database connectivity',
 'lms': 'lan management solution',
 'wcf': 'windows communication foundation',
 'nes': 'netscape enterprise server',
 'jsf': 'javaserver faces',
 'alm': 'application lifecycle management',
 'hlasm': 'high level assembler',
 'cmod': 'content manager ondemand'}
 external_source = {
 'vb.net': 'visual basic dot net',
 'jes': 'job entry subsystem',
 'svn': 'subversion',
 'vcs': 'version control system',
 'lims': 'laboratory information management system',
 'ide': 'integrated development environment',
 'sdk': 'software development kit',
 'mq': 'message queue',
 'ims': 'information management system',
 'isa': 'internet security and acceleration',
 'vs': 'visual studio',
 'esr': 'extended support release',
 'ff': 'firefox',
 'vb': 'visual basic',
 'rhel': 'red hat enterprise linux',
 'iis': 'internet information server',
 'api': 'application programming interface',
 'se': 'standard edition',
 '\.net': 'dot net',
 'c#': 'c sharp',
 'ms': 'microsoft'
 }
-abbrev_to_term = {rf'\b{value}\b': key for key, value in term_to_abbrev.items()}
+
 # synonyms = {
 #  'windows server': 'windows nt',
 #  'windows 7': 'windows desktop',
 #  'windows 8': 'windows desktop',
 #  'windows 10': 'windows desktop'
 # }
 # add more information
 acronym_mapping.update(external_source)
 abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
 term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
 def replace_terms_with_abbreviations(text):
    for input, replacement in term_to_abbrev.items():
        text = re.sub(input, replacement, text)
    return text
-def replace_abbreivations_with_terms(text):
+def replace_abbreviations_with_terms(text):
    for input, replacement in abbrev_to_term.items():
        text = re.sub(input, replacement, text)
    return text
@ -218,8 +283,8 @@ def process_df_to_dict(df):
        # no_of_shuffles = label2weight[index] + 1
        no_of_shuffles = SHUFFLES
        processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'label': label2id[index], # ensure labels starts from 0
@ -227,24 +292,32 @@ def process_df_to_dict(df):
                output_list.append(element)
        # augmentation
        # remove all non-alphanumerics
        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        if (desc != parent_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
        # augmentation
        # perform abbrev_to_term
-        desc = replace_terms_with_abbreviations(parent_desc)
+        temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
-        no_of_shuffles = SHUFFLES
+        desc = replace_terms_with_abbreviations(temp_desc)
-        processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
+        if (desc != temp_desc):
        for desc in processed_descs:
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
        # augmentation
        # perform term to abbrev
-        desc = replace_abbreivations_with_terms(parent_desc)
+        desc = replace_abbreviations_with_terms(parent_desc)
-        no_of_shuffles = SHUFFLES
+        if (desc != parent_desc):
        processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
        for desc in processed_descs:
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
@ -257,7 +330,7 @@ def process_df_to_dict(df):
 def create_dataset():
    # train 
-    data_path = '../../esAppMod_data_import/train.csv'
+    data_path = '../../esAppMod_data_import/parent_train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
@ -271,13 +344,13 @@ def create_dataset():
 def train():
-    save_path = f'checkpoint'
+    save_path = f'checkpoint_part1'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
-    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'google-bert/bert-base-uncased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
@ -348,7 +421,6 @@ def train():
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
--- a/train/class_bert_hierarchical/train_2.py
+++ b/train/class_bert_hierarchical/train_2.py
@ -0,0 +1,469 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import glob
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=0
 # %%
 # import training file
 data_path = '../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 acronym_mapping = {
 'hpsa': 'hp server automation',
 'tam': 'tivoli access manager',
 'adf': 'application development facility',
 'html': 'hypertext markup language',
 'wff': 'microsoft web farm framework',
 'jsp': 'javaserver pages',
 'bw': 'business works',
 'ssrs': 'sql server reporting services',
 'cl': 'control language',
 'vba': 'visual basic for applications',
 'esapi': 'enterprise security api',
 'gwt': 'google web toolkit',
 'pki': 'perkin elmer informatics',
 'rtd': 'oracle realtime decisions',
 'jms': 'java message service',
 'db': 'database',
 'soa': 'service oriented architecture',
 'xsl': 'extensible stylesheet language',
 'com': 'compopent object model',
 'ldap': 'lightweight directory access protocol',
 'odm': 'ibm operational decision manager',
 'soql': 'salesforce object query language',
 'oms': 'order management system',
 'cfml': 'coldfusion markup language',
 'nas': 'netscape application server',
 'sql': 'structured query language',
 'bde': 'borland database engine',
 'imap': 'internet message access protocol',
 'uws': 'ultidev web server',
 'birt': 'business intelligence and reporting tools',
 'mdw': 'model driven workflow',
 'tws': 'tivoli workload scheduler',
 'jre': 'java runtime environment',
 'wcs': 'websphere commerce suite',
 'was': 'websphere application server',
 'ssis': 'sql server integration services',
 'xhtml': 'extensible hypertext markup language',
 'soap': 'simple object access protocol',
 'san': 'storage area network',
 'elk': 'elastic stack',
 'arr': 'application request routing',
 'xlst': 'extensible stylesheet language transformations',
 'sccm': 'microsoft endpoint configuration manager',
 'ejb': 'enterprise java beans',
 'css': 'cascading style sheets',
 'hpoo': 'hp operations orchestration',
 'xml': 'extensible markup language',
 'esb': 'enterprise service bus',
 'edi': 'electronic data interchange',
 'imsva': 'interscan messaging security virtual appliance',
 'wtx': 'ibm websphere transformation extender',
 'cgi': 'common gateway interface',
 'bal': 'ibm basic assembly language',
 'issow': 'integrated safe system of work',
 'dcl': 'data control language',
 'jdom': 'java document object model',
 'fim': 'microsoft forefront identity manager',
 'npl': 'niakwa programming language',
 'wf': 'windows workflow foundation',
 'lm': 'etap license manager',
 'wts': 'windows terminal server',
 'asp': 'active server pages',
 'jil': 'job information language',
 'mvc': 'model view controller',
 'rmi': 'remote method invocation',
 'ad': 'active directory',
 'owb': 'oracle warehouse builder',
 'rest': 'representational state transfer',
 'jdk': 'java development kit',
 'ids': 'integrated data store',
 'bms': 'batch management software',
 'vsx': 'vmware solution exchange',
 'ssas': 'sql server analysis services',
 'atl': 'atlas transformation language',
 'ice': 'infobright community edition',
 'esql': 'extended structured query language',
 'corba': 'common object request broker architecture',
 'dpe': 'device provisioning engines',
 'rac': 'oracle real application clusters',
 'iemt': 'iis easy migration tool',
 'mes': 'manufacturing execution system',
 'odbc': 'open database connectivity',
 'lms': 'lan management solution',
 'wcf': 'windows communication foundation',
 'nes': 'netscape enterprise server',
 'jsf': 'javaserver faces',
 'alm': 'application lifecycle management',
 'hlasm': 'high level assembler',
 'cmod': 'content manager ondemand'}
 external_source = {
 'vb.net': 'visual basic dot net',
 'jes': 'job entry subsystem',
 'svn': 'subversion',
 'vcs': 'version control system',
 'lims': 'laboratory information management system',
 'ide': 'integrated development environment',
 'sdk': 'software development kit',
 'mq': 'message queue',
 'ims': 'information management system',
 'isa': 'internet security and acceleration',
 'vs': 'visual studio',
 'esr': 'extended support release',
 'ff': 'firefox',
 'vb': 'visual basic',
 'rhel': 'red hat enterprise linux',
 'iis': 'internet information server',
 'api': 'application programming interface',
 'se': 'standard edition',
 '\.net': 'dot net',
 'c#': 'c sharp',
 'ms': 'microsoft'
 }
 # synonyms = {
 #  'windows server': 'windows nt',
 #  'windows 7': 'windows desktop',
 #  'windows 8': 'windows desktop',
 #  'windows 10': 'windows desktop'
 # }
 # add more information
 acronym_mapping.update(external_source)
 abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
 term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
 def replace_terms_with_abbreviations(text):
    for input, replacement in term_to_abbrev.items():
        text = re.sub(input, replacement, text)
    return text
 def replace_abbreviations_with_terms(text):
    for input, replacement in abbrev_to_term.items():
        text = re.sub(input, replacement, text)
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        parent_desc = preprocess_text(parent_desc)
        # ensure at least 1 shuffle
        # no_of_shuffles = label2weight[index] + 1
        no_of_shuffles = SHUFFLES
        processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'label': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # augmentation
        # remove all non-alphanumerics
        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        if (desc != parent_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
        # augmentation
        # perform abbrev_to_term
        temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        desc = replace_terms_with_abbreviations(temp_desc)
        if (desc != temp_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
        # augmentation
        # perform term to abbrev
        desc = replace_abbreviations_with_terms(parent_desc)
        if (desc != parent_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../esAppMod_data_import/train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    pattern = 'checkpoint_part1-*'
    checkpoint_directory = 'checkpoint'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    # model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-uncased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    # additional_special_tokens = ["<DESC>"]
    # Add the additional special tokens to the tokenizer
    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            truncation=True,
            padding=True
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=5e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=300,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/train/seq2seq_t5_simple/mapping_prediction/output.txt
+++ b/train/seq2seq_t5_simple/mapping_prediction/output.txt
@ -1,2 +0,0 @@
 Accuracy for fold: 0.5846658466584665
--- a/train/seq2seq_t5_simple/mapping_prediction/.gitignore
+++ b/train/seq2seq_t5_simple/mapping_prediction/.gitignore
--- a/train/seq2seq_t5_simple/mapping_prediction/inference.py
+++ b/train/seq2seq_t5_simple/mapping_prediction/inference.py
@ -57,10 +57,10 @@ class Inference():
            output_list = []
            for _, row in df.iterrows():
                desc = row['mention']
-                label = row['entity_name']
+                label = row['entity_seq']
                element = {
                    'input' : desc,
-                    'output': label
+                    'output': f'{label}'
                }
                output_list.append(element)
@ -101,7 +101,7 @@ class Inference():
    def generate(self):
-        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        MAX_GENERATE_LENGTH = 128
        pred_generations = []
--- a/train/seq2seq_t5_simple/prediction/output.txt
+++ b/train/seq2seq_t5_simple/prediction/output.txt
@ -0,0 +1,2 @@
 Accuracy for fold: 0.5022550225502255
--- a/train/seq2seq_t5_simple/mapping_prediction/predict.py
+++ b/train/seq2seq_t5_simple/mapping_prediction/predict.py
@ -11,7 +11,7 @@ BATCH_SIZE = 512
 def infer():
    print(f"Inference for data")
    # import test data
-    data_path = '../../../data_import/test.csv'
+    data_path = '../../../esAppMod_data_import/test_seq.csv'
    df = pd.read_csv(data_path, skipinitialspace=True)
@ -35,18 +35,19 @@ def infer():
    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
    # Convert the list to a Pandas DataFrame
    df_out = pd.DataFrame({
-        'predictions': prediction_list
+        'class_prediction': prediction_list
    })
    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
-    df = pd.concat([df, df_out], axis=1)
+    # df = pd.concat([df, df_out], axis=1)
    # we can save the t5 generation output here
-    df.to_csv(f"exports/result.csv", index=False)
+    df_out.to_csv(f"exports/result.csv", index=False)
    # here we want to evaluate mapping accuracy within the valid in mdm data only
-    condition_correct = df['predictions'] == df['entity_name']
+    # predictions = pd.to_numeric(df_out['class_prediction'], errors="coerce")
-    pred_correct_proportion = sum(condition_correct)/len(df)
+    condition_correct = df_out['class_prediction'] == df['entity_seq']
    pred_correct_proportion = sum(condition_correct)/len(df_out)
    # write output to file output.txt
    with open("output.txt", "a") as f:
--- a/train/seq2seq_t5_simple/train.py
+++ b/train/seq2seq_t5_simple/train.py
@ -33,10 +33,10 @@ def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
-        label = row['entity_name']
+        label = row['entity_seq']
        element = {
            'input' : desc,
-            'output': label
+            'output': f'{label}'
        }
        output_list.append(element)
@ -45,7 +45,7 @@ def process_df_to_dict(df):
 def create_dataset():
    # train 
-    data_path = f"../../data_import/train.csv"
+    data_path = f"../../esAppMod_data_import/train_seq.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
--- a/zero_shot/bloom.py
+++ b/zero_shot/bloom.py
@ -3,8 +3,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 # %%
 # Load model and tokenizer
-# model_name = "bigscience/bloom-7b1"  # Replace with your model
+model_name = "bigscience/bloom-7b1"  # Replace with your model
-model_name = "bigscience/bloomz-1b1"
+# model_name = "bigscience/bloomz-1b1"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 # Automatically map model layers to available GPUs
@ -26,13 +26,12 @@ outputs = model.generate(inputs["input_ids"], max_length=50)
 # Decode and print result
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 # %%
 # %%
 # Prepare input
 def generate(text):
    # Define prompt
-    prompt = f"Answer Concisely: Give me a mapping between the acronym and descriptor in the format '(acronym: description): '{text}'"
+    prompt = f"Give me past product names relating to: '{text}'"
    # Generate acronym
    inputs = tokenizer(prompt, return_tensors="pt")
@ -45,7 +44,7 @@ def generate(text):
 # Example usage
 # text = "Advanced Data Analytics Platform"
-text = 'ColdFusion Markup Language (CFML)'
+text = 'windows server'
 acronym = generate(text)
-print(f"Acronym: {acronym}")
+print(f"Generation: {acronym}")
 # %%
--- a/zero_shot/conceptnet.py
+++ b/zero_shot/conceptnet.py
@ -0,0 +1,21 @@
 # %%
 import requests
 def get_related_terms(term, language="en", limit=10):
    url = f"http://api.conceptnet.io/c/{language}/{term}"
    response = requests.get(url).json()
    # Extract related terms
    related_terms = []
    for edge in response.get("edges", []):
        related = edge.get("end", {}).get("label", None)
        if related and related.lower() != term.lower():
            related_terms.append(related)
        if len(related_terms) >= limit:
            break
    return related_terms
 # Example
 related_terms = get_related_terms("windows_server")
 print("Related Terms:", related_terms)
 # %%
--- a/zero_shot/dbpedia.py
+++ b/zero_shot/dbpedia.py
@ -0,0 +1,38 @@
 # %%
 from SPARQLWrapper import SPARQLWrapper, JSON
 # %%
 sparql = SPARQLWrapper("https://dbpedia.org/sparql")
 sparql.setQuery("""
    SELECT ?altLabel WHERE {
    ?item rdfs:label "Windows Server"@en.
    ?item skos:altLabel ?altLabel.
    FILTER (LANG(?altLabel) = "en")
    }
    LIMIT 10
 """)
 sparql.setReturnFormat(JSON)
 results = sparql.query().convert()
 for result in results["results"]["bindings"]:
    print(result["label"]["value"])
 # %%
 from SPARQLWrapper import SPARQLWrapper, JSON
 sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
 sparql.setQuery("""
    SELECT ?itemLabel ?altLabel WHERE {
        ?item ?label "Windows Server"@en.
        OPTIONAL { ?item skos:altLabel ?altLabel. FILTER (LANG(?altLabel) = "en") }
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    LIMIT 10
 """)
 sparql.setReturnFormat(JSON)
 results = sparql.query().convert()
 for result in results["results"]["bindings"]:
    print("Label:", result["itemLabel"]["value"])
    if "altLabel" in result:
        print("Alias:", result["altLabel"]["value"])
 # %%
--- a/zero_shot/error.csv
+++ b/zero_shot/error.csv
@ -0,0 +1,626 @@
 ,mention,entity_id,entity_name,class_prediction,predicted_name
 0,DOT NET,497,.NET Framework,579,Unix|BSD|*
 2,Dot net - FW 4,497,.NET Framework,368,VB.NET
 3,.Net 4.7.1 Enterprise Lib,497,.NET Framework,368,VB.NET
 11,.NET,497,.NET Framework,579,Unix|BSD|*
 13,.Net 4.5.2 Enterprise Lib,497,.NET Framework,368,VB.NET
 40,APACHE LOG4NET,483,.NET Framework|log4net,394,Java|Log4j
 41,LOG4NET,483,.NET Framework|log4net,394,Java|Log4j
 42,Magik,484,.NET Framework|Magick.NET,533,YAML
 43,WCF,485,.NET Framework|Windows Communication Foundation (WCF),486,.NET Framework|Windows Workflow Foundation (WF)
 45,WWF,486,.NET Framework|Windows Workflow Foundation (WF),443,OS/2
 47,Ejes,1,(E)JES,101,Microsoft Dynamics AX
 48,(UNIRITA) A-AUTO 7.2.2,2,A-Auto Job Scheduling Software,299,AutoIt
 50,Active Directoy,498,Active Directory (AD),40,Connect Direct
 54,APSX,592,Active Server Pages (ASP)|*,609,IIS|*
 69,Andriod,418,Android,586,PHP|*
 71,Apache Active Queue,6,Apache ActiveMQ,259,Apache HTTP Server
 72,MQ Apache Active Queue,6,Apache ActiveMQ,81,IBM Websphere MQ
 75,cordova-android,501,Apache Cordova,418,Android
 77,Hive,8,Apache Hive,177,SAP NetWeaver Business Warehouse
 99,solr,11,Apache Solr,375,Apache Lucene
 135,ADF,13,Application Development Facility (ADF),130,Oracle ADF
 144,WLS 10.2,600,Oracle WebLogic Server|*,442,OpenVMS
 149,BEA WLS,600,Oracle WebLogic Server|*,442,OpenVMS
 152,Weblogic 12c,600,Oracle WebLogic Server|*,582,C#|*
 160,WLE,600,Oracle WebLogic Server|*,443,OS/2
 168,Web Logic,600,Oracle WebLogic Server|*,97,MarkLogic DB
 174,BEA WLE,600,Oracle WebLogic Server|*,443,OS/2
 175,WLS 10,600,Oracle WebLogic Server|*,442,OpenVMS
 176,WLS,600,Oracle WebLogic Server|*,442,OpenVMS
 189,brain script,302,Brainscript,369,VBScript
 190,BRAINScript,302,Brainscript,367,TypeScript
 191,Business Intelligence and Reporting Tools,21,Business Intelligence and Reporting Tools (BIRT),133,Oracle Business Intelligence
 192,Actuate Report Server,21,Business Intelligence and Reporting Tools (BIRT),42,Crystal Reports
 194,CSHARP,582,C#|*,87,Informatica PowerCenter
 218,WinFrame,30,Citrix Virtual Apps and Desktops,443,OS/2
 221,METAFRAME,30,Citrix Virtual Apps and Desktops,406,JavaScript|AngularJS
 225,Presentation Server,30,Citrix Virtual Apps and Desktops,541,File Server
 226,NETSCALER-1.5,563,Citrix ADC,273,Netscape Enterprise Server (NES)
 227,NETSCALER-11.,563,Citrix ADC,273,Netscape Enterprise Server (NES)
 228,Citrix SD-WAN,563,Citrix ADC,30,Citrix Virtual Apps and Desktops
 229,NetScaler SD-WAN,563,Citrix ADC,273,Netscape Enterprise Server (NES)
 231,NetScaler ADC,563,Citrix ADC,272,Netscape Application Server (NAS)
 236,NetScaler SDX,291,Citrix ADC SDX,273,Netscape Enterprise Server (NES)
 240,Provisioning Services 7.15.8,32,Citrix Provisioning,538,Device Provisioning Engines (DPE)
 241,Citrix PVS,32,Citrix Provisioning,30,Citrix Virtual Apps and Desktops
 243,CLISTS,309,CLIST,329,IBM i Control Language (CL)
 253,CFML,311,ColdFusion Markup Language (CFML),316,eXtensible HyperText Markup Language (XHTML)
 254,ColdFusion Markup Language,311,ColdFusion Markup Language (CFML),37,Coldfusion
 255,Sterling Connect,40,Connect Direct,542,General Ledger
 264,Cormerstone,41,Cornerstone software,516,Compopent Object Model (COM)
 265,Cornerstone,41,Cornerstone software,370,Visual Basic
 279,DB2 UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA)
 282,DB2-UDB,43,DB2,365,TCL
 291,DB2/UDB,43,DB2,365,TCL
 292,IBM DB2 ENTERPRISE SERVER EDITION PVU OPTION 10.5,43,DB2,163,PVCS Version Manager
 300,IBM - IBM DB2 Advanced Enterprise Server Edition PVU Option 10.5,43,DB2,72,IBM Mobile Foundation
 301,UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA)
 302,IBM - IBM DB2 Enterprise Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|*
 306,IBM - IBM DB2 Workgroup Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|*
 313,EZTriev,314,Easytrieve,296,Intel Xeon Processor
 314,Eztrieve,314,Easytrieve,296,Intel Xeon Processor
 321,PrestoSoft - ExamDiff Application 1.6,49,ExamDiff,346,Pascal|Object Pascal
 322,PrestoSoft - ExamDiff Application,49,ExamDiff,346,Pascal|Object Pascal
 323,ExamDiff Application,49,ExamDiff,467,MS SQL Server|Log Reader Agent
 324,Expect Scripts,315,Expect,109,Microsoft MQ
 329,Microsoft - MSXML 4.0 SP2 4.2,318,Extensible Markup Language (XML)|MSXML,316,eXtensible HyperText Markup Language (XHTML)
 331,XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST)
 332,JAVA-XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST)
 335,ServerCA Access GatewayF5,50,F5 Secure Web Gateway Services,290,CA API Gateway
 347,HP C++,58,HP aC++ compiler,59,HP C/ANSI C compiler
 350,HP C++ 10.20,58,HP aC++ compiler,59,HP C/ANSI C compiler
 351,HPC 11.11,59,HP C/ANSI C compiler,58,HP aC++ compiler
 358,HFS,505,HTTP File Server,486,.NET Framework|Windows Workflow Foundation (WF)
 359,www.rejetto.com - HttpFileServer 2.3,505,HTTP File Server,55,Google Chrome
 360,HttpFileServer,505,HTTP File Server,522,Application Web Server
 367,IBM - IBM BigFix Platform Client Deploy Tool 9.5,457,IBM BigFix Platform|Client Deploy Tool,62,IBM BigFix Platform
 369,IBM BPM,64,IBM Business Process Manager,328,IBM High Level Assembler (HLASM)
 375,Data Power,294,IBM DataPower Gateway,295,IBM Power Systems
 376,IDG.7.5.2.19hp,294,IBM DataPower Gateway,449,Unix|HP-UX
 380,hlasm,328,IBM High Level Assembler (HLASM),438,macOS
 383,IHS,265,IBM HTTP Server,424,IBM i
 386,WebSphere and IHS,265,IBM HTTP Server,67,IBM InfoSphere DataStage
 387,WebSphere http,265,IBM HTTP Server,284,Websphere Application Server (WAS)
 391,IBM Websphere HTTP Server,265,IBM HTTP Server,285,WebSphere Liberty
 393,WebSphere IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS
 394,WebSphere -IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS
 397,OS400 V7R1,424,IBM i,443,OS/2
 398,OS400,424,IBM i,443,OS/2
 399,OS/400,424,IBM i,443,OS/2
 408,IIB,68,IBM Integration Bus,370,Visual Basic
 411,Extended Structured Query Language,458,IBM Integration Bus|Extended Structured Query Language (ESQL),572,Structured Query Language (SQL)
 415,IBM WorkLight,72,IBM Mobile Foundation,649,IBM Cloud
 417,ILOG JRules,73,IBM Operational Decision Manager (ODM),601,Websphere ILOG JRules BRMS
 420,Decision Center 8.0.1.0,73,IBM Operational Decision Manager (ODM),252,IBM ILOG Views
 423,AS400,295,IBM Power Systems,443,OS/2
 424,AS/400,295,IBM Power Systems,443,OS/2
 426,System i,295,IBM Power Systems,424,IBM i
 427,P-series,295,IBM Power Systems,81,IBM Websphere MQ
 428,IBM iSeries/AS400 system Model 520,295,IBM Power Systems,443,OS/2
 439,Tivoli Asset Discovery for Distributed,459,IBM Tivoli Asset Management|Asset Discovery for Distributed,606,IBM Tivoli Asset Management|*
 447,Database MS SQL Agent,77,IBM Tivoli Monitoring,469,MS SQL Server|SQL Server Database Engine
 448,Linux OS Agent,77,IBM Tivoli Monitoring,576,Linux|*
 449,Database DB2 Agent,77,IBM Tivoli Monitoring,520,Database (DB)
 452,Windows OS Agent,77,IBM Tivoli Monitoring,580,Windows|*
 454,IBM - IBM TSM FCM,604,IBM Tivoli Storage Manager|*,460,IBM Tivoli Storage Manager|TSM API
 459,Databases Data Protection for Microsoft SQL,604,IBM Tivoli Storage Manager|*,572,Structured Query Language (SQL)
 461,IBM - IBM Spectrum Protect Data Protection for Microsoft SQL Server 8.1,604,IBM Tivoli Storage Manager|*,469,MS SQL Server|SQL Server Database Engine
 462,IBM Spectrum Protect Data Protection,604,IBM Tivoli Storage Manager|*,312,Data Language Interface (DL/I)
 463,IBM - IBM Spectrum Protect API 7.1,460,IBM Tivoli Storage Manager|TSM API,294,IBM DataPower Gateway
 464,IBM - IBM Spectrum Protect Client,461,IBM Tivoli Storage Manager|TSM Client,294,IBM DataPower Gateway
 465,IBM - IBM Tivoli Storage Manager Client,461,IBM Tivoli Storage Manager|TSM Client,604,IBM Tivoli Storage Manager|*
 467,VSS Requestor configured 8.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|*
 468,VSS Requestor 7.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|*
 469,TWS-WS,79,IBM Tivoli Workload Scheduler (TWS),239,Windows Terminal Server (WTS)
 472,wbia 2.6,80,IBM WebSphere Business Integration Adaptor,627,XtraDB
 473,IBM WBIA 2.6.0.12,80,IBM WebSphere Business Integration Adaptor,424,IBM i
 475,MQ,81,IBM Websphere MQ,248,ZeroMQ
 476,MQ 9.1,81,IBM Websphere MQ,248,ZeroMQ
 479,MQ 7,81,IBM Websphere MQ,248,ZeroMQ
 480,MQ 6,81,IBM Websphere MQ,248,ZeroMQ
 481,MQ 9.0,81,IBM Websphere MQ,248,ZeroMQ
 482,MQ 5.3,81,IBM Websphere MQ,248,ZeroMQ
 483,MQ 7.01,81,IBM Websphere MQ,248,ZeroMQ
 484,MQ 7.5,81,IBM Websphere MQ,248,ZeroMQ
 485,MQSeries 8.0,81,IBM Websphere MQ,248,ZeroMQ
 488,WSMQ 8.0,81,IBM Websphere MQ,248,ZeroMQ
 489,MQ 9.0.5,81,IBM Websphere MQ,248,ZeroMQ
 491,WTX,83,IBM WebSphere Transformation Extender (WTX),274,Nginx
 505,Microsoft Internet Inf,609,IIS|*,130,Oracle ADF
 508,Microsoft Internet Informat,609,IIS|*,330,IBM Informix-4GL
 550,Microsoft - IIS 6.0 Migration Tool 1,489,IIS|Easy Migration Tool (IEMT),609,IIS|*
 558,Infozip 6,85,Info-ZIP,677,Git
 559,Infozip,85,Info-ZIP,677,Git
 578,IMSVA 9.1,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB
 580,IMSVA,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB
 581,Java 1.8,584,Java|*,334,Java|Java Standard Edition (Java SE)
 582,Java 7,584,Java|*,334,Java|Java Standard Edition (Java SE)
 583,Java on Weblogic server,584,Java|*,600,Oracle WebLogic Server|*
 584,Java5,584,Java|*,334,Java|Java Standard Edition (Java SE)
 585,Java 6,584,Java|*,334,Java|Java Standard Edition (Java SE)
 586,Java 6.0,584,Java|*,334,Java|Java Standard Edition (Java SE)
 587,Java 7 Update 25,584,Java|*,334,Java|Java Standard Edition (Java SE)
 589,Java (open source),584,Java|*,397,Java|Servlet
 590,Java 5,584,Java|*,334,Java|Java Standard Edition (Java SE)
 591,Java 1.5,584,Java|*,334,Java|Java Standard Edition (Java SE)
 593,Java 1.8.0_92,584,Java|*,334,Java|Java Standard Edition (Java SE)
 594,Java 1.6,584,Java|*,334,Java|Java Standard Edition (Java SE)
 595,J2EE 6,584,Java|*,333,Java|Java Enterprise Edition (Java EE)
 596,Java (J2EE),584,Java|*,333,Java|Java Enterprise Edition (Java EE)
 598,JRE,506,Java Runtime Environment (JRE),84,IMS DB
 629,JEE,333,Java|Java Enterprise Edition (Java EE),1,(E)JES
 639,JSF,391,Java|JavaServer Faces (JSF),334,Java|Java Standard Edition (Java SE)
 643,JSP Scriptlets,336,Java|JavaServer Pages (JSP)|Scriptlets,335,Java|JavaServer Pages (JSP)
 644,Java Scriplet,336,Java|JavaServer Pages (JSP)|Scriptlets,88,Ingres
 645,Core 9.2.0.0,393,Java|JRuby Core,583,C++|*
 647,Java RMI,396,Java|Remote Method Invocation (RMI),584,Java|*
 650,Java Servlets,397,Java|Servlet,453,Linux|Fedora
 651,Java 6 Servlets,397,Java|Servlet,453,Linux|Fedora
 652,J2EE Servlets,397,Java|Servlet,443,OS/2
 653,Servlets,397,Java|Servlet,420,Cisco IOS
 654,Servlets v2.3,397,Java|Servlet,370,Visual Basic
 656,Spring BOOT,399,Java|Spring|Spring Boot,398,Java|Spring
 657,Springboot,399,Java|Spring|Spring Boot,398,Java|Spring
 661,javasript,589,JavaScript|*,335,Java|JavaServer Pages (JSP)
 662,JS,589,JavaScript|*,507,Node.js
 664,Java Script,589,JavaScript|*,584,Java|*
 671,Sencha 4.2.0,409,JavaScript|Ext JS,589,JavaScript|*
 674,jqueryui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery
 675,jquery-ui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery
 679,Scriptaculous,414,JavaScript|script.aculo.us,582,C#|*
 684,EAP,268,JBoss|*,174,SAP ERP
 685,JBOSS-EAP,268,JBoss|*,493,JBoss|Wildfly
 686,JBoss Application Server 4,268,JBoss|*,493,JBoss|Wildfly
 687,JBoss Application Server 7,268,JBoss|*,493,JBoss|Wildfly
 688,JBoss Application Server 5,268,JBoss|*,493,JBoss|Wildfly
 689,JBoss Application Server,268,JBoss|*,493,JBoss|Wildfly
 690,Enterprise Application Platform,268,JBoss|*,670,EAServer
 692,JBOSS 5.1.2 EAP,268,JBoss|*,493,JBoss|Wildfly
 693,server: Jboss,268,JBoss|*,493,JBoss|Wildfly
 694,JBOSS 6.3.2 EAP,268,JBoss|*,493,JBoss|Wildfly
 695,JBoss EAP 4.3,268,JBoss|*,493,JBoss|Wildfly
 700,Job Information Language,339,Job Information Language (JIL),338,JCL
 703,JoinIT by Acayosoft,91,joinIT,4,Adobe Acrobat Reader
 704,Acayosoft JoinIT,91,joinIT,4,Adobe Acrobat Reader
 705,JoinIT by Acayosoft v 9.0.8,91,joinIT,4,Adobe Acrobat Reader
 706,LifeFlow Tool,92,LifeFlow,486,.NET Framework|Windows Workflow Foundation (WF)
 707,Linux 2.6.32-696.28.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
 709,Linux 2.6.32-696.30.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
 710,Linux 2.6.9,576,Linux|*,437,Linux|zLinux
 711,Linux 2.6.32-642.3.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
 712,Linux - 2.6.18-371.1.2.el5,576,Linux|*,437,Linux|zLinux
 713,Linux 2.6.32-696.23.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
 749,Gaia Kernel version 2.7,428,Linux|Check Point,432,Linux|Oracle Linux
 752,Gaia Kernel version 2.6,428,Linux|Check Point,432,Linux|Oracle Linux
 766,OEL,432,Linux|Oracle Linux,449,Unix|HP-UX
 778,Oracle Enterprise Server 7.5,432,Linux|Oracle Linux,134,Oracle Database
 780,OEL6.7 - 3.8.13-68.3.4.el6uek.x86_64,432,Linux|Oracle Linux,449,Unix|HP-UX
 792,VMware Photon,433,Linux|Photon OS,569,VMware Server
 793,VMware Photon 1,433,Linux|Photon OS,569,VMware Server
 809,Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 818,Redhat - Redhat Linux 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 819,Linux RH6,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
 865,Redhat - Redhat Linux 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 870,Redhat - RHEL 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 874,Red Hat Entreprise Linux 6.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 882,Redhat 6 64-Bit,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 893,RED HAT ADVANCED SERVER 5,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 910,redhat6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 912,Redhat - Redhat Linux 6.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 913,Linux RH,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
 916,Redhat - Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 920,Linux RH7,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
 926,Red Hat V6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 932,Linux RH5,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
 934,rehl5.9,434,Linux|Red Hat Enterprise Linux,43,DB2
 964,Red Hat 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 979,red hat,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 991,Redhat,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 996,RedHat 7.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 998,LINUX RED HAT 5 EL,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
 1003,SUSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1006,Linux SuSE12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1011,SUSE10,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1012,SUSE Linux 12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1017,SUSELinux Enterprise 11.x,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1023,SUSE Linux 11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1024,SUSE Linux 11 SP3,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1029,Linux SuSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1030,SUSE,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1038,SuseLinux,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
 1051,domino8.5,270,Lotus Domino,93,Lotus Notes
 1052,Domino 8.x,270,Lotus Domino,93,Lotus Notes
 1054,Lucee 5.2.6.60,271,Lucee,375,Apache Lucene
 1056,Darwin,438,macOS,117,Mozilla Firefox
 1061,Memcache,98,Memcached,18,BMC Control-M
 1062,ACCDB,99,Microsoft Access,525,Open Database Connectivity (ODBC)
 1070,ConfigMgr,102,Microsoft Endpoint Configuration Manager (SCCM),21,Business Intelligence and Reporting Tools (BIRT)
 1080,FIM SQL Development Server,105,Microsoft Forefront Identity Manager (FIM),572,Structured Query Language (SQL)
 1082,Microsoft - Internet Explor,107,Microsoft Internet Explorer,356,Rexx
 1084,Internet Explor,107,Microsoft Internet Explorer,356,Rexx
 1090,SCEP for Linux,110,Microsoft System Center Endpoint Protection,437,Linux|zLinux
 1094,SCEP for Mac,110,Microsoft System Center Endpoint Protection,438,macOS
 1101,msdeploy,112,Microsoft Web Deploy,56,Greenplum DB
 1106,WebPI,114,Microsoft Web Platform Installer,522,Application Web Server
 1109,Web PI,114,Microsoft Web Platform Installer,531,Simple Object Access Protocol (SOAP)
 1111,MDW Framework,115,Model Driven Workflow (MDW),406,JavaScript|AngularJS
 1115,Mango DB,116,MongoDB,43,DB2
 1117,MangoDB,116,MongoDB,43,DB2
 1125,O365,119,MS Office 365,424,IBM i
 1141,MICROSOFT SQL SERVER 2012 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
 1153,MICROSOFT SQL SERVER 2012 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
 1154,MS SQL Server 2008 Developer,581,MS SQL Server|*,146,Oracle SQL Developer
 1156,MICROSOFT SQL SERVER 2008 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
 1167,MSSQL Database Server,581,MS SQL Server|*,122,MySQL
 1173,MSSQL2008,581,MS SQL Server|*,122,MySQL
 1192,Microsoft SQL Server Standard Edition,581,MS SQL Server|*,121,MS SQL Server Compact
 1201,SQLServer,581,MS SQL Server|*,572,Structured Query Language (SQL)
 1226,MICROSOFT SQL SERVER 2012 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
 1233,MICROSOFT SQL SERVER 2005 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
 1234,SQLSVR2008,581,MS SQL Server|*,352,PL/SQL
 1235,MICROSOFT SQL SERVER 2008 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
 1239,MICROSOFT SQL SERVER 2008 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
 1244,MS SQL Server 2012 Developer,581,MS SQL Server|*,146,Oracle SQL Developer
 1253,Microsoft - SQL Server Express LocalDB 2014,581,MS SQL Server|*,469,MS SQL Server|SQL Server Database Engine
 1256,MSSQL,581,MS SQL Server|*,122,MySQL
 1280,Microsoft - Microsoft SQL Server Analysis Services 2012 for Microsoft SQL Server 2012 Standard Edition 2012,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
 1281,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
 1283,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
 1290,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|*
 1293,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|*
 1295,SQL Server Integration Services,470,MS SQL Server|SQL Server Integration Services (SSIS),473,MS SQL Server|SQL Server Reporting Services (SSRS)
 1316,ZOS Base 1.12,441,MVS|z/OS,437,Linux|zLinux
 1335,NAS,272,Netscape Application Server (NAS),443,OS/2
 1337,NES,273,Netscape Enterprise Server (NES),443,OS/2
 1349,Node.js 0.10 (Linux),507,Node.js,437,Linux|zLinux
 1361,Node.js 4 (Linux),507,Node.js,437,Linux|zLinux
 1371,Symas OpenLDAP,126,OpenLDAP,178,SAP SQL Anywhere
 1377,OAM 12c,129,Oracle Access Management,303,C
 1378,ADF 12c,130,Oracle ADF,343,Objective C
 1381,OHS,610,Oracle Application Server|*,122,MySQL
 1383,Oracle HTTP Server powered by Apache,610,Oracle Application Server|*,259,Apache HTTP Server
 1384,j2eeoracleca,610,Oracle Application Server|*,453,Linux|Fedora
 1385,Oracle HTTP,610,Oracle Application Server|*,134,Oracle Database
 1389,9i AS server,610,Oracle Application Server|*,227,Virtual I/O Server
 1391,Oracle Application R12.1.3,610,Oracle Application Server|*,134,Oracle Database
 1394,Weblogic BI Publisher,132,Oracle BI Publisher,600,Oracle WebLogic Server|*
 1396,OBI,133,Oracle Business Intelligence,343,Objective C
 1397,OBIEE,133,Oracle Business Intelligence,343,Objective C
 1398,OBI Reporting,133,Oracle Business Intelligence,343,Objective C
 1401,Oracle 12.2 Client,134,Oracle Database,610,Oracle Application Server|*
 1406,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0,134,Oracle Database,610,Oracle Application Server|*
 1407,Oracle 11.2 (Oracle Database 11g Enterprise Edition Release 11.2.0.3.0 - 64bit) RAC,134,Oracle Database,610,Oracle Application Server|*
 1415,Oracle 11 on AIX,134,Oracle Database,445,Unix|AIX
 1416,Oracle Database 10g Enterprise Edition Release 10.1.0.4.0 - 64bit,134,Oracle Database,610,Oracle Application Server|*
 1431,Oracle Database 10g Release 10.2.0.4.0 - 64bit Production,134,Oracle Database,298,Oracle Exadata
 1432,Oarcle 11G,134,Oracle Database,218,TIBCO InConcert
 1443,DB - Oracle inbuilt,134,Oracle Database,158,Powerbuilder
 1460,Oracle Server,134,Oracle Database,610,Oracle Application Server|*
 1475,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 - 64bit Production,134,Oracle Database,610,Oracle Application Server|*
 1480,Oracle 12C on linux,134,Oracle Database,303,C
 1484,Oracle9i Enterprise Edition Release 9.2.0.5.0,134,Oracle Database,610,Oracle Application Server|*
 1486,Oracle 11g on linux,134,Oracle Database,432,Linux|Oracle Linux
 1487,Oracle 11gEssbase,134,Oracle Database,298,Oracle Exadata
 1490,JServer Release 9.2.0.5.0,474,Oracle Database|Jserver,335,Java|JavaServer Pages (JSP)
 1492,Designer 6i,135,Oracle Designer,516,Compopent Object Model (COM)
 1493,Enterprise Manager 12.2.1.1,136,Oracle Enterprise Manager,427,Linux|CentOS
 1494,Enterprise Manager 12.2.1.2,136,Oracle Enterprise Manager,427,Linux|CentOS
 1495,Enterprise Manager 11.1.1.7,136,Oracle Enterprise Manager,427,Linux|CentOS
 1501,"Oracle, Nets",140,Oracle Net Services,273,Netscape Enterprise Server (NES)
 1502,Oracle RAC,141,Oracle Real Application Clusters (RAC),134,Oracle Database
 1504,ORPOS 13.3.3,142,Oracle Retail Point-of-Service,609,IIS|*
 1505,ORPOS 13.3.5,142,Oracle Retail Point-of-Service,609,IIS|*
 1506,ORPOS 13.3.4,142,Oracle Retail Point-of-Service,609,IIS|*
 1509,OSB Servers,143,Oracle Service Bus,443,OS/2
 1514,Oracle TT,147,Oracle TimesTen In-Memory Database,134,Oracle Database
 1517,OWB 10g,148,Oracle Warehouse Builder (OWB),300,AWK
 1524,Clascal,346,Pascal|Object Pascal,307,Cascading Style Sheets (CSS)
 1526,Oracle-HR-9.2,151,PeopleSoft,134,Oracle Database
 1529,ActiveState Tool Corp. - ActivePerl 5.12,348,Perl|ActivePerl,500,ActiveX
 1530,ActiveState Tool Corp. - ActivePerl 5.8,348,Perl|ActivePerl,500,ActiveX
 1531,ORAPERL,417,Perl|Oraperl,242,WinRAR
 1532,REX,349,Perl|Rex,356,Rexx
 1536,TCServer V6,277,Pivotal tc Server,365,TCL
 1537,IBM PKWARE PKZip 2,155,PKZIP,387,Java|IBM SDK
 1541,PLQSL,352,PL/SQL,351,PL/I
 1542,Oracle - SQL,352,PL/SQL,581,MS SQL Server|*
 1544,Oracle SQL,352,PL/SQL,134,Oracle Database
 1545,PLSQL;,352,PL/SQL,351,PL/I
 1547,Oracle PLSQL,352,PL/SQL,351,PL/I
 1548,plsql,352,PL/SQL,351,PL/I
 1551,Projectplace,156,Planview,21,Business Intelligence and Reporting Tools (BIRT)
 1558,Power Builder,158,Powerbuilder,151,PeopleSoft
 1560,Power Builder 6.5,158,Powerbuilder,27,Chef Automate
 1565,ProjectWise Oracle Server,161,ProjectWise,162,ProjectWise Web Server
 1576,RMQ,165,RabbitMQ,355,R
 1579,Clearquest,167,Rational ClearQuest,455,Clarify|Clear Basic
 1581,Remedy ARS,169,Remedy,322,Fortran
 1584,RightFax client 10,171,RightFax,118,MQ Client
 1585,SOQL,359,Salesforce Object Query Language (SOQL),621,ArangoDB
 1587,SAP Business Objects,173,SAP BusinessObjects BI server,177,SAP NetWeaver Business Warehouse
 1588,Business Objects 12,173,SAP BusinessObjects BI server,488,ActiveX|ADO
 1590,SAP BI 4.2 Sp5,173,SAP BusinessObjects BI server,174,SAP ERP
 1593,SAP HANA ON SUSEOracle 11g on Linux,175,SAP HANA DB,435,Linux|SUSE Linux Enterprise Server
 1596,NetWeaver,279,SAP NetWeaver App Server,431,Linux|openSUSE
 1605,SCSS,361,Sass,102,Microsoft Endpoint Configuration Manager (SCCM)
 1606,Scalla,362,Scala,664,Forte
 1609,Microsoft SPS 2010,603,SharePoint|*,577,MVS|*
 1613,SQL Server SP2013 Database Server,603,SharePoint|*,581,MS SQL Server|*
 1615,Siebel IP 2015,182,Siebel,583,C++|*
 1616,Siebel 7.8.2.16,182,Siebel,43,DB2
 1617,Siebel CRM,182,Siebel,583,C++|*
 1619,Techsmith Corporation - SnagIt 8,184,SnagIt,183,SNA Manager
 1620,Solid development server,185,solidDB,600,Oracle WebLogic Server|*
 1622,Sixty-Five Software - SpaceMonger 1.4,187,SpaceMonger,296,Intel Xeon Processor
 1623,SQLPlus,478,Oracle Database|SQL*Plus,572,Structured Query Language (SQL)
 1625,SQLIO 1.0,189,SQLIO,178,SAP SQL Anywhere
 1630,SunOne,281,Oracle iPlanet Web Server,448,Unix|BSD|SunOS
 1637,SAP - Sybase Central 4.3,479,Sybase SQL Server|Sybase Central,190,Sybase SQL Server
 1639,Sysncsort,191,Syncsort,178,SAP SQL Anywhere
 1640,syncsort,191,Syncsort,98,Memcached
 1641,Sysinternals LLC - AccessEnum 1 1,194,Sysinternal Tools|AccessEnum,124,Nexus Repository OSS
 1642,Sysinternals LLC - ClockRes 2,195,Sysinternal Tools|ClockRes,374,Xbase++
 1643,Sysinternals LLC - Coreinfo 3.21,196,Sysinternal Tools|Coreinfo,670,EAServer
 1644,Sysinternals LLC - DiskExt 1.1,197,Sysinternal Tools|DiskExt,374,Xbase++
 1645,Sysinternals LLC - DiskMon 2.01,198,Sysinternal Tools|DiskMon,670,EAServer
 1647,Sysinternals LLC - Junction 1.6,200,Sysinternal Tools|Junction,374,Xbase++
 1648,Sysinternals LLC - LDMDump 1.02,201,Sysinternal Tools|LDMDump,178,SAP SQL Anywhere
 1649,Sysinternals LLC - LoadOrder 1,202,Sysinternal Tools|LoadOrder,374,Xbase++
 1650,Sysinternals LLC - PipeList 1.01,203,Sysinternal Tools|PipeList,670,EAServer
 1651,Sysinternals LLC - Process Explorer 16.5,204,Sysinternal Tools|Process Explorer,464,Microsoft Exchange Server|Veeam Explorer
 1652,Sysinternals LLC - PsKill 1.15,205,Sysinternal Tools|PsKill,151,PeopleSoft
 1653,Sysinternals LLC - PsPasswd 1.23,206,Sysinternal Tools|PsPasswd,231,VMware vCenter
 1654,Sysinternals LLC - SDelete 1.61,207,Sysinternal Tools|SDelete,670,EAServer
 1655,Sysinternals LLC - ShareEnum 1.6,208,Sysinternal Tools|ShareEnum,603,SharePoint|*
 1656,Sysinternals LLC - Sync 2.2,209,Sysinternal Tools|Sync,374,Xbase++
 1657,Sysinternals LLC - Sysinternals TCPView 3.5,210,Sysinternal Tools|TCPView,365,TCL
 1658,Sysinternals LLC - VMMap 3.11,211,Sysinternal Tools|VMMap,176,SAP MaxDB
 1659,Sysinternals LLC - Whois 1.11,212,Sysinternal Tools|Whois,178,SAP SQL Anywhere
 1664,TERADATA QUERY SCHEDULER SERVER VERSION 15,216,Teradata QS Server,215,Teradata
 1667,BusinessWorks,217,TIBCO Business Works (BW),111,Microsoft Visual Studio
 1668,Tibco-IM,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous
 1669,Tibco Integration Manager,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous
 1674,TSQL,366,Transact-SQL,621,ArangoDB
 1675,Trasact SQL,366,Transact-SQL,352,PL/SQL
 1746,Solaris 11.2 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene
 1747,Solaris UNIX,448,Unix|BSD|SunOS,578,Unix|*
 1748,Unix Servers (Solaris,448,Unix|BSD|SunOS,578,Unix|*
 1749,Oracle Solaris 11.3 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene
 1753,Solaris 5.10 (Generic_150400-61),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
 1754,Solaris 5.10 (Generic_150400-62),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
 1756,Solaris 5.10 (Generic_150400-55),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
 1760,Oracle Solaris,448,Unix|BSD|SunOS,134,Oracle Database
 1762,Solaris 1 (SPARC),448,Unix|BSD|SunOS,375,Apache Lucene
 1765,SunSolaris 10.0,448,Unix|BSD|SunOS,430,Linux|Junos OS
 1771,Oracle Solaris 10,448,Unix|BSD|SunOS,134,Oracle Database
 1800,VIO 2.2.0.10,227,Virtual I/O Server,159,Primavera P6
 1801,VIOS,227,Virtual I/O Server,443,OS/2
 1802,visibroker,228,Visibroker,420,Cisco IOS
 1803,VB6,370,Visual Basic,368,VB.NET
 1804,VB 6.0,370,Visual Basic,368,VB.NET
 1805,visualbasic,370,Visual Basic,306,C++|Visual C++
 1808,Visual Basic 6.0,370,Visual Basic,368,VB.NET
 1811,VBA,371,Visual Basic for Applications (VBA),370,Visual Basic
 1812,Access VB,371,Visual Basic for Applications (VBA),99,Microsoft Access
 1813,vfoxpro,372,Visual FoxPro,117,Mozilla Firefox
 1827,VMware Appliance,569,VMware Server,559,Virtual Appliance
 1828,VSX,229,VMware Solution Exchange Marketplace (VSX),111,Microsoft Visual Studio
 1830,VMware - VMware Tools 10.2,230,VMware Tools,569,VMware Server
 1832,VXML,373,VoiceXML,316,eXtensible HyperText Markup Language (XHTML)
 1833,Web Focus,232,WebFOCUS,321,FOCUS
 1834,FOCEXEC,232,WebFOCUS,495,Oracle WebCenter Content Server|Idoc Script
 1836,WLI 8,233,WebLogic Integration,442,OpenVMS
 1842,IBM WEBSPHERE APPLICATION SERVER VERSION 6.1.0,284,Websphere Application Server (WAS),285,WebSphere Liberty
 1848,"IBM WebSphere Application Server Network Deployment, 8.0.0.5",284,Websphere Application Server (WAS),285,WebSphere Liberty
 1850,IBM WebSphere Application Server Network Deployment 7,284,Websphere Application Server (WAS),285,WebSphere Liberty
 1858,IBM WebSphere 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
 1861,IBM - WebSphere Application Server - Base 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
 1865,Websphere AS (JVM),284,Websphere Application Server (WAS),285,WebSphere Liberty
 1872,IBM WebSphere,284,Websphere Application Server (WAS),285,WebSphere Liberty
 1875,IBM WebSphere Application Server 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
 1877,IBM WebSphere Application,284,Websphere Application Server (WAS),285,WebSphere Liberty
 1878,WAS 6.x,284,Websphere Application Server (WAS),521,Electronic Data Interchange (EDI)
 1880,IBM OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE
 1882,Open Liberty,285,WebSphere Liberty,397,Java|Servlet
 1883,IBM Open Liberty,285,WebSphere Liberty,62,IBM BigFix Platform
 1887,WAS Liberty,285,WebSphere Liberty,397,Java|Servlet
 1889,OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE
 1891,WMB 6.1,235,WebSphere Message Broker,486,.NET Framework|Windows Workflow Foundation (WF)
 1892,WebSphere Message Broker v6.0,235,WebSphere Message Broker,285,WebSphere Liberty
 1899,WebSphere Portal Extend Limited Use 6.1,286,WebSphere Portal Server,285,WebSphere Liberty
 1901,Windchill 11.1,237,Windchill,17,Bluebeam|Bluebeam Q
 1908,Window,580,Windows|*,637,Microsoft Azure
 1914,Windows Terminal Server,239,Windows Terminal Server (WTS),452,Windows|Windows Server
 1915,Windows 7 Standard,451,Windows|Windows Desktop,580,Windows|*
 1916,WINDOWS 10 SERVER STANDARD EDITION X64,451,Windows|Windows Desktop,452,Windows|Windows Server
 1917,Microsoft Windows 7 (64-bit),451,Windows|Windows Desktop,580,Windows|*
 1918,Microsoft Windows XP Professional (32-bit),451,Windows|Windows Desktop,580,Windows|*
 1919,Windows 7 Professional x64,451,Windows|Windows Desktop,580,Windows|*
 1920,Microsoft Microsoft Windows Entreprise,451,Windows|Windows Desktop,580,Windows|*
 1921,Microsoft Windows 2000,451,Windows|Windows Desktop,580,Windows|*
 1922,Microsoft Windows 10,451,Windows|Windows Desktop,580,Windows|*
 1923,MS Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|*
 1924,Microsoft Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|*
 1925,Microsoft Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
 1926,Microsoft Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|*
 1927,Win Desktop,451,Windows|Windows Desktop,560,Webtop
 1928,Windows 10 Pro,451,Windows|Windows Desktop,580,Windows|*
 1929,Windows 10,451,Windows|Windows Desktop,580,Windows|*
 1930,Windows 7 Ultimate,451,Windows|Windows Desktop,580,Windows|*
 1931,Microsoft Windows 8 (64-bit),451,Windows|Windows Desktop,580,Windows|*
 1932,Microsoft Windows XP,451,Windows|Windows Desktop,580,Windows|*
 1933,Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|*
 1934,Windows XP,451,Windows|Windows Desktop,580,Windows|*
 1935,Windows 10 Professional,451,Windows|Windows Desktop,580,Windows|*
 1936,Windows 7,451,Windows|Windows Desktop,580,Windows|*
 1937,Microsoft Windows 10 (64-bit),451,Windows|Windows Desktop,580,Windows|*
 1938,Win 7,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE)
 1939,windowsxp,451,Windows|Windows Desktop,580,Windows|*
 1940,Microsoft Windows Unknown,451,Windows|Windows Desktop,580,Windows|*
 1941,Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
 1942,Windows XP Professional,451,Windows|Windows Desktop,580,Windows|*
 1943,Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|*
 1944,Window XP,451,Windows|Windows Desktop,580,Windows|*
 1945,Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
 1946,Microsoft Windows 7 - SOE,451,Windows|Windows Desktop,580,Windows|*
 1947,Windows 7 Enterprise Edition,451,Windows|Windows Desktop,452,Windows|Windows Server
 1948,Windows 8,451,Windows|Windows Desktop,580,Windows|*
 1949,Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|*
 1950,Microsoft Windows 7 (32-bit),451,Windows|Windows Desktop,580,Windows|*
 1951,Windows Embedded Standard 7,451,Windows|Windows Desktop,580,Windows|*
 1952,Win10,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE)
 1953,Windows 2003,451,Windows|Windows Desktop,580,Windows|*
 1955,Windows 2003 Standard,452,Windows|Windows Server,580,Windows|*
 1956,Windows 2008 Enterprise R2 x64,452,Windows|Windows Server,580,Windows|*
 1960,WINDOWS 2008R2,452,Windows|Windows Server,580,Windows|*
 1961,Microsoft Windows Server 2008 Standard Editio,452,Windows|Windows Server,121,MS SQL Server Compact
 1962,MICROSOFT WINDOWS NT 2003,452,Windows|Windows Server,580,Windows|*
 1967,Microsoft Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
 1979,Windows 2008 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|*
 1982,Windows 2003 R2,452,Windows|Windows Server,580,Windows|*
 1983,Windows 2008 R2 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|*
 1988,Windows 2008 R2,452,Windows|Windows Server,580,Windows|*
 1989,Windows 2012 Standard,452,Windows|Windows Server,580,Windows|*
 1992,Windows 2008 R2 Standard 6.1.7601 Service Pack 1,452,Windows|Windows Server,580,Windows|*
 1994,Windows 2008 Standard x64,452,Windows|Windows Server,580,Windows|*
 1998,Windows 2012 R2 Standard 64-Bit,452,Windows|Windows Server,580,Windows|*
 2007,w2k12,452,Windows|Windows Server,582,C#|*
 2008,WINDOWS 2013,452,Windows|Windows Server,580,Windows|*
 2009,WINDOWS 2016 SE 64 BIT,452,Windows|Windows Server,580,Windows|*
 2011,Microsoft - Windows 2012,452,Windows|Windows Server,580,Windows|*
 2019,MICROSOFT WINDOWS 2008 TPM,452,Windows|Windows Server,580,Windows|*
 2021,MICROSOFT WINDOWS STD 2008,452,Windows|Windows Server,580,Windows|*
 2025,Windows 2008 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
 2028,MICROSOFT WINDOWS STD 2008 TPM,452,Windows|Windows Server,580,Windows|*
 2030,Windows 2012 64 Bit,452,Windows|Windows Server,580,Windows|*
 2031,MICROSOFT WINDOWS NT 2003 ENT,452,Windows|Windows Server,580,Windows|*
 2034,MICROSOFT WINDOWS 2012,452,Windows|Windows Server,580,Windows|*
 2036,Windows 2003 Standard5.2.3790,452,Windows|Windows Server,580,Windows|*
 2040,Windows 2012 R,452,Windows|Windows Server,580,Windows|*
 2044,Windows 2008 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|*
 2045,MICROSOFT WINDOWS 2008 ENT,452,Windows|Windows Server,580,Windows|*
 2047,Windows 2012 R2 Standard 6.3.9600,452,Windows|Windows Server,580,Windows|*
 2053,Windows 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
 2055,Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
 2061,Windows 2016 Datacenter10.0.14393,452,Windows|Windows Server,637,Microsoft Azure
 2065,windows6.3.9600,452,Windows|Windows Server,580,Windows|*
 2066,Windows 2012 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
 2069,Windows 2008 Enterprise,452,Windows|Windows Server,580,Windows|*
 2080,Windows 2008 Standard without Hyper-V6.0.6003,452,Windows|Windows Server,580,Windows|*
 2084,Windows 2012 R2 Datacenter,452,Windows|Windows Server,110,Microsoft System Center Endpoint Protection
 2089,Windows 2008 Standard 64-bit,452,Windows|Windows Server,580,Windows|*
 2096,Windows 2000,452,Windows|Windows Server,580,Windows|*
 2097,W2K8R2 Standard 64 BIT,452,Windows|Windows Server,303,C
 2099,Windows 2008 Standard6.0.6003,452,Windows|Windows Server,580,Windows|*
 2100,Windows2008 R2 Enterprise 64bit,452,Windows|Windows Server,580,Windows|*
 2105,Win2008R2,452,Windows|Windows Server,355,R
 2107,Windows 2008 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
 2109,Windows Server 2003 Appliance,452,Windows|Windows Server,559,Virtual Appliance
 2111,Windows 2008 ENT R2 (64 bits),452,Windows|Windows Server,355,R
 2114,WIN2008R2 6.1.7601,452,Windows|Windows Server,355,R
 2116,microsoft windows std 2012  tpm,452,Windows|Windows Server,580,Windows|*
 2118,microsoft windows 2008,452,Windows|Windows Server,580,Windows|*
 2120,Windows 2008 Standard 32 Bit,452,Windows|Windows Server,580,Windows|*
 2121,Microsoft Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|*
 2126,Window2008 R2,452,Windows|Windows Server,355,R
 2130,Windows 2008 Standard,452,Windows|Windows Server,580,Windows|*
 2134,WS03,452,Windows|Windows Server,239,Windows Terminal Server (WTS)
 2136,Windows 2008 Enterprise x64,452,Windows|Windows Server,580,Windows|*
 2141,Windows 2008 R2 Enterprise,452,Windows|Windows Server,580,Windows|*
 2142,Windows Server 2003 Std 32-bit,452,Windows|Windows Server,580,Windows|*
 2143,Windows 2008 R2 Standard 64bit,452,Windows|Windows Server,580,Windows|*
 2146,Microsoft Windows 2003 R2 Standard,452,Windows|Windows Server,580,Windows|*
 2148,MICROSOFT WINDOWS NT 2003 TPM,452,Windows|Windows Server,580,Windows|*
 2149,Win Server 2008,452,Windows|Windows Server,569,VMware Server
 2150,Windows 2003 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
 2152,WIN2014,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
 2156,Win 2012 R2,452,Windows|Windows Server,355,R
 2160,Win Server,452,Windows|Windows Server,12,Apache Subversion
 2161,Windows 2008 Standard R2 x64,452,Windows|Windows Server,580,Windows|*
 2163,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 6196 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor
 2164,Windows2012,452,Windows|Windows Server,580,Windows|*
 2165,Windows 2008 R2 Standard6.1.7601,452,Windows|Windows Server,580,Windows|*
 2166,Windows 2016,452,Windows|Windows Server,580,Windows|*
 2167,Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|*
 2179,Windows Server 2003 Std 64-bit,452,Windows|Windows Server,580,Windows|*
 2180,Windows 2012 R2,452,Windows|Windows Server,580,Windows|*
 2181,Wintel,452,Windows|Windows Server,461,IBM Tivoli Storage Manager|TSM Client
 2191,Windows 2003 Enterprise5.2.3790,452,Windows|Windows Server,580,Windows|*
 2192,WINDOWS 2012,452,Windows|Windows Server,580,Windows|*
 2193,Windows 2008 R2 OS,452,Windows|Windows Server,580,Windows|*
 2196,Windows 2003 Standard R2,452,Windows|Windows Server,580,Windows|*
 2197,Windows 2008 R2 Enterprise6.1.7601,452,Windows|Windows Server,580,Windows|*
 2198,Windows 2003 Standard 32 Bit,452,Windows|Windows Server,580,Windows|*
 2199,WINDOWS SERVER 2003 APPLIANCE 5.2,452,Windows|Windows Server,559,Virtual Appliance
 2201,WS08R2,452,Windows|Windows Server,355,R
 2204,Windows 2008 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|*
 2213,w2k8r2sp1,452,Windows|Windows Server,355,R
 2217,Win 2003,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
 2222,Windows 2012 R2 Standard,452,Windows|Windows Server,580,Windows|*
 2226,Windows 2008 R2 Standard 64-bit,452,Windows|Windows Server,580,Windows|*
 2228,Windows 2003 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|*
 2230,Windows 2012 Storage R2,452,Windows|Windows Server,580,Windows|*
 2231,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 4096 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor
 2235,MICROSOFT WINDOWS NT 2003 ENT TPM,452,Windows|Windows Server,239,Windows Terminal Server (WTS)
 2237,Windows 2016 Standard10.0.14393,452,Windows|Windows Server,580,Windows|*
 2240,MICROSOFT WINDOWS 2003,452,Windows|Windows Server,580,Windows|*
 2242,Windows 2012 Standard R2,452,Windows|Windows Server,580,Windows|*
 2246,Win Server 2008 R2,452,Windows|Windows Server,355,R
 2248,MICROSOFT WINDOWS STD 2012 TPM,452,Windows|Windows Server,580,Windows|*
 2249,Windows 2003 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|*
 2250,Windows 2008 Enterprise R2,452,Windows|Windows Server,580,Windows|*
 2251,Windows 2008,452,Windows|Windows Server,580,Windows|*
 2252,Microsoft Microsoft Windows 2008 R2,452,Windows|Windows Server,580,Windows|*
 2257,Win Server 2012,452,Windows|Windows Server,569,VMware Server
 2258,Windows 2016 Standard,452,Windows|Windows Server,580,Windows|*
 2264,Windows 2008 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|*
 2267,Windows 2003 Standard 5.2.3790 Service Pack 2,452,Windows|Windows Server,580,Windows|*
 2268,Windows 2012 Standard6.2.9200,452,Windows|Windows Server,580,Windows|*
 2269,MICROSOFT WINDOWS 2016 TPM,452,Windows|Windows Server,580,Windows|*
 2272,Windows 2003 Enterprise,452,Windows|Windows Server,580,Windows|*
 2275,Windows 2008 R2 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|*
 2277,Windows 2012 R2 Standard6.3.9600,452,Windows|Windows Server,580,Windows|*
 2286,Windows 2008 Standard R2,452,Windows|Windows Server,580,Windows|*
 2287,MicrosoftWindows Server 2008 R2 (64-bit),452,Windows|Windows Server,443,OS/2
 2288,windows6.3,452,Windows|Windows Server,580,Windows|*
 2290,Windows 2016 64 Bit,452,Windows|Windows Server,580,Windows|*
 2296,Windows 2008 Enterprise6.0.6003,452,Windows|Windows Server,580,Windows|*
 2301,Win 2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
 2302,Win2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
 2303,Win2012R2,452,Windows|Windows Server,355,R
 2305,win2008,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
 2306,Windows 2003 Standard x64,452,Windows|Windows Server,580,Windows|*
 2315,WINDOWS 2016 STANDARD EDITION,452,Windows|Windows Server,580,Windows|*
 2325,WinSCP.net - WinSCP 5.11,243,WinSCP,178,SAP SQL Anywhere
 2332,Zerto Vritual Appliance,249,Zerto Virtual Replication,559,Virtual Appliance
 2333,Oracle RTD,289,Oracle Real-Time Decisions (RTD),134,Oracle Database
 2338,OMNIbus,251,Tivoli Netcool/OMNIbus,582,C#|*
 2347,ALM,511,Application Lifecycle Management (ALM),421,DART
 2349,BMS,513,Batch Management Software (BMS),442,OpenVMS
 2354,COM,516,Compopent Object Model (COM),661,COM+
 2357,CORBA Interface Definition Language,518,CORBA Interface Definition Language (CORBA IDL),517,Common Object Request Broker Architecture (CORBA)
 2359,Data Control Language,519,Data Control Language (DCL),329,IBM i Control Language (CL)
 2361,Database,520,Database (DB),43,DB2
 2362,DB,520,Database (DB),43,DB2
 2365,Electronic Data Interchange,521,Electronic Data Interchange (EDI),104,Microsoft Exchange Server
 2369,JDOM,523,Java-based Document Object Model for XML (JDOM),84,IMS DB
 2381,Simple Object Access Protocol,531,Simple Object Access Protocol (SOAP),547,Internet Message Access Protocol (IMAP)
 2383,SQL,572,Structured Query Language (SQL),581,MS SQL Server|*
 2386,DPE,538,Device Provisioning Engines (DPE),661,COM+
 2388,ESB,540,Enterprise Service Bus(ESB),370,Visual Basic
 2395,MES,553,Manufacturing Execution System (MES),623,Amazon S3
 2401,Z/Virtual System Environment,591,z/VSE,441,MVS|z/OS
 2403,DOS/VSE,591,z/VSE,597,DOS/360
 2404,Microsoft Disk Operating System,593,MS-DOS,443,OS/2
 2407,VME/B,595,VME,368,VB.NET
 2408,Virtual Machine Environment,595,VME,111,Microsoft Visual Studio
 2409,VME 2900,595,VME,107,Microsoft Internet Explorer
 2410,OpenVME,595,VME,442,OpenVMS
 2411,Disk Operating System/360,597,DOS/360,443,OS/2
 2413,Transaction Processing Facility,598,z/TPF,572,Structured Query Language (SQL)
 2419,NPL,653,Natural Programming Language,342,Niakwa Programming Language (NPL)
 2426,IDMS/DB Data Manipulation Language,668,IDMS DML,312,Data Language Interface (DL/I)
 2433,Basic Mapping Supprt,689,BMS Map,21,Business Intelligence and Reporting Tools (BIRT)
 2434,DB/400,690,DB400,43,DB2
 2435,IBM ISAM,693,ISAM,73,IBM Operational Decision Manager (ODM)
--- a/zero_shot/flan-t5.py
+++ b/zero_shot/flan-t5.py
@ -32,21 +32,25 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 def generate_acronym(text):
    # Define prompt
-    prompt = f"Answer concisely: make a possible acronym from the following: '{text}'"
+    # prompt = f"Imagine you are a diverse database. Given the following: '{text}', please suggest to me 5 possible variations. Give 5."
    prompt = f"Give me a list of 10 historical product names related to: '{text}'. Format the output in a list, like this 1. Item, 2. Item, 3. ..."
    # Generate acronym
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to("cuda")
    outputs = model.generate(
        inputs["input_ids"],
-        max_length=100,
+        max_length=200,
-        no_repeat_ngram_size=3) 
+        do_sample=True,
        top_k=50,
        temperature=0.8)
        # no_repeat_ngram_size=3) 
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # %%
 # Example usage
 # text = "Advanced Data Analytics Platform"
-text = "red hat enterprise linux"
+text = "windows desktop"
 acronym = generate_acronym(text)
-print(f"Acronym: {acronym}")
+print(f"Generation: {acronym}")
 # %%