added more augmentations to finally beat sota

- class_bert_augmentation is now the reference training code
2025-01-15 20:09:15 +09:00 · 2025-01-15 20:09:15 +09:00 · 5312cfa06f
parent e90bc69ea9
commit 5312cfa06f
32 changed files with 2837 additions and 235 deletions
--- a/analysis/corrupt_text.py
+++ b/analysis/corrupt_text.py
@ -0,0 +1,41 @@
+# %%
+import random
+import string
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+# Example usage
+sentence = "This is a simple string for testing"
+corrupted_sentence = corrupt_string(sentence, corruption_probability=0.1)
+print("Original:", sentence)
+print("Corrupted:", corrupted_sentence)
+
+# %%
--- a/analysis/entity_hierarchy.py
+++ b/analysis/entity_hierarchy.py
@ -1,95 +0,0 @@
-# %%
-import json
-import pandas as pd
-
-##########################################
-# %%
-
-# Load the JSON file
-data_path = '../esAppMod/tca_entities.json'
-with open(data_path, 'r') as file:
-    data = json.load(file)
-
-# Initialize an empty list to store the rows
-rows = []
-
-# %%
-# Loop through all entities in the JSON
-for entity in data["data"].items():
-    entity_data = entity[1]
-    entity_id = entity_data['entity_id']
-    entity_name = entity_data['entity_name']
-    entity_type_id = entity_data['entity_type_id']
-    entity_type_name = entity_data['entity_type_name']
-    
-    # Add each mention and its entity_id to the rows list
-    rows.append(
-        {
-        'id': entity_id,
-        'name': entity_name,
-        'type_id': entity_type_id,
-        'type_name': entity_type_name
-        })
-
-# Create a DataFrame from the rows
-df = pd.DataFrame(rows)
-
-# %%
-# df.to_csv('entity.csv', index=False)
-df
-
-# %%
-df['type_name'].value_counts()
-# %%
-df['type_id'].value_counts()
-
-# %%
-name_list = df['name'].to_list()
-# %%
-name_list
-
-# %%
-from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
-import numpy as np
-
-# %%
-# Define labels
-labels = name_list
-
-# Create a prefix-based distance matrix
-def prefix_distance(label1, label2):
-    prefix1 = label1.split()
-    prefix2 = label2.split()
-    # Find common prefix length
-    common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
-    # Distance is inversely proportional to common prefix length
-    return 1.0 / (common_prefix_length + 1)
-
-# Create a pairwise distance matrix
-n = len(labels)
-distance_matrix = np.zeros((n, n))
-for i in range(n):
-    for j in range(n):
-        distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
-
-# Perform hierarchical clustering
-linkage_matrix = linkage(distance_matrix, method='average')
-
-# Visualize as a dendrogram
-import matplotlib.pyplot as plt
-dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
-plt.title("Prefix-Based Clustering")
-plt.show()
-
-# %%
-linkage_matrix
-# %%
-# Extract flat clusters with a distance threshold
-threshold = 0.5
-clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
-
-# Display clusters
-for i, cluster_id in enumerate(clusters):
-    print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
-
-# %%
--- a/analysis/error_analysis.py
+++ b/analysis/error_analysis.py
@ -3,53 +3,55 @@ import pandas as pd

 # %%
 # import training file
-data_path = '../data_import/train.csv'
+data_path = '../esAppMod_data_import/train.csv'
+# data_path = '../esAppMod_data_import/parent_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)


 # import test file
-data_path = '../data_import/test.csv'
+data_path = '../esAppMod_data_import/test.csv'
+# data_path = '../esAppMod_data_import/parent_test.csv'
 test_df = pd.read_csv(data_path, skipinitialspace=True)

 # import entity file
-data_path = '../data_import/entity.csv'
+data_path = '../esAppMod_data_import/entity.csv'
 entity_df = pd.read_csv(data_path, skipinitialspace=True)
 id2label = {}
 for _, row in entity_df.iterrows():
    id2label[row['id']] = row['name']

-# %%
 train_df.sort_values(by=['entity_id']).to_markdown('out.md')

 # %%
-data_path = '../train/class_bert_process/prediction/exports/result.csv'
+data_path = '../train/class_bert_augmentation/prediction/exports/result.csv'
 prediction_df = pd.read_csv(data_path)

-# %%
 predicted_entity_list = []
 for element in prediction_df['class_prediction']:
    predicted_entity_list.append(id2label[element])

 prediction_df['predicted_name'] = predicted_entity_list
-# %%
 new_df = pd.concat((test_df, prediction_df ), axis=1)
-
-# %%
 mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
 mismatch_df = new_df[mismatch_mask]
-
-# %%
 len(mismatch_df)

 # %%
 # print the top 10 offending classes
+# mask1 = mismatch_df['entity_id'] != 434
+# mask2 = mismatch_df['entity_id'] != 451
+# mask3 = mismatch_df['entity_id'] != 452
+# mask= mask1 & mask2 & mask3
+# masked_df = mismatch_df[mask]
+# print(masked_df['entity_id'].value_counts()[:10])
 print(mismatch_df['entity_id'].value_counts()[:10])
+masked_df = mismatch_df


 # %%
 # Convert the whole dataframe as a string and display
 # print the mismatch_df
-print(mismatch_df.sort_values(by=['entity_id']).to_markdown())
+print(masked_df.sort_values(by=['entity_id']).to_markdown())

 # %%
 mismatch_df.to_csv('error.csv')
@ -62,14 +64,9 @@ mismatch_df[select_mask]

 # %%
 # let us see the train mentions
-select_value = 452
+select_value = 130
 select_mask = train_df['entity_id'] == select_value
 train_df[select_mask]



-# %%
-mismatch_df[select_mask]['class_prediction'].to_list()
-
-# %%
-# %%
--- a/analysis/label_acronym.py
+++ b/analysis/label_acronym.py
@ -0,0 +1,62 @@
+# %%
+import pandas as pd
+import re
+
+# %%
+# import training file
+data_path = '../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+# import test file
+data_path = '../esAppMod_data_import/test.csv'
+test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+# import entity file
+data_path = '../esAppMod_data_import/entity.csv'
+entity_df = pd.read_csv(data_path, skipinitialspace=True)
+id2label = {}
+for _, row in entity_df.iterrows():
+    id2label[row['id']] = row['name']
+
+
+# %%
+train_df
+# %%
+
+def extract_acronym_mapping(names):
+    mapping = {}
+    for name in names:
+        # Find acronym in parentheses
+        match = re.search(r"\((\w+)\)", name)
+        if match:
+            acronym = match.group(1)
+            
+            # Remove unrelated prepended terms
+            core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip()
+            
+            # Add to dictionary
+            mapping[acronym] = core_term
+    return mapping
+
+names = set(train_df['entity_name'].to_list())
+
+# Extract mappings
+acronym_mapping = extract_acronym_mapping(names)
+print(acronym_mapping)
+# %%
+del acronym_mapping['E']  # too many false matches
+acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()}
+
+abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()}
+term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()}
+
+
+# %%
+abbrev_to_term
+# %%
+term_to_abbrev
+
+# %%
+acronym_mapping
+# %%
--- a/esAppMod_data_import/.gitignore
+++ b/esAppMod_data_import/.gitignore
@ -0,0 +1,5 @@
+out.md
+parent_test.csv
+parent_train.csv
+test_seq.csv
+train_seq.csv
--- a/esAppMod_data_import/entity_hierarchy.py
+++ b/esAppMod_data_import/entity_hierarchy.py
@ -0,0 +1,124 @@
+# %%
+import json
+import pandas as pd
+
+##########################################
+# %%
+# import training file
+data_path = '../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+# import entity file
+# Keep only one row per unique value in 'column1'
+unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
+id2label = {}
+for _, row in unique_df.iterrows():
+    id2label[row['entity_id']] = row['entity_name']
+
+inverse_dict = {value:key for key,value in id2label.items()}
+# %%
+# Create a new dictionary with sorted keys
+# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
+sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
+
+# %%
+sorted_dict
+
+# %%
+rule_set ={
+    '.NET': [497,482,484,487,485,486,483],
+    'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
+    'C++': [583,306],
+    'CA': [290,22,23,24,25],
+    'CSS': [307,377],
+    'Cisco': [28,420,29],
+    'Citrix': [563,565,31,292,291,564,32,30],
+    'coldfusion': [311,37],
+    'eclipse': [46,622,641,456],
+    'xml': [596, 318],
+    'xsl': [319,320],
+    'HP': [59,293,60,61,58],
+    'http': [505,543],
+    'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
+    'IBM BigFix': [62,457],
+    'IBM ILOG': [253,255,254,256,252],
+    'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
+    'IBM WebSphere': [80,82,83,81],
+    'IBM i': [424,329],
+    'IDMS': [667,668],
+    'IIS': [609,490,489,491],
+    'JBoss': [268,492,493],
+    'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
+    'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
+    'KVS': [549,550,551],
+    'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
+    'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
+    'MVS': [577,440,441],
+    'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
+    'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
+    'Oracle WebLogic': [600,233],
+    'Oracle Application Server': [610,494],
+    'Oracle Database': [134,474,475,478],
+    'Oracle Hyperion': [607,138,139],
+    'Oracle WebCenter': [276,495],
+    'Pascal': [599,346],
+    'Perl': [585,348,417,349],
+    'ProjectWise': [161,162],
+    'Rational': [166,167],
+    'SAP': [173,175,695,176,676,178,179],
+    'SAP ERP': [174,476,477],
+    'SAP NetWeaver': [279,496,177],
+    'Sybase SQL Server': [190,479,480],
+    'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
+    'TIBCO': [218,219],
+    'TIBCO Business Works': [217,481],
+    'Tivoli': [220,251],
+    'Tortoise': [221,222],
+    'Unix': [578,445,579,447,602,590,448,449],
+    'VB': [368,369],
+    'VMware': [568,569,229,230,231],
+    'Visual Basic': [370,371,372],
+    'WebSphere': [234,285,235,286,284,601,287],
+    'Windows': [580,238,239,451,452],
+    'z': [598,608,591]
+
+}
+
+# %%
+# iterate through the whole training set
+new_df = train_df.copy()
+for idx, row in train_df.iterrows():
+    # we iterate through each rule set, replacing any matching values in the
+    # list with the first element of the list
+    for key in rule_set.keys():
+        id = row['entity_id']
+        if (id in rule_set[key]):
+            new_df.loc[idx,('entity_id')] = rule_set[key][0]
+# %%
+len(set(new_df['entity_id'].to_list()))
+
+# %%
+new_df.to_csv('parent_train.csv')
+
+# %%
+# now do the same for the test data
+# import training file
+data_path = '../esAppMod_data_import/test.csv'
+test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+new_df = test_df.copy()
+for idx, row in test_df.iterrows():
+    # we iterate through each rule set, replacing any matching values in the
+    # list with the first element of the list
+    for key in rule_set.keys():
+        id = row['entity_id']
+        if (id in rule_set[key]):
+            new_df.loc[idx,('entity_id')] = rule_set[key][0]
+
+# %%
+new_df
+
+# %%
+new_df.to_csv('parent_test.csv')
+# %%
--- a/esAppMod_data_import/entity_hierarchy_for_seq2seq.py
+++ b/esAppMod_data_import/entity_hierarchy_for_seq2seq.py
@ -0,0 +1,129 @@
+# %%
+import json
+import pandas as pd
+
+##########################################
+# %%
+# import training file
+data_path = '../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+# import entity file
+# Keep only one row per unique value in 'column1'
+unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
+id2label = {}
+for _, row in unique_df.iterrows():
+    id2label[row['entity_id']] = row['entity_name']
+
+inverse_dict = {value:key for key,value in id2label.items()}
+# %%
+# Create a new dictionary with sorted keys
+# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
+sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
+
+# %%
+sorted_dict
+
+# %%
+rule_set ={
+    '.NET': [497,482,484,487,485,486,483],
+    'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
+    'C++': [583,306],
+    'CA': [290,22,23,24,25],
+    'CSS': [307,377],
+    'Cisco': [28,420,29],
+    'Citrix': [563,565,31,292,291,564,32,30],
+    'coldfusion': [311,37],
+    'eclipse': [46,622,641,456],
+    'xml': [596, 318],
+    'xsl': [319,320],
+    'HP': [59,293,60,61,58],
+    'http': [505,543],
+    'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
+    'IBM BigFix': [62,457],
+    'IBM ILOG': [253,255,254,256,252],
+    'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
+    'IBM WebSphere': [80,82,83,81],
+    'IBM i': [424,329],
+    'IDMS': [667,668],
+    'IIS': [609,490,489,491],
+    'JBoss': [268,492,493],
+    'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
+    'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
+    'KVS': [549,550,551],
+    'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
+    'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
+    'MVS': [577,440,441],
+    'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
+    'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
+    'Oracle WebLogic': [600,233],
+    'Oracle Application Server': [610,494],
+    'Oracle Database': [134,474,475,478],
+    'Oracle Hyperion': [607,138,139],
+    'Oracle WebCenter': [276,495],
+    'Pascal': [599,346],
+    'Perl': [585,348,417,349],
+    'ProjectWise': [161,162],
+    'Rational': [166,167],
+    'SAP': [173,175,695,176,676,178,179],
+    'SAP ERP': [174,476,477],
+    'SAP NetWeaver': [279,496,177],
+    'Sybase SQL Server': [190,479,480],
+    'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
+    'TIBCO': [218,219],
+    'TIBCO Business Works': [217,481],
+    'Tivoli': [220,251],
+    'Tortoise': [221,222],
+    'Unix': [578,445,579,447,602,590,448,449],
+    'VB': [368,369],
+    'VMware': [568,569,229,230,231],
+    'Visual Basic': [370,371,372],
+    'WebSphere': [234,285,235,286,284,601,287],
+    'Windows': [580,238,239,451,452],
+    'z': [598,608,591]
+
+}
+
+# %%
+# iterate through the whole training set
+new_df = train_df.copy()
+for idx, row in train_df.iterrows():
+    # we iterate through each rule set, replacing any matching values in the
+    # list with the first element of the list
+    for key in rule_set.keys():
+        id = row['entity_id']
+        if (id in rule_set[key]):
+            stem = rule_set[key][0]
+            leaf = rule_set[key].index(id)
+            new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}"
+# %%
+len(set(new_df['entity_seq'].to_list()))
+
+# %%
+new_df.to_csv('train_seq.csv')
+
+# %%
+# now do the same for the test data
+# import training file
+data_path = '../esAppMod_data_import/test.csv'
+test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+new_df = test_df.copy()
+for idx, row in test_df.iterrows():
+    # we iterate through each rule set, replacing any matching values in the
+    # list with the first element of the list
+    for key in rule_set.keys():
+        id = row['entity_id']
+        if (id in rule_set[key]):
+            stem = rule_set[key][0]
+            leaf = rule_set[key].index(id)
+            new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}"
+
+
+# %%
+new_df
+
+# %%
+new_df.to_csv('test_seq.csv')
+# %%
--- a/train/class_bert_augmentation/.gitignore
+++ b/train/class_bert_augmentation/.gitignore
--- a/train/class_bert_augmentation/prediction/.gitignore
+++ b/train/class_bert_augmentation/prediction/.gitignore
--- a/train/class_bert_augmentation/prediction/output.txt
+++ b/train/class_bert_augmentation/prediction/output.txt
@ -1,6 +1,6 @@

 *******************************************************************************
-Accuracy: 0.77655
-F1 Score: 0.79605
-Precision: 0.85637
-Recall: 0.77655
+Accuracy: 0.80197
+F1 Score: 0.81948
+Precision: 0.88067
+Recall: 0.80197
--- a/train/class_bert_augmentation/prediction/predict.py
+++ b/train/class_bert_augmentation/prediction/predict.py
@ -32,6 +32,8 @@ torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256

 # %%
+# construct the target id list
+# data_path = '../../../esAppMod_data_import/train.csv'
 data_path = '../../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
@ -52,19 +54,8 @@ def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()

-    # Remove any non alphanumeric character
-    # text = re.sub(r'[^\w\s]', ' ', text)  # Retains only alphanumeric and spaces
-    text = re.sub(r"[-;:]", " ", text)
-
-    # Add space between digit followed by a letter
-    text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
-
-    # Add space between letter followed by a digit
-    text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
-
-
    # Substitute digits with '#'
-    text = re.sub(r'\d+', 'x', text)
+    # text = re.sub(r'\d+', '#', text)

    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
--- a/train/class_bert_augmentation/train.py
+++ b/train/class_bert_augmentation/train.py
@ -0,0 +1,562 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=10
+
+# %%
+
+# import training file
+data_path = '../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+acronym_mapping = {
+ 'hpsa': 'hp server automation',
+ 'tam': 'tivoli access manager',
+ 'adf': 'application development facility',
+ 'html': 'hypertext markup language',
+ 'wff': 'microsoft web farm framework',
+ 'jsp': 'javaserver pages',
+ 'bw': 'business works',
+ 'ssrs': 'sql server reporting services',
+ 'cl': 'control language',
+ 'vba': 'visual basic for applications',
+ 'esapi': 'enterprise security api',
+ 'gwt': 'google web toolkit',
+ 'pki': 'perkin elmer informatics',
+ 'rtd': 'oracle realtime decisions',
+ 'jms': 'java message service',
+ 'db': 'database',
+ 'soa': 'service oriented architecture',
+ 'xsl': 'extensible stylesheet language',
+ 'com': 'compopent object model',
+ 'ldap': 'lightweight directory access protocol',
+ 'odm': 'ibm operational decision manager',
+ 'soql': 'salesforce object query language',
+ 'oms': 'order management system',
+ 'cfml': 'coldfusion markup language',
+ 'nas': 'netscape application server',
+ 'sql': 'structured query language',
+ 'bde': 'borland database engine',
+ 'imap': 'internet message access protocol',
+ 'uws': 'ultidev web server',
+ 'birt': 'business intelligence and reporting tools',
+ 'mdw': 'model driven workflow',
+ 'tws': 'tivoli workload scheduler',
+ 'jre': 'java runtime environment',
+ 'wcs': 'websphere commerce suite',
+ 'was': 'websphere application server',
+ 'ssis': 'sql server integration services',
+ 'xhtml': 'extensible hypertext markup language',
+ 'soap': 'simple object access protocol',
+ 'san': 'storage area network',
+ 'elk': 'elastic stack',
+ 'arr': 'application request routing',
+ 'xlst': 'extensible stylesheet language transformations',
+ 'sccm': 'microsoft endpoint configuration manager',
+ 'ejb': 'enterprise java beans',
+ 'css': 'cascading style sheets',
+ 'hpoo': 'hp operations orchestration',
+ 'xml': 'extensible markup language',
+ 'esb': 'enterprise service bus',
+ 'edi': 'electronic data interchange',
+ 'imsva': 'interscan messaging security virtual appliance',
+ 'wtx': 'ibm websphere transformation extender',
+ 'cgi': 'common gateway interface',
+ 'bal': 'ibm basic assembly language',
+ 'issow': 'integrated safe system of work',
+ 'dcl': 'data control language',
+ 'jdom': 'java document object model',
+ 'fim': 'microsoft forefront identity manager',
+ 'npl': 'niakwa programming language',
+ 'wf': 'windows workflow foundation',
+ 'lm': 'etap license manager',
+ 'wts': 'windows terminal server',
+ 'asp': 'active server pages',
+ 'jil': 'job information language',
+ 'mvc': 'model view controller',
+ 'rmi': 'remote method invocation',
+ 'ad': 'active directory',
+ 'owb': 'oracle warehouse builder',
+ 'rest': 'representational state transfer',
+ 'jdk': 'java development kit',
+ 'ids': 'integrated data store',
+ 'bms': 'batch management software',
+ 'vsx': 'vmware solution exchange',
+ 'ssas': 'sql server analysis services',
+ 'atl': 'atlas transformation language',
+ 'ice': 'infobright community edition',
+ 'esql': 'extended structured query language',
+ 'corba': 'common object request broker architecture',
+ 'dpe': 'device provisioning engines',
+ 'rac': 'oracle real application clusters',
+ 'iemt': 'iis easy migration tool',
+ 'mes': 'manufacturing execution system',
+ 'odbc': 'open database connectivity',
+ 'lms': 'lan management solution',
+ 'wcf': 'windows communication foundation',
+ 'nes': 'netscape enterprise server',
+ 'jsf': 'javaserver faces',
+ 'alm': 'application lifecycle management',
+ 'hlasm': 'high level assembler',
+ 'cmod': 'content manager ondemand'}
+
+external_source = {
+ 'vb.net': 'visual basic dot net',
+ 'jes': 'job entry subsystem',
+ 'svn': 'subversion',
+ 'vcs': 'version control system',
+ 'lims': 'laboratory information management system',
+ 'ide': 'integrated development environment',
+ 'sdk': 'software development kit',
+ 'mq': 'message queue',
+ 'ims': 'information management system',
+ 'isa': 'internet security and acceleration',
+ 'vs': 'visual studio',
+ 'esr': 'extended support release',
+ 'ff': 'firefox',
+ 'vb': 'visual basic',
+ 'rhel': 'red hat enterprise linux',
+ 'iis': 'internet information server',
+ 'api': 'application programming interface',
+ 'se': 'standard edition',
+ '\.net': 'dot net',
+ 'c#': 'c sharp'
+}
+
+
+# synonyms = {
+#  'windows server': 'windows nt',
+#  'windows 7': 'windows desktop',
+#  'windows 8': 'windows desktop',
+#  'windows 10': 'windows desktop'
+# }
+
+
+# add more information
+acronym_mapping.update(external_source)
+
+
+abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
+term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
+
+def replace_terms_with_abbreviations(text):
+    for input, replacement in term_to_abbrev.items():
+        text = re.sub(input, replacement, text)
+    return text
+
+def replace_abbreviations_with_terms(text):
+    for input, replacement in abbrev_to_term.items():
+        text = re.sub(input, replacement, text)
+    return text
+
+######################################
+
+# augmentation by text corruption
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+label_flag_list = []
+
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        parent_desc = preprocess_text(parent_desc)
+
+        # Split the string into words
+        words = parent_desc.split()
+
+        # Count the number of words
+        word_count = len(words)
+
+        # short sequences are rare, and we must compensate by including more examples
+        # mutation of other longer sequences might drown out rare short sequences
+        if word_count < 3:
+            for _ in range(10):
+                element = {
+                    'text': parent_desc,
+                    'label': label2id[index],
+                }
+                output_list.append(element)
+
+
+        # check if label is in label_flag_list
+        if index not in label_flag_list:
+
+            entity_name = row['entity_name']
+            # add the "entity_name" label as a mention
+            element = {
+                'text': entity_name,
+                'label': label2id[index],
+            }
+            output_list.append(element)
+
+            # remove all non-alphanumerics
+            desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'label': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+
+            # add shufles of the original entity name
+            no_of_shuffles = SHUFFLES
+            processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles)
+            for desc in processed_descs:
+                if (desc != parent_desc):
+                    element = {
+                        'text' : desc,
+                        'label': label2id[index], # ensure labels starts from 0
+                    }
+                    output_list.append(element)
+
+            label_flag_list.append(index)
+
+
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'label': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        # corrupt string
+        desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        if (desc != parent_desc):
+            element = {
+                'text' : desc,
+                'label': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+        
+        # augmentation
+        # remove all non-alphanumerics
+        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        if (desc != parent_desc):
+            element = {
+                'text' : desc,
+                'label': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+
+        # # augmentation
+        # # perform abbrev_to_term
+        # temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        # desc = replace_terms_with_abbreviations(temp_desc)
+        # if (desc != temp_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+        # augmentation
+        # perform term to abbrev
+        desc = replace_abbreviations_with_terms(parent_desc)
+        if (desc != parent_desc):
+            element = {
+                'text' : desc,
+                'label': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../esAppMod_data_import/train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<DESC>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=5e-5,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/train/class_bert_hierarchical/.gitignore
+++ b/train/class_bert_hierarchical/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/class_bert_hierarchical/prediction/.gitignore
+++ b/train/class_bert_hierarchical/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/train/class_bert_hierarchical/prediction/output.txt
+++ b/train/class_bert_hierarchical/prediction/output.txt
@ -0,0 +1,11 @@
+
+*******************************************************************************
+Accuracy: 0.71956
+F1 Score: 0.74142
+Precision: 0.81529
+Recall: 0.71956
+********************************************************************************
+Accuracy: 0.71710
+F1 Score: 0.74095
+Precision: 0.82181
+Recall: 0.71710
--- a/train/class_bert_hierarchical/prediction/output_1.txt
+++ b/train/class_bert_hierarchical/prediction/output_1.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.81591
+F1 Score: 0.82162
+Precision: 0.85519
+Recall: 0.81591
--- a/train/class_bert_hierarchical/prediction/output_2.txt
+++ b/train/class_bert_hierarchical/prediction/output_2.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.59943
+F1 Score: 0.60266
+Precision: 0.66956
+Recall: 0.59943
--- a/train/class_bert_hierarchical/prediction/predict_1.py
+++ b/train/class_bert_hierarchical/prediction/predict_1.py
@ -0,0 +1,265 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+# construct the target id list
+# data_path = '../../../esAppMod_data_import/train.csv'
+data_path = '../../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        index = row['entity_id']
+        element = {
+            'text' : desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    # data_path = '../../../esAppMod_data_import/test.csv'
+    data_path = '../../../esAppMod_data_import/parent_test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    # combined_data = DatasetDict({
+    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
+    # })
+    return Dataset.from_list(process_df_to_dict(test_df))
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint_part1-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output_1.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result_1.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output_1.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/train/class_bert_hierarchical/prediction/predict_2.py
+++ b/train/class_bert_hierarchical/prediction/predict_2.py
@ -0,0 +1,265 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+# construct the target id list
+# data_path = '../../../esAppMod_data_import/train.csv'
+data_path = '../../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        index = row['entity_id']
+        element = {
+            'text' : desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    # data_path = '../../../esAppMod_data_import/test.csv'
+    data_path = '../../../esAppMod_data_import/test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    # combined_data = DatasetDict({
+    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
+    # })
+    return Dataset.from_list(process_df_to_dict(test_df))
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output_2.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result_2.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output_2.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/train/class_bert_hierarchical/train_1.py
+++ b/train/class_bert_hierarchical/train_1.py
@ -45,7 +45,7 @@ def set_seed(seed):

 set_seed(42)

-SHUFFLES=2
+SHUFFLES=5

 # %%

@ -56,37 +56,6 @@ train_df = pd.read_csv(data_path, skipinitialspace=True)
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))

-def compute_normalized_class_weights(class_counts, max_resamples=SHUFFLES):
-    """
-    Compute normalized class weights inversely proportional to class counts.
-    The weights are normalized so that they sum to 1.
-
-    Args:
-        class_counts (array-like): An array or list where each element represents the count of samples for a class.
-
-    Returns:
-        numpy.ndarray: A normalized array of weights for each class.
-    """
-    class_counts = np.array(class_counts)
-    total_samples = np.sum(class_counts)
-    class_weights = total_samples / class_counts
-    # so that highest weight is 1
-    normalized_weights = class_weights / np.max(class_weights)
-    # Scale weights such that the highest weight corresponds to `max_resamples`
-    resample_counts = normalized_weights * max_resamples
-    # Round resamples to nearest integer
-    resample_counts = np.round(resample_counts).astype(int)
-    return resample_counts
-
-# %%
-id_counts = train_df['entity_id'].value_counts()
-id_weights = compute_normalized_class_weights(id_counts, max_resamples=SHUFFLES)
-id_index = id_counts.index
-label2weight = {}
-for idx, label in enumerate(id_index):
-    label2weight[label] = id_weights[idx]
-
-
 # %%
 id2label = {}
 label2id = {}
@ -101,20 +70,8 @@ def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()

-    # Remove any non alphanumeric character
-    # text = re.sub(r'[^\w\s]', ' ', text)  # Retains only alphanumeric and spaces
-    # replace dashes
-    text = re.sub(r"[-;:]", " ", text)
-
-    # Add space between digit followed by a letter
-    text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
-
-    # Add space between letter followed by a digit
-    text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
-
-
    # Substitute digits with 'x'
-    text = re.sub(r'\d+', 'x', text)
+    text = re.sub(r'\d+', '#', text)

    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
@ -165,35 +122,143 @@ def shuffle_text(text, n_shuffles=SHUFFLES):
    
    return all_processed

-term_to_abbrev = {
-    r'job entry system': 'jes',
-    r'subversion': 'svn',
-    r'borland database engine': 'bde',
-    r'business intelligence and reporting tools': 'birt',
-    r'lan management solution': 'lms',
-    r'laboratory information management system': 'lims',
-    r'ibm database 2': 'db/2',
-    r'integrated development environment': 'ide',
-    r'software development kit': 'sdk',
-    r'hp operations orchestration': 'hpoo',
-    r'hp server automation': 'hpsa',
-    r'internet information server': 'iis',
-    r'release 2': 'r2',
-    r'red hat enterprise linux': 'rhel',
-    r'oracle enterprise linux': 'oel',
-    r'websphere application server': 'was',
-    r'application development facility': 'adf',
-    r'server analysis services': 'ssas'
+acronym_mapping = {
+ 'hpsa': 'hp server automation',
+ 'tam': 'tivoli access manager',
+ 'adf': 'application development facility',
+ 'html': 'hypertext markup language',
+ 'wff': 'microsoft web farm framework',
+ 'jsp': 'javaserver pages',
+ 'bw': 'business works',
+ 'ssrs': 'sql server reporting services',
+ 'cl': 'control language',
+ 'vba': 'visual basic for applications',
+ 'esapi': 'enterprise security api',
+ 'gwt': 'google web toolkit',
+ 'pki': 'perkin elmer informatics',
+ 'rtd': 'oracle realtime decisions',
+ 'jms': 'java message service',
+ 'db': 'database',
+ 'soa': 'service oriented architecture',
+ 'xsl': 'extensible stylesheet language',
+ 'com': 'compopent object model',
+ 'ldap': 'lightweight directory access protocol',
+ 'odm': 'ibm operational decision manager',
+ 'soql': 'salesforce object query language',
+ 'oms': 'order management system',
+ 'cfml': 'coldfusion markup language',
+ 'nas': 'netscape application server',
+ 'sql': 'structured query language',
+ 'bde': 'borland database engine',
+ 'imap': 'internet message access protocol',
+ 'uws': 'ultidev web server',
+ 'birt': 'business intelligence and reporting tools',
+ 'mdw': 'model driven workflow',
+ 'tws': 'tivoli workload scheduler',
+ 'jre': 'java runtime environment',
+ 'wcs': 'websphere commerce suite',
+ 'was': 'websphere application server',
+ 'ssis': 'sql server integration services',
+ 'xhtml': 'extensible hypertext markup language',
+ 'soap': 'simple object access protocol',
+ 'san': 'storage area network',
+ 'elk': 'elastic stack',
+ 'arr': 'application request routing',
+ 'xlst': 'extensible stylesheet language transformations',
+ 'sccm': 'microsoft endpoint configuration manager',
+ 'ejb': 'enterprise java beans',
+ 'css': 'cascading style sheets',
+ 'hpoo': 'hp operations orchestration',
+ 'xml': 'extensible markup language',
+ 'esb': 'enterprise service bus',
+ 'edi': 'electronic data interchange',
+ 'imsva': 'interscan messaging security virtual appliance',
+ 'wtx': 'ibm websphere transformation extender',
+ 'cgi': 'common gateway interface',
+ 'bal': 'ibm basic assembly language',
+ 'issow': 'integrated safe system of work',
+ 'dcl': 'data control language',
+ 'jdom': 'java document object model',
+ 'fim': 'microsoft forefront identity manager',
+ 'npl': 'niakwa programming language',
+ 'wf': 'windows workflow foundation',
+ 'lm': 'etap license manager',
+ 'wts': 'windows terminal server',
+ 'asp': 'active server pages',
+ 'jil': 'job information language',
+ 'mvc': 'model view controller',
+ 'rmi': 'remote method invocation',
+ 'ad': 'active directory',
+ 'owb': 'oracle warehouse builder',
+ 'rest': 'representational state transfer',
+ 'jdk': 'java development kit',
+ 'ids': 'integrated data store',
+ 'bms': 'batch management software',
+ 'vsx': 'vmware solution exchange',
+ 'ssas': 'sql server analysis services',
+ 'atl': 'atlas transformation language',
+ 'ice': 'infobright community edition',
+ 'esql': 'extended structured query language',
+ 'corba': 'common object request broker architecture',
+ 'dpe': 'device provisioning engines',
+ 'rac': 'oracle real application clusters',
+ 'iemt': 'iis easy migration tool',
+ 'mes': 'manufacturing execution system',
+ 'odbc': 'open database connectivity',
+ 'lms': 'lan management solution',
+ 'wcf': 'windows communication foundation',
+ 'nes': 'netscape enterprise server',
+ 'jsf': 'javaserver faces',
+ 'alm': 'application lifecycle management',
+ 'hlasm': 'high level assembler',
+ 'cmod': 'content manager ondemand'}
+
+external_source = {
+ 'vb.net': 'visual basic dot net',
+ 'jes': 'job entry subsystem',
+ 'svn': 'subversion',
+ 'vcs': 'version control system',
+ 'lims': 'laboratory information management system',
+ 'ide': 'integrated development environment',
+ 'sdk': 'software development kit',
+ 'mq': 'message queue',
+ 'ims': 'information management system',
+ 'isa': 'internet security and acceleration',
+ 'vs': 'visual studio',
+ 'esr': 'extended support release',
+ 'ff': 'firefox',
+ 'vb': 'visual basic',
+ 'rhel': 'red hat enterprise linux',
+ 'iis': 'internet information server',
+ 'api': 'application programming interface',
+ 'se': 'standard edition',
+ '\.net': 'dot net',
+ 'c#': 'c sharp',
+ 'ms': 'microsoft'
 }

-abbrev_to_term = {rf'\b{value}\b': key for key, value in term_to_abbrev.items()}
+
+# synonyms = {
+#  'windows server': 'windows nt',
+#  'windows 7': 'windows desktop',
+#  'windows 8': 'windows desktop',
+#  'windows 10': 'windows desktop'
+# }
+
+
+# add more information
+acronym_mapping.update(external_source)
+
+
+abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
+term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}

 def replace_terms_with_abbreviations(text):
    for input, replacement in term_to_abbrev.items():
        text = re.sub(input, replacement, text)
    return text

-def replace_abbreivations_with_terms(text):
+def replace_abbreviations_with_terms(text):
    for input, replacement in abbrev_to_term.items():
        text = re.sub(input, replacement, text)
    return text
@ -218,8 +283,19 @@ def process_df_to_dict(df):
        # no_of_shuffles = label2weight[index] + 1
        no_of_shuffles = SHUFFLES
        processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
-
        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'label': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        
+        # augmentation
+        # remove all non-alphanumerics
+        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        if (desc != parent_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
@ -227,24 +303,21 @@ def process_df_to_dict(df):
            output_list.append(element)


+        # augmentation
        # perform abbrev_to_term
-        desc = replace_terms_with_abbreviations(parent_desc)
-        no_of_shuffles = SHUFFLES
-        processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
-
-        for desc in processed_descs:
+        temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        desc = replace_terms_with_abbreviations(temp_desc)
+        if (desc != temp_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)

+        # augmentation
        # perform term to abbrev
-        desc = replace_abbreivations_with_terms(parent_desc)
-        no_of_shuffles = SHUFFLES
-        processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
-
-        for desc in processed_descs:
+        desc = replace_abbreviations_with_terms(parent_desc)
+        if (desc != parent_desc):
            element = {
                'text' : desc,
                'label': label2id[index], # ensure labels starts from 0
@ -257,7 +330,7 @@ def process_df_to_dict(df):

 def create_dataset():
    # train 
-    data_path = '../../esAppMod_data_import/train.csv'
+    data_path = '../../esAppMod_data_import/parent_train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)


@ -271,13 +344,13 @@ def create_dataset():

 def train():

-    save_path = f'checkpoint'
+    save_path = f'checkpoint_part1'
    split_datasets = create_dataset()

    # prepare tokenizer

    model_checkpoint = "distilbert/distilbert-base-uncased"
-    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'google-bert/bert-base-uncased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
@ -348,7 +421,6 @@ def train():

    training_args = TrainingArguments(
        output_dir=f"{save_path}",
-        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
--- a/train/class_bert_hierarchical/train_2.py
+++ b/train/class_bert_hierarchical/train_2.py
@ -0,0 +1,469 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+import glob
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=0
+
+# %%
+
+# import training file
+data_path = '../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+acronym_mapping = {
+ 'hpsa': 'hp server automation',
+ 'tam': 'tivoli access manager',
+ 'adf': 'application development facility',
+ 'html': 'hypertext markup language',
+ 'wff': 'microsoft web farm framework',
+ 'jsp': 'javaserver pages',
+ 'bw': 'business works',
+ 'ssrs': 'sql server reporting services',
+ 'cl': 'control language',
+ 'vba': 'visual basic for applications',
+ 'esapi': 'enterprise security api',
+ 'gwt': 'google web toolkit',
+ 'pki': 'perkin elmer informatics',
+ 'rtd': 'oracle realtime decisions',
+ 'jms': 'java message service',
+ 'db': 'database',
+ 'soa': 'service oriented architecture',
+ 'xsl': 'extensible stylesheet language',
+ 'com': 'compopent object model',
+ 'ldap': 'lightweight directory access protocol',
+ 'odm': 'ibm operational decision manager',
+ 'soql': 'salesforce object query language',
+ 'oms': 'order management system',
+ 'cfml': 'coldfusion markup language',
+ 'nas': 'netscape application server',
+ 'sql': 'structured query language',
+ 'bde': 'borland database engine',
+ 'imap': 'internet message access protocol',
+ 'uws': 'ultidev web server',
+ 'birt': 'business intelligence and reporting tools',
+ 'mdw': 'model driven workflow',
+ 'tws': 'tivoli workload scheduler',
+ 'jre': 'java runtime environment',
+ 'wcs': 'websphere commerce suite',
+ 'was': 'websphere application server',
+ 'ssis': 'sql server integration services',
+ 'xhtml': 'extensible hypertext markup language',
+ 'soap': 'simple object access protocol',
+ 'san': 'storage area network',
+ 'elk': 'elastic stack',
+ 'arr': 'application request routing',
+ 'xlst': 'extensible stylesheet language transformations',
+ 'sccm': 'microsoft endpoint configuration manager',
+ 'ejb': 'enterprise java beans',
+ 'css': 'cascading style sheets',
+ 'hpoo': 'hp operations orchestration',
+ 'xml': 'extensible markup language',
+ 'esb': 'enterprise service bus',
+ 'edi': 'electronic data interchange',
+ 'imsva': 'interscan messaging security virtual appliance',
+ 'wtx': 'ibm websphere transformation extender',
+ 'cgi': 'common gateway interface',
+ 'bal': 'ibm basic assembly language',
+ 'issow': 'integrated safe system of work',
+ 'dcl': 'data control language',
+ 'jdom': 'java document object model',
+ 'fim': 'microsoft forefront identity manager',
+ 'npl': 'niakwa programming language',
+ 'wf': 'windows workflow foundation',
+ 'lm': 'etap license manager',
+ 'wts': 'windows terminal server',
+ 'asp': 'active server pages',
+ 'jil': 'job information language',
+ 'mvc': 'model view controller',
+ 'rmi': 'remote method invocation',
+ 'ad': 'active directory',
+ 'owb': 'oracle warehouse builder',
+ 'rest': 'representational state transfer',
+ 'jdk': 'java development kit',
+ 'ids': 'integrated data store',
+ 'bms': 'batch management software',
+ 'vsx': 'vmware solution exchange',
+ 'ssas': 'sql server analysis services',
+ 'atl': 'atlas transformation language',
+ 'ice': 'infobright community edition',
+ 'esql': 'extended structured query language',
+ 'corba': 'common object request broker architecture',
+ 'dpe': 'device provisioning engines',
+ 'rac': 'oracle real application clusters',
+ 'iemt': 'iis easy migration tool',
+ 'mes': 'manufacturing execution system',
+ 'odbc': 'open database connectivity',
+ 'lms': 'lan management solution',
+ 'wcf': 'windows communication foundation',
+ 'nes': 'netscape enterprise server',
+ 'jsf': 'javaserver faces',
+ 'alm': 'application lifecycle management',
+ 'hlasm': 'high level assembler',
+ 'cmod': 'content manager ondemand'}
+
+external_source = {
+ 'vb.net': 'visual basic dot net',
+ 'jes': 'job entry subsystem',
+ 'svn': 'subversion',
+ 'vcs': 'version control system',
+ 'lims': 'laboratory information management system',
+ 'ide': 'integrated development environment',
+ 'sdk': 'software development kit',
+ 'mq': 'message queue',
+ 'ims': 'information management system',
+ 'isa': 'internet security and acceleration',
+ 'vs': 'visual studio',
+ 'esr': 'extended support release',
+ 'ff': 'firefox',
+ 'vb': 'visual basic',
+ 'rhel': 'red hat enterprise linux',
+ 'iis': 'internet information server',
+ 'api': 'application programming interface',
+ 'se': 'standard edition',
+ '\.net': 'dot net',
+ 'c#': 'c sharp',
+ 'ms': 'microsoft'
+}
+
+
+# synonyms = {
+#  'windows server': 'windows nt',
+#  'windows 7': 'windows desktop',
+#  'windows 8': 'windows desktop',
+#  'windows 10': 'windows desktop'
+# }
+
+
+# add more information
+acronym_mapping.update(external_source)
+
+
+abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
+term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
+
+def replace_terms_with_abbreviations(text):
+    for input, replacement in term_to_abbrev.items():
+        text = re.sub(input, replacement, text)
+    return text
+
+def replace_abbreviations_with_terms(text):
+    for input, replacement in abbrev_to_term.items():
+        text = re.sub(input, replacement, text)
+    return text
+
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        parent_desc = preprocess_text(parent_desc)
+        # ensure at least 1 shuffle
+        # no_of_shuffles = label2weight[index] + 1
+        no_of_shuffles = SHUFFLES
+        processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'label': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        
+        # augmentation
+        # remove all non-alphanumerics
+        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        if (desc != parent_desc):
+            element = {
+                'text' : desc,
+                'label': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+
+        # augmentation
+        # perform abbrev_to_term
+        temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        desc = replace_terms_with_abbreviations(temp_desc)
+        if (desc != temp_desc):
+            element = {
+                'text' : desc,
+                'label': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+        # augmentation
+        # perform term to abbrev
+        desc = replace_abbreviations_with_terms(parent_desc)
+        if (desc != parent_desc):
+            element = {
+                'text' : desc,
+                'label': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../esAppMod_data_import/train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    pattern = 'checkpoint_part1-*'
+    checkpoint_directory = 'checkpoint'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    # model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-uncased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<DESC>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=5e-5,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=300,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/train/seq2seq_t5_simple/mapping_prediction/output.txt
+++ b/train/seq2seq_t5_simple/mapping_prediction/output.txt
@ -1,2 +0,0 @@
-
-Accuracy for fold: 0.5846658466584665
--- a/train/seq2seq_t5_simple/mapping_prediction/.gitignore
+++ b/train/seq2seq_t5_simple/mapping_prediction/.gitignore
--- a/train/seq2seq_t5_simple/mapping_prediction/inference.py
+++ b/train/seq2seq_t5_simple/mapping_prediction/inference.py
@ -57,10 +57,10 @@ class Inference():
            output_list = []
            for _, row in df.iterrows():
                desc = row['mention']
-                label = row['entity_name']
+                label = row['entity_seq']
                element = {
                    'input' : desc,
-                    'output': label
+                    'output': f'{label}'
                }

                output_list.append(element)
@ -101,7 +101,7 @@ class Inference():


    def generate(self):
-        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        MAX_GENERATE_LENGTH = 128

        pred_generations = []
--- a/train/seq2seq_t5_simple/prediction/output.txt
+++ b/train/seq2seq_t5_simple/prediction/output.txt
@ -0,0 +1,2 @@
+
+Accuracy for fold: 0.5022550225502255
--- a/train/seq2seq_t5_simple/mapping_prediction/predict.py
+++ b/train/seq2seq_t5_simple/mapping_prediction/predict.py
@ -11,7 +11,7 @@ BATCH_SIZE = 512
 def infer():
    print(f"Inference for data")
    # import test data
-    data_path = '../../../data_import/test.csv'
+    data_path = '../../../esAppMod_data_import/test_seq.csv'
    df = pd.read_csv(data_path, skipinitialspace=True)


@ -35,18 +35,19 @@ def infer():
    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
    # Convert the list to a Pandas DataFrame
    df_out = pd.DataFrame({
-        'predictions': prediction_list
+        'class_prediction': prediction_list
    })
    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
-    df = pd.concat([df, df_out], axis=1)
+    # df = pd.concat([df, df_out], axis=1)

    # we can save the t5 generation output here
-    df.to_csv(f"exports/result.csv", index=False)
+    df_out.to_csv(f"exports/result.csv", index=False)

    # here we want to evaluate mapping accuracy within the valid in mdm data only
-    condition_correct = df['predictions'] == df['entity_name']
-    pred_correct_proportion = sum(condition_correct)/len(df)
+    # predictions = pd.to_numeric(df_out['class_prediction'], errors="coerce")
+    condition_correct = df_out['class_prediction'] == df['entity_seq']
+    pred_correct_proportion = sum(condition_correct)/len(df_out)

    # write output to file output.txt
    with open("output.txt", "a") as f:
--- a/train/seq2seq_t5_simple/train.py
+++ b/train/seq2seq_t5_simple/train.py
@ -33,10 +33,10 @@ def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
-        label = row['entity_name']
+        label = row['entity_seq']
        element = {
            'input' : desc,
-            'output': label
+            'output': f'{label}'
        }
        output_list.append(element)

@ -45,7 +45,7 @@ def process_df_to_dict(df):

 def create_dataset():
    # train 
-    data_path = f"../../data_import/train.csv"
+    data_path = f"../../esAppMod_data_import/train_seq.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)

    combined_data = DatasetDict({
--- a/zero_shot/bloom.py
+++ b/zero_shot/bloom.py
@ -3,8 +3,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer

 # %%
 # Load model and tokenizer
-# model_name = "bigscience/bloom-7b1"  # Replace with your model
-model_name = "bigscience/bloomz-1b1"
+model_name = "bigscience/bloom-7b1"  # Replace with your model
+# model_name = "bigscience/bloomz-1b1"
 tokenizer = AutoTokenizer.from_pretrained(model_name)

 # Automatically map model layers to available GPUs
@ -26,13 +26,12 @@ outputs = model.generate(inputs["input_ids"], max_length=50)
 # Decode and print result
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 # %%
-# %%
 # Prepare input

 def generate(text):

    # Define prompt
-    prompt = f"Answer Concisely: Give me a mapping between the acronym and descriptor in the format '(acronym: description): '{text}'"
+    prompt = f"Give me past product names relating to: '{text}'"

    # Generate acronym
    inputs = tokenizer(prompt, return_tensors="pt")
@ -45,7 +44,7 @@ def generate(text):

 # Example usage
 # text = "Advanced Data Analytics Platform"
-text = 'ColdFusion Markup Language (CFML)'
+text = 'windows server'
 acronym = generate(text)
-print(f"Acronym: {acronym}")
+print(f"Generation: {acronym}")
 # %%
--- a/zero_shot/conceptnet.py
+++ b/zero_shot/conceptnet.py
@ -0,0 +1,21 @@
+# %%
+import requests
+
+def get_related_terms(term, language="en", limit=10):
+    url = f"http://api.conceptnet.io/c/{language}/{term}"
+    response = requests.get(url).json()
+    
+    # Extract related terms
+    related_terms = []
+    for edge in response.get("edges", []):
+        related = edge.get("end", {}).get("label", None)
+        if related and related.lower() != term.lower():
+            related_terms.append(related)
+        if len(related_terms) >= limit:
+            break
+    return related_terms
+
+# Example
+related_terms = get_related_terms("windows_server")
+print("Related Terms:", related_terms)
+# %%
--- a/zero_shot/dbpedia.py
+++ b/zero_shot/dbpedia.py
@ -0,0 +1,38 @@
+# %%
+from SPARQLWrapper import SPARQLWrapper, JSON
+
+# %%
+sparql = SPARQLWrapper("https://dbpedia.org/sparql")
+sparql.setQuery("""
+    SELECT ?altLabel WHERE {
+    ?item rdfs:label "Windows Server"@en.
+    ?item skos:altLabel ?altLabel.
+    FILTER (LANG(?altLabel) = "en")
+    }
+    LIMIT 10
+""")
+sparql.setReturnFormat(JSON)
+results = sparql.query().convert()
+
+for result in results["results"]["bindings"]:
+    print(result["label"]["value"])
+# %%
+from SPARQLWrapper import SPARQLWrapper, JSON
+
+sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
+sparql.setQuery("""
+    SELECT ?itemLabel ?altLabel WHERE {
+        ?item ?label "Windows Server"@en.
+        OPTIONAL { ?item skos:altLabel ?altLabel. FILTER (LANG(?altLabel) = "en") }
+        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
+    }
+    LIMIT 10
+""")
+sparql.setReturnFormat(JSON)
+results = sparql.query().convert()
+
+for result in results["results"]["bindings"]:
+    print("Label:", result["itemLabel"]["value"])
+    if "altLabel" in result:
+        print("Alias:", result["altLabel"]["value"])
+# %%
--- a/zero_shot/error.csv
+++ b/zero_shot/error.csv
@ -0,0 +1,626 @@
+,mention,entity_id,entity_name,class_prediction,predicted_name
+0,DOT NET,497,.NET Framework,579,Unix|BSD|*
+2,Dot net - FW 4,497,.NET Framework,368,VB.NET
+3,.Net 4.7.1 Enterprise Lib,497,.NET Framework,368,VB.NET
+11,.NET,497,.NET Framework,579,Unix|BSD|*
+13,.Net 4.5.2 Enterprise Lib,497,.NET Framework,368,VB.NET
+40,APACHE LOG4NET,483,.NET Framework|log4net,394,Java|Log4j
+41,LOG4NET,483,.NET Framework|log4net,394,Java|Log4j
+42,Magik,484,.NET Framework|Magick.NET,533,YAML
+43,WCF,485,.NET Framework|Windows Communication Foundation (WCF),486,.NET Framework|Windows Workflow Foundation (WF)
+45,WWF,486,.NET Framework|Windows Workflow Foundation (WF),443,OS/2
+47,Ejes,1,(E)JES,101,Microsoft Dynamics AX
+48,(UNIRITA) A-AUTO 7.2.2,2,A-Auto Job Scheduling Software,299,AutoIt
+50,Active Directoy,498,Active Directory (AD),40,Connect Direct
+54,APSX,592,Active Server Pages (ASP)|*,609,IIS|*
+69,Andriod,418,Android,586,PHP|*
+71,Apache Active Queue,6,Apache ActiveMQ,259,Apache HTTP Server
+72,MQ Apache Active Queue,6,Apache ActiveMQ,81,IBM Websphere MQ
+75,cordova-android,501,Apache Cordova,418,Android
+77,Hive,8,Apache Hive,177,SAP NetWeaver Business Warehouse
+99,solr,11,Apache Solr,375,Apache Lucene
+135,ADF,13,Application Development Facility (ADF),130,Oracle ADF
+144,WLS 10.2,600,Oracle WebLogic Server|*,442,OpenVMS
+149,BEA WLS,600,Oracle WebLogic Server|*,442,OpenVMS
+152,Weblogic 12c,600,Oracle WebLogic Server|*,582,C#|*
+160,WLE,600,Oracle WebLogic Server|*,443,OS/2
+168,Web Logic,600,Oracle WebLogic Server|*,97,MarkLogic DB
+174,BEA WLE,600,Oracle WebLogic Server|*,443,OS/2
+175,WLS 10,600,Oracle WebLogic Server|*,442,OpenVMS
+176,WLS,600,Oracle WebLogic Server|*,442,OpenVMS
+189,brain script,302,Brainscript,369,VBScript
+190,BRAINScript,302,Brainscript,367,TypeScript
+191,Business Intelligence and Reporting Tools,21,Business Intelligence and Reporting Tools (BIRT),133,Oracle Business Intelligence
+192,Actuate Report Server,21,Business Intelligence and Reporting Tools (BIRT),42,Crystal Reports
+194,CSHARP,582,C#|*,87,Informatica PowerCenter
+218,WinFrame,30,Citrix Virtual Apps and Desktops,443,OS/2
+221,METAFRAME,30,Citrix Virtual Apps and Desktops,406,JavaScript|AngularJS
+225,Presentation Server,30,Citrix Virtual Apps and Desktops,541,File Server
+226,NETSCALER-1.5,563,Citrix ADC,273,Netscape Enterprise Server (NES)
+227,NETSCALER-11.,563,Citrix ADC,273,Netscape Enterprise Server (NES)
+228,Citrix SD-WAN,563,Citrix ADC,30,Citrix Virtual Apps and Desktops
+229,NetScaler SD-WAN,563,Citrix ADC,273,Netscape Enterprise Server (NES)
+231,NetScaler ADC,563,Citrix ADC,272,Netscape Application Server (NAS)
+236,NetScaler SDX,291,Citrix ADC SDX,273,Netscape Enterprise Server (NES)
+240,Provisioning Services 7.15.8,32,Citrix Provisioning,538,Device Provisioning Engines (DPE)
+241,Citrix PVS,32,Citrix Provisioning,30,Citrix Virtual Apps and Desktops
+243,CLISTS,309,CLIST,329,IBM i Control Language (CL)
+253,CFML,311,ColdFusion Markup Language (CFML),316,eXtensible HyperText Markup Language (XHTML)
+254,ColdFusion Markup Language,311,ColdFusion Markup Language (CFML),37,Coldfusion
+255,Sterling Connect,40,Connect Direct,542,General Ledger
+264,Cormerstone,41,Cornerstone software,516,Compopent Object Model (COM)
+265,Cornerstone,41,Cornerstone software,370,Visual Basic
+279,DB2 UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA)
+282,DB2-UDB,43,DB2,365,TCL
+291,DB2/UDB,43,DB2,365,TCL
+292,IBM DB2 ENTERPRISE SERVER EDITION PVU OPTION 10.5,43,DB2,163,PVCS Version Manager
+300,IBM - IBM DB2 Advanced Enterprise Server Edition PVU Option 10.5,43,DB2,72,IBM Mobile Foundation
+301,UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA)
+302,IBM - IBM DB2 Enterprise Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|*
+306,IBM - IBM DB2 Workgroup Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|*
+313,EZTriev,314,Easytrieve,296,Intel Xeon Processor
+314,Eztrieve,314,Easytrieve,296,Intel Xeon Processor
+321,PrestoSoft - ExamDiff Application 1.6,49,ExamDiff,346,Pascal|Object Pascal
+322,PrestoSoft - ExamDiff Application,49,ExamDiff,346,Pascal|Object Pascal
+323,ExamDiff Application,49,ExamDiff,467,MS SQL Server|Log Reader Agent
+324,Expect Scripts,315,Expect,109,Microsoft MQ
+329,Microsoft - MSXML 4.0 SP2 4.2,318,Extensible Markup Language (XML)|MSXML,316,eXtensible HyperText Markup Language (XHTML)
+331,XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST)
+332,JAVA-XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST)
+335,ServerCA Access GatewayF5,50,F5 Secure Web Gateway Services,290,CA API Gateway
+347,HP C++,58,HP aC++ compiler,59,HP C/ANSI C compiler
+350,HP C++ 10.20,58,HP aC++ compiler,59,HP C/ANSI C compiler
+351,HPC 11.11,59,HP C/ANSI C compiler,58,HP aC++ compiler
+358,HFS,505,HTTP File Server,486,.NET Framework|Windows Workflow Foundation (WF)
+359,www.rejetto.com - HttpFileServer 2.3,505,HTTP File Server,55,Google Chrome
+360,HttpFileServer,505,HTTP File Server,522,Application Web Server
+367,IBM - IBM BigFix Platform Client Deploy Tool 9.5,457,IBM BigFix Platform|Client Deploy Tool,62,IBM BigFix Platform
+369,IBM BPM,64,IBM Business Process Manager,328,IBM High Level Assembler (HLASM)
+375,Data Power,294,IBM DataPower Gateway,295,IBM Power Systems
+376,IDG.7.5.2.19hp,294,IBM DataPower Gateway,449,Unix|HP-UX
+380,hlasm,328,IBM High Level Assembler (HLASM),438,macOS
+383,IHS,265,IBM HTTP Server,424,IBM i
+386,WebSphere and IHS,265,IBM HTTP Server,67,IBM InfoSphere DataStage
+387,WebSphere http,265,IBM HTTP Server,284,Websphere Application Server (WAS)
+391,IBM Websphere HTTP Server,265,IBM HTTP Server,285,WebSphere Liberty
+393,WebSphere IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS
+394,WebSphere -IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS
+397,OS400 V7R1,424,IBM i,443,OS/2
+398,OS400,424,IBM i,443,OS/2
+399,OS/400,424,IBM i,443,OS/2
+408,IIB,68,IBM Integration Bus,370,Visual Basic
+411,Extended Structured Query Language,458,IBM Integration Bus|Extended Structured Query Language (ESQL),572,Structured Query Language (SQL)
+415,IBM WorkLight,72,IBM Mobile Foundation,649,IBM Cloud
+417,ILOG JRules,73,IBM Operational Decision Manager (ODM),601,Websphere ILOG JRules BRMS
+420,Decision Center 8.0.1.0,73,IBM Operational Decision Manager (ODM),252,IBM ILOG Views
+423,AS400,295,IBM Power Systems,443,OS/2
+424,AS/400,295,IBM Power Systems,443,OS/2
+426,System i,295,IBM Power Systems,424,IBM i
+427,P-series,295,IBM Power Systems,81,IBM Websphere MQ
+428,IBM iSeries/AS400 system Model 520,295,IBM Power Systems,443,OS/2
+439,Tivoli Asset Discovery for Distributed,459,IBM Tivoli Asset Management|Asset Discovery for Distributed,606,IBM Tivoli Asset Management|*
+447,Database MS SQL Agent,77,IBM Tivoli Monitoring,469,MS SQL Server|SQL Server Database Engine
+448,Linux OS Agent,77,IBM Tivoli Monitoring,576,Linux|*
+449,Database DB2 Agent,77,IBM Tivoli Monitoring,520,Database (DB)
+452,Windows OS Agent,77,IBM Tivoli Monitoring,580,Windows|*
+454,IBM - IBM TSM FCM,604,IBM Tivoli Storage Manager|*,460,IBM Tivoli Storage Manager|TSM API
+459,Databases Data Protection for Microsoft SQL,604,IBM Tivoli Storage Manager|*,572,Structured Query Language (SQL)
+461,IBM - IBM Spectrum Protect Data Protection for Microsoft SQL Server 8.1,604,IBM Tivoli Storage Manager|*,469,MS SQL Server|SQL Server Database Engine
+462,IBM Spectrum Protect Data Protection,604,IBM Tivoli Storage Manager|*,312,Data Language Interface (DL/I)
+463,IBM - IBM Spectrum Protect API 7.1,460,IBM Tivoli Storage Manager|TSM API,294,IBM DataPower Gateway
+464,IBM - IBM Spectrum Protect Client,461,IBM Tivoli Storage Manager|TSM Client,294,IBM DataPower Gateway
+465,IBM - IBM Tivoli Storage Manager Client,461,IBM Tivoli Storage Manager|TSM Client,604,IBM Tivoli Storage Manager|*
+467,VSS Requestor configured 8.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|*
+468,VSS Requestor 7.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|*
+469,TWS-WS,79,IBM Tivoli Workload Scheduler (TWS),239,Windows Terminal Server (WTS)
+472,wbia 2.6,80,IBM WebSphere Business Integration Adaptor,627,XtraDB
+473,IBM WBIA 2.6.0.12,80,IBM WebSphere Business Integration Adaptor,424,IBM i
+475,MQ,81,IBM Websphere MQ,248,ZeroMQ
+476,MQ 9.1,81,IBM Websphere MQ,248,ZeroMQ
+479,MQ 7,81,IBM Websphere MQ,248,ZeroMQ
+480,MQ 6,81,IBM Websphere MQ,248,ZeroMQ
+481,MQ 9.0,81,IBM Websphere MQ,248,ZeroMQ
+482,MQ 5.3,81,IBM Websphere MQ,248,ZeroMQ
+483,MQ 7.01,81,IBM Websphere MQ,248,ZeroMQ
+484,MQ 7.5,81,IBM Websphere MQ,248,ZeroMQ
+485,MQSeries 8.0,81,IBM Websphere MQ,248,ZeroMQ
+488,WSMQ 8.0,81,IBM Websphere MQ,248,ZeroMQ
+489,MQ 9.0.5,81,IBM Websphere MQ,248,ZeroMQ
+491,WTX,83,IBM WebSphere Transformation Extender (WTX),274,Nginx
+505,Microsoft Internet Inf,609,IIS|*,130,Oracle ADF
+508,Microsoft Internet Informat,609,IIS|*,330,IBM Informix-4GL
+550,Microsoft - IIS 6.0 Migration Tool 1,489,IIS|Easy Migration Tool (IEMT),609,IIS|*
+558,Infozip 6,85,Info-ZIP,677,Git
+559,Infozip,85,Info-ZIP,677,Git
+578,IMSVA 9.1,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB
+580,IMSVA,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB
+581,Java 1.8,584,Java|*,334,Java|Java Standard Edition (Java SE)
+582,Java 7,584,Java|*,334,Java|Java Standard Edition (Java SE)
+583,Java on Weblogic server,584,Java|*,600,Oracle WebLogic Server|*
+584,Java5,584,Java|*,334,Java|Java Standard Edition (Java SE)
+585,Java 6,584,Java|*,334,Java|Java Standard Edition (Java SE)
+586,Java 6.0,584,Java|*,334,Java|Java Standard Edition (Java SE)
+587,Java 7 Update 25,584,Java|*,334,Java|Java Standard Edition (Java SE)
+589,Java (open source),584,Java|*,397,Java|Servlet
+590,Java 5,584,Java|*,334,Java|Java Standard Edition (Java SE)
+591,Java 1.5,584,Java|*,334,Java|Java Standard Edition (Java SE)
+593,Java 1.8.0_92,584,Java|*,334,Java|Java Standard Edition (Java SE)
+594,Java 1.6,584,Java|*,334,Java|Java Standard Edition (Java SE)
+595,J2EE 6,584,Java|*,333,Java|Java Enterprise Edition (Java EE)
+596,Java (J2EE),584,Java|*,333,Java|Java Enterprise Edition (Java EE)
+598,JRE,506,Java Runtime Environment (JRE),84,IMS DB
+629,JEE,333,Java|Java Enterprise Edition (Java EE),1,(E)JES
+639,JSF,391,Java|JavaServer Faces (JSF),334,Java|Java Standard Edition (Java SE)
+643,JSP Scriptlets,336,Java|JavaServer Pages (JSP)|Scriptlets,335,Java|JavaServer Pages (JSP)
+644,Java Scriplet,336,Java|JavaServer Pages (JSP)|Scriptlets,88,Ingres
+645,Core 9.2.0.0,393,Java|JRuby Core,583,C++|*
+647,Java RMI,396,Java|Remote Method Invocation (RMI),584,Java|*
+650,Java Servlets,397,Java|Servlet,453,Linux|Fedora
+651,Java 6 Servlets,397,Java|Servlet,453,Linux|Fedora
+652,J2EE Servlets,397,Java|Servlet,443,OS/2
+653,Servlets,397,Java|Servlet,420,Cisco IOS
+654,Servlets v2.3,397,Java|Servlet,370,Visual Basic
+656,Spring BOOT,399,Java|Spring|Spring Boot,398,Java|Spring
+657,Springboot,399,Java|Spring|Spring Boot,398,Java|Spring
+661,javasript,589,JavaScript|*,335,Java|JavaServer Pages (JSP)
+662,JS,589,JavaScript|*,507,Node.js
+664,Java Script,589,JavaScript|*,584,Java|*
+671,Sencha 4.2.0,409,JavaScript|Ext JS,589,JavaScript|*
+674,jqueryui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery
+675,jquery-ui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery
+679,Scriptaculous,414,JavaScript|script.aculo.us,582,C#|*
+684,EAP,268,JBoss|*,174,SAP ERP
+685,JBOSS-EAP,268,JBoss|*,493,JBoss|Wildfly
+686,JBoss Application Server 4,268,JBoss|*,493,JBoss|Wildfly
+687,JBoss Application Server 7,268,JBoss|*,493,JBoss|Wildfly
+688,JBoss Application Server 5,268,JBoss|*,493,JBoss|Wildfly
+689,JBoss Application Server,268,JBoss|*,493,JBoss|Wildfly
+690,Enterprise Application Platform,268,JBoss|*,670,EAServer
+692,JBOSS 5.1.2 EAP,268,JBoss|*,493,JBoss|Wildfly
+693,server: Jboss,268,JBoss|*,493,JBoss|Wildfly
+694,JBOSS 6.3.2 EAP,268,JBoss|*,493,JBoss|Wildfly
+695,JBoss EAP 4.3,268,JBoss|*,493,JBoss|Wildfly
+700,Job Information Language,339,Job Information Language (JIL),338,JCL
+703,JoinIT by Acayosoft,91,joinIT,4,Adobe Acrobat Reader
+704,Acayosoft JoinIT,91,joinIT,4,Adobe Acrobat Reader
+705,JoinIT by Acayosoft v 9.0.8,91,joinIT,4,Adobe Acrobat Reader
+706,LifeFlow Tool,92,LifeFlow,486,.NET Framework|Windows Workflow Foundation (WF)
+707,Linux 2.6.32-696.28.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
+709,Linux 2.6.32-696.30.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
+710,Linux 2.6.9,576,Linux|*,437,Linux|zLinux
+711,Linux 2.6.32-642.3.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
+712,Linux - 2.6.18-371.1.2.el5,576,Linux|*,437,Linux|zLinux
+713,Linux 2.6.32-696.23.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
+749,Gaia Kernel version 2.7,428,Linux|Check Point,432,Linux|Oracle Linux
+752,Gaia Kernel version 2.6,428,Linux|Check Point,432,Linux|Oracle Linux
+766,OEL,432,Linux|Oracle Linux,449,Unix|HP-UX
+778,Oracle Enterprise Server 7.5,432,Linux|Oracle Linux,134,Oracle Database
+780,OEL6.7 - 3.8.13-68.3.4.el6uek.x86_64,432,Linux|Oracle Linux,449,Unix|HP-UX
+792,VMware Photon,433,Linux|Photon OS,569,VMware Server
+793,VMware Photon 1,433,Linux|Photon OS,569,VMware Server
+809,Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+818,Redhat - Redhat Linux 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+819,Linux RH6,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
+865,Redhat - Redhat Linux 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+870,Redhat - RHEL 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+874,Red Hat Entreprise Linux 6.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+882,Redhat 6 64-Bit,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+893,RED HAT ADVANCED SERVER 5,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+910,redhat6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+912,Redhat - Redhat Linux 6.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+913,Linux RH,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
+916,Redhat - Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+920,Linux RH7,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
+926,Red Hat V6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+932,Linux RH5,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
+934,rehl5.9,434,Linux|Red Hat Enterprise Linux,43,DB2
+964,Red Hat 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+979,red hat,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+991,Redhat,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+996,RedHat 7.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+998,LINUX RED HAT 5 EL,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
+1003,SUSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1006,Linux SuSE12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1011,SUSE10,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1012,SUSE Linux 12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1017,SUSELinux Enterprise 11.x,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1023,SUSE Linux 11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1024,SUSE Linux 11 SP3,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1029,Linux SuSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1030,SUSE,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1038,SuseLinux,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
+1051,domino8.5,270,Lotus Domino,93,Lotus Notes
+1052,Domino 8.x,270,Lotus Domino,93,Lotus Notes
+1054,Lucee 5.2.6.60,271,Lucee,375,Apache Lucene
+1056,Darwin,438,macOS,117,Mozilla Firefox
+1061,Memcache,98,Memcached,18,BMC Control-M
+1062,ACCDB,99,Microsoft Access,525,Open Database Connectivity (ODBC)
+1070,ConfigMgr,102,Microsoft Endpoint Configuration Manager (SCCM),21,Business Intelligence and Reporting Tools (BIRT)
+1080,FIM SQL Development Server,105,Microsoft Forefront Identity Manager (FIM),572,Structured Query Language (SQL)
+1082,Microsoft - Internet Explor,107,Microsoft Internet Explorer,356,Rexx
+1084,Internet Explor,107,Microsoft Internet Explorer,356,Rexx
+1090,SCEP for Linux,110,Microsoft System Center Endpoint Protection,437,Linux|zLinux
+1094,SCEP for Mac,110,Microsoft System Center Endpoint Protection,438,macOS
+1101,msdeploy,112,Microsoft Web Deploy,56,Greenplum DB
+1106,WebPI,114,Microsoft Web Platform Installer,522,Application Web Server
+1109,Web PI,114,Microsoft Web Platform Installer,531,Simple Object Access Protocol (SOAP)
+1111,MDW Framework,115,Model Driven Workflow (MDW),406,JavaScript|AngularJS
+1115,Mango DB,116,MongoDB,43,DB2
+1117,MangoDB,116,MongoDB,43,DB2
+1125,O365,119,MS Office 365,424,IBM i
+1141,MICROSOFT SQL SERVER 2012 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
+1153,MICROSOFT SQL SERVER 2012 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
+1154,MS SQL Server 2008 Developer,581,MS SQL Server|*,146,Oracle SQL Developer
+1156,MICROSOFT SQL SERVER 2008 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
+1167,MSSQL Database Server,581,MS SQL Server|*,122,MySQL
+1173,MSSQL2008,581,MS SQL Server|*,122,MySQL
+1192,Microsoft SQL Server Standard Edition,581,MS SQL Server|*,121,MS SQL Server Compact
+1201,SQLServer,581,MS SQL Server|*,572,Structured Query Language (SQL)
+1226,MICROSOFT SQL SERVER 2012 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
+1233,MICROSOFT SQL SERVER 2005 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
+1234,SQLSVR2008,581,MS SQL Server|*,352,PL/SQL
+1235,MICROSOFT SQL SERVER 2008 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
+1239,MICROSOFT SQL SERVER 2008 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
+1244,MS SQL Server 2012 Developer,581,MS SQL Server|*,146,Oracle SQL Developer
+1253,Microsoft - SQL Server Express LocalDB 2014,581,MS SQL Server|*,469,MS SQL Server|SQL Server Database Engine
+1256,MSSQL,581,MS SQL Server|*,122,MySQL
+1280,Microsoft - Microsoft SQL Server Analysis Services 2012 for Microsoft SQL Server 2012 Standard Edition 2012,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
+1281,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
+1283,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
+1290,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|*
+1293,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|*
+1295,SQL Server Integration Services,470,MS SQL Server|SQL Server Integration Services (SSIS),473,MS SQL Server|SQL Server Reporting Services (SSRS)
+1316,ZOS Base 1.12,441,MVS|z/OS,437,Linux|zLinux
+1335,NAS,272,Netscape Application Server (NAS),443,OS/2
+1337,NES,273,Netscape Enterprise Server (NES),443,OS/2
+1349,Node.js 0.10 (Linux),507,Node.js,437,Linux|zLinux
+1361,Node.js 4 (Linux),507,Node.js,437,Linux|zLinux
+1371,Symas OpenLDAP,126,OpenLDAP,178,SAP SQL Anywhere
+1377,OAM 12c,129,Oracle Access Management,303,C
+1378,ADF 12c,130,Oracle ADF,343,Objective C
+1381,OHS,610,Oracle Application Server|*,122,MySQL
+1383,Oracle HTTP Server powered by Apache,610,Oracle Application Server|*,259,Apache HTTP Server
+1384,j2eeoracleca,610,Oracle Application Server|*,453,Linux|Fedora
+1385,Oracle HTTP,610,Oracle Application Server|*,134,Oracle Database
+1389,9i AS server,610,Oracle Application Server|*,227,Virtual I/O Server
+1391,Oracle Application R12.1.3,610,Oracle Application Server|*,134,Oracle Database
+1394,Weblogic BI Publisher,132,Oracle BI Publisher,600,Oracle WebLogic Server|*
+1396,OBI,133,Oracle Business Intelligence,343,Objective C
+1397,OBIEE,133,Oracle Business Intelligence,343,Objective C
+1398,OBI Reporting,133,Oracle Business Intelligence,343,Objective C
+1401,Oracle 12.2 Client,134,Oracle Database,610,Oracle Application Server|*
+1406,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0,134,Oracle Database,610,Oracle Application Server|*
+1407,Oracle 11.2 (Oracle Database 11g Enterprise Edition Release 11.2.0.3.0 - 64bit) RAC,134,Oracle Database,610,Oracle Application Server|*
+1415,Oracle 11 on AIX,134,Oracle Database,445,Unix|AIX
+1416,Oracle Database 10g Enterprise Edition Release 10.1.0.4.0 - 64bit,134,Oracle Database,610,Oracle Application Server|*
+1431,Oracle Database 10g Release 10.2.0.4.0 - 64bit Production,134,Oracle Database,298,Oracle Exadata
+1432,Oarcle 11G,134,Oracle Database,218,TIBCO InConcert
+1443,DB - Oracle inbuilt,134,Oracle Database,158,Powerbuilder
+1460,Oracle Server,134,Oracle Database,610,Oracle Application Server|*
+1475,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 - 64bit Production,134,Oracle Database,610,Oracle Application Server|*
+1480,Oracle 12C on linux,134,Oracle Database,303,C
+1484,Oracle9i Enterprise Edition Release 9.2.0.5.0,134,Oracle Database,610,Oracle Application Server|*
+1486,Oracle 11g on linux,134,Oracle Database,432,Linux|Oracle Linux
+1487,Oracle 11gEssbase,134,Oracle Database,298,Oracle Exadata
+1490,JServer Release 9.2.0.5.0,474,Oracle Database|Jserver,335,Java|JavaServer Pages (JSP)
+1492,Designer 6i,135,Oracle Designer,516,Compopent Object Model (COM)
+1493,Enterprise Manager 12.2.1.1,136,Oracle Enterprise Manager,427,Linux|CentOS
+1494,Enterprise Manager 12.2.1.2,136,Oracle Enterprise Manager,427,Linux|CentOS
+1495,Enterprise Manager 11.1.1.7,136,Oracle Enterprise Manager,427,Linux|CentOS
+1501,"Oracle, Nets",140,Oracle Net Services,273,Netscape Enterprise Server (NES)
+1502,Oracle RAC,141,Oracle Real Application Clusters (RAC),134,Oracle Database
+1504,ORPOS 13.3.3,142,Oracle Retail Point-of-Service,609,IIS|*
+1505,ORPOS 13.3.5,142,Oracle Retail Point-of-Service,609,IIS|*
+1506,ORPOS 13.3.4,142,Oracle Retail Point-of-Service,609,IIS|*
+1509,OSB Servers,143,Oracle Service Bus,443,OS/2
+1514,Oracle TT,147,Oracle TimesTen In-Memory Database,134,Oracle Database
+1517,OWB 10g,148,Oracle Warehouse Builder (OWB),300,AWK
+1524,Clascal,346,Pascal|Object Pascal,307,Cascading Style Sheets (CSS)
+1526,Oracle-HR-9.2,151,PeopleSoft,134,Oracle Database
+1529,ActiveState Tool Corp. - ActivePerl 5.12,348,Perl|ActivePerl,500,ActiveX
+1530,ActiveState Tool Corp. - ActivePerl 5.8,348,Perl|ActivePerl,500,ActiveX
+1531,ORAPERL,417,Perl|Oraperl,242,WinRAR
+1532,REX,349,Perl|Rex,356,Rexx
+1536,TCServer V6,277,Pivotal tc Server,365,TCL
+1537,IBM PKWARE PKZip 2,155,PKZIP,387,Java|IBM SDK
+1541,PLQSL,352,PL/SQL,351,PL/I
+1542,Oracle - SQL,352,PL/SQL,581,MS SQL Server|*
+1544,Oracle SQL,352,PL/SQL,134,Oracle Database
+1545,PLSQL;,352,PL/SQL,351,PL/I
+1547,Oracle PLSQL,352,PL/SQL,351,PL/I
+1548,plsql,352,PL/SQL,351,PL/I
+1551,Projectplace,156,Planview,21,Business Intelligence and Reporting Tools (BIRT)
+1558,Power Builder,158,Powerbuilder,151,PeopleSoft
+1560,Power Builder 6.5,158,Powerbuilder,27,Chef Automate
+1565,ProjectWise Oracle Server,161,ProjectWise,162,ProjectWise Web Server
+1576,RMQ,165,RabbitMQ,355,R
+1579,Clearquest,167,Rational ClearQuest,455,Clarify|Clear Basic
+1581,Remedy ARS,169,Remedy,322,Fortran
+1584,RightFax client 10,171,RightFax,118,MQ Client
+1585,SOQL,359,Salesforce Object Query Language (SOQL),621,ArangoDB
+1587,SAP Business Objects,173,SAP BusinessObjects BI server,177,SAP NetWeaver Business Warehouse
+1588,Business Objects 12,173,SAP BusinessObjects BI server,488,ActiveX|ADO
+1590,SAP BI 4.2 Sp5,173,SAP BusinessObjects BI server,174,SAP ERP
+1593,SAP HANA ON SUSEOracle 11g on Linux,175,SAP HANA DB,435,Linux|SUSE Linux Enterprise Server
+1596,NetWeaver,279,SAP NetWeaver App Server,431,Linux|openSUSE
+1605,SCSS,361,Sass,102,Microsoft Endpoint Configuration Manager (SCCM)
+1606,Scalla,362,Scala,664,Forte
+1609,Microsoft SPS 2010,603,SharePoint|*,577,MVS|*
+1613,SQL Server SP2013 Database Server,603,SharePoint|*,581,MS SQL Server|*
+1615,Siebel IP 2015,182,Siebel,583,C++|*
+1616,Siebel 7.8.2.16,182,Siebel,43,DB2
+1617,Siebel CRM,182,Siebel,583,C++|*
+1619,Techsmith Corporation - SnagIt 8,184,SnagIt,183,SNA Manager
+1620,Solid development server,185,solidDB,600,Oracle WebLogic Server|*
+1622,Sixty-Five Software - SpaceMonger 1.4,187,SpaceMonger,296,Intel Xeon Processor
+1623,SQLPlus,478,Oracle Database|SQL*Plus,572,Structured Query Language (SQL)
+1625,SQLIO 1.0,189,SQLIO,178,SAP SQL Anywhere
+1630,SunOne,281,Oracle iPlanet Web Server,448,Unix|BSD|SunOS
+1637,SAP - Sybase Central 4.3,479,Sybase SQL Server|Sybase Central,190,Sybase SQL Server
+1639,Sysncsort,191,Syncsort,178,SAP SQL Anywhere
+1640,syncsort,191,Syncsort,98,Memcached
+1641,Sysinternals LLC - AccessEnum 1 1,194,Sysinternal Tools|AccessEnum,124,Nexus Repository OSS
+1642,Sysinternals LLC - ClockRes 2,195,Sysinternal Tools|ClockRes,374,Xbase++
+1643,Sysinternals LLC - Coreinfo 3.21,196,Sysinternal Tools|Coreinfo,670,EAServer
+1644,Sysinternals LLC - DiskExt 1.1,197,Sysinternal Tools|DiskExt,374,Xbase++
+1645,Sysinternals LLC - DiskMon 2.01,198,Sysinternal Tools|DiskMon,670,EAServer
+1647,Sysinternals LLC - Junction 1.6,200,Sysinternal Tools|Junction,374,Xbase++
+1648,Sysinternals LLC - LDMDump 1.02,201,Sysinternal Tools|LDMDump,178,SAP SQL Anywhere
+1649,Sysinternals LLC - LoadOrder 1,202,Sysinternal Tools|LoadOrder,374,Xbase++
+1650,Sysinternals LLC - PipeList 1.01,203,Sysinternal Tools|PipeList,670,EAServer
+1651,Sysinternals LLC - Process Explorer 16.5,204,Sysinternal Tools|Process Explorer,464,Microsoft Exchange Server|Veeam Explorer
+1652,Sysinternals LLC - PsKill 1.15,205,Sysinternal Tools|PsKill,151,PeopleSoft
+1653,Sysinternals LLC - PsPasswd 1.23,206,Sysinternal Tools|PsPasswd,231,VMware vCenter
+1654,Sysinternals LLC - SDelete 1.61,207,Sysinternal Tools|SDelete,670,EAServer
+1655,Sysinternals LLC - ShareEnum 1.6,208,Sysinternal Tools|ShareEnum,603,SharePoint|*
+1656,Sysinternals LLC - Sync 2.2,209,Sysinternal Tools|Sync,374,Xbase++
+1657,Sysinternals LLC - Sysinternals TCPView 3.5,210,Sysinternal Tools|TCPView,365,TCL
+1658,Sysinternals LLC - VMMap 3.11,211,Sysinternal Tools|VMMap,176,SAP MaxDB
+1659,Sysinternals LLC - Whois 1.11,212,Sysinternal Tools|Whois,178,SAP SQL Anywhere
+1664,TERADATA QUERY SCHEDULER SERVER VERSION 15,216,Teradata QS Server,215,Teradata
+1667,BusinessWorks,217,TIBCO Business Works (BW),111,Microsoft Visual Studio
+1668,Tibco-IM,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous
+1669,Tibco Integration Manager,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous
+1674,TSQL,366,Transact-SQL,621,ArangoDB
+1675,Trasact SQL,366,Transact-SQL,352,PL/SQL
+1746,Solaris 11.2 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene
+1747,Solaris UNIX,448,Unix|BSD|SunOS,578,Unix|*
+1748,Unix Servers (Solaris,448,Unix|BSD|SunOS,578,Unix|*
+1749,Oracle Solaris 11.3 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene
+1753,Solaris 5.10 (Generic_150400-61),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
+1754,Solaris 5.10 (Generic_150400-62),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
+1756,Solaris 5.10 (Generic_150400-55),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
+1760,Oracle Solaris,448,Unix|BSD|SunOS,134,Oracle Database
+1762,Solaris 1 (SPARC),448,Unix|BSD|SunOS,375,Apache Lucene
+1765,SunSolaris 10.0,448,Unix|BSD|SunOS,430,Linux|Junos OS
+1771,Oracle Solaris 10,448,Unix|BSD|SunOS,134,Oracle Database
+1800,VIO 2.2.0.10,227,Virtual I/O Server,159,Primavera P6
+1801,VIOS,227,Virtual I/O Server,443,OS/2
+1802,visibroker,228,Visibroker,420,Cisco IOS
+1803,VB6,370,Visual Basic,368,VB.NET
+1804,VB 6.0,370,Visual Basic,368,VB.NET
+1805,visualbasic,370,Visual Basic,306,C++|Visual C++
+1808,Visual Basic 6.0,370,Visual Basic,368,VB.NET
+1811,VBA,371,Visual Basic for Applications (VBA),370,Visual Basic
+1812,Access VB,371,Visual Basic for Applications (VBA),99,Microsoft Access
+1813,vfoxpro,372,Visual FoxPro,117,Mozilla Firefox
+1827,VMware Appliance,569,VMware Server,559,Virtual Appliance
+1828,VSX,229,VMware Solution Exchange Marketplace (VSX),111,Microsoft Visual Studio
+1830,VMware - VMware Tools 10.2,230,VMware Tools,569,VMware Server
+1832,VXML,373,VoiceXML,316,eXtensible HyperText Markup Language (XHTML)
+1833,Web Focus,232,WebFOCUS,321,FOCUS
+1834,FOCEXEC,232,WebFOCUS,495,Oracle WebCenter Content Server|Idoc Script
+1836,WLI 8,233,WebLogic Integration,442,OpenVMS
+1842,IBM WEBSPHERE APPLICATION SERVER VERSION 6.1.0,284,Websphere Application Server (WAS),285,WebSphere Liberty
+1848,"IBM WebSphere Application Server Network Deployment, 8.0.0.5",284,Websphere Application Server (WAS),285,WebSphere Liberty
+1850,IBM WebSphere Application Server Network Deployment 7,284,Websphere Application Server (WAS),285,WebSphere Liberty
+1858,IBM WebSphere 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
+1861,IBM - WebSphere Application Server - Base 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
+1865,Websphere AS (JVM),284,Websphere Application Server (WAS),285,WebSphere Liberty
+1872,IBM WebSphere,284,Websphere Application Server (WAS),285,WebSphere Liberty
+1875,IBM WebSphere Application Server 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
+1877,IBM WebSphere Application,284,Websphere Application Server (WAS),285,WebSphere Liberty
+1878,WAS 6.x,284,Websphere Application Server (WAS),521,Electronic Data Interchange (EDI)
+1880,IBM OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE
+1882,Open Liberty,285,WebSphere Liberty,397,Java|Servlet
+1883,IBM Open Liberty,285,WebSphere Liberty,62,IBM BigFix Platform
+1887,WAS Liberty,285,WebSphere Liberty,397,Java|Servlet
+1889,OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE
+1891,WMB 6.1,235,WebSphere Message Broker,486,.NET Framework|Windows Workflow Foundation (WF)
+1892,WebSphere Message Broker v6.0,235,WebSphere Message Broker,285,WebSphere Liberty
+1899,WebSphere Portal Extend Limited Use 6.1,286,WebSphere Portal Server,285,WebSphere Liberty
+1901,Windchill 11.1,237,Windchill,17,Bluebeam|Bluebeam Q
+1908,Window,580,Windows|*,637,Microsoft Azure
+1914,Windows Terminal Server,239,Windows Terminal Server (WTS),452,Windows|Windows Server
+1915,Windows 7 Standard,451,Windows|Windows Desktop,580,Windows|*
+1916,WINDOWS 10 SERVER STANDARD EDITION X64,451,Windows|Windows Desktop,452,Windows|Windows Server
+1917,Microsoft Windows 7 (64-bit),451,Windows|Windows Desktop,580,Windows|*
+1918,Microsoft Windows XP Professional (32-bit),451,Windows|Windows Desktop,580,Windows|*
+1919,Windows 7 Professional x64,451,Windows|Windows Desktop,580,Windows|*
+1920,Microsoft Microsoft Windows Entreprise,451,Windows|Windows Desktop,580,Windows|*
+1921,Microsoft Windows 2000,451,Windows|Windows Desktop,580,Windows|*
+1922,Microsoft Windows 10,451,Windows|Windows Desktop,580,Windows|*
+1923,MS Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|*
+1924,Microsoft Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|*
+1925,Microsoft Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
+1926,Microsoft Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|*
+1927,Win Desktop,451,Windows|Windows Desktop,560,Webtop
+1928,Windows 10 Pro,451,Windows|Windows Desktop,580,Windows|*
+1929,Windows 10,451,Windows|Windows Desktop,580,Windows|*
+1930,Windows 7 Ultimate,451,Windows|Windows Desktop,580,Windows|*
+1931,Microsoft Windows 8 (64-bit),451,Windows|Windows Desktop,580,Windows|*
+1932,Microsoft Windows XP,451,Windows|Windows Desktop,580,Windows|*
+1933,Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|*
+1934,Windows XP,451,Windows|Windows Desktop,580,Windows|*
+1935,Windows 10 Professional,451,Windows|Windows Desktop,580,Windows|*
+1936,Windows 7,451,Windows|Windows Desktop,580,Windows|*
+1937,Microsoft Windows 10 (64-bit),451,Windows|Windows Desktop,580,Windows|*
+1938,Win 7,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE)
+1939,windowsxp,451,Windows|Windows Desktop,580,Windows|*
+1940,Microsoft Windows Unknown,451,Windows|Windows Desktop,580,Windows|*
+1941,Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
+1942,Windows XP Professional,451,Windows|Windows Desktop,580,Windows|*
+1943,Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|*
+1944,Window XP,451,Windows|Windows Desktop,580,Windows|*
+1945,Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
+1946,Microsoft Windows 7 - SOE,451,Windows|Windows Desktop,580,Windows|*
+1947,Windows 7 Enterprise Edition,451,Windows|Windows Desktop,452,Windows|Windows Server
+1948,Windows 8,451,Windows|Windows Desktop,580,Windows|*
+1949,Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|*
+1950,Microsoft Windows 7 (32-bit),451,Windows|Windows Desktop,580,Windows|*
+1951,Windows Embedded Standard 7,451,Windows|Windows Desktop,580,Windows|*
+1952,Win10,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE)
+1953,Windows 2003,451,Windows|Windows Desktop,580,Windows|*
+1955,Windows 2003 Standard,452,Windows|Windows Server,580,Windows|*
+1956,Windows 2008 Enterprise R2 x64,452,Windows|Windows Server,580,Windows|*
+1960,WINDOWS 2008R2,452,Windows|Windows Server,580,Windows|*
+1961,Microsoft Windows Server 2008 Standard Editio,452,Windows|Windows Server,121,MS SQL Server Compact
+1962,MICROSOFT WINDOWS NT 2003,452,Windows|Windows Server,580,Windows|*
+1967,Microsoft Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
+1979,Windows 2008 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|*
+1982,Windows 2003 R2,452,Windows|Windows Server,580,Windows|*
+1983,Windows 2008 R2 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|*
+1988,Windows 2008 R2,452,Windows|Windows Server,580,Windows|*
+1989,Windows 2012 Standard,452,Windows|Windows Server,580,Windows|*
+1992,Windows 2008 R2 Standard 6.1.7601 Service Pack 1,452,Windows|Windows Server,580,Windows|*
+1994,Windows 2008 Standard x64,452,Windows|Windows Server,580,Windows|*
+1998,Windows 2012 R2 Standard 64-Bit,452,Windows|Windows Server,580,Windows|*
+2007,w2k12,452,Windows|Windows Server,582,C#|*
+2008,WINDOWS 2013,452,Windows|Windows Server,580,Windows|*
+2009,WINDOWS 2016 SE 64 BIT,452,Windows|Windows Server,580,Windows|*
+2011,Microsoft - Windows 2012,452,Windows|Windows Server,580,Windows|*
+2019,MICROSOFT WINDOWS 2008 TPM,452,Windows|Windows Server,580,Windows|*
+2021,MICROSOFT WINDOWS STD 2008,452,Windows|Windows Server,580,Windows|*
+2025,Windows 2008 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
+2028,MICROSOFT WINDOWS STD 2008 TPM,452,Windows|Windows Server,580,Windows|*
+2030,Windows 2012 64 Bit,452,Windows|Windows Server,580,Windows|*
+2031,MICROSOFT WINDOWS NT 2003 ENT,452,Windows|Windows Server,580,Windows|*
+2034,MICROSOFT WINDOWS 2012,452,Windows|Windows Server,580,Windows|*
+2036,Windows 2003 Standard5.2.3790,452,Windows|Windows Server,580,Windows|*
+2040,Windows 2012 R,452,Windows|Windows Server,580,Windows|*
+2044,Windows 2008 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|*
+2045,MICROSOFT WINDOWS 2008 ENT,452,Windows|Windows Server,580,Windows|*
+2047,Windows 2012 R2 Standard 6.3.9600,452,Windows|Windows Server,580,Windows|*
+2053,Windows 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
+2055,Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
+2061,Windows 2016 Datacenter10.0.14393,452,Windows|Windows Server,637,Microsoft Azure
+2065,windows6.3.9600,452,Windows|Windows Server,580,Windows|*
+2066,Windows 2012 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
+2069,Windows 2008 Enterprise,452,Windows|Windows Server,580,Windows|*
+2080,Windows 2008 Standard without Hyper-V6.0.6003,452,Windows|Windows Server,580,Windows|*
+2084,Windows 2012 R2 Datacenter,452,Windows|Windows Server,110,Microsoft System Center Endpoint Protection
+2089,Windows 2008 Standard 64-bit,452,Windows|Windows Server,580,Windows|*
+2096,Windows 2000,452,Windows|Windows Server,580,Windows|*
+2097,W2K8R2 Standard 64 BIT,452,Windows|Windows Server,303,C
+2099,Windows 2008 Standard6.0.6003,452,Windows|Windows Server,580,Windows|*
+2100,Windows2008 R2 Enterprise 64bit,452,Windows|Windows Server,580,Windows|*
+2105,Win2008R2,452,Windows|Windows Server,355,R
+2107,Windows 2008 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
+2109,Windows Server 2003 Appliance,452,Windows|Windows Server,559,Virtual Appliance
+2111,Windows 2008 ENT R2 (64 bits),452,Windows|Windows Server,355,R
+2114,WIN2008R2 6.1.7601,452,Windows|Windows Server,355,R
+2116,microsoft windows std 2012  tpm,452,Windows|Windows Server,580,Windows|*
+2118,microsoft windows 2008,452,Windows|Windows Server,580,Windows|*
+2120,Windows 2008 Standard 32 Bit,452,Windows|Windows Server,580,Windows|*
+2121,Microsoft Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|*
+2126,Window2008 R2,452,Windows|Windows Server,355,R
+2130,Windows 2008 Standard,452,Windows|Windows Server,580,Windows|*
+2134,WS03,452,Windows|Windows Server,239,Windows Terminal Server (WTS)
+2136,Windows 2008 Enterprise x64,452,Windows|Windows Server,580,Windows|*
+2141,Windows 2008 R2 Enterprise,452,Windows|Windows Server,580,Windows|*
+2142,Windows Server 2003 Std 32-bit,452,Windows|Windows Server,580,Windows|*
+2143,Windows 2008 R2 Standard 64bit,452,Windows|Windows Server,580,Windows|*
+2146,Microsoft Windows 2003 R2 Standard,452,Windows|Windows Server,580,Windows|*
+2148,MICROSOFT WINDOWS NT 2003 TPM,452,Windows|Windows Server,580,Windows|*
+2149,Win Server 2008,452,Windows|Windows Server,569,VMware Server
+2150,Windows 2003 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
+2152,WIN2014,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
+2156,Win 2012 R2,452,Windows|Windows Server,355,R
+2160,Win Server,452,Windows|Windows Server,12,Apache Subversion
+2161,Windows 2008 Standard R2 x64,452,Windows|Windows Server,580,Windows|*
+2163,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 6196 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor
+2164,Windows2012,452,Windows|Windows Server,580,Windows|*
+2165,Windows 2008 R2 Standard6.1.7601,452,Windows|Windows Server,580,Windows|*
+2166,Windows 2016,452,Windows|Windows Server,580,Windows|*
+2167,Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|*
+2179,Windows Server 2003 Std 64-bit,452,Windows|Windows Server,580,Windows|*
+2180,Windows 2012 R2,452,Windows|Windows Server,580,Windows|*
+2181,Wintel,452,Windows|Windows Server,461,IBM Tivoli Storage Manager|TSM Client
+2191,Windows 2003 Enterprise5.2.3790,452,Windows|Windows Server,580,Windows|*
+2192,WINDOWS 2012,452,Windows|Windows Server,580,Windows|*
+2193,Windows 2008 R2 OS,452,Windows|Windows Server,580,Windows|*
+2196,Windows 2003 Standard R2,452,Windows|Windows Server,580,Windows|*
+2197,Windows 2008 R2 Enterprise6.1.7601,452,Windows|Windows Server,580,Windows|*
+2198,Windows 2003 Standard 32 Bit,452,Windows|Windows Server,580,Windows|*
+2199,WINDOWS SERVER 2003 APPLIANCE 5.2,452,Windows|Windows Server,559,Virtual Appliance
+2201,WS08R2,452,Windows|Windows Server,355,R
+2204,Windows 2008 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|*
+2213,w2k8r2sp1,452,Windows|Windows Server,355,R
+2217,Win 2003,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
+2222,Windows 2012 R2 Standard,452,Windows|Windows Server,580,Windows|*
+2226,Windows 2008 R2 Standard 64-bit,452,Windows|Windows Server,580,Windows|*
+2228,Windows 2003 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|*
+2230,Windows 2012 Storage R2,452,Windows|Windows Server,580,Windows|*
+2231,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 4096 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor
+2235,MICROSOFT WINDOWS NT 2003 ENT TPM,452,Windows|Windows Server,239,Windows Terminal Server (WTS)
+2237,Windows 2016 Standard10.0.14393,452,Windows|Windows Server,580,Windows|*
+2240,MICROSOFT WINDOWS 2003,452,Windows|Windows Server,580,Windows|*
+2242,Windows 2012 Standard R2,452,Windows|Windows Server,580,Windows|*
+2246,Win Server 2008 R2,452,Windows|Windows Server,355,R
+2248,MICROSOFT WINDOWS STD 2012 TPM,452,Windows|Windows Server,580,Windows|*
+2249,Windows 2003 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|*
+2250,Windows 2008 Enterprise R2,452,Windows|Windows Server,580,Windows|*
+2251,Windows 2008,452,Windows|Windows Server,580,Windows|*
+2252,Microsoft Microsoft Windows 2008 R2,452,Windows|Windows Server,580,Windows|*
+2257,Win Server 2012,452,Windows|Windows Server,569,VMware Server
+2258,Windows 2016 Standard,452,Windows|Windows Server,580,Windows|*
+2264,Windows 2008 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|*
+2267,Windows 2003 Standard 5.2.3790 Service Pack 2,452,Windows|Windows Server,580,Windows|*
+2268,Windows 2012 Standard6.2.9200,452,Windows|Windows Server,580,Windows|*
+2269,MICROSOFT WINDOWS 2016 TPM,452,Windows|Windows Server,580,Windows|*
+2272,Windows 2003 Enterprise,452,Windows|Windows Server,580,Windows|*
+2275,Windows 2008 R2 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|*
+2277,Windows 2012 R2 Standard6.3.9600,452,Windows|Windows Server,580,Windows|*
+2286,Windows 2008 Standard R2,452,Windows|Windows Server,580,Windows|*
+2287,MicrosoftWindows Server 2008 R2 (64-bit),452,Windows|Windows Server,443,OS/2
+2288,windows6.3,452,Windows|Windows Server,580,Windows|*
+2290,Windows 2016 64 Bit,452,Windows|Windows Server,580,Windows|*
+2296,Windows 2008 Enterprise6.0.6003,452,Windows|Windows Server,580,Windows|*
+2301,Win 2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
+2302,Win2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
+2303,Win2012R2,452,Windows|Windows Server,355,R
+2305,win2008,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
+2306,Windows 2003 Standard x64,452,Windows|Windows Server,580,Windows|*
+2315,WINDOWS 2016 STANDARD EDITION,452,Windows|Windows Server,580,Windows|*
+2325,WinSCP.net - WinSCP 5.11,243,WinSCP,178,SAP SQL Anywhere
+2332,Zerto Vritual Appliance,249,Zerto Virtual Replication,559,Virtual Appliance
+2333,Oracle RTD,289,Oracle Real-Time Decisions (RTD),134,Oracle Database
+2338,OMNIbus,251,Tivoli Netcool/OMNIbus,582,C#|*
+2347,ALM,511,Application Lifecycle Management (ALM),421,DART
+2349,BMS,513,Batch Management Software (BMS),442,OpenVMS
+2354,COM,516,Compopent Object Model (COM),661,COM+
+2357,CORBA Interface Definition Language,518,CORBA Interface Definition Language (CORBA IDL),517,Common Object Request Broker Architecture (CORBA)
+2359,Data Control Language,519,Data Control Language (DCL),329,IBM i Control Language (CL)
+2361,Database,520,Database (DB),43,DB2
+2362,DB,520,Database (DB),43,DB2
+2365,Electronic Data Interchange,521,Electronic Data Interchange (EDI),104,Microsoft Exchange Server
+2369,JDOM,523,Java-based Document Object Model for XML (JDOM),84,IMS DB
+2381,Simple Object Access Protocol,531,Simple Object Access Protocol (SOAP),547,Internet Message Access Protocol (IMAP)
+2383,SQL,572,Structured Query Language (SQL),581,MS SQL Server|*
+2386,DPE,538,Device Provisioning Engines (DPE),661,COM+
+2388,ESB,540,Enterprise Service Bus(ESB),370,Visual Basic
+2395,MES,553,Manufacturing Execution System (MES),623,Amazon S3
+2401,Z/Virtual System Environment,591,z/VSE,441,MVS|z/OS
+2403,DOS/VSE,591,z/VSE,597,DOS/360
+2404,Microsoft Disk Operating System,593,MS-DOS,443,OS/2
+2407,VME/B,595,VME,368,VB.NET
+2408,Virtual Machine Environment,595,VME,111,Microsoft Visual Studio
+2409,VME 2900,595,VME,107,Microsoft Internet Explorer
+2410,OpenVME,595,VME,442,OpenVMS
+2411,Disk Operating System/360,597,DOS/360,443,OS/2
+2413,Transaction Processing Facility,598,z/TPF,572,Structured Query Language (SQL)
+2419,NPL,653,Natural Programming Language,342,Niakwa Programming Language (NPL)
+2426,IDMS/DB Data Manipulation Language,668,IDMS DML,312,Data Language Interface (DL/I)
+2433,Basic Mapping Supprt,689,BMS Map,21,Business Intelligence and Reporting Tools (BIRT)
+2434,DB/400,690,DB400,43,DB2
+2435,IBM ISAM,693,ISAM,73,IBM Operational Decision Manager (ODM)
--- a/zero_shot/flan-t5.py
+++ b/zero_shot/flan-t5.py
@ -32,21 +32,25 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 def generate_acronym(text):

    # Define prompt
-    prompt = f"Answer concisely: make a possible acronym from the following: '{text}'"
+    # prompt = f"Imagine you are a diverse database. Given the following: '{text}', please suggest to me 5 possible variations. Give 5."
+    prompt = f"Give me a list of 10 historical product names related to: '{text}'. Format the output in a list, like this 1. Item, 2. Item, 3. ..."

    # Generate acronym
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to("cuda")
    outputs = model.generate(
        inputs["input_ids"],
-        max_length=100,
-        no_repeat_ngram_size=3) 
+        max_length=200,
+        do_sample=True,
+        top_k=50,
+        temperature=0.8)
+        # no_repeat_ngram_size=3) 
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

 # %%
 # Example usage
 # text = "Advanced Data Analytics Platform"
-text = "red hat enterprise linux"
+text = "windows desktop"
 acronym = generate_acronym(text)
-print(f"Acronym: {acronym}")
+print(f"Generation: {acronym}")
 # %%