From 5312cfa06f0c542582041186d549f7955df3ad97 Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Wed, 15 Jan 2025 20:09:15 +0900 Subject: [PATCH] added more augmentations to finally beat sota - class_bert_augmentation is now the reference training code --- analysis/corrupt_text.py | 41 ++ analysis/entity_hierarchy.py | 95 --- analysis/error_analysis.py | 33 +- analysis/label_acronym.py | 62 ++ esAppMod_data_import/.gitignore | 5 + esAppMod_data_import/entity_hierarchy.py | 124 ++++ .../entity_hierarchy_for_seq2seq.py | 129 ++++ .../.gitignore | 0 .../prediction/.gitignore | 0 .../prediction/output.txt | 8 +- .../prediction/predict.py | 15 +- train/class_bert_augmentation/train.py | 562 ++++++++++++++++ train/class_bert_hierarchical/.gitignore | 2 + .../prediction/.gitignore | 1 + .../prediction/output.txt | 11 + .../prediction/output_1.txt | 6 + .../prediction/output_2.txt | 6 + .../prediction/predict_1.py | 265 ++++++++ .../prediction/predict_2.py | 265 ++++++++ .../train_1.py} | 234 ++++--- train/class_bert_hierarchical/train_2.py | 469 +++++++++++++ .../mapping_prediction/output.txt | 2 - .../.gitignore | 0 .../inference.py | 6 +- train/seq2seq_t5_simple/prediction/output.txt | 2 + .../predict.py | 13 +- train/seq2seq_t5_simple/train.py | 6 +- zero_shot/bloom.py | 11 +- zero_shot/conceptnet.py | 21 + zero_shot/dbpedia.py | 38 ++ zero_shot/error.csv | 626 ++++++++++++++++++ zero_shot/{t5.py => flan-t5.py} | 14 +- 32 files changed, 2837 insertions(+), 235 deletions(-) create mode 100644 analysis/corrupt_text.py delete mode 100644 analysis/entity_hierarchy.py create mode 100644 analysis/label_acronym.py create mode 100644 esAppMod_data_import/entity_hierarchy.py create mode 100644 esAppMod_data_import/entity_hierarchy_for_seq2seq.py rename train/{class_bert_process => class_bert_augmentation}/.gitignore (100%) rename train/{class_bert_process => class_bert_augmentation}/prediction/.gitignore (100%) rename train/{class_bert_process => class_bert_augmentation}/prediction/output.txt (53%) rename train/{class_bert_process => class_bert_augmentation}/prediction/predict.py (94%) create mode 100644 train/class_bert_augmentation/train.py create mode 100644 train/class_bert_hierarchical/.gitignore create mode 100644 train/class_bert_hierarchical/prediction/.gitignore create mode 100644 train/class_bert_hierarchical/prediction/output.txt create mode 100644 train/class_bert_hierarchical/prediction/output_1.txt create mode 100644 train/class_bert_hierarchical/prediction/output_2.txt create mode 100644 train/class_bert_hierarchical/prediction/predict_1.py create mode 100644 train/class_bert_hierarchical/prediction/predict_2.py rename train/{class_bert_process/train.py => class_bert_hierarchical/train_1.py} (59%) create mode 100644 train/class_bert_hierarchical/train_2.py delete mode 100644 train/seq2seq_t5_simple/mapping_prediction/output.txt rename train/seq2seq_t5_simple/{mapping_prediction => prediction}/.gitignore (100%) rename train/seq2seq_t5_simple/{mapping_prediction => prediction}/inference.py (96%) create mode 100644 train/seq2seq_t5_simple/prediction/output.txt rename train/seq2seq_t5_simple/{mapping_prediction => prediction}/predict.py (79%) create mode 100644 zero_shot/conceptnet.py create mode 100644 zero_shot/dbpedia.py create mode 100644 zero_shot/error.csv rename zero_shot/{t5.py => flan-t5.py} (72%) diff --git a/analysis/corrupt_text.py b/analysis/corrupt_text.py new file mode 100644 index 0000000..859b681 --- /dev/null +++ b/analysis/corrupt_text.py @@ -0,0 +1,41 @@ +# %% +import random +import string + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + +# Example usage +sentence = "This is a simple string for testing" +corrupted_sentence = corrupt_string(sentence, corruption_probability=0.1) +print("Original:", sentence) +print("Corrupted:", corrupted_sentence) + +# %% diff --git a/analysis/entity_hierarchy.py b/analysis/entity_hierarchy.py deleted file mode 100644 index 937aa59..0000000 --- a/analysis/entity_hierarchy.py +++ /dev/null @@ -1,95 +0,0 @@ -# %% -import json -import pandas as pd - -########################################## -# %% - -# Load the JSON file -data_path = '../esAppMod/tca_entities.json' -with open(data_path, 'r') as file: - data = json.load(file) - -# Initialize an empty list to store the rows -rows = [] - -# %% -# Loop through all entities in the JSON -for entity in data["data"].items(): - entity_data = entity[1] - entity_id = entity_data['entity_id'] - entity_name = entity_data['entity_name'] - entity_type_id = entity_data['entity_type_id'] - entity_type_name = entity_data['entity_type_name'] - - # Add each mention and its entity_id to the rows list - rows.append( - { - 'id': entity_id, - 'name': entity_name, - 'type_id': entity_type_id, - 'type_name': entity_type_name - }) - -# Create a DataFrame from the rows -df = pd.DataFrame(rows) - -# %% -# df.to_csv('entity.csv', index=False) -df - -# %% -df['type_name'].value_counts() -# %% -df['type_id'].value_counts() - -# %% -name_list = df['name'].to_list() -# %% -name_list - -# %% -from scipy.cluster.hierarchy import dendrogram, linkage, fcluster -import numpy as np - -# %% -# Define labels -labels = name_list - -# Create a prefix-based distance matrix -def prefix_distance(label1, label2): - prefix1 = label1.split() - prefix2 = label2.split() - # Find common prefix length - common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2]) - # Distance is inversely proportional to common prefix length - return 1.0 / (common_prefix_length + 1) - -# Create a pairwise distance matrix -n = len(labels) -distance_matrix = np.zeros((n, n)) -for i in range(n): - for j in range(n): - distance_matrix[i, j] = prefix_distance(labels[i], labels[j]) - -# Perform hierarchical clustering -linkage_matrix = linkage(distance_matrix, method='average') - -# Visualize as a dendrogram -import matplotlib.pyplot as plt -dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2) -plt.title("Prefix-Based Clustering") -plt.show() - -# %% -linkage_matrix -# %% -# Extract flat clusters with a distance threshold -threshold = 0.5 -clusters = fcluster(linkage_matrix, t=threshold, criterion='distance') - -# Display clusters -for i, cluster_id in enumerate(clusters): - print(f"Label: {labels[i]}, Cluster ID: {cluster_id}") - -# %% diff --git a/analysis/error_analysis.py b/analysis/error_analysis.py index 40e1310..0df5bb4 100644 --- a/analysis/error_analysis.py +++ b/analysis/error_analysis.py @@ -3,53 +3,55 @@ import pandas as pd # %% # import training file -data_path = '../data_import/train.csv' +data_path = '../esAppMod_data_import/train.csv' +# data_path = '../esAppMod_data_import/parent_train.csv' train_df = pd.read_csv(data_path, skipinitialspace=True) # import test file -data_path = '../data_import/test.csv' +data_path = '../esAppMod_data_import/test.csv' +# data_path = '../esAppMod_data_import/parent_test.csv' test_df = pd.read_csv(data_path, skipinitialspace=True) # import entity file -data_path = '../data_import/entity.csv' +data_path = '../esAppMod_data_import/entity.csv' entity_df = pd.read_csv(data_path, skipinitialspace=True) id2label = {} for _, row in entity_df.iterrows(): id2label[row['id']] = row['name'] -# %% train_df.sort_values(by=['entity_id']).to_markdown('out.md') # %% -data_path = '../train/class_bert_process/prediction/exports/result.csv' +data_path = '../train/class_bert_augmentation/prediction/exports/result.csv' prediction_df = pd.read_csv(data_path) -# %% predicted_entity_list = [] for element in prediction_df['class_prediction']: predicted_entity_list.append(id2label[element]) prediction_df['predicted_name'] = predicted_entity_list -# %% new_df = pd.concat((test_df, prediction_df ), axis=1) - -# %% mismatch_mask = new_df['entity_id'] != new_df['class_prediction'] mismatch_df = new_df[mismatch_mask] - -# %% len(mismatch_df) # %% # print the top 10 offending classes +# mask1 = mismatch_df['entity_id'] != 434 +# mask2 = mismatch_df['entity_id'] != 451 +# mask3 = mismatch_df['entity_id'] != 452 +# mask= mask1 & mask2 & mask3 +# masked_df = mismatch_df[mask] +# print(masked_df['entity_id'].value_counts()[:10]) print(mismatch_df['entity_id'].value_counts()[:10]) +masked_df = mismatch_df # %% # Convert the whole dataframe as a string and display # print the mismatch_df -print(mismatch_df.sort_values(by=['entity_id']).to_markdown()) +print(masked_df.sort_values(by=['entity_id']).to_markdown()) # %% mismatch_df.to_csv('error.csv') @@ -62,14 +64,9 @@ mismatch_df[select_mask] # %% # let us see the train mentions -select_value = 452 +select_value = 130 select_mask = train_df['entity_id'] == select_value train_df[select_mask] -# %% -mismatch_df[select_mask]['class_prediction'].to_list() - -# %% -# %% diff --git a/analysis/label_acronym.py b/analysis/label_acronym.py new file mode 100644 index 0000000..be1d2ad --- /dev/null +++ b/analysis/label_acronym.py @@ -0,0 +1,62 @@ +# %% +import pandas as pd +import re + +# %% +# import training file +data_path = '../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) + + +# import test file +data_path = '../esAppMod_data_import/test.csv' +test_df = pd.read_csv(data_path, skipinitialspace=True) + +# import entity file +data_path = '../esAppMod_data_import/entity.csv' +entity_df = pd.read_csv(data_path, skipinitialspace=True) +id2label = {} +for _, row in entity_df.iterrows(): + id2label[row['id']] = row['name'] + + +# %% +train_df +# %% + +def extract_acronym_mapping(names): + mapping = {} + for name in names: + # Find acronym in parentheses + match = re.search(r"\((\w+)\)", name) + if match: + acronym = match.group(1) + + # Remove unrelated prepended terms + core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip() + + # Add to dictionary + mapping[acronym] = core_term + return mapping + +names = set(train_df['entity_name'].to_list()) + +# Extract mappings +acronym_mapping = extract_acronym_mapping(names) +print(acronym_mapping) +# %% +del acronym_mapping['E'] # too many false matches +acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()} + +abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()} +term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()} + + +# %% +abbrev_to_term +# %% +term_to_abbrev + +# %% +acronym_mapping +# %% diff --git a/esAppMod_data_import/.gitignore b/esAppMod_data_import/.gitignore index e69de29..ab11419 100644 --- a/esAppMod_data_import/.gitignore +++ b/esAppMod_data_import/.gitignore @@ -0,0 +1,5 @@ +out.md +parent_test.csv +parent_train.csv +test_seq.csv +train_seq.csv diff --git a/esAppMod_data_import/entity_hierarchy.py b/esAppMod_data_import/entity_hierarchy.py new file mode 100644 index 0000000..0878d46 --- /dev/null +++ b/esAppMod_data_import/entity_hierarchy.py @@ -0,0 +1,124 @@ +# %% +import json +import pandas as pd + +########################################## +# %% +# import training file +data_path = '../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) + +# %% +# import entity file +# Keep only one row per unique value in 'column1' +unique_df = train_df.drop_duplicates(subset="entity_name", keep="first") +id2label = {} +for _, row in unique_df.iterrows(): + id2label[row['entity_id']] = row['entity_name'] + +inverse_dict = {value:key for key,value in id2label.items()} +# %% +# Create a new dictionary with sorted keys +# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())} +sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())} + +# %% +sorted_dict + +# %% +rule_set ={ + '.NET': [497,482,484,487,485,486,483], + 'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376], + 'C++': [583,306], + 'CA': [290,22,23,24,25], + 'CSS': [307,377], + 'Cisco': [28,420,29], + 'Citrix': [563,565,31,292,291,564,32,30], + 'coldfusion': [311,37], + 'eclipse': [46,622,641,456], + 'xml': [596, 318], + 'xsl': [319,320], + 'HP': [59,293,60,61,58], + 'http': [505,543], + 'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605], + 'IBM BigFix': [62,457], + 'IBM ILOG': [253,255,254,256,252], + 'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79], + 'IBM WebSphere': [80,82,83,81], + 'IBM i': [424,329], + 'IDMS': [667,668], + 'IIS': [609,490,489,491], + 'JBoss': [268,492,493], + 'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414], + 'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404], + 'KVS': [549,550,551], + 'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437], + 'MS SQL': [581,121,466,467,465,468,469,470,471,472,473], + 'MVS': [577,440,441], + 'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114], + 'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281], + 'Oracle WebLogic': [600,233], + 'Oracle Application Server': [610,494], + 'Oracle Database': [134,474,475,478], + 'Oracle Hyperion': [607,138,139], + 'Oracle WebCenter': [276,495], + 'Pascal': [599,346], + 'Perl': [585,348,417,349], + 'ProjectWise': [161,162], + 'Rational': [166,167], + 'SAP': [173,175,695,176,676,178,179], + 'SAP ERP': [174,476,477], + 'SAP NetWeaver': [279,496,177], + 'Sybase SQL Server': [190,479,480], + 'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212], + 'TIBCO': [218,219], + 'TIBCO Business Works': [217,481], + 'Tivoli': [220,251], + 'Tortoise': [221,222], + 'Unix': [578,445,579,447,602,590,448,449], + 'VB': [368,369], + 'VMware': [568,569,229,230,231], + 'Visual Basic': [370,371,372], + 'WebSphere': [234,285,235,286,284,601,287], + 'Windows': [580,238,239,451,452], + 'z': [598,608,591] + +} + +# %% +# iterate through the whole training set +new_df = train_df.copy() +for idx, row in train_df.iterrows(): + # we iterate through each rule set, replacing any matching values in the + # list with the first element of the list + for key in rule_set.keys(): + id = row['entity_id'] + if (id in rule_set[key]): + new_df.loc[idx,('entity_id')] = rule_set[key][0] +# %% +len(set(new_df['entity_id'].to_list())) + +# %% +new_df.to_csv('parent_train.csv') + +# %% +# now do the same for the test data +# import training file +data_path = '../esAppMod_data_import/test.csv' +test_df = pd.read_csv(data_path, skipinitialspace=True) + +new_df = test_df.copy() +for idx, row in test_df.iterrows(): + # we iterate through each rule set, replacing any matching values in the + # list with the first element of the list + for key in rule_set.keys(): + id = row['entity_id'] + if (id in rule_set[key]): + new_df.loc[idx,('entity_id')] = rule_set[key][0] + +# %% +new_df + +# %% +new_df.to_csv('parent_test.csv') +# %% diff --git a/esAppMod_data_import/entity_hierarchy_for_seq2seq.py b/esAppMod_data_import/entity_hierarchy_for_seq2seq.py new file mode 100644 index 0000000..ab5533c --- /dev/null +++ b/esAppMod_data_import/entity_hierarchy_for_seq2seq.py @@ -0,0 +1,129 @@ +# %% +import json +import pandas as pd + +########################################## +# %% +# import training file +data_path = '../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) + +# %% +# import entity file +# Keep only one row per unique value in 'column1' +unique_df = train_df.drop_duplicates(subset="entity_name", keep="first") +id2label = {} +for _, row in unique_df.iterrows(): + id2label[row['entity_id']] = row['entity_name'] + +inverse_dict = {value:key for key,value in id2label.items()} +# %% +# Create a new dictionary with sorted keys +# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())} +sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())} + +# %% +sorted_dict + +# %% +rule_set ={ + '.NET': [497,482,484,487,485,486,483], + 'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376], + 'C++': [583,306], + 'CA': [290,22,23,24,25], + 'CSS': [307,377], + 'Cisco': [28,420,29], + 'Citrix': [563,565,31,292,291,564,32,30], + 'coldfusion': [311,37], + 'eclipse': [46,622,641,456], + 'xml': [596, 318], + 'xsl': [319,320], + 'HP': [59,293,60,61,58], + 'http': [505,543], + 'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605], + 'IBM BigFix': [62,457], + 'IBM ILOG': [253,255,254,256,252], + 'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79], + 'IBM WebSphere': [80,82,83,81], + 'IBM i': [424,329], + 'IDMS': [667,668], + 'IIS': [609,490,489,491], + 'JBoss': [268,492,493], + 'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414], + 'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404], + 'KVS': [549,550,551], + 'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437], + 'MS SQL': [581,121,466,467,465,468,469,470,471,472,473], + 'MVS': [577,440,441], + 'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114], + 'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281], + 'Oracle WebLogic': [600,233], + 'Oracle Application Server': [610,494], + 'Oracle Database': [134,474,475,478], + 'Oracle Hyperion': [607,138,139], + 'Oracle WebCenter': [276,495], + 'Pascal': [599,346], + 'Perl': [585,348,417,349], + 'ProjectWise': [161,162], + 'Rational': [166,167], + 'SAP': [173,175,695,176,676,178,179], + 'SAP ERP': [174,476,477], + 'SAP NetWeaver': [279,496,177], + 'Sybase SQL Server': [190,479,480], + 'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212], + 'TIBCO': [218,219], + 'TIBCO Business Works': [217,481], + 'Tivoli': [220,251], + 'Tortoise': [221,222], + 'Unix': [578,445,579,447,602,590,448,449], + 'VB': [368,369], + 'VMware': [568,569,229,230,231], + 'Visual Basic': [370,371,372], + 'WebSphere': [234,285,235,286,284,601,287], + 'Windows': [580,238,239,451,452], + 'z': [598,608,591] + +} + +# %% +# iterate through the whole training set +new_df = train_df.copy() +for idx, row in train_df.iterrows(): + # we iterate through each rule set, replacing any matching values in the + # list with the first element of the list + for key in rule_set.keys(): + id = row['entity_id'] + if (id in rule_set[key]): + stem = rule_set[key][0] + leaf = rule_set[key].index(id) + new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}" +# %% +len(set(new_df['entity_seq'].to_list())) + +# %% +new_df.to_csv('train_seq.csv') + +# %% +# now do the same for the test data +# import training file +data_path = '../esAppMod_data_import/test.csv' +test_df = pd.read_csv(data_path, skipinitialspace=True) + +new_df = test_df.copy() +for idx, row in test_df.iterrows(): + # we iterate through each rule set, replacing any matching values in the + # list with the first element of the list + for key in rule_set.keys(): + id = row['entity_id'] + if (id in rule_set[key]): + stem = rule_set[key][0] + leaf = rule_set[key].index(id) + new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}" + + +# %% +new_df + +# %% +new_df.to_csv('test_seq.csv') +# %% diff --git a/train/class_bert_process/.gitignore b/train/class_bert_augmentation/.gitignore similarity index 100% rename from train/class_bert_process/.gitignore rename to train/class_bert_augmentation/.gitignore diff --git a/train/class_bert_process/prediction/.gitignore b/train/class_bert_augmentation/prediction/.gitignore similarity index 100% rename from train/class_bert_process/prediction/.gitignore rename to train/class_bert_augmentation/prediction/.gitignore diff --git a/train/class_bert_process/prediction/output.txt b/train/class_bert_augmentation/prediction/output.txt similarity index 53% rename from train/class_bert_process/prediction/output.txt rename to train/class_bert_augmentation/prediction/output.txt index 6d2e300..8a8215b 100644 --- a/train/class_bert_process/prediction/output.txt +++ b/train/class_bert_augmentation/prediction/output.txt @@ -1,6 +1,6 @@ ******************************************************************************* -Accuracy: 0.77655 -F1 Score: 0.79605 -Precision: 0.85637 -Recall: 0.77655 +Accuracy: 0.80197 +F1 Score: 0.81948 +Precision: 0.88067 +Recall: 0.80197 diff --git a/train/class_bert_process/prediction/predict.py b/train/class_bert_augmentation/prediction/predict.py similarity index 94% rename from train/class_bert_process/prediction/predict.py rename to train/class_bert_augmentation/prediction/predict.py index 96c7496..12b1954 100644 --- a/train/class_bert_process/prediction/predict.py +++ b/train/class_bert_augmentation/prediction/predict.py @@ -32,6 +32,8 @@ torch.set_float32_matmul_precision('high') BATCH_SIZE = 256 # %% +# construct the target id list +# data_path = '../../../esAppMod_data_import/train.csv' data_path = '../../../esAppMod_data_import/train.csv' train_df = pd.read_csv(data_path, skipinitialspace=True) # rather than use pattern, we use the real thing and property @@ -51,20 +53,9 @@ for idx, val in enumerate(target_id_list): def preprocess_text(text): # 1. Make all uppercase text = text.lower() - - # Remove any non alphanumeric character - # text = re.sub(r'[^\w\s]', ' ', text) # Retains only alphanumeric and spaces - text = re.sub(r"[-;:]", " ", text) - - # Add space between digit followed by a letter - text = re.sub(r"(\d)([A-Z])", r"\1 \2", text) - - # Add space between letter followed by a digit - text = re.sub(r"([A-Z])(\d)", r"\1 \2", text) - # Substitute digits with '#' - text = re.sub(r'\d+', 'x', text) + # text = re.sub(r'\d+', '#', text) # standardize spacing text = re.sub(r'\s+', ' ', text).strip() diff --git a/train/class_bert_augmentation/train.py b/train/class_bert_augmentation/train.py new file mode 100644 index 0000000..7a90289 --- /dev/null +++ b/train/class_bert_augmentation/train.py @@ -0,0 +1,562 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=10 + +# %% + +# import training file +data_path = '../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + # text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + # add the original text + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + +acronym_mapping = { + 'hpsa': 'hp server automation', + 'tam': 'tivoli access manager', + 'adf': 'application development facility', + 'html': 'hypertext markup language', + 'wff': 'microsoft web farm framework', + 'jsp': 'javaserver pages', + 'bw': 'business works', + 'ssrs': 'sql server reporting services', + 'cl': 'control language', + 'vba': 'visual basic for applications', + 'esapi': 'enterprise security api', + 'gwt': 'google web toolkit', + 'pki': 'perkin elmer informatics', + 'rtd': 'oracle realtime decisions', + 'jms': 'java message service', + 'db': 'database', + 'soa': 'service oriented architecture', + 'xsl': 'extensible stylesheet language', + 'com': 'compopent object model', + 'ldap': 'lightweight directory access protocol', + 'odm': 'ibm operational decision manager', + 'soql': 'salesforce object query language', + 'oms': 'order management system', + 'cfml': 'coldfusion markup language', + 'nas': 'netscape application server', + 'sql': 'structured query language', + 'bde': 'borland database engine', + 'imap': 'internet message access protocol', + 'uws': 'ultidev web server', + 'birt': 'business intelligence and reporting tools', + 'mdw': 'model driven workflow', + 'tws': 'tivoli workload scheduler', + 'jre': 'java runtime environment', + 'wcs': 'websphere commerce suite', + 'was': 'websphere application server', + 'ssis': 'sql server integration services', + 'xhtml': 'extensible hypertext markup language', + 'soap': 'simple object access protocol', + 'san': 'storage area network', + 'elk': 'elastic stack', + 'arr': 'application request routing', + 'xlst': 'extensible stylesheet language transformations', + 'sccm': 'microsoft endpoint configuration manager', + 'ejb': 'enterprise java beans', + 'css': 'cascading style sheets', + 'hpoo': 'hp operations orchestration', + 'xml': 'extensible markup language', + 'esb': 'enterprise service bus', + 'edi': 'electronic data interchange', + 'imsva': 'interscan messaging security virtual appliance', + 'wtx': 'ibm websphere transformation extender', + 'cgi': 'common gateway interface', + 'bal': 'ibm basic assembly language', + 'issow': 'integrated safe system of work', + 'dcl': 'data control language', + 'jdom': 'java document object model', + 'fim': 'microsoft forefront identity manager', + 'npl': 'niakwa programming language', + 'wf': 'windows workflow foundation', + 'lm': 'etap license manager', + 'wts': 'windows terminal server', + 'asp': 'active server pages', + 'jil': 'job information language', + 'mvc': 'model view controller', + 'rmi': 'remote method invocation', + 'ad': 'active directory', + 'owb': 'oracle warehouse builder', + 'rest': 'representational state transfer', + 'jdk': 'java development kit', + 'ids': 'integrated data store', + 'bms': 'batch management software', + 'vsx': 'vmware solution exchange', + 'ssas': 'sql server analysis services', + 'atl': 'atlas transformation language', + 'ice': 'infobright community edition', + 'esql': 'extended structured query language', + 'corba': 'common object request broker architecture', + 'dpe': 'device provisioning engines', + 'rac': 'oracle real application clusters', + 'iemt': 'iis easy migration tool', + 'mes': 'manufacturing execution system', + 'odbc': 'open database connectivity', + 'lms': 'lan management solution', + 'wcf': 'windows communication foundation', + 'nes': 'netscape enterprise server', + 'jsf': 'javaserver faces', + 'alm': 'application lifecycle management', + 'hlasm': 'high level assembler', + 'cmod': 'content manager ondemand'} + +external_source = { + 'vb.net': 'visual basic dot net', + 'jes': 'job entry subsystem', + 'svn': 'subversion', + 'vcs': 'version control system', + 'lims': 'laboratory information management system', + 'ide': 'integrated development environment', + 'sdk': 'software development kit', + 'mq': 'message queue', + 'ims': 'information management system', + 'isa': 'internet security and acceleration', + 'vs': 'visual studio', + 'esr': 'extended support release', + 'ff': 'firefox', + 'vb': 'visual basic', + 'rhel': 'red hat enterprise linux', + 'iis': 'internet information server', + 'api': 'application programming interface', + 'se': 'standard edition', + '\.net': 'dot net', + 'c#': 'c sharp' +} + + +# synonyms = { +# 'windows server': 'windows nt', +# 'windows 7': 'windows desktop', +# 'windows 8': 'windows desktop', +# 'windows 10': 'windows desktop' +# } + + +# add more information +acronym_mapping.update(external_source) + + +abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()} +term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()} + +def replace_terms_with_abbreviations(text): + for input, replacement in term_to_abbrev.items(): + text = re.sub(input, replacement, text) + return text + +def replace_abbreviations_with_terms(text): + for input, replacement in abbrev_to_term.items(): + text = re.sub(input, replacement, text) + return text + +###################################### + +# augmentation by text corruption + +def corrupt_word(word): + """Corrupt a single word using random corruption techniques.""" + if len(word) <= 1: # Skip corruption for single-character words + return word + + corruption_type = random.choice(["delete", "swap"]) + + if corruption_type == "delete": + # Randomly delete a character + idx = random.randint(0, len(word) - 1) + word = word[:idx] + word[idx + 1:] + + elif corruption_type == "swap": + # Swap two adjacent characters + if len(word) > 1: + idx = random.randint(0, len(word) - 2) + word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) + + + return word + +def corrupt_string(sentence, corruption_probability=0.01): + """Corrupt each word in the string with a given probability.""" + words = sentence.split() + corrupted_words = [ + corrupt_word(word) if random.random() < corruption_probability else word + for word in words + ] + return " ".join(corrupted_words) + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +label_flag_list = [] + +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + parent_desc = preprocess_text(parent_desc) + + # Split the string into words + words = parent_desc.split() + + # Count the number of words + word_count = len(words) + + # short sequences are rare, and we must compensate by including more examples + # mutation of other longer sequences might drown out rare short sequences + if word_count < 3: + for _ in range(10): + element = { + 'text': parent_desc, + 'label': label2id[index], + } + output_list.append(element) + + + # check if label is in label_flag_list + if index not in label_flag_list: + + entity_name = row['entity_name'] + # add the "entity_name" label as a mention + element = { + 'text': entity_name, + 'label': label2id[index], + } + output_list.append(element) + + # remove all non-alphanumerics + desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # add shufles of the original entity name + no_of_shuffles = SHUFFLES + processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + label_flag_list.append(index) + + + + # add shuffled strings + processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # corrupt string + desc = corrupt_string(parent_desc, corruption_probability=0.1) + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # augmentation + # remove all non-alphanumerics + desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # # augmentation + # # perform abbrev_to_term + # temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + # desc = replace_terms_with_abbreviations(temp_desc) + # if (desc != temp_desc): + # element = { + # 'text' : desc, + # 'label': label2id[index], # ensure labels starts from 0 + # } + # output_list.append(element) + + # augmentation + # perform term to abbrev + desc = replace_abbreviations_with_terms(parent_desc) + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + data_path = '../../esAppMod_data_import/train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + # additional_special_tokens = [""] + # Add the additional special tokens to the tokenizer + # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=5e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/train/class_bert_hierarchical/.gitignore b/train/class_bert_hierarchical/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/train/class_bert_hierarchical/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/train/class_bert_hierarchical/prediction/.gitignore b/train/class_bert_hierarchical/prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/train/class_bert_hierarchical/prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/train/class_bert_hierarchical/prediction/output.txt b/train/class_bert_hierarchical/prediction/output.txt new file mode 100644 index 0000000..da7b7bf --- /dev/null +++ b/train/class_bert_hierarchical/prediction/output.txt @@ -0,0 +1,11 @@ + +******************************************************************************* +Accuracy: 0.71956 +F1 Score: 0.74142 +Precision: 0.81529 +Recall: 0.71956 +******************************************************************************** +Accuracy: 0.71710 +F1 Score: 0.74095 +Precision: 0.82181 +Recall: 0.71710 diff --git a/train/class_bert_hierarchical/prediction/output_1.txt b/train/class_bert_hierarchical/prediction/output_1.txt new file mode 100644 index 0000000..bad67a7 --- /dev/null +++ b/train/class_bert_hierarchical/prediction/output_1.txt @@ -0,0 +1,6 @@ + +******************************************************************************* +Accuracy: 0.81591 +F1 Score: 0.82162 +Precision: 0.85519 +Recall: 0.81591 diff --git a/train/class_bert_hierarchical/prediction/output_2.txt b/train/class_bert_hierarchical/prediction/output_2.txt new file mode 100644 index 0000000..b3f462b --- /dev/null +++ b/train/class_bert_hierarchical/prediction/output_2.txt @@ -0,0 +1,6 @@ + +******************************************************************************* +Accuracy: 0.59943 +F1 Score: 0.60266 +Precision: 0.66956 +Recall: 0.59943 diff --git a/train/class_bert_hierarchical/prediction/predict_1.py b/train/class_bert_hierarchical/prediction/predict_1.py new file mode 100644 index 0000000..d33eef5 --- /dev/null +++ b/train/class_bert_hierarchical/prediction/predict_1.py @@ -0,0 +1,265 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% +# construct the target id list +# data_path = '../../../esAppMod_data_import/train.csv' +data_path = '../../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + index = row['entity_id'] + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + # data_path = '../../../esAppMod_data_import/test.csv' + data_path = '../../../esAppMod_data_import/parent_test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + # combined_data = DatasetDict({ + # 'train': Dataset.from_list(process_df_to_dict(train_df)), + # }) + return Dataset.from_list(process_df_to_dict(test_df)) + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint_part1-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + # additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 128 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output_1.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_1.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output_1.txt", "w") as f: + print('', file=f) + test() diff --git a/train/class_bert_hierarchical/prediction/predict_2.py b/train/class_bert_hierarchical/prediction/predict_2.py new file mode 100644 index 0000000..d1801ec --- /dev/null +++ b/train/class_bert_hierarchical/prediction/predict_2.py @@ -0,0 +1,265 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% +# construct the target id list +# data_path = '../../../esAppMod_data_import/train.csv' +data_path = '../../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + + +# introduce pre-processing functions +def preprocess_text(text): + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with '#' + text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = row['mention'] + desc = preprocess_text(desc) + index = row['entity_id'] + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + return output_list + + +def create_dataset(): + # train + # data_path = '../../../esAppMod_data_import/test.csv' + data_path = '../../../esAppMod_data_import/test.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + # combined_data = DatasetDict({ + # 'train': Dataset.from_list(process_df_to_dict(train_df)), + # }) + return Dataset.from_list(process_df_to_dict(test_df)) + + + +# %% + +def test(): + + test_dataset = create_dataset() + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + # additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 128 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + with open("output_2.txt", "a") as f: + + print('*' * 80, file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + label_list = [id2label[id] for id in pred_labels] + df = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_2.csv", index=False) + + + + + + +# %% +# reset file before writing to it +with open("output_2.txt", "w") as f: + print('', file=f) + test() diff --git a/train/class_bert_process/train.py b/train/class_bert_hierarchical/train_1.py similarity index 59% rename from train/class_bert_process/train.py rename to train/class_bert_hierarchical/train_1.py index bcad65d..4d37a0c 100644 --- a/train/class_bert_process/train.py +++ b/train/class_bert_hierarchical/train_1.py @@ -45,7 +45,7 @@ def set_seed(seed): set_seed(42) -SHUFFLES=2 +SHUFFLES=5 # %% @@ -56,37 +56,6 @@ train_df = pd.read_csv(data_path, skipinitialspace=True) entity_ids = train_df['entity_id'].to_list() target_id_list = sorted(list(set(entity_ids))) -def compute_normalized_class_weights(class_counts, max_resamples=SHUFFLES): - """ - Compute normalized class weights inversely proportional to class counts. - The weights are normalized so that they sum to 1. - - Args: - class_counts (array-like): An array or list where each element represents the count of samples for a class. - - Returns: - numpy.ndarray: A normalized array of weights for each class. - """ - class_counts = np.array(class_counts) - total_samples = np.sum(class_counts) - class_weights = total_samples / class_counts - # so that highest weight is 1 - normalized_weights = class_weights / np.max(class_weights) - # Scale weights such that the highest weight corresponds to `max_resamples` - resample_counts = normalized_weights * max_resamples - # Round resamples to nearest integer - resample_counts = np.round(resample_counts).astype(int) - return resample_counts - -# %% -id_counts = train_df['entity_id'].value_counts() -id_weights = compute_normalized_class_weights(id_counts, max_resamples=SHUFFLES) -id_index = id_counts.index -label2weight = {} -for idx, label in enumerate(id_index): - label2weight[label] = id_weights[idx] - - # %% id2label = {} label2id = {} @@ -100,21 +69,9 @@ def preprocess_text(text): # 1. Make all uppercase text = text.lower() - - # Remove any non alphanumeric character - # text = re.sub(r'[^\w\s]', ' ', text) # Retains only alphanumeric and spaces - # replace dashes - text = re.sub(r"[-;:]", " ", text) - - # Add space between digit followed by a letter - text = re.sub(r"(\d)([A-Z])", r"\1 \2", text) - - # Add space between letter followed by a digit - text = re.sub(r"([A-Z])(\d)", r"\1 \2", text) - # Substitute digits with 'x' - text = re.sub(r'\d+', 'x', text) + text = re.sub(r'\d+', '#', text) # standardize spacing text = re.sub(r'\s+', ' ', text).strip() @@ -165,35 +122,143 @@ def shuffle_text(text, n_shuffles=SHUFFLES): return all_processed -term_to_abbrev = { - r'job entry system': 'jes', - r'subversion': 'svn', - r'borland database engine': 'bde', - r'business intelligence and reporting tools': 'birt', - r'lan management solution': 'lms', - r'laboratory information management system': 'lims', - r'ibm database 2': 'db/2', - r'integrated development environment': 'ide', - r'software development kit': 'sdk', - r'hp operations orchestration': 'hpoo', - r'hp server automation': 'hpsa', - r'internet information server': 'iis', - r'release 2': 'r2', - r'red hat enterprise linux': 'rhel', - r'oracle enterprise linux': 'oel', - r'websphere application server': 'was', - r'application development facility': 'adf', - r'server analysis services': 'ssas' +acronym_mapping = { + 'hpsa': 'hp server automation', + 'tam': 'tivoli access manager', + 'adf': 'application development facility', + 'html': 'hypertext markup language', + 'wff': 'microsoft web farm framework', + 'jsp': 'javaserver pages', + 'bw': 'business works', + 'ssrs': 'sql server reporting services', + 'cl': 'control language', + 'vba': 'visual basic for applications', + 'esapi': 'enterprise security api', + 'gwt': 'google web toolkit', + 'pki': 'perkin elmer informatics', + 'rtd': 'oracle realtime decisions', + 'jms': 'java message service', + 'db': 'database', + 'soa': 'service oriented architecture', + 'xsl': 'extensible stylesheet language', + 'com': 'compopent object model', + 'ldap': 'lightweight directory access protocol', + 'odm': 'ibm operational decision manager', + 'soql': 'salesforce object query language', + 'oms': 'order management system', + 'cfml': 'coldfusion markup language', + 'nas': 'netscape application server', + 'sql': 'structured query language', + 'bde': 'borland database engine', + 'imap': 'internet message access protocol', + 'uws': 'ultidev web server', + 'birt': 'business intelligence and reporting tools', + 'mdw': 'model driven workflow', + 'tws': 'tivoli workload scheduler', + 'jre': 'java runtime environment', + 'wcs': 'websphere commerce suite', + 'was': 'websphere application server', + 'ssis': 'sql server integration services', + 'xhtml': 'extensible hypertext markup language', + 'soap': 'simple object access protocol', + 'san': 'storage area network', + 'elk': 'elastic stack', + 'arr': 'application request routing', + 'xlst': 'extensible stylesheet language transformations', + 'sccm': 'microsoft endpoint configuration manager', + 'ejb': 'enterprise java beans', + 'css': 'cascading style sheets', + 'hpoo': 'hp operations orchestration', + 'xml': 'extensible markup language', + 'esb': 'enterprise service bus', + 'edi': 'electronic data interchange', + 'imsva': 'interscan messaging security virtual appliance', + 'wtx': 'ibm websphere transformation extender', + 'cgi': 'common gateway interface', + 'bal': 'ibm basic assembly language', + 'issow': 'integrated safe system of work', + 'dcl': 'data control language', + 'jdom': 'java document object model', + 'fim': 'microsoft forefront identity manager', + 'npl': 'niakwa programming language', + 'wf': 'windows workflow foundation', + 'lm': 'etap license manager', + 'wts': 'windows terminal server', + 'asp': 'active server pages', + 'jil': 'job information language', + 'mvc': 'model view controller', + 'rmi': 'remote method invocation', + 'ad': 'active directory', + 'owb': 'oracle warehouse builder', + 'rest': 'representational state transfer', + 'jdk': 'java development kit', + 'ids': 'integrated data store', + 'bms': 'batch management software', + 'vsx': 'vmware solution exchange', + 'ssas': 'sql server analysis services', + 'atl': 'atlas transformation language', + 'ice': 'infobright community edition', + 'esql': 'extended structured query language', + 'corba': 'common object request broker architecture', + 'dpe': 'device provisioning engines', + 'rac': 'oracle real application clusters', + 'iemt': 'iis easy migration tool', + 'mes': 'manufacturing execution system', + 'odbc': 'open database connectivity', + 'lms': 'lan management solution', + 'wcf': 'windows communication foundation', + 'nes': 'netscape enterprise server', + 'jsf': 'javaserver faces', + 'alm': 'application lifecycle management', + 'hlasm': 'high level assembler', + 'cmod': 'content manager ondemand'} + +external_source = { + 'vb.net': 'visual basic dot net', + 'jes': 'job entry subsystem', + 'svn': 'subversion', + 'vcs': 'version control system', + 'lims': 'laboratory information management system', + 'ide': 'integrated development environment', + 'sdk': 'software development kit', + 'mq': 'message queue', + 'ims': 'information management system', + 'isa': 'internet security and acceleration', + 'vs': 'visual studio', + 'esr': 'extended support release', + 'ff': 'firefox', + 'vb': 'visual basic', + 'rhel': 'red hat enterprise linux', + 'iis': 'internet information server', + 'api': 'application programming interface', + 'se': 'standard edition', + '\.net': 'dot net', + 'c#': 'c sharp', + 'ms': 'microsoft' } -abbrev_to_term = {rf'\b{value}\b': key for key, value in term_to_abbrev.items()} + +# synonyms = { +# 'windows server': 'windows nt', +# 'windows 7': 'windows desktop', +# 'windows 8': 'windows desktop', +# 'windows 10': 'windows desktop' +# } + + +# add more information +acronym_mapping.update(external_source) + + +abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()} +term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()} def replace_terms_with_abbreviations(text): for input, replacement in term_to_abbrev.items(): text = re.sub(input, replacement, text) return text -def replace_abbreivations_with_terms(text): +def replace_abbreviations_with_terms(text): for input, replacement in abbrev_to_term.items(): text = re.sub(input, replacement, text) return text @@ -218,8 +283,19 @@ def process_df_to_dict(df): # no_of_shuffles = label2weight[index] + 1 no_of_shuffles = SHUFFLES processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles) - for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # augmentation + # remove all non-alphanumerics + desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + if (desc != parent_desc): element = { 'text' : desc, 'label': label2id[index], # ensure labels starts from 0 @@ -227,24 +303,21 @@ def process_df_to_dict(df): output_list.append(element) + # augmentation # perform abbrev_to_term - desc = replace_terms_with_abbreviations(parent_desc) - no_of_shuffles = SHUFFLES - processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles) - - for desc in processed_descs: + temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + desc = replace_terms_with_abbreviations(temp_desc) + if (desc != temp_desc): element = { 'text' : desc, 'label': label2id[index], # ensure labels starts from 0 } output_list.append(element) + # augmentation # perform term to abbrev - desc = replace_abbreivations_with_terms(parent_desc) - no_of_shuffles = SHUFFLES - processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles) - - for desc in processed_descs: + desc = replace_abbreviations_with_terms(parent_desc) + if (desc != parent_desc): element = { 'text' : desc, 'label': label2id[index], # ensure labels starts from 0 @@ -257,7 +330,7 @@ def process_df_to_dict(df): def create_dataset(): # train - data_path = '../../esAppMod_data_import/train.csv' + data_path = '../../esAppMod_data_import/parent_train.csv' train_df = pd.read_csv(data_path, skipinitialspace=True) @@ -271,13 +344,13 @@ def create_dataset(): def train(): - save_path = f'checkpoint' + save_path = f'checkpoint_part1' split_datasets = create_dataset() # prepare tokenizer model_checkpoint = "distilbert/distilbert-base-uncased" - # model_checkpoint = 'google-bert/bert-base-cased' + # model_checkpoint = 'google-bert/bert-base-uncased' # model_checkpoint = 'prajjwal1/bert-small' tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) # Define additional special tokens @@ -348,7 +421,6 @@ def train(): training_args = TrainingArguments( output_dir=f"{save_path}", - # eval_strategy="epoch", eval_strategy="no", logging_dir="tensorboard-log", logging_strategy="epoch", diff --git a/train/class_bert_hierarchical/train_2.py b/train/class_bert_hierarchical/train_2.py new file mode 100644 index 0000000..67aa0c9 --- /dev/null +++ b/train/class_bert_hierarchical/train_2.py @@ -0,0 +1,469 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import re +import random +import glob + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +def set_seed(seed): + """ + Set the random seed for reproducibility. + """ + random.seed(seed) # Python random module + np.random.seed(seed) # NumPy random + torch.manual_seed(seed) # PyTorch CPU + torch.cuda.manual_seed(seed) # PyTorch GPU + torch.cuda.manual_seed_all(seed) # If using multiple GPUs + torch.backends.cudnn.deterministic = True # Ensure deterministic behavior + torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility + +set_seed(42) + +SHUFFLES=0 + +# %% + +# import training file +data_path = '../../esAppMod_data_import/train.csv' +train_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +entity_ids = train_df['entity_id'].to_list() +target_id_list = sorted(list(set(entity_ids))) + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(target_id_list): + id2label[idx] = val + label2id[val] = idx + +# %% +# introduce pre-processing functions +def preprocess_text(text): + + # 1. Make all uppercase + text = text.lower() + + # Substitute digits with 'x' + text = re.sub(r'\d+', '#', text) + + # standardize spacing + text = re.sub(r'\s+', ' ', text).strip() + + return text + + +def generate_random_shuffles(text, n): + """ + Generate n strings with randomly shuffled words from the input text. + + Args: + text (str): The input text. + n (int): The number of random variations to generate. + + Returns: + list: A list of strings with shuffled words. + """ + words = text.split() # Split the input into words + shuffled_variations = [] + + for _ in range(n): + shuffled = words[:] # Copy the word list to avoid in-place modification + random.shuffle(shuffled) # Randomly shuffle the words + shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string + + return shuffled_variations + + +# generate n more shuffled examples +def shuffle_text(text, n_shuffles=SHUFFLES): + """ + Preprocess a list of texts and add n random shuffles for each string. + + Args: + texts (list): An input strings. + n_shuffles (int): Number of random shuffles to generate for each string. + + Returns: + list: A list of preprocessed and shuffled strings. + """ + all_processed = [] + all_processed.append(text) + + # Generate random shuffles + shuffled_variations = generate_random_shuffles(text, n_shuffles) + all_processed.extend(shuffled_variations) + + return all_processed + +acronym_mapping = { + 'hpsa': 'hp server automation', + 'tam': 'tivoli access manager', + 'adf': 'application development facility', + 'html': 'hypertext markup language', + 'wff': 'microsoft web farm framework', + 'jsp': 'javaserver pages', + 'bw': 'business works', + 'ssrs': 'sql server reporting services', + 'cl': 'control language', + 'vba': 'visual basic for applications', + 'esapi': 'enterprise security api', + 'gwt': 'google web toolkit', + 'pki': 'perkin elmer informatics', + 'rtd': 'oracle realtime decisions', + 'jms': 'java message service', + 'db': 'database', + 'soa': 'service oriented architecture', + 'xsl': 'extensible stylesheet language', + 'com': 'compopent object model', + 'ldap': 'lightweight directory access protocol', + 'odm': 'ibm operational decision manager', + 'soql': 'salesforce object query language', + 'oms': 'order management system', + 'cfml': 'coldfusion markup language', + 'nas': 'netscape application server', + 'sql': 'structured query language', + 'bde': 'borland database engine', + 'imap': 'internet message access protocol', + 'uws': 'ultidev web server', + 'birt': 'business intelligence and reporting tools', + 'mdw': 'model driven workflow', + 'tws': 'tivoli workload scheduler', + 'jre': 'java runtime environment', + 'wcs': 'websphere commerce suite', + 'was': 'websphere application server', + 'ssis': 'sql server integration services', + 'xhtml': 'extensible hypertext markup language', + 'soap': 'simple object access protocol', + 'san': 'storage area network', + 'elk': 'elastic stack', + 'arr': 'application request routing', + 'xlst': 'extensible stylesheet language transformations', + 'sccm': 'microsoft endpoint configuration manager', + 'ejb': 'enterprise java beans', + 'css': 'cascading style sheets', + 'hpoo': 'hp operations orchestration', + 'xml': 'extensible markup language', + 'esb': 'enterprise service bus', + 'edi': 'electronic data interchange', + 'imsva': 'interscan messaging security virtual appliance', + 'wtx': 'ibm websphere transformation extender', + 'cgi': 'common gateway interface', + 'bal': 'ibm basic assembly language', + 'issow': 'integrated safe system of work', + 'dcl': 'data control language', + 'jdom': 'java document object model', + 'fim': 'microsoft forefront identity manager', + 'npl': 'niakwa programming language', + 'wf': 'windows workflow foundation', + 'lm': 'etap license manager', + 'wts': 'windows terminal server', + 'asp': 'active server pages', + 'jil': 'job information language', + 'mvc': 'model view controller', + 'rmi': 'remote method invocation', + 'ad': 'active directory', + 'owb': 'oracle warehouse builder', + 'rest': 'representational state transfer', + 'jdk': 'java development kit', + 'ids': 'integrated data store', + 'bms': 'batch management software', + 'vsx': 'vmware solution exchange', + 'ssas': 'sql server analysis services', + 'atl': 'atlas transformation language', + 'ice': 'infobright community edition', + 'esql': 'extended structured query language', + 'corba': 'common object request broker architecture', + 'dpe': 'device provisioning engines', + 'rac': 'oracle real application clusters', + 'iemt': 'iis easy migration tool', + 'mes': 'manufacturing execution system', + 'odbc': 'open database connectivity', + 'lms': 'lan management solution', + 'wcf': 'windows communication foundation', + 'nes': 'netscape enterprise server', + 'jsf': 'javaserver faces', + 'alm': 'application lifecycle management', + 'hlasm': 'high level assembler', + 'cmod': 'content manager ondemand'} + +external_source = { + 'vb.net': 'visual basic dot net', + 'jes': 'job entry subsystem', + 'svn': 'subversion', + 'vcs': 'version control system', + 'lims': 'laboratory information management system', + 'ide': 'integrated development environment', + 'sdk': 'software development kit', + 'mq': 'message queue', + 'ims': 'information management system', + 'isa': 'internet security and acceleration', + 'vs': 'visual studio', + 'esr': 'extended support release', + 'ff': 'firefox', + 'vb': 'visual basic', + 'rhel': 'red hat enterprise linux', + 'iis': 'internet information server', + 'api': 'application programming interface', + 'se': 'standard edition', + '\.net': 'dot net', + 'c#': 'c sharp', + 'ms': 'microsoft' +} + + +# synonyms = { +# 'windows server': 'windows nt', +# 'windows 7': 'windows desktop', +# 'windows 8': 'windows desktop', +# 'windows 10': 'windows desktop' +# } + + +# add more information +acronym_mapping.update(external_source) + + +abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()} +term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()} + +def replace_terms_with_abbreviations(text): + for input, replacement in term_to_abbrev.items(): + text = re.sub(input, replacement, text) + return text + +def replace_abbreviations_with_terms(text): + for input, replacement in abbrev_to_term.items(): + text = re.sub(input, replacement, text) + return text + + + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + # produce shuffling + index = row['entity_id'] + parent_desc = row['mention'] + parent_desc = preprocess_text(parent_desc) + # ensure at least 1 shuffle + # no_of_shuffles = label2weight[index] + 1 + no_of_shuffles = SHUFFLES + processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles) + for desc in processed_descs: + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # augmentation + # remove all non-alphanumerics + desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + # augmentation + # perform abbrev_to_term + temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces + desc = replace_terms_with_abbreviations(temp_desc) + if (desc != temp_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + # augmentation + # perform term to abbrev + desc = replace_abbreviations_with_terms(parent_desc) + if (desc != parent_desc): + element = { + 'text' : desc, + 'label': label2id[index], # ensure labels starts from 0 + } + output_list.append(element) + + + return output_list + + +def create_dataset(): + # train + data_path = '../../esAppMod_data_import/train.csv' + train_df = pd.read_csv(data_path, skipinitialspace=True) + + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + }) + return combined_data + + +# %% + +def train(): + + save_path = f'checkpoint' + split_datasets = create_dataset() + + # prepare tokenizer + + pattern = 'checkpoint_part1-*' + checkpoint_directory = 'checkpoint' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + # model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-uncased' + # model_checkpoint = 'prajjwal1/bert-small' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + # additional_special_tokens = [""] + # Add the additional special tokens to the tokenizer + # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(target_id_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=5e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=300, + warmup_steps=400, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() + + +# %% diff --git a/train/seq2seq_t5_simple/mapping_prediction/output.txt b/train/seq2seq_t5_simple/mapping_prediction/output.txt deleted file mode 100644 index dd5d228..0000000 --- a/train/seq2seq_t5_simple/mapping_prediction/output.txt +++ /dev/null @@ -1,2 +0,0 @@ - -Accuracy for fold: 0.5846658466584665 diff --git a/train/seq2seq_t5_simple/mapping_prediction/.gitignore b/train/seq2seq_t5_simple/prediction/.gitignore similarity index 100% rename from train/seq2seq_t5_simple/mapping_prediction/.gitignore rename to train/seq2seq_t5_simple/prediction/.gitignore diff --git a/train/seq2seq_t5_simple/mapping_prediction/inference.py b/train/seq2seq_t5_simple/prediction/inference.py similarity index 96% rename from train/seq2seq_t5_simple/mapping_prediction/inference.py rename to train/seq2seq_t5_simple/prediction/inference.py index 618b3f8..0035c87 100644 --- a/train/seq2seq_t5_simple/mapping_prediction/inference.py +++ b/train/seq2seq_t5_simple/prediction/inference.py @@ -57,10 +57,10 @@ class Inference(): output_list = [] for _, row in df.iterrows(): desc = row['mention'] - label = row['entity_name'] + label = row['entity_seq'] element = { 'input' : desc, - 'output': label + 'output': f'{label}' } output_list.append(element) @@ -101,7 +101,7 @@ class Inference(): def generate(self): - device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') MAX_GENERATE_LENGTH = 128 pred_generations = [] diff --git a/train/seq2seq_t5_simple/prediction/output.txt b/train/seq2seq_t5_simple/prediction/output.txt new file mode 100644 index 0000000..4cc5a4e --- /dev/null +++ b/train/seq2seq_t5_simple/prediction/output.txt @@ -0,0 +1,2 @@ + +Accuracy for fold: 0.5022550225502255 diff --git a/train/seq2seq_t5_simple/mapping_prediction/predict.py b/train/seq2seq_t5_simple/prediction/predict.py similarity index 79% rename from train/seq2seq_t5_simple/mapping_prediction/predict.py rename to train/seq2seq_t5_simple/prediction/predict.py index 12b931e..2e995d5 100644 --- a/train/seq2seq_t5_simple/mapping_prediction/predict.py +++ b/train/seq2seq_t5_simple/prediction/predict.py @@ -11,7 +11,7 @@ BATCH_SIZE = 512 def infer(): print(f"Inference for data") # import test data - data_path = '../../../data_import/test.csv' + data_path = '../../../esAppMod_data_import/test_seq.csv' df = pd.read_csv(data_path, skipinitialspace=True) @@ -35,18 +35,19 @@ def infer(): # thing_actual_list, property_actual_list = decode_preds(pred_labels) # Convert the list to a Pandas DataFrame df_out = pd.DataFrame({ - 'predictions': prediction_list + 'class_prediction': prediction_list }) # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] - df = pd.concat([df, df_out], axis=1) + # df = pd.concat([df, df_out], axis=1) # we can save the t5 generation output here - df.to_csv(f"exports/result.csv", index=False) + df_out.to_csv(f"exports/result.csv", index=False) # here we want to evaluate mapping accuracy within the valid in mdm data only - condition_correct = df['predictions'] == df['entity_name'] - pred_correct_proportion = sum(condition_correct)/len(df) + # predictions = pd.to_numeric(df_out['class_prediction'], errors="coerce") + condition_correct = df_out['class_prediction'] == df['entity_seq'] + pred_correct_proportion = sum(condition_correct)/len(df_out) # write output to file output.txt with open("output.txt", "a") as f: diff --git a/train/seq2seq_t5_simple/train.py b/train/seq2seq_t5_simple/train.py index 8edb0fe..9eff774 100644 --- a/train/seq2seq_t5_simple/train.py +++ b/train/seq2seq_t5_simple/train.py @@ -33,10 +33,10 @@ def process_df_to_dict(df): output_list = [] for _, row in df.iterrows(): desc = row['mention'] - label = row['entity_name'] + label = row['entity_seq'] element = { 'input' : desc, - 'output': label + 'output': f'{label}' } output_list.append(element) @@ -45,7 +45,7 @@ def process_df_to_dict(df): def create_dataset(): # train - data_path = f"../../data_import/train.csv" + data_path = f"../../esAppMod_data_import/train_seq.csv" train_df = pd.read_csv(data_path, skipinitialspace=True) combined_data = DatasetDict({ diff --git a/zero_shot/bloom.py b/zero_shot/bloom.py index aaf7e74..358b6fe 100644 --- a/zero_shot/bloom.py +++ b/zero_shot/bloom.py @@ -3,8 +3,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer # %% # Load model and tokenizer -# model_name = "bigscience/bloom-7b1" # Replace with your model -model_name = "bigscience/bloomz-1b1" +model_name = "bigscience/bloom-7b1" # Replace with your model +# model_name = "bigscience/bloomz-1b1" tokenizer = AutoTokenizer.from_pretrained(model_name) # Automatically map model layers to available GPUs @@ -26,13 +26,12 @@ outputs = model.generate(inputs["input_ids"], max_length=50) # Decode and print result print(tokenizer.decode(outputs[0], skip_special_tokens=True)) # %% -# %% # Prepare input def generate(text): # Define prompt - prompt = f"Answer Concisely: Give me a mapping between the acronym and descriptor in the format '(acronym: description): '{text}'" + prompt = f"Give me past product names relating to: '{text}'" # Generate acronym inputs = tokenizer(prompt, return_tensors="pt") @@ -45,7 +44,7 @@ def generate(text): # Example usage # text = "Advanced Data Analytics Platform" -text = 'ColdFusion Markup Language (CFML)' +text = 'windows server' acronym = generate(text) -print(f"Acronym: {acronym}") +print(f"Generation: {acronym}") # %% diff --git a/zero_shot/conceptnet.py b/zero_shot/conceptnet.py new file mode 100644 index 0000000..2499be3 --- /dev/null +++ b/zero_shot/conceptnet.py @@ -0,0 +1,21 @@ +# %% +import requests + +def get_related_terms(term, language="en", limit=10): + url = f"http://api.conceptnet.io/c/{language}/{term}" + response = requests.get(url).json() + + # Extract related terms + related_terms = [] + for edge in response.get("edges", []): + related = edge.get("end", {}).get("label", None) + if related and related.lower() != term.lower(): + related_terms.append(related) + if len(related_terms) >= limit: + break + return related_terms + +# Example +related_terms = get_related_terms("windows_server") +print("Related Terms:", related_terms) +# %% diff --git a/zero_shot/dbpedia.py b/zero_shot/dbpedia.py new file mode 100644 index 0000000..d370b6d --- /dev/null +++ b/zero_shot/dbpedia.py @@ -0,0 +1,38 @@ +# %% +from SPARQLWrapper import SPARQLWrapper, JSON + +# %% +sparql = SPARQLWrapper("https://dbpedia.org/sparql") +sparql.setQuery(""" + SELECT ?altLabel WHERE { + ?item rdfs:label "Windows Server"@en. + ?item skos:altLabel ?altLabel. + FILTER (LANG(?altLabel) = "en") + } + LIMIT 10 +""") +sparql.setReturnFormat(JSON) +results = sparql.query().convert() + +for result in results["results"]["bindings"]: + print(result["label"]["value"]) +# %% +from SPARQLWrapper import SPARQLWrapper, JSON + +sparql = SPARQLWrapper("https://query.wikidata.org/sparql") +sparql.setQuery(""" + SELECT ?itemLabel ?altLabel WHERE { + ?item ?label "Windows Server"@en. + OPTIONAL { ?item skos:altLabel ?altLabel. FILTER (LANG(?altLabel) = "en") } + SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + } + LIMIT 10 +""") +sparql.setReturnFormat(JSON) +results = sparql.query().convert() + +for result in results["results"]["bindings"]: + print("Label:", result["itemLabel"]["value"]) + if "altLabel" in result: + print("Alias:", result["altLabel"]["value"]) +# %% diff --git a/zero_shot/error.csv b/zero_shot/error.csv new file mode 100644 index 0000000..b48cb18 --- /dev/null +++ b/zero_shot/error.csv @@ -0,0 +1,626 @@ +,mention,entity_id,entity_name,class_prediction,predicted_name +0,DOT NET,497,.NET Framework,579,Unix|BSD|* +2,Dot net - FW 4,497,.NET Framework,368,VB.NET +3,.Net 4.7.1 Enterprise Lib,497,.NET Framework,368,VB.NET +11,.NET,497,.NET Framework,579,Unix|BSD|* +13,.Net 4.5.2 Enterprise Lib,497,.NET Framework,368,VB.NET +40,APACHE LOG4NET,483,.NET Framework|log4net,394,Java|Log4j +41,LOG4NET,483,.NET Framework|log4net,394,Java|Log4j +42,Magik,484,.NET Framework|Magick.NET,533,YAML +43,WCF,485,.NET Framework|Windows Communication Foundation (WCF),486,.NET Framework|Windows Workflow Foundation (WF) +45,WWF,486,.NET Framework|Windows Workflow Foundation (WF),443,OS/2 +47,Ejes,1,(E)JES,101,Microsoft Dynamics AX +48,(UNIRITA) A-AUTO 7.2.2,2,A-Auto Job Scheduling Software,299,AutoIt +50,Active Directoy,498,Active Directory (AD),40,Connect Direct +54,APSX,592,Active Server Pages (ASP)|*,609,IIS|* +69,Andriod,418,Android,586,PHP|* +71,Apache Active Queue,6,Apache ActiveMQ,259,Apache HTTP Server +72,MQ Apache Active Queue,6,Apache ActiveMQ,81,IBM Websphere MQ +75,cordova-android,501,Apache Cordova,418,Android +77,Hive,8,Apache Hive,177,SAP NetWeaver Business Warehouse +99,solr,11,Apache Solr,375,Apache Lucene +135,ADF,13,Application Development Facility (ADF),130,Oracle ADF +144,WLS 10.2,600,Oracle WebLogic Server|*,442,OpenVMS +149,BEA WLS,600,Oracle WebLogic Server|*,442,OpenVMS +152,Weblogic 12c,600,Oracle WebLogic Server|*,582,C#|* +160,WLE,600,Oracle WebLogic Server|*,443,OS/2 +168,Web Logic,600,Oracle WebLogic Server|*,97,MarkLogic DB +174,BEA WLE,600,Oracle WebLogic Server|*,443,OS/2 +175,WLS 10,600,Oracle WebLogic Server|*,442,OpenVMS +176,WLS,600,Oracle WebLogic Server|*,442,OpenVMS +189,brain script,302,Brainscript,369,VBScript +190,BRAINScript,302,Brainscript,367,TypeScript +191,Business Intelligence and Reporting Tools,21,Business Intelligence and Reporting Tools (BIRT),133,Oracle Business Intelligence +192,Actuate Report Server,21,Business Intelligence and Reporting Tools (BIRT),42,Crystal Reports +194,CSHARP,582,C#|*,87,Informatica PowerCenter +218,WinFrame,30,Citrix Virtual Apps and Desktops,443,OS/2 +221,METAFRAME,30,Citrix Virtual Apps and Desktops,406,JavaScript|AngularJS +225,Presentation Server,30,Citrix Virtual Apps and Desktops,541,File Server +226,NETSCALER-1.5,563,Citrix ADC,273,Netscape Enterprise Server (NES) +227,NETSCALER-11.,563,Citrix ADC,273,Netscape Enterprise Server (NES) +228,Citrix SD-WAN,563,Citrix ADC,30,Citrix Virtual Apps and Desktops +229,NetScaler SD-WAN,563,Citrix ADC,273,Netscape Enterprise Server (NES) +231,NetScaler ADC,563,Citrix ADC,272,Netscape Application Server (NAS) +236,NetScaler SDX,291,Citrix ADC SDX,273,Netscape Enterprise Server (NES) +240,Provisioning Services 7.15.8,32,Citrix Provisioning,538,Device Provisioning Engines (DPE) +241,Citrix PVS,32,Citrix Provisioning,30,Citrix Virtual Apps and Desktops +243,CLISTS,309,CLIST,329,IBM i Control Language (CL) +253,CFML,311,ColdFusion Markup Language (CFML),316,eXtensible HyperText Markup Language (XHTML) +254,ColdFusion Markup Language,311,ColdFusion Markup Language (CFML),37,Coldfusion +255,Sterling Connect,40,Connect Direct,542,General Ledger +264,Cormerstone,41,Cornerstone software,516,Compopent Object Model (COM) +265,Cornerstone,41,Cornerstone software,370,Visual Basic +279,DB2 UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA) +282,DB2-UDB,43,DB2,365,TCL +291,DB2/UDB,43,DB2,365,TCL +292,IBM DB2 ENTERPRISE SERVER EDITION PVU OPTION 10.5,43,DB2,163,PVCS Version Manager +300,IBM - IBM DB2 Advanced Enterprise Server Edition PVU Option 10.5,43,DB2,72,IBM Mobile Foundation +301,UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA) +302,IBM - IBM DB2 Enterprise Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|* +306,IBM - IBM DB2 Workgroup Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|* +313,EZTriev,314,Easytrieve,296,Intel Xeon Processor +314,Eztrieve,314,Easytrieve,296,Intel Xeon Processor +321,PrestoSoft - ExamDiff Application 1.6,49,ExamDiff,346,Pascal|Object Pascal +322,PrestoSoft - ExamDiff Application,49,ExamDiff,346,Pascal|Object Pascal +323,ExamDiff Application,49,ExamDiff,467,MS SQL Server|Log Reader Agent +324,Expect Scripts,315,Expect,109,Microsoft MQ +329,Microsoft - MSXML 4.0 SP2 4.2,318,Extensible Markup Language (XML)|MSXML,316,eXtensible HyperText Markup Language (XHTML) +331,XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST) +332,JAVA-XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST) +335,ServerCA Access GatewayF5,50,F5 Secure Web Gateway Services,290,CA API Gateway +347,HP C++,58,HP aC++ compiler,59,HP C/ANSI C compiler +350,HP C++ 10.20,58,HP aC++ compiler,59,HP C/ANSI C compiler +351,HPC 11.11,59,HP C/ANSI C compiler,58,HP aC++ compiler +358,HFS,505,HTTP File Server,486,.NET Framework|Windows Workflow Foundation (WF) +359,www.rejetto.com - HttpFileServer 2.3,505,HTTP File Server,55,Google Chrome +360,HttpFileServer,505,HTTP File Server,522,Application Web Server +367,IBM - IBM BigFix Platform Client Deploy Tool 9.5,457,IBM BigFix Platform|Client Deploy Tool,62,IBM BigFix Platform +369,IBM BPM,64,IBM Business Process Manager,328,IBM High Level Assembler (HLASM) +375,Data Power,294,IBM DataPower Gateway,295,IBM Power Systems +376,IDG.7.5.2.19hp,294,IBM DataPower Gateway,449,Unix|HP-UX +380,hlasm,328,IBM High Level Assembler (HLASM),438,macOS +383,IHS,265,IBM HTTP Server,424,IBM i +386,WebSphere and IHS,265,IBM HTTP Server,67,IBM InfoSphere DataStage +387,WebSphere http,265,IBM HTTP Server,284,Websphere Application Server (WAS) +391,IBM Websphere HTTP Server,265,IBM HTTP Server,285,WebSphere Liberty +393,WebSphere IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS +394,WebSphere -IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS +397,OS400 V7R1,424,IBM i,443,OS/2 +398,OS400,424,IBM i,443,OS/2 +399,OS/400,424,IBM i,443,OS/2 +408,IIB,68,IBM Integration Bus,370,Visual Basic +411,Extended Structured Query Language,458,IBM Integration Bus|Extended Structured Query Language (ESQL),572,Structured Query Language (SQL) +415,IBM WorkLight,72,IBM Mobile Foundation,649,IBM Cloud +417,ILOG JRules,73,IBM Operational Decision Manager (ODM),601,Websphere ILOG JRules BRMS +420,Decision Center 8.0.1.0,73,IBM Operational Decision Manager (ODM),252,IBM ILOG Views +423,AS400,295,IBM Power Systems,443,OS/2 +424,AS/400,295,IBM Power Systems,443,OS/2 +426,System i,295,IBM Power Systems,424,IBM i +427,P-series,295,IBM Power Systems,81,IBM Websphere MQ +428,IBM iSeries/AS400 system Model 520,295,IBM Power Systems,443,OS/2 +439,Tivoli Asset Discovery for Distributed,459,IBM Tivoli Asset Management|Asset Discovery for Distributed,606,IBM Tivoli Asset Management|* +447,Database MS SQL Agent,77,IBM Tivoli Monitoring,469,MS SQL Server|SQL Server Database Engine +448,Linux OS Agent,77,IBM Tivoli Monitoring,576,Linux|* +449,Database DB2 Agent,77,IBM Tivoli Monitoring,520,Database (DB) +452,Windows OS Agent,77,IBM Tivoli Monitoring,580,Windows|* +454,IBM - IBM TSM FCM,604,IBM Tivoli Storage Manager|*,460,IBM Tivoli Storage Manager|TSM API +459,Databases Data Protection for Microsoft SQL,604,IBM Tivoli Storage Manager|*,572,Structured Query Language (SQL) +461,IBM - IBM Spectrum Protect Data Protection for Microsoft SQL Server 8.1,604,IBM Tivoli Storage Manager|*,469,MS SQL Server|SQL Server Database Engine +462,IBM Spectrum Protect Data Protection,604,IBM Tivoli Storage Manager|*,312,Data Language Interface (DL/I) +463,IBM - IBM Spectrum Protect API 7.1,460,IBM Tivoli Storage Manager|TSM API,294,IBM DataPower Gateway +464,IBM - IBM Spectrum Protect Client,461,IBM Tivoli Storage Manager|TSM Client,294,IBM DataPower Gateway +465,IBM - IBM Tivoli Storage Manager Client,461,IBM Tivoli Storage Manager|TSM Client,604,IBM Tivoli Storage Manager|* +467,VSS Requestor configured 8.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|* +468,VSS Requestor 7.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|* +469,TWS-WS,79,IBM Tivoli Workload Scheduler (TWS),239,Windows Terminal Server (WTS) +472,wbia 2.6,80,IBM WebSphere Business Integration Adaptor,627,XtraDB +473,IBM WBIA 2.6.0.12,80,IBM WebSphere Business Integration Adaptor,424,IBM i +475,MQ,81,IBM Websphere MQ,248,ZeroMQ +476,MQ 9.1,81,IBM Websphere MQ,248,ZeroMQ +479,MQ 7,81,IBM Websphere MQ,248,ZeroMQ +480,MQ 6,81,IBM Websphere MQ,248,ZeroMQ +481,MQ 9.0,81,IBM Websphere MQ,248,ZeroMQ +482,MQ 5.3,81,IBM Websphere MQ,248,ZeroMQ +483,MQ 7.01,81,IBM Websphere MQ,248,ZeroMQ +484,MQ 7.5,81,IBM Websphere MQ,248,ZeroMQ +485,MQSeries 8.0,81,IBM Websphere MQ,248,ZeroMQ +488,WSMQ 8.0,81,IBM Websphere MQ,248,ZeroMQ +489,MQ 9.0.5,81,IBM Websphere MQ,248,ZeroMQ +491,WTX,83,IBM WebSphere Transformation Extender (WTX),274,Nginx +505,Microsoft Internet Inf,609,IIS|*,130,Oracle ADF +508,Microsoft Internet Informat,609,IIS|*,330,IBM Informix-4GL +550,Microsoft - IIS 6.0 Migration Tool 1,489,IIS|Easy Migration Tool (IEMT),609,IIS|* +558,Infozip 6,85,Info-ZIP,677,Git +559,Infozip,85,Info-ZIP,677,Git +578,IMSVA 9.1,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB +580,IMSVA,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB +581,Java 1.8,584,Java|*,334,Java|Java Standard Edition (Java SE) +582,Java 7,584,Java|*,334,Java|Java Standard Edition (Java SE) +583,Java on Weblogic server,584,Java|*,600,Oracle WebLogic Server|* +584,Java5,584,Java|*,334,Java|Java Standard Edition (Java SE) +585,Java 6,584,Java|*,334,Java|Java Standard Edition (Java SE) +586,Java 6.0,584,Java|*,334,Java|Java Standard Edition (Java SE) +587,Java 7 Update 25,584,Java|*,334,Java|Java Standard Edition (Java SE) +589,Java (open source),584,Java|*,397,Java|Servlet +590,Java 5,584,Java|*,334,Java|Java Standard Edition (Java SE) +591,Java 1.5,584,Java|*,334,Java|Java Standard Edition (Java SE) +593,Java 1.8.0_92,584,Java|*,334,Java|Java Standard Edition (Java SE) +594,Java 1.6,584,Java|*,334,Java|Java Standard Edition (Java SE) +595,J2EE 6,584,Java|*,333,Java|Java Enterprise Edition (Java EE) +596,Java (J2EE),584,Java|*,333,Java|Java Enterprise Edition (Java EE) +598,JRE,506,Java Runtime Environment (JRE),84,IMS DB +629,JEE,333,Java|Java Enterprise Edition (Java EE),1,(E)JES +639,JSF,391,Java|JavaServer Faces (JSF),334,Java|Java Standard Edition (Java SE) +643,JSP Scriptlets,336,Java|JavaServer Pages (JSP)|Scriptlets,335,Java|JavaServer Pages (JSP) +644,Java Scriplet,336,Java|JavaServer Pages (JSP)|Scriptlets,88,Ingres +645,Core 9.2.0.0,393,Java|JRuby Core,583,C++|* +647,Java RMI,396,Java|Remote Method Invocation (RMI),584,Java|* +650,Java Servlets,397,Java|Servlet,453,Linux|Fedora +651,Java 6 Servlets,397,Java|Servlet,453,Linux|Fedora +652,J2EE Servlets,397,Java|Servlet,443,OS/2 +653,Servlets,397,Java|Servlet,420,Cisco IOS +654,Servlets v2.3,397,Java|Servlet,370,Visual Basic +656,Spring BOOT,399,Java|Spring|Spring Boot,398,Java|Spring +657,Springboot,399,Java|Spring|Spring Boot,398,Java|Spring +661,javasript,589,JavaScript|*,335,Java|JavaServer Pages (JSP) +662,JS,589,JavaScript|*,507,Node.js +664,Java Script,589,JavaScript|*,584,Java|* +671,Sencha 4.2.0,409,JavaScript|Ext JS,589,JavaScript|* +674,jqueryui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery +675,jquery-ui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery +679,Scriptaculous,414,JavaScript|script.aculo.us,582,C#|* +684,EAP,268,JBoss|*,174,SAP ERP +685,JBOSS-EAP,268,JBoss|*,493,JBoss|Wildfly +686,JBoss Application Server 4,268,JBoss|*,493,JBoss|Wildfly +687,JBoss Application Server 7,268,JBoss|*,493,JBoss|Wildfly +688,JBoss Application Server 5,268,JBoss|*,493,JBoss|Wildfly +689,JBoss Application Server,268,JBoss|*,493,JBoss|Wildfly +690,Enterprise Application Platform,268,JBoss|*,670,EAServer +692,JBOSS 5.1.2 EAP,268,JBoss|*,493,JBoss|Wildfly +693,server: Jboss,268,JBoss|*,493,JBoss|Wildfly +694,JBOSS 6.3.2 EAP,268,JBoss|*,493,JBoss|Wildfly +695,JBoss EAP 4.3,268,JBoss|*,493,JBoss|Wildfly +700,Job Information Language,339,Job Information Language (JIL),338,JCL +703,JoinIT by Acayosoft,91,joinIT,4,Adobe Acrobat Reader +704,Acayosoft JoinIT,91,joinIT,4,Adobe Acrobat Reader +705,JoinIT by Acayosoft v 9.0.8,91,joinIT,4,Adobe Acrobat Reader +706,LifeFlow Tool,92,LifeFlow,486,.NET Framework|Windows Workflow Foundation (WF) +707,Linux 2.6.32-696.28.1.el6.x86_64,576,Linux|*,437,Linux|zLinux +709,Linux 2.6.32-696.30.1.el6.x86_64,576,Linux|*,437,Linux|zLinux +710,Linux 2.6.9,576,Linux|*,437,Linux|zLinux +711,Linux 2.6.32-642.3.1.el6.x86_64,576,Linux|*,437,Linux|zLinux +712,Linux - 2.6.18-371.1.2.el5,576,Linux|*,437,Linux|zLinux +713,Linux 2.6.32-696.23.1.el6.x86_64,576,Linux|*,437,Linux|zLinux +749,Gaia Kernel version 2.7,428,Linux|Check Point,432,Linux|Oracle Linux +752,Gaia Kernel version 2.6,428,Linux|Check Point,432,Linux|Oracle Linux +766,OEL,432,Linux|Oracle Linux,449,Unix|HP-UX +778,Oracle Enterprise Server 7.5,432,Linux|Oracle Linux,134,Oracle Database +780,OEL6.7 - 3.8.13-68.3.4.el6uek.x86_64,432,Linux|Oracle Linux,449,Unix|HP-UX +792,VMware Photon,433,Linux|Photon OS,569,VMware Server +793,VMware Photon 1,433,Linux|Photon OS,569,VMware Server +809,Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|* +818,Redhat - Redhat Linux 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +819,Linux RH6,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux +865,Redhat - Redhat Linux 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +870,Redhat - RHEL 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +874,Red Hat Entreprise Linux 6.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +882,Redhat 6 64-Bit,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +893,RED HAT ADVANCED SERVER 5,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +910,redhat6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +912,Redhat - Redhat Linux 6.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +913,Linux RH,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux +916,Redhat - Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|* +920,Linux RH7,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux +926,Red Hat V6,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +932,Linux RH5,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux +934,rehl5.9,434,Linux|Red Hat Enterprise Linux,43,DB2 +964,Red Hat 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +979,red hat,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +991,Redhat,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +996,RedHat 7.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +998,LINUX RED HAT 5 EL,434,Linux|Red Hat Enterprise Linux,268,JBoss|* +1003,SUSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1006,Linux SuSE12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1011,SUSE10,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1012,SUSE Linux 12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1017,SUSELinux Enterprise 11.x,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1023,SUSE Linux 11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1024,SUSE Linux 11 SP3,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1029,Linux SuSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1030,SUSE,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1038,SuseLinux,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE +1051,domino8.5,270,Lotus Domino,93,Lotus Notes +1052,Domino 8.x,270,Lotus Domino,93,Lotus Notes +1054,Lucee 5.2.6.60,271,Lucee,375,Apache Lucene +1056,Darwin,438,macOS,117,Mozilla Firefox +1061,Memcache,98,Memcached,18,BMC Control-M +1062,ACCDB,99,Microsoft Access,525,Open Database Connectivity (ODBC) +1070,ConfigMgr,102,Microsoft Endpoint Configuration Manager (SCCM),21,Business Intelligence and Reporting Tools (BIRT) +1080,FIM SQL Development Server,105,Microsoft Forefront Identity Manager (FIM),572,Structured Query Language (SQL) +1082,Microsoft - Internet Explor,107,Microsoft Internet Explorer,356,Rexx +1084,Internet Explor,107,Microsoft Internet Explorer,356,Rexx +1090,SCEP for Linux,110,Microsoft System Center Endpoint Protection,437,Linux|zLinux +1094,SCEP for Mac,110,Microsoft System Center Endpoint Protection,438,macOS +1101,msdeploy,112,Microsoft Web Deploy,56,Greenplum DB +1106,WebPI,114,Microsoft Web Platform Installer,522,Application Web Server +1109,Web PI,114,Microsoft Web Platform Installer,531,Simple Object Access Protocol (SOAP) +1111,MDW Framework,115,Model Driven Workflow (MDW),406,JavaScript|AngularJS +1115,Mango DB,116,MongoDB,43,DB2 +1117,MangoDB,116,MongoDB,43,DB2 +1125,O365,119,MS Office 365,424,IBM i +1141,MICROSOFT SQL SERVER 2012 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact +1153,MICROSOFT SQL SERVER 2012 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact +1154,MS SQL Server 2008 Developer,581,MS SQL Server|*,146,Oracle SQL Developer +1156,MICROSOFT SQL SERVER 2008 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact +1167,MSSQL Database Server,581,MS SQL Server|*,122,MySQL +1173,MSSQL2008,581,MS SQL Server|*,122,MySQL +1192,Microsoft SQL Server Standard Edition,581,MS SQL Server|*,121,MS SQL Server Compact +1201,SQLServer,581,MS SQL Server|*,572,Structured Query Language (SQL) +1226,MICROSOFT SQL SERVER 2012 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact +1233,MICROSOFT SQL SERVER 2005 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact +1234,SQLSVR2008,581,MS SQL Server|*,352,PL/SQL +1235,MICROSOFT SQL SERVER 2008 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact +1239,MICROSOFT SQL SERVER 2008 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact +1244,MS SQL Server 2012 Developer,581,MS SQL Server|*,146,Oracle SQL Developer +1253,Microsoft - SQL Server Express LocalDB 2014,581,MS SQL Server|*,469,MS SQL Server|SQL Server Database Engine +1256,MSSQL,581,MS SQL Server|*,122,MySQL +1280,Microsoft - Microsoft SQL Server Analysis Services 2012 for Microsoft SQL Server 2012 Standard Edition 2012,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|* +1281,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|* +1283,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|* +1290,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|* +1293,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|* +1295,SQL Server Integration Services,470,MS SQL Server|SQL Server Integration Services (SSIS),473,MS SQL Server|SQL Server Reporting Services (SSRS) +1316,ZOS Base 1.12,441,MVS|z/OS,437,Linux|zLinux +1335,NAS,272,Netscape Application Server (NAS),443,OS/2 +1337,NES,273,Netscape Enterprise Server (NES),443,OS/2 +1349,Node.js 0.10 (Linux),507,Node.js,437,Linux|zLinux +1361,Node.js 4 (Linux),507,Node.js,437,Linux|zLinux +1371,Symas OpenLDAP,126,OpenLDAP,178,SAP SQL Anywhere +1377,OAM 12c,129,Oracle Access Management,303,C +1378,ADF 12c,130,Oracle ADF,343,Objective C +1381,OHS,610,Oracle Application Server|*,122,MySQL +1383,Oracle HTTP Server powered by Apache,610,Oracle Application Server|*,259,Apache HTTP Server +1384,j2eeoracleca,610,Oracle Application Server|*,453,Linux|Fedora +1385,Oracle HTTP,610,Oracle Application Server|*,134,Oracle Database +1389,9i AS server,610,Oracle Application Server|*,227,Virtual I/O Server +1391,Oracle Application R12.1.3,610,Oracle Application Server|*,134,Oracle Database +1394,Weblogic BI Publisher,132,Oracle BI Publisher,600,Oracle WebLogic Server|* +1396,OBI,133,Oracle Business Intelligence,343,Objective C +1397,OBIEE,133,Oracle Business Intelligence,343,Objective C +1398,OBI Reporting,133,Oracle Business Intelligence,343,Objective C +1401,Oracle 12.2 Client,134,Oracle Database,610,Oracle Application Server|* +1406,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0,134,Oracle Database,610,Oracle Application Server|* +1407,Oracle 11.2 (Oracle Database 11g Enterprise Edition Release 11.2.0.3.0 - 64bit) RAC,134,Oracle Database,610,Oracle Application Server|* +1415,Oracle 11 on AIX,134,Oracle Database,445,Unix|AIX +1416,Oracle Database 10g Enterprise Edition Release 10.1.0.4.0 - 64bit,134,Oracle Database,610,Oracle Application Server|* +1431,Oracle Database 10g Release 10.2.0.4.0 - 64bit Production,134,Oracle Database,298,Oracle Exadata +1432,Oarcle 11G,134,Oracle Database,218,TIBCO InConcert +1443,DB - Oracle inbuilt,134,Oracle Database,158,Powerbuilder +1460,Oracle Server,134,Oracle Database,610,Oracle Application Server|* +1475,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 - 64bit Production,134,Oracle Database,610,Oracle Application Server|* +1480,Oracle 12C on linux,134,Oracle Database,303,C +1484,Oracle9i Enterprise Edition Release 9.2.0.5.0,134,Oracle Database,610,Oracle Application Server|* +1486,Oracle 11g on linux,134,Oracle Database,432,Linux|Oracle Linux +1487,Oracle 11gEssbase,134,Oracle Database,298,Oracle Exadata +1490,JServer Release 9.2.0.5.0,474,Oracle Database|Jserver,335,Java|JavaServer Pages (JSP) +1492,Designer 6i,135,Oracle Designer,516,Compopent Object Model (COM) +1493,Enterprise Manager 12.2.1.1,136,Oracle Enterprise Manager,427,Linux|CentOS +1494,Enterprise Manager 12.2.1.2,136,Oracle Enterprise Manager,427,Linux|CentOS +1495,Enterprise Manager 11.1.1.7,136,Oracle Enterprise Manager,427,Linux|CentOS +1501,"Oracle, Nets",140,Oracle Net Services,273,Netscape Enterprise Server (NES) +1502,Oracle RAC,141,Oracle Real Application Clusters (RAC),134,Oracle Database +1504,ORPOS 13.3.3,142,Oracle Retail Point-of-Service,609,IIS|* +1505,ORPOS 13.3.5,142,Oracle Retail Point-of-Service,609,IIS|* +1506,ORPOS 13.3.4,142,Oracle Retail Point-of-Service,609,IIS|* +1509,OSB Servers,143,Oracle Service Bus,443,OS/2 +1514,Oracle TT,147,Oracle TimesTen In-Memory Database,134,Oracle Database +1517,OWB 10g,148,Oracle Warehouse Builder (OWB),300,AWK +1524,Clascal,346,Pascal|Object Pascal,307,Cascading Style Sheets (CSS) +1526,Oracle-HR-9.2,151,PeopleSoft,134,Oracle Database +1529,ActiveState Tool Corp. - ActivePerl 5.12,348,Perl|ActivePerl,500,ActiveX +1530,ActiveState Tool Corp. - ActivePerl 5.8,348,Perl|ActivePerl,500,ActiveX +1531,ORAPERL,417,Perl|Oraperl,242,WinRAR +1532,REX,349,Perl|Rex,356,Rexx +1536,TCServer V6,277,Pivotal tc Server,365,TCL +1537,IBM PKWARE PKZip 2,155,PKZIP,387,Java|IBM SDK +1541,PLQSL,352,PL/SQL,351,PL/I +1542,Oracle - SQL,352,PL/SQL,581,MS SQL Server|* +1544,Oracle SQL,352,PL/SQL,134,Oracle Database +1545,PLSQL;,352,PL/SQL,351,PL/I +1547,Oracle PLSQL,352,PL/SQL,351,PL/I +1548,plsql,352,PL/SQL,351,PL/I +1551,Projectplace,156,Planview,21,Business Intelligence and Reporting Tools (BIRT) +1558,Power Builder,158,Powerbuilder,151,PeopleSoft +1560,Power Builder 6.5,158,Powerbuilder,27,Chef Automate +1565,ProjectWise Oracle Server,161,ProjectWise,162,ProjectWise Web Server +1576,RMQ,165,RabbitMQ,355,R +1579,Clearquest,167,Rational ClearQuest,455,Clarify|Clear Basic +1581,Remedy ARS,169,Remedy,322,Fortran +1584,RightFax client 10,171,RightFax,118,MQ Client +1585,SOQL,359,Salesforce Object Query Language (SOQL),621,ArangoDB +1587,SAP Business Objects,173,SAP BusinessObjects BI server,177,SAP NetWeaver Business Warehouse +1588,Business Objects 12,173,SAP BusinessObjects BI server,488,ActiveX|ADO +1590,SAP BI 4.2 Sp5,173,SAP BusinessObjects BI server,174,SAP ERP +1593,SAP HANA ON SUSEOracle 11g on Linux,175,SAP HANA DB,435,Linux|SUSE Linux Enterprise Server +1596,NetWeaver,279,SAP NetWeaver App Server,431,Linux|openSUSE +1605,SCSS,361,Sass,102,Microsoft Endpoint Configuration Manager (SCCM) +1606,Scalla,362,Scala,664,Forte +1609,Microsoft SPS 2010,603,SharePoint|*,577,MVS|* +1613,SQL Server SP2013 Database Server,603,SharePoint|*,581,MS SQL Server|* +1615,Siebel IP 2015,182,Siebel,583,C++|* +1616,Siebel 7.8.2.16,182,Siebel,43,DB2 +1617,Siebel CRM,182,Siebel,583,C++|* +1619,Techsmith Corporation - SnagIt 8,184,SnagIt,183,SNA Manager +1620,Solid development server,185,solidDB,600,Oracle WebLogic Server|* +1622,Sixty-Five Software - SpaceMonger 1.4,187,SpaceMonger,296,Intel Xeon Processor +1623,SQLPlus,478,Oracle Database|SQL*Plus,572,Structured Query Language (SQL) +1625,SQLIO 1.0,189,SQLIO,178,SAP SQL Anywhere +1630,SunOne,281,Oracle iPlanet Web Server,448,Unix|BSD|SunOS +1637,SAP - Sybase Central 4.3,479,Sybase SQL Server|Sybase Central,190,Sybase SQL Server +1639,Sysncsort,191,Syncsort,178,SAP SQL Anywhere +1640,syncsort,191,Syncsort,98,Memcached +1641,Sysinternals LLC - AccessEnum 1 1,194,Sysinternal Tools|AccessEnum,124,Nexus Repository OSS +1642,Sysinternals LLC - ClockRes 2,195,Sysinternal Tools|ClockRes,374,Xbase++ +1643,Sysinternals LLC - Coreinfo 3.21,196,Sysinternal Tools|Coreinfo,670,EAServer +1644,Sysinternals LLC - DiskExt 1.1,197,Sysinternal Tools|DiskExt,374,Xbase++ +1645,Sysinternals LLC - DiskMon 2.01,198,Sysinternal Tools|DiskMon,670,EAServer +1647,Sysinternals LLC - Junction 1.6,200,Sysinternal Tools|Junction,374,Xbase++ +1648,Sysinternals LLC - LDMDump 1.02,201,Sysinternal Tools|LDMDump,178,SAP SQL Anywhere +1649,Sysinternals LLC - LoadOrder 1,202,Sysinternal Tools|LoadOrder,374,Xbase++ +1650,Sysinternals LLC - PipeList 1.01,203,Sysinternal Tools|PipeList,670,EAServer +1651,Sysinternals LLC - Process Explorer 16.5,204,Sysinternal Tools|Process Explorer,464,Microsoft Exchange Server|Veeam Explorer +1652,Sysinternals LLC - PsKill 1.15,205,Sysinternal Tools|PsKill,151,PeopleSoft +1653,Sysinternals LLC - PsPasswd 1.23,206,Sysinternal Tools|PsPasswd,231,VMware vCenter +1654,Sysinternals LLC - SDelete 1.61,207,Sysinternal Tools|SDelete,670,EAServer +1655,Sysinternals LLC - ShareEnum 1.6,208,Sysinternal Tools|ShareEnum,603,SharePoint|* +1656,Sysinternals LLC - Sync 2.2,209,Sysinternal Tools|Sync,374,Xbase++ +1657,Sysinternals LLC - Sysinternals TCPView 3.5,210,Sysinternal Tools|TCPView,365,TCL +1658,Sysinternals LLC - VMMap 3.11,211,Sysinternal Tools|VMMap,176,SAP MaxDB +1659,Sysinternals LLC - Whois 1.11,212,Sysinternal Tools|Whois,178,SAP SQL Anywhere +1664,TERADATA QUERY SCHEDULER SERVER VERSION 15,216,Teradata QS Server,215,Teradata +1667,BusinessWorks,217,TIBCO Business Works (BW),111,Microsoft Visual Studio +1668,Tibco-IM,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous +1669,Tibco Integration Manager,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous +1674,TSQL,366,Transact-SQL,621,ArangoDB +1675,Trasact SQL,366,Transact-SQL,352,PL/SQL +1746,Solaris 11.2 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene +1747,Solaris UNIX,448,Unix|BSD|SunOS,578,Unix|* +1748,Unix Servers (Solaris,448,Unix|BSD|SunOS,578,Unix|* +1749,Oracle Solaris 11.3 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene +1753,Solaris 5.10 (Generic_150400-61),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI) +1754,Solaris 5.10 (Generic_150400-62),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI) +1756,Solaris 5.10 (Generic_150400-55),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI) +1760,Oracle Solaris,448,Unix|BSD|SunOS,134,Oracle Database +1762,Solaris 1 (SPARC),448,Unix|BSD|SunOS,375,Apache Lucene +1765,SunSolaris 10.0,448,Unix|BSD|SunOS,430,Linux|Junos OS +1771,Oracle Solaris 10,448,Unix|BSD|SunOS,134,Oracle Database +1800,VIO 2.2.0.10,227,Virtual I/O Server,159,Primavera P6 +1801,VIOS,227,Virtual I/O Server,443,OS/2 +1802,visibroker,228,Visibroker,420,Cisco IOS +1803,VB6,370,Visual Basic,368,VB.NET +1804,VB 6.0,370,Visual Basic,368,VB.NET +1805,visualbasic,370,Visual Basic,306,C++|Visual C++ +1808,Visual Basic 6.0,370,Visual Basic,368,VB.NET +1811,VBA,371,Visual Basic for Applications (VBA),370,Visual Basic +1812,Access VB,371,Visual Basic for Applications (VBA),99,Microsoft Access +1813,vfoxpro,372,Visual FoxPro,117,Mozilla Firefox +1827,VMware Appliance,569,VMware Server,559,Virtual Appliance +1828,VSX,229,VMware Solution Exchange Marketplace (VSX),111,Microsoft Visual Studio +1830,VMware - VMware Tools 10.2,230,VMware Tools,569,VMware Server +1832,VXML,373,VoiceXML,316,eXtensible HyperText Markup Language (XHTML) +1833,Web Focus,232,WebFOCUS,321,FOCUS +1834,FOCEXEC,232,WebFOCUS,495,Oracle WebCenter Content Server|Idoc Script +1836,WLI 8,233,WebLogic Integration,442,OpenVMS +1842,IBM WEBSPHERE APPLICATION SERVER VERSION 6.1.0,284,Websphere Application Server (WAS),285,WebSphere Liberty +1848,"IBM WebSphere Application Server Network Deployment, 8.0.0.5",284,Websphere Application Server (WAS),285,WebSphere Liberty +1850,IBM WebSphere Application Server Network Deployment 7,284,Websphere Application Server (WAS),285,WebSphere Liberty +1858,IBM WebSphere 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty +1861,IBM - WebSphere Application Server - Base 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty +1865,Websphere AS (JVM),284,Websphere Application Server (WAS),285,WebSphere Liberty +1872,IBM WebSphere,284,Websphere Application Server (WAS),285,WebSphere Liberty +1875,IBM WebSphere Application Server 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty +1877,IBM WebSphere Application,284,Websphere Application Server (WAS),285,WebSphere Liberty +1878,WAS 6.x,284,Websphere Application Server (WAS),521,Electronic Data Interchange (EDI) +1880,IBM OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE +1882,Open Liberty,285,WebSphere Liberty,397,Java|Servlet +1883,IBM Open Liberty,285,WebSphere Liberty,62,IBM BigFix Platform +1887,WAS Liberty,285,WebSphere Liberty,397,Java|Servlet +1889,OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE +1891,WMB 6.1,235,WebSphere Message Broker,486,.NET Framework|Windows Workflow Foundation (WF) +1892,WebSphere Message Broker v6.0,235,WebSphere Message Broker,285,WebSphere Liberty +1899,WebSphere Portal Extend Limited Use 6.1,286,WebSphere Portal Server,285,WebSphere Liberty +1901,Windchill 11.1,237,Windchill,17,Bluebeam|Bluebeam Q +1908,Window,580,Windows|*,637,Microsoft Azure +1914,Windows Terminal Server,239,Windows Terminal Server (WTS),452,Windows|Windows Server +1915,Windows 7 Standard,451,Windows|Windows Desktop,580,Windows|* +1916,WINDOWS 10 SERVER STANDARD EDITION X64,451,Windows|Windows Desktop,452,Windows|Windows Server +1917,Microsoft Windows 7 (64-bit),451,Windows|Windows Desktop,580,Windows|* +1918,Microsoft Windows XP Professional (32-bit),451,Windows|Windows Desktop,580,Windows|* +1919,Windows 7 Professional x64,451,Windows|Windows Desktop,580,Windows|* +1920,Microsoft Microsoft Windows Entreprise,451,Windows|Windows Desktop,580,Windows|* +1921,Microsoft Windows 2000,451,Windows|Windows Desktop,580,Windows|* +1922,Microsoft Windows 10,451,Windows|Windows Desktop,580,Windows|* +1923,MS Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|* +1924,Microsoft Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|* +1925,Microsoft Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|* +1926,Microsoft Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|* +1927,Win Desktop,451,Windows|Windows Desktop,560,Webtop +1928,Windows 10 Pro,451,Windows|Windows Desktop,580,Windows|* +1929,Windows 10,451,Windows|Windows Desktop,580,Windows|* +1930,Windows 7 Ultimate,451,Windows|Windows Desktop,580,Windows|* +1931,Microsoft Windows 8 (64-bit),451,Windows|Windows Desktop,580,Windows|* +1932,Microsoft Windows XP,451,Windows|Windows Desktop,580,Windows|* +1933,Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|* +1934,Windows XP,451,Windows|Windows Desktop,580,Windows|* +1935,Windows 10 Professional,451,Windows|Windows Desktop,580,Windows|* +1936,Windows 7,451,Windows|Windows Desktop,580,Windows|* +1937,Microsoft Windows 10 (64-bit),451,Windows|Windows Desktop,580,Windows|* +1938,Win 7,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE) +1939,windowsxp,451,Windows|Windows Desktop,580,Windows|* +1940,Microsoft Windows Unknown,451,Windows|Windows Desktop,580,Windows|* +1941,Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|* +1942,Windows XP Professional,451,Windows|Windows Desktop,580,Windows|* +1943,Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|* +1944,Window XP,451,Windows|Windows Desktop,580,Windows|* +1945,Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|* +1946,Microsoft Windows 7 - SOE,451,Windows|Windows Desktop,580,Windows|* +1947,Windows 7 Enterprise Edition,451,Windows|Windows Desktop,452,Windows|Windows Server +1948,Windows 8,451,Windows|Windows Desktop,580,Windows|* +1949,Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|* +1950,Microsoft Windows 7 (32-bit),451,Windows|Windows Desktop,580,Windows|* +1951,Windows Embedded Standard 7,451,Windows|Windows Desktop,580,Windows|* +1952,Win10,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE) +1953,Windows 2003,451,Windows|Windows Desktop,580,Windows|* +1955,Windows 2003 Standard,452,Windows|Windows Server,580,Windows|* +1956,Windows 2008 Enterprise R2 x64,452,Windows|Windows Server,580,Windows|* +1960,WINDOWS 2008R2,452,Windows|Windows Server,580,Windows|* +1961,Microsoft Windows Server 2008 Standard Editio,452,Windows|Windows Server,121,MS SQL Server Compact +1962,MICROSOFT WINDOWS NT 2003,452,Windows|Windows Server,580,Windows|* +1967,Microsoft Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server +1979,Windows 2008 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|* +1982,Windows 2003 R2,452,Windows|Windows Server,580,Windows|* +1983,Windows 2008 R2 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|* +1988,Windows 2008 R2,452,Windows|Windows Server,580,Windows|* +1989,Windows 2012 Standard,452,Windows|Windows Server,580,Windows|* +1992,Windows 2008 R2 Standard 6.1.7601 Service Pack 1,452,Windows|Windows Server,580,Windows|* +1994,Windows 2008 Standard x64,452,Windows|Windows Server,580,Windows|* +1998,Windows 2012 R2 Standard 64-Bit,452,Windows|Windows Server,580,Windows|* +2007,w2k12,452,Windows|Windows Server,582,C#|* +2008,WINDOWS 2013,452,Windows|Windows Server,580,Windows|* +2009,WINDOWS 2016 SE 64 BIT,452,Windows|Windows Server,580,Windows|* +2011,Microsoft - Windows 2012,452,Windows|Windows Server,580,Windows|* +2019,MICROSOFT WINDOWS 2008 TPM,452,Windows|Windows Server,580,Windows|* +2021,MICROSOFT WINDOWS STD 2008,452,Windows|Windows Server,580,Windows|* +2025,Windows 2008 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|* +2028,MICROSOFT WINDOWS STD 2008 TPM,452,Windows|Windows Server,580,Windows|* +2030,Windows 2012 64 Bit,452,Windows|Windows Server,580,Windows|* +2031,MICROSOFT WINDOWS NT 2003 ENT,452,Windows|Windows Server,580,Windows|* +2034,MICROSOFT WINDOWS 2012,452,Windows|Windows Server,580,Windows|* +2036,Windows 2003 Standard5.2.3790,452,Windows|Windows Server,580,Windows|* +2040,Windows 2012 R,452,Windows|Windows Server,580,Windows|* +2044,Windows 2008 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|* +2045,MICROSOFT WINDOWS 2008 ENT,452,Windows|Windows Server,580,Windows|* +2047,Windows 2012 R2 Standard 6.3.9600,452,Windows|Windows Server,580,Windows|* +2053,Windows 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server +2055,Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server +2061,Windows 2016 Datacenter10.0.14393,452,Windows|Windows Server,637,Microsoft Azure +2065,windows6.3.9600,452,Windows|Windows Server,580,Windows|* +2066,Windows 2012 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|* +2069,Windows 2008 Enterprise,452,Windows|Windows Server,580,Windows|* +2080,Windows 2008 Standard without Hyper-V6.0.6003,452,Windows|Windows Server,580,Windows|* +2084,Windows 2012 R2 Datacenter,452,Windows|Windows Server,110,Microsoft System Center Endpoint Protection +2089,Windows 2008 Standard 64-bit,452,Windows|Windows Server,580,Windows|* +2096,Windows 2000,452,Windows|Windows Server,580,Windows|* +2097,W2K8R2 Standard 64 BIT,452,Windows|Windows Server,303,C +2099,Windows 2008 Standard6.0.6003,452,Windows|Windows Server,580,Windows|* +2100,Windows2008 R2 Enterprise 64bit,452,Windows|Windows Server,580,Windows|* +2105,Win2008R2,452,Windows|Windows Server,355,R +2107,Windows 2008 Standard 64 Bit,452,Windows|Windows Server,580,Windows|* +2109,Windows Server 2003 Appliance,452,Windows|Windows Server,559,Virtual Appliance +2111,Windows 2008 ENT R2 (64 bits),452,Windows|Windows Server,355,R +2114,WIN2008R2 6.1.7601,452,Windows|Windows Server,355,R +2116,microsoft windows std 2012 tpm,452,Windows|Windows Server,580,Windows|* +2118,microsoft windows 2008,452,Windows|Windows Server,580,Windows|* +2120,Windows 2008 Standard 32 Bit,452,Windows|Windows Server,580,Windows|* +2121,Microsoft Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|* +2126,Window2008 R2,452,Windows|Windows Server,355,R +2130,Windows 2008 Standard,452,Windows|Windows Server,580,Windows|* +2134,WS03,452,Windows|Windows Server,239,Windows Terminal Server (WTS) +2136,Windows 2008 Enterprise x64,452,Windows|Windows Server,580,Windows|* +2141,Windows 2008 R2 Enterprise,452,Windows|Windows Server,580,Windows|* +2142,Windows Server 2003 Std 32-bit,452,Windows|Windows Server,580,Windows|* +2143,Windows 2008 R2 Standard 64bit,452,Windows|Windows Server,580,Windows|* +2146,Microsoft Windows 2003 R2 Standard,452,Windows|Windows Server,580,Windows|* +2148,MICROSOFT WINDOWS NT 2003 TPM,452,Windows|Windows Server,580,Windows|* +2149,Win Server 2008,452,Windows|Windows Server,569,VMware Server +2150,Windows 2003 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|* +2152,WIN2014,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) +2156,Win 2012 R2,452,Windows|Windows Server,355,R +2160,Win Server,452,Windows|Windows Server,12,Apache Subversion +2161,Windows 2008 Standard R2 x64,452,Windows|Windows Server,580,Windows|* +2163,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 6196 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor +2164,Windows2012,452,Windows|Windows Server,580,Windows|* +2165,Windows 2008 R2 Standard6.1.7601,452,Windows|Windows Server,580,Windows|* +2166,Windows 2016,452,Windows|Windows Server,580,Windows|* +2167,Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|* +2179,Windows Server 2003 Std 64-bit,452,Windows|Windows Server,580,Windows|* +2180,Windows 2012 R2,452,Windows|Windows Server,580,Windows|* +2181,Wintel,452,Windows|Windows Server,461,IBM Tivoli Storage Manager|TSM Client +2191,Windows 2003 Enterprise5.2.3790,452,Windows|Windows Server,580,Windows|* +2192,WINDOWS 2012,452,Windows|Windows Server,580,Windows|* +2193,Windows 2008 R2 OS,452,Windows|Windows Server,580,Windows|* +2196,Windows 2003 Standard R2,452,Windows|Windows Server,580,Windows|* +2197,Windows 2008 R2 Enterprise6.1.7601,452,Windows|Windows Server,580,Windows|* +2198,Windows 2003 Standard 32 Bit,452,Windows|Windows Server,580,Windows|* +2199,WINDOWS SERVER 2003 APPLIANCE 5.2,452,Windows|Windows Server,559,Virtual Appliance +2201,WS08R2,452,Windows|Windows Server,355,R +2204,Windows 2008 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|* +2213,w2k8r2sp1,452,Windows|Windows Server,355,R +2217,Win 2003,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) +2222,Windows 2012 R2 Standard,452,Windows|Windows Server,580,Windows|* +2226,Windows 2008 R2 Standard 64-bit,452,Windows|Windows Server,580,Windows|* +2228,Windows 2003 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|* +2230,Windows 2012 Storage R2,452,Windows|Windows Server,580,Windows|* +2231,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 4096 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor +2235,MICROSOFT WINDOWS NT 2003 ENT TPM,452,Windows|Windows Server,239,Windows Terminal Server (WTS) +2237,Windows 2016 Standard10.0.14393,452,Windows|Windows Server,580,Windows|* +2240,MICROSOFT WINDOWS 2003,452,Windows|Windows Server,580,Windows|* +2242,Windows 2012 Standard R2,452,Windows|Windows Server,580,Windows|* +2246,Win Server 2008 R2,452,Windows|Windows Server,355,R +2248,MICROSOFT WINDOWS STD 2012 TPM,452,Windows|Windows Server,580,Windows|* +2249,Windows 2003 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|* +2250,Windows 2008 Enterprise R2,452,Windows|Windows Server,580,Windows|* +2251,Windows 2008,452,Windows|Windows Server,580,Windows|* +2252,Microsoft Microsoft Windows 2008 R2,452,Windows|Windows Server,580,Windows|* +2257,Win Server 2012,452,Windows|Windows Server,569,VMware Server +2258,Windows 2016 Standard,452,Windows|Windows Server,580,Windows|* +2264,Windows 2008 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|* +2267,Windows 2003 Standard 5.2.3790 Service Pack 2,452,Windows|Windows Server,580,Windows|* +2268,Windows 2012 Standard6.2.9200,452,Windows|Windows Server,580,Windows|* +2269,MICROSOFT WINDOWS 2016 TPM,452,Windows|Windows Server,580,Windows|* +2272,Windows 2003 Enterprise,452,Windows|Windows Server,580,Windows|* +2275,Windows 2008 R2 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|* +2277,Windows 2012 R2 Standard6.3.9600,452,Windows|Windows Server,580,Windows|* +2286,Windows 2008 Standard R2,452,Windows|Windows Server,580,Windows|* +2287,MicrosoftWindows Server 2008 R2 (64-bit),452,Windows|Windows Server,443,OS/2 +2288,windows6.3,452,Windows|Windows Server,580,Windows|* +2290,Windows 2016 64 Bit,452,Windows|Windows Server,580,Windows|* +2296,Windows 2008 Enterprise6.0.6003,452,Windows|Windows Server,580,Windows|* +2301,Win 2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) +2302,Win2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) +2303,Win2012R2,452,Windows|Windows Server,355,R +2305,win2008,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) +2306,Windows 2003 Standard x64,452,Windows|Windows Server,580,Windows|* +2315,WINDOWS 2016 STANDARD EDITION,452,Windows|Windows Server,580,Windows|* +2325,WinSCP.net - WinSCP 5.11,243,WinSCP,178,SAP SQL Anywhere +2332,Zerto Vritual Appliance,249,Zerto Virtual Replication,559,Virtual Appliance +2333,Oracle RTD,289,Oracle Real-Time Decisions (RTD),134,Oracle Database +2338,OMNIbus,251,Tivoli Netcool/OMNIbus,582,C#|* +2347,ALM,511,Application Lifecycle Management (ALM),421,DART +2349,BMS,513,Batch Management Software (BMS),442,OpenVMS +2354,COM,516,Compopent Object Model (COM),661,COM+ +2357,CORBA Interface Definition Language,518,CORBA Interface Definition Language (CORBA IDL),517,Common Object Request Broker Architecture (CORBA) +2359,Data Control Language,519,Data Control Language (DCL),329,IBM i Control Language (CL) +2361,Database,520,Database (DB),43,DB2 +2362,DB,520,Database (DB),43,DB2 +2365,Electronic Data Interchange,521,Electronic Data Interchange (EDI),104,Microsoft Exchange Server +2369,JDOM,523,Java-based Document Object Model for XML (JDOM),84,IMS DB +2381,Simple Object Access Protocol,531,Simple Object Access Protocol (SOAP),547,Internet Message Access Protocol (IMAP) +2383,SQL,572,Structured Query Language (SQL),581,MS SQL Server|* +2386,DPE,538,Device Provisioning Engines (DPE),661,COM+ +2388,ESB,540,Enterprise Service Bus(ESB),370,Visual Basic +2395,MES,553,Manufacturing Execution System (MES),623,Amazon S3 +2401,Z/Virtual System Environment,591,z/VSE,441,MVS|z/OS +2403,DOS/VSE,591,z/VSE,597,DOS/360 +2404,Microsoft Disk Operating System,593,MS-DOS,443,OS/2 +2407,VME/B,595,VME,368,VB.NET +2408,Virtual Machine Environment,595,VME,111,Microsoft Visual Studio +2409,VME 2900,595,VME,107,Microsoft Internet Explorer +2410,OpenVME,595,VME,442,OpenVMS +2411,Disk Operating System/360,597,DOS/360,443,OS/2 +2413,Transaction Processing Facility,598,z/TPF,572,Structured Query Language (SQL) +2419,NPL,653,Natural Programming Language,342,Niakwa Programming Language (NPL) +2426,IDMS/DB Data Manipulation Language,668,IDMS DML,312,Data Language Interface (DL/I) +2433,Basic Mapping Supprt,689,BMS Map,21,Business Intelligence and Reporting Tools (BIRT) +2434,DB/400,690,DB400,43,DB2 +2435,IBM ISAM,693,ISAM,73,IBM Operational Decision Manager (ODM) diff --git a/zero_shot/t5.py b/zero_shot/flan-t5.py similarity index 72% rename from zero_shot/t5.py rename to zero_shot/flan-t5.py index 89af7cb..bda2c64 100644 --- a/zero_shot/t5.py +++ b/zero_shot/flan-t5.py @@ -32,21 +32,25 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) def generate_acronym(text): # Define prompt - prompt = f"Answer concisely: make a possible acronym from the following: '{text}'" + # prompt = f"Imagine you are a diverse database. Given the following: '{text}', please suggest to me 5 possible variations. Give 5." + prompt = f"Give me a list of 10 historical product names related to: '{text}'. Format the output in a list, like this 1. Item, 2. Item, 3. ..." # Generate acronym inputs = tokenizer(prompt, return_tensors="pt") inputs = inputs.to("cuda") outputs = model.generate( inputs["input_ids"], - max_length=100, - no_repeat_ngram_size=3) + max_length=200, + do_sample=True, + top_k=50, + temperature=0.8) + # no_repeat_ngram_size=3) return tokenizer.decode(outputs[0], skip_special_tokens=True) # %% # Example usage # text = "Advanced Data Analytics Platform" -text = "red hat enterprise linux" +text = "windows desktop" acronym = generate_acronym(text) -print(f"Acronym: {acronym}") +print(f"Generation: {acronym}") # %%