added more augmentations to finally beat sota
- class_bert_augmentation is now the reference training code
This commit is contained in:
parent
e90bc69ea9
commit
5312cfa06f
|
@ -0,0 +1,41 @@
|
|||
# %%
|
||||
import random
|
||||
import string
|
||||
|
||||
def corrupt_word(word):
|
||||
"""Corrupt a single word using random corruption techniques."""
|
||||
if len(word) <= 1: # Skip corruption for single-character words
|
||||
return word
|
||||
|
||||
corruption_type = random.choice(["delete", "swap"])
|
||||
|
||||
if corruption_type == "delete":
|
||||
# Randomly delete a character
|
||||
idx = random.randint(0, len(word) - 1)
|
||||
word = word[:idx] + word[idx + 1:]
|
||||
|
||||
elif corruption_type == "swap":
|
||||
# Swap two adjacent characters
|
||||
if len(word) > 1:
|
||||
idx = random.randint(0, len(word) - 2)
|
||||
word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
|
||||
|
||||
|
||||
return word
|
||||
|
||||
def corrupt_string(sentence, corruption_probability=0.01):
|
||||
"""Corrupt each word in the string with a given probability."""
|
||||
words = sentence.split()
|
||||
corrupted_words = [
|
||||
corrupt_word(word) if random.random() < corruption_probability else word
|
||||
for word in words
|
||||
]
|
||||
return " ".join(corrupted_words)
|
||||
|
||||
# Example usage
|
||||
sentence = "This is a simple string for testing"
|
||||
corrupted_sentence = corrupt_string(sentence, corruption_probability=0.1)
|
||||
print("Original:", sentence)
|
||||
print("Corrupted:", corrupted_sentence)
|
||||
|
||||
# %%
|
|
@ -1,95 +0,0 @@
|
|||
# %%
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
##########################################
|
||||
# %%
|
||||
|
||||
# Load the JSON file
|
||||
data_path = '../esAppMod/tca_entities.json'
|
||||
with open(data_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Initialize an empty list to store the rows
|
||||
rows = []
|
||||
|
||||
# %%
|
||||
# Loop through all entities in the JSON
|
||||
for entity in data["data"].items():
|
||||
entity_data = entity[1]
|
||||
entity_id = entity_data['entity_id']
|
||||
entity_name = entity_data['entity_name']
|
||||
entity_type_id = entity_data['entity_type_id']
|
||||
entity_type_name = entity_data['entity_type_name']
|
||||
|
||||
# Add each mention and its entity_id to the rows list
|
||||
rows.append(
|
||||
{
|
||||
'id': entity_id,
|
||||
'name': entity_name,
|
||||
'type_id': entity_type_id,
|
||||
'type_name': entity_type_name
|
||||
})
|
||||
|
||||
# Create a DataFrame from the rows
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
# %%
|
||||
# df.to_csv('entity.csv', index=False)
|
||||
df
|
||||
|
||||
# %%
|
||||
df['type_name'].value_counts()
|
||||
# %%
|
||||
df['type_id'].value_counts()
|
||||
|
||||
# %%
|
||||
name_list = df['name'].to_list()
|
||||
# %%
|
||||
name_list
|
||||
|
||||
# %%
|
||||
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
||||
import numpy as np
|
||||
|
||||
# %%
|
||||
# Define labels
|
||||
labels = name_list
|
||||
|
||||
# Create a prefix-based distance matrix
|
||||
def prefix_distance(label1, label2):
|
||||
prefix1 = label1.split()
|
||||
prefix2 = label2.split()
|
||||
# Find common prefix length
|
||||
common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
|
||||
# Distance is inversely proportional to common prefix length
|
||||
return 1.0 / (common_prefix_length + 1)
|
||||
|
||||
# Create a pairwise distance matrix
|
||||
n = len(labels)
|
||||
distance_matrix = np.zeros((n, n))
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
|
||||
|
||||
# Perform hierarchical clustering
|
||||
linkage_matrix = linkage(distance_matrix, method='average')
|
||||
|
||||
# Visualize as a dendrogram
|
||||
import matplotlib.pyplot as plt
|
||||
dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
|
||||
plt.title("Prefix-Based Clustering")
|
||||
plt.show()
|
||||
|
||||
# %%
|
||||
linkage_matrix
|
||||
# %%
|
||||
# Extract flat clusters with a distance threshold
|
||||
threshold = 0.5
|
||||
clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
|
||||
|
||||
# Display clusters
|
||||
for i, cluster_id in enumerate(clusters):
|
||||
print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
|
||||
|
||||
# %%
|
|
@ -3,53 +3,55 @@ import pandas as pd
|
|||
|
||||
# %%
|
||||
# import training file
|
||||
data_path = '../data_import/train.csv'
|
||||
data_path = '../esAppMod_data_import/train.csv'
|
||||
# data_path = '../esAppMod_data_import/parent_train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
# import test file
|
||||
data_path = '../data_import/test.csv'
|
||||
data_path = '../esAppMod_data_import/test.csv'
|
||||
# data_path = '../esAppMod_data_import/parent_test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# import entity file
|
||||
data_path = '../data_import/entity.csv'
|
||||
data_path = '../esAppMod_data_import/entity.csv'
|
||||
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
id2label = {}
|
||||
for _, row in entity_df.iterrows():
|
||||
id2label[row['id']] = row['name']
|
||||
|
||||
# %%
|
||||
train_df.sort_values(by=['entity_id']).to_markdown('out.md')
|
||||
|
||||
# %%
|
||||
data_path = '../train/class_bert_process/prediction/exports/result.csv'
|
||||
data_path = '../train/class_bert_augmentation/prediction/exports/result.csv'
|
||||
prediction_df = pd.read_csv(data_path)
|
||||
|
||||
# %%
|
||||
predicted_entity_list = []
|
||||
for element in prediction_df['class_prediction']:
|
||||
predicted_entity_list.append(id2label[element])
|
||||
|
||||
prediction_df['predicted_name'] = predicted_entity_list
|
||||
# %%
|
||||
new_df = pd.concat((test_df, prediction_df ), axis=1)
|
||||
|
||||
# %%
|
||||
mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
|
||||
mismatch_df = new_df[mismatch_mask]
|
||||
|
||||
# %%
|
||||
len(mismatch_df)
|
||||
|
||||
# %%
|
||||
# print the top 10 offending classes
|
||||
# mask1 = mismatch_df['entity_id'] != 434
|
||||
# mask2 = mismatch_df['entity_id'] != 451
|
||||
# mask3 = mismatch_df['entity_id'] != 452
|
||||
# mask= mask1 & mask2 & mask3
|
||||
# masked_df = mismatch_df[mask]
|
||||
# print(masked_df['entity_id'].value_counts()[:10])
|
||||
print(mismatch_df['entity_id'].value_counts()[:10])
|
||||
masked_df = mismatch_df
|
||||
|
||||
|
||||
# %%
|
||||
# Convert the whole dataframe as a string and display
|
||||
# print the mismatch_df
|
||||
print(mismatch_df.sort_values(by=['entity_id']).to_markdown())
|
||||
print(masked_df.sort_values(by=['entity_id']).to_markdown())
|
||||
|
||||
# %%
|
||||
mismatch_df.to_csv('error.csv')
|
||||
|
@ -62,14 +64,9 @@ mismatch_df[select_mask]
|
|||
|
||||
# %%
|
||||
# let us see the train mentions
|
||||
select_value = 452
|
||||
select_value = 130
|
||||
select_mask = train_df['entity_id'] == select_value
|
||||
train_df[select_mask]
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
mismatch_df[select_mask]['class_prediction'].to_list()
|
||||
|
||||
# %%
|
||||
# %%
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
# %%
|
||||
# import training file
|
||||
data_path = '../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
# import test file
|
||||
data_path = '../esAppMod_data_import/test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# import entity file
|
||||
data_path = '../esAppMod_data_import/entity.csv'
|
||||
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
id2label = {}
|
||||
for _, row in entity_df.iterrows():
|
||||
id2label[row['id']] = row['name']
|
||||
|
||||
|
||||
# %%
|
||||
train_df
|
||||
# %%
|
||||
|
||||
def extract_acronym_mapping(names):
|
||||
mapping = {}
|
||||
for name in names:
|
||||
# Find acronym in parentheses
|
||||
match = re.search(r"\((\w+)\)", name)
|
||||
if match:
|
||||
acronym = match.group(1)
|
||||
|
||||
# Remove unrelated prepended terms
|
||||
core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip()
|
||||
|
||||
# Add to dictionary
|
||||
mapping[acronym] = core_term
|
||||
return mapping
|
||||
|
||||
names = set(train_df['entity_name'].to_list())
|
||||
|
||||
# Extract mappings
|
||||
acronym_mapping = extract_acronym_mapping(names)
|
||||
print(acronym_mapping)
|
||||
# %%
|
||||
del acronym_mapping['E'] # too many false matches
|
||||
acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()}
|
||||
|
||||
abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()}
|
||||
term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()}
|
||||
|
||||
|
||||
# %%
|
||||
abbrev_to_term
|
||||
# %%
|
||||
term_to_abbrev
|
||||
|
||||
# %%
|
||||
acronym_mapping
|
||||
# %%
|
|
@ -0,0 +1,5 @@
|
|||
out.md
|
||||
parent_test.csv
|
||||
parent_train.csv
|
||||
test_seq.csv
|
||||
train_seq.csv
|
|
@ -0,0 +1,124 @@
|
|||
# %%
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
##########################################
|
||||
# %%
|
||||
# import training file
|
||||
data_path = '../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# %%
|
||||
# import entity file
|
||||
# Keep only one row per unique value in 'column1'
|
||||
unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
|
||||
id2label = {}
|
||||
for _, row in unique_df.iterrows():
|
||||
id2label[row['entity_id']] = row['entity_name']
|
||||
|
||||
inverse_dict = {value:key for key,value in id2label.items()}
|
||||
# %%
|
||||
# Create a new dictionary with sorted keys
|
||||
# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
|
||||
sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
|
||||
|
||||
# %%
|
||||
sorted_dict
|
||||
|
||||
# %%
|
||||
rule_set ={
|
||||
'.NET': [497,482,484,487,485,486,483],
|
||||
'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
|
||||
'C++': [583,306],
|
||||
'CA': [290,22,23,24,25],
|
||||
'CSS': [307,377],
|
||||
'Cisco': [28,420,29],
|
||||
'Citrix': [563,565,31,292,291,564,32,30],
|
||||
'coldfusion': [311,37],
|
||||
'eclipse': [46,622,641,456],
|
||||
'xml': [596, 318],
|
||||
'xsl': [319,320],
|
||||
'HP': [59,293,60,61,58],
|
||||
'http': [505,543],
|
||||
'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
|
||||
'IBM BigFix': [62,457],
|
||||
'IBM ILOG': [253,255,254,256,252],
|
||||
'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
|
||||
'IBM WebSphere': [80,82,83,81],
|
||||
'IBM i': [424,329],
|
||||
'IDMS': [667,668],
|
||||
'IIS': [609,490,489,491],
|
||||
'JBoss': [268,492,493],
|
||||
'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
|
||||
'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
|
||||
'KVS': [549,550,551],
|
||||
'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
|
||||
'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
|
||||
'MVS': [577,440,441],
|
||||
'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
|
||||
'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
|
||||
'Oracle WebLogic': [600,233],
|
||||
'Oracle Application Server': [610,494],
|
||||
'Oracle Database': [134,474,475,478],
|
||||
'Oracle Hyperion': [607,138,139],
|
||||
'Oracle WebCenter': [276,495],
|
||||
'Pascal': [599,346],
|
||||
'Perl': [585,348,417,349],
|
||||
'ProjectWise': [161,162],
|
||||
'Rational': [166,167],
|
||||
'SAP': [173,175,695,176,676,178,179],
|
||||
'SAP ERP': [174,476,477],
|
||||
'SAP NetWeaver': [279,496,177],
|
||||
'Sybase SQL Server': [190,479,480],
|
||||
'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
|
||||
'TIBCO': [218,219],
|
||||
'TIBCO Business Works': [217,481],
|
||||
'Tivoli': [220,251],
|
||||
'Tortoise': [221,222],
|
||||
'Unix': [578,445,579,447,602,590,448,449],
|
||||
'VB': [368,369],
|
||||
'VMware': [568,569,229,230,231],
|
||||
'Visual Basic': [370,371,372],
|
||||
'WebSphere': [234,285,235,286,284,601,287],
|
||||
'Windows': [580,238,239,451,452],
|
||||
'z': [598,608,591]
|
||||
|
||||
}
|
||||
|
||||
# %%
|
||||
# iterate through the whole training set
|
||||
new_df = train_df.copy()
|
||||
for idx, row in train_df.iterrows():
|
||||
# we iterate through each rule set, replacing any matching values in the
|
||||
# list with the first element of the list
|
||||
for key in rule_set.keys():
|
||||
id = row['entity_id']
|
||||
if (id in rule_set[key]):
|
||||
new_df.loc[idx,('entity_id')] = rule_set[key][0]
|
||||
# %%
|
||||
len(set(new_df['entity_id'].to_list()))
|
||||
|
||||
# %%
|
||||
new_df.to_csv('parent_train.csv')
|
||||
|
||||
# %%
|
||||
# now do the same for the test data
|
||||
# import training file
|
||||
data_path = '../esAppMod_data_import/test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
new_df = test_df.copy()
|
||||
for idx, row in test_df.iterrows():
|
||||
# we iterate through each rule set, replacing any matching values in the
|
||||
# list with the first element of the list
|
||||
for key in rule_set.keys():
|
||||
id = row['entity_id']
|
||||
if (id in rule_set[key]):
|
||||
new_df.loc[idx,('entity_id')] = rule_set[key][0]
|
||||
|
||||
# %%
|
||||
new_df
|
||||
|
||||
# %%
|
||||
new_df.to_csv('parent_test.csv')
|
||||
# %%
|
|
@ -0,0 +1,129 @@
|
|||
# %%
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
##########################################
|
||||
# %%
|
||||
# import training file
|
||||
data_path = '../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# %%
|
||||
# import entity file
|
||||
# Keep only one row per unique value in 'column1'
|
||||
unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
|
||||
id2label = {}
|
||||
for _, row in unique_df.iterrows():
|
||||
id2label[row['entity_id']] = row['entity_name']
|
||||
|
||||
inverse_dict = {value:key for key,value in id2label.items()}
|
||||
# %%
|
||||
# Create a new dictionary with sorted keys
|
||||
# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
|
||||
sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
|
||||
|
||||
# %%
|
||||
sorted_dict
|
||||
|
||||
# %%
|
||||
rule_set ={
|
||||
'.NET': [497,482,484,487,485,486,483],
|
||||
'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
|
||||
'C++': [583,306],
|
||||
'CA': [290,22,23,24,25],
|
||||
'CSS': [307,377],
|
||||
'Cisco': [28,420,29],
|
||||
'Citrix': [563,565,31,292,291,564,32,30],
|
||||
'coldfusion': [311,37],
|
||||
'eclipse': [46,622,641,456],
|
||||
'xml': [596, 318],
|
||||
'xsl': [319,320],
|
||||
'HP': [59,293,60,61,58],
|
||||
'http': [505,543],
|
||||
'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
|
||||
'IBM BigFix': [62,457],
|
||||
'IBM ILOG': [253,255,254,256,252],
|
||||
'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
|
||||
'IBM WebSphere': [80,82,83,81],
|
||||
'IBM i': [424,329],
|
||||
'IDMS': [667,668],
|
||||
'IIS': [609,490,489,491],
|
||||
'JBoss': [268,492,493],
|
||||
'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
|
||||
'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
|
||||
'KVS': [549,550,551],
|
||||
'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
|
||||
'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
|
||||
'MVS': [577,440,441],
|
||||
'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
|
||||
'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
|
||||
'Oracle WebLogic': [600,233],
|
||||
'Oracle Application Server': [610,494],
|
||||
'Oracle Database': [134,474,475,478],
|
||||
'Oracle Hyperion': [607,138,139],
|
||||
'Oracle WebCenter': [276,495],
|
||||
'Pascal': [599,346],
|
||||
'Perl': [585,348,417,349],
|
||||
'ProjectWise': [161,162],
|
||||
'Rational': [166,167],
|
||||
'SAP': [173,175,695,176,676,178,179],
|
||||
'SAP ERP': [174,476,477],
|
||||
'SAP NetWeaver': [279,496,177],
|
||||
'Sybase SQL Server': [190,479,480],
|
||||
'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
|
||||
'TIBCO': [218,219],
|
||||
'TIBCO Business Works': [217,481],
|
||||
'Tivoli': [220,251],
|
||||
'Tortoise': [221,222],
|
||||
'Unix': [578,445,579,447,602,590,448,449],
|
||||
'VB': [368,369],
|
||||
'VMware': [568,569,229,230,231],
|
||||
'Visual Basic': [370,371,372],
|
||||
'WebSphere': [234,285,235,286,284,601,287],
|
||||
'Windows': [580,238,239,451,452],
|
||||
'z': [598,608,591]
|
||||
|
||||
}
|
||||
|
||||
# %%
|
||||
# iterate through the whole training set
|
||||
new_df = train_df.copy()
|
||||
for idx, row in train_df.iterrows():
|
||||
# we iterate through each rule set, replacing any matching values in the
|
||||
# list with the first element of the list
|
||||
for key in rule_set.keys():
|
||||
id = row['entity_id']
|
||||
if (id in rule_set[key]):
|
||||
stem = rule_set[key][0]
|
||||
leaf = rule_set[key].index(id)
|
||||
new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}"
|
||||
# %%
|
||||
len(set(new_df['entity_seq'].to_list()))
|
||||
|
||||
# %%
|
||||
new_df.to_csv('train_seq.csv')
|
||||
|
||||
# %%
|
||||
# now do the same for the test data
|
||||
# import training file
|
||||
data_path = '../esAppMod_data_import/test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
new_df = test_df.copy()
|
||||
for idx, row in test_df.iterrows():
|
||||
# we iterate through each rule set, replacing any matching values in the
|
||||
# list with the first element of the list
|
||||
for key in rule_set.keys():
|
||||
id = row['entity_id']
|
||||
if (id in rule_set[key]):
|
||||
stem = rule_set[key][0]
|
||||
leaf = rule_set[key].index(id)
|
||||
new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}"
|
||||
|
||||
|
||||
# %%
|
||||
new_df
|
||||
|
||||
# %%
|
||||
new_df.to_csv('test_seq.csv')
|
||||
# %%
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
*******************************************************************************
|
||||
Accuracy: 0.77655
|
||||
F1 Score: 0.79605
|
||||
Precision: 0.85637
|
||||
Recall: 0.77655
|
||||
Accuracy: 0.80197
|
||||
F1 Score: 0.81948
|
||||
Precision: 0.88067
|
||||
Recall: 0.80197
|
|
@ -32,6 +32,8 @@ torch.set_float32_matmul_precision('high')
|
|||
BATCH_SIZE = 256
|
||||
|
||||
# %%
|
||||
# construct the target id list
|
||||
# data_path = '../../../esAppMod_data_import/train.csv'
|
||||
data_path = '../../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
|
@ -52,19 +54,8 @@ def preprocess_text(text):
|
|||
# 1. Make all uppercase
|
||||
text = text.lower()
|
||||
|
||||
# Remove any non alphanumeric character
|
||||
# text = re.sub(r'[^\w\s]', ' ', text) # Retains only alphanumeric and spaces
|
||||
text = re.sub(r"[-;:]", " ", text)
|
||||
|
||||
# Add space between digit followed by a letter
|
||||
text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
|
||||
|
||||
# Add space between letter followed by a digit
|
||||
text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
|
||||
|
||||
|
||||
# Substitute digits with '#'
|
||||
text = re.sub(r'\d+', 'x', text)
|
||||
# text = re.sub(r'\d+', '#', text)
|
||||
|
||||
# standardize spacing
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
|
@ -0,0 +1,562 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import re
|
||||
import random
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
Trainer,
|
||||
EarlyStoppingCallback,
|
||||
TrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
# %%
|
||||
def set_seed(seed):
|
||||
"""
|
||||
Set the random seed for reproducibility.
|
||||
"""
|
||||
random.seed(seed) # Python random module
|
||||
np.random.seed(seed) # NumPy random
|
||||
torch.manual_seed(seed) # PyTorch CPU
|
||||
torch.cuda.manual_seed(seed) # PyTorch GPU
|
||||
torch.cuda.manual_seed_all(seed) # If using multiple GPUs
|
||||
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
|
||||
torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility
|
||||
|
||||
set_seed(42)
|
||||
|
||||
SHUFFLES=10
|
||||
|
||||
# %%
|
||||
|
||||
# import training file
|
||||
data_path = '../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(target_id_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
# %%
|
||||
# introduce pre-processing functions
|
||||
def preprocess_text(text):
|
||||
|
||||
# 1. Make all uppercase
|
||||
text = text.lower()
|
||||
|
||||
# Substitute digits with 'x'
|
||||
# text = re.sub(r'\d+', '#', text)
|
||||
|
||||
# standardize spacing
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def generate_random_shuffles(text, n):
|
||||
"""
|
||||
Generate n strings with randomly shuffled words from the input text.
|
||||
|
||||
Args:
|
||||
text (str): The input text.
|
||||
n (int): The number of random variations to generate.
|
||||
|
||||
Returns:
|
||||
list: A list of strings with shuffled words.
|
||||
"""
|
||||
words = text.split() # Split the input into words
|
||||
shuffled_variations = []
|
||||
|
||||
for _ in range(n):
|
||||
shuffled = words[:] # Copy the word list to avoid in-place modification
|
||||
random.shuffle(shuffled) # Randomly shuffle the words
|
||||
shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string
|
||||
|
||||
return shuffled_variations
|
||||
|
||||
|
||||
# generate n more shuffled examples
|
||||
def shuffle_text(text, n_shuffles=SHUFFLES):
|
||||
"""
|
||||
Preprocess a list of texts and add n random shuffles for each string.
|
||||
|
||||
Args:
|
||||
texts (list): An input strings.
|
||||
n_shuffles (int): Number of random shuffles to generate for each string.
|
||||
|
||||
Returns:
|
||||
list: A list of preprocessed and shuffled strings.
|
||||
"""
|
||||
all_processed = []
|
||||
# add the original text
|
||||
all_processed.append(text)
|
||||
|
||||
# Generate random shuffles
|
||||
shuffled_variations = generate_random_shuffles(text, n_shuffles)
|
||||
all_processed.extend(shuffled_variations)
|
||||
|
||||
return all_processed
|
||||
|
||||
acronym_mapping = {
|
||||
'hpsa': 'hp server automation',
|
||||
'tam': 'tivoli access manager',
|
||||
'adf': 'application development facility',
|
||||
'html': 'hypertext markup language',
|
||||
'wff': 'microsoft web farm framework',
|
||||
'jsp': 'javaserver pages',
|
||||
'bw': 'business works',
|
||||
'ssrs': 'sql server reporting services',
|
||||
'cl': 'control language',
|
||||
'vba': 'visual basic for applications',
|
||||
'esapi': 'enterprise security api',
|
||||
'gwt': 'google web toolkit',
|
||||
'pki': 'perkin elmer informatics',
|
||||
'rtd': 'oracle realtime decisions',
|
||||
'jms': 'java message service',
|
||||
'db': 'database',
|
||||
'soa': 'service oriented architecture',
|
||||
'xsl': 'extensible stylesheet language',
|
||||
'com': 'compopent object model',
|
||||
'ldap': 'lightweight directory access protocol',
|
||||
'odm': 'ibm operational decision manager',
|
||||
'soql': 'salesforce object query language',
|
||||
'oms': 'order management system',
|
||||
'cfml': 'coldfusion markup language',
|
||||
'nas': 'netscape application server',
|
||||
'sql': 'structured query language',
|
||||
'bde': 'borland database engine',
|
||||
'imap': 'internet message access protocol',
|
||||
'uws': 'ultidev web server',
|
||||
'birt': 'business intelligence and reporting tools',
|
||||
'mdw': 'model driven workflow',
|
||||
'tws': 'tivoli workload scheduler',
|
||||
'jre': 'java runtime environment',
|
||||
'wcs': 'websphere commerce suite',
|
||||
'was': 'websphere application server',
|
||||
'ssis': 'sql server integration services',
|
||||
'xhtml': 'extensible hypertext markup language',
|
||||
'soap': 'simple object access protocol',
|
||||
'san': 'storage area network',
|
||||
'elk': 'elastic stack',
|
||||
'arr': 'application request routing',
|
||||
'xlst': 'extensible stylesheet language transformations',
|
||||
'sccm': 'microsoft endpoint configuration manager',
|
||||
'ejb': 'enterprise java beans',
|
||||
'css': 'cascading style sheets',
|
||||
'hpoo': 'hp operations orchestration',
|
||||
'xml': 'extensible markup language',
|
||||
'esb': 'enterprise service bus',
|
||||
'edi': 'electronic data interchange',
|
||||
'imsva': 'interscan messaging security virtual appliance',
|
||||
'wtx': 'ibm websphere transformation extender',
|
||||
'cgi': 'common gateway interface',
|
||||
'bal': 'ibm basic assembly language',
|
||||
'issow': 'integrated safe system of work',
|
||||
'dcl': 'data control language',
|
||||
'jdom': 'java document object model',
|
||||
'fim': 'microsoft forefront identity manager',
|
||||
'npl': 'niakwa programming language',
|
||||
'wf': 'windows workflow foundation',
|
||||
'lm': 'etap license manager',
|
||||
'wts': 'windows terminal server',
|
||||
'asp': 'active server pages',
|
||||
'jil': 'job information language',
|
||||
'mvc': 'model view controller',
|
||||
'rmi': 'remote method invocation',
|
||||
'ad': 'active directory',
|
||||
'owb': 'oracle warehouse builder',
|
||||
'rest': 'representational state transfer',
|
||||
'jdk': 'java development kit',
|
||||
'ids': 'integrated data store',
|
||||
'bms': 'batch management software',
|
||||
'vsx': 'vmware solution exchange',
|
||||
'ssas': 'sql server analysis services',
|
||||
'atl': 'atlas transformation language',
|
||||
'ice': 'infobright community edition',
|
||||
'esql': 'extended structured query language',
|
||||
'corba': 'common object request broker architecture',
|
||||
'dpe': 'device provisioning engines',
|
||||
'rac': 'oracle real application clusters',
|
||||
'iemt': 'iis easy migration tool',
|
||||
'mes': 'manufacturing execution system',
|
||||
'odbc': 'open database connectivity',
|
||||
'lms': 'lan management solution',
|
||||
'wcf': 'windows communication foundation',
|
||||
'nes': 'netscape enterprise server',
|
||||
'jsf': 'javaserver faces',
|
||||
'alm': 'application lifecycle management',
|
||||
'hlasm': 'high level assembler',
|
||||
'cmod': 'content manager ondemand'}
|
||||
|
||||
external_source = {
|
||||
'vb.net': 'visual basic dot net',
|
||||
'jes': 'job entry subsystem',
|
||||
'svn': 'subversion',
|
||||
'vcs': 'version control system',
|
||||
'lims': 'laboratory information management system',
|
||||
'ide': 'integrated development environment',
|
||||
'sdk': 'software development kit',
|
||||
'mq': 'message queue',
|
||||
'ims': 'information management system',
|
||||
'isa': 'internet security and acceleration',
|
||||
'vs': 'visual studio',
|
||||
'esr': 'extended support release',
|
||||
'ff': 'firefox',
|
||||
'vb': 'visual basic',
|
||||
'rhel': 'red hat enterprise linux',
|
||||
'iis': 'internet information server',
|
||||
'api': 'application programming interface',
|
||||
'se': 'standard edition',
|
||||
'\.net': 'dot net',
|
||||
'c#': 'c sharp'
|
||||
}
|
||||
|
||||
|
||||
# synonyms = {
|
||||
# 'windows server': 'windows nt',
|
||||
# 'windows 7': 'windows desktop',
|
||||
# 'windows 8': 'windows desktop',
|
||||
# 'windows 10': 'windows desktop'
|
||||
# }
|
||||
|
||||
|
||||
# add more information
|
||||
acronym_mapping.update(external_source)
|
||||
|
||||
|
||||
abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
|
||||
term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
|
||||
|
||||
def replace_terms_with_abbreviations(text):
|
||||
for input, replacement in term_to_abbrev.items():
|
||||
text = re.sub(input, replacement, text)
|
||||
return text
|
||||
|
||||
def replace_abbreviations_with_terms(text):
|
||||
for input, replacement in abbrev_to_term.items():
|
||||
text = re.sub(input, replacement, text)
|
||||
return text
|
||||
|
||||
######################################
|
||||
|
||||
# augmentation by text corruption
|
||||
|
||||
def corrupt_word(word):
|
||||
"""Corrupt a single word using random corruption techniques."""
|
||||
if len(word) <= 1: # Skip corruption for single-character words
|
||||
return word
|
||||
|
||||
corruption_type = random.choice(["delete", "swap"])
|
||||
|
||||
if corruption_type == "delete":
|
||||
# Randomly delete a character
|
||||
idx = random.randint(0, len(word) - 1)
|
||||
word = word[:idx] + word[idx + 1:]
|
||||
|
||||
elif corruption_type == "swap":
|
||||
# Swap two adjacent characters
|
||||
if len(word) > 1:
|
||||
idx = random.randint(0, len(word) - 2)
|
||||
word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
|
||||
|
||||
|
||||
return word
|
||||
|
||||
def corrupt_string(sentence, corruption_probability=0.01):
|
||||
"""Corrupt each word in the string with a given probability."""
|
||||
words = sentence.split()
|
||||
corrupted_words = [
|
||||
corrupt_word(word) if random.random() < corruption_probability else word
|
||||
for word in words
|
||||
]
|
||||
return " ".join(corrupted_words)
|
||||
|
||||
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
label_flag_list = []
|
||||
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
# produce shuffling
|
||||
index = row['entity_id']
|
||||
parent_desc = row['mention']
|
||||
parent_desc = preprocess_text(parent_desc)
|
||||
|
||||
# Split the string into words
|
||||
words = parent_desc.split()
|
||||
|
||||
# Count the number of words
|
||||
word_count = len(words)
|
||||
|
||||
# short sequences are rare, and we must compensate by including more examples
|
||||
# mutation of other longer sequences might drown out rare short sequences
|
||||
if word_count < 3:
|
||||
for _ in range(10):
|
||||
element = {
|
||||
'text': parent_desc,
|
||||
'label': label2id[index],
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
# check if label is in label_flag_list
|
||||
if index not in label_flag_list:
|
||||
|
||||
entity_name = row['entity_name']
|
||||
# add the "entity_name" label as a mention
|
||||
element = {
|
||||
'text': entity_name,
|
||||
'label': label2id[index],
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
# remove all non-alphanumerics
|
||||
desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
# add shufles of the original entity name
|
||||
no_of_shuffles = SHUFFLES
|
||||
processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles)
|
||||
for desc in processed_descs:
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
label_flag_list.append(index)
|
||||
|
||||
|
||||
|
||||
# add shuffled strings
|
||||
processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
|
||||
for desc in processed_descs:
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
# corrupt string
|
||||
desc = corrupt_string(parent_desc, corruption_probability=0.1)
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
# augmentation
|
||||
# remove all non-alphanumerics
|
||||
desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
# # augmentation
|
||||
# # perform abbrev_to_term
|
||||
# temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||
# desc = replace_terms_with_abbreviations(temp_desc)
|
||||
# if (desc != temp_desc):
|
||||
# element = {
|
||||
# 'text' : desc,
|
||||
# 'label': label2id[index], # ensure labels starts from 0
|
||||
# }
|
||||
# output_list.append(element)
|
||||
|
||||
# augmentation
|
||||
# perform term to abbrev
|
||||
desc = replace_abbreviations_with_terms(parent_desc)
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
combined_data = DatasetDict({
|
||||
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
})
|
||||
return combined_data
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
def train():
|
||||
|
||||
save_path = f'checkpoint'
|
||||
split_datasets = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||
# model_checkpoint = 'google-bert/bert-base-cased'
|
||||
# model_checkpoint = 'prajjwal1/bert-small'
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<DESC>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
max_length = 120
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
tokenized_datasets = split_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels = eval_preds
|
||||
preds = np.argmax(preds, axis=1)
|
||||
return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
# %%
|
||||
# create id2label and label2id
|
||||
|
||||
|
||||
# %%
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(target_id_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
||||
|
||||
# %%
|
||||
# Trainer
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=f"{save_path}",
|
||||
# eval_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=5e-5,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
warmup_steps=400,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
|
||||
trainer = Trainer(
|
||||
model,
|
||||
training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||
)
|
||||
|
||||
# uncomment to load training from checkpoint
|
||||
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# execute training
|
||||
train()
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1,2 @@
|
|||
checkpoint*
|
||||
tensorboard-log
|
|
@ -0,0 +1 @@
|
|||
exports
|
|
@ -0,0 +1,11 @@
|
|||
|
||||
*******************************************************************************
|
||||
Accuracy: 0.71956
|
||||
F1 Score: 0.74142
|
||||
Precision: 0.81529
|
||||
Recall: 0.71956
|
||||
********************************************************************************
|
||||
Accuracy: 0.71710
|
||||
F1 Score: 0.74095
|
||||
Precision: 0.82181
|
||||
Recall: 0.71710
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
*******************************************************************************
|
||||
Accuracy: 0.81591
|
||||
F1 Score: 0.82162
|
||||
Precision: 0.85519
|
||||
Recall: 0.81591
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
*******************************************************************************
|
||||
Accuracy: 0.59943
|
||||
F1 Score: 0.60266
|
||||
Precision: 0.66956
|
||||
Recall: 0.59943
|
|
@ -0,0 +1,265 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
import glob
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import re
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
|
||||
BATCH_SIZE = 256
|
||||
|
||||
# %%
|
||||
# construct the target id list
|
||||
# data_path = '../../../esAppMod_data_import/train.csv'
|
||||
data_path = '../../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(target_id_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
|
||||
# introduce pre-processing functions
|
||||
def preprocess_text(text):
|
||||
# 1. Make all uppercase
|
||||
text = text.lower()
|
||||
|
||||
# Substitute digits with '#'
|
||||
text = re.sub(r'\d+', '#', text)
|
||||
|
||||
# standardize spacing
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
desc = preprocess_text(desc)
|
||||
index = row['entity_id']
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
# data_path = '../../../esAppMod_data_import/test.csv'
|
||||
data_path = '../../../esAppMod_data_import/parent_test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
# combined_data = DatasetDict({
|
||||
# 'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
# })
|
||||
return Dataset.from_list(process_df_to_dict(test_df))
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
def test():
|
||||
|
||||
test_dataset = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
checkpoint_directory = f'../checkpoint'
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
pattern = 'checkpoint_part1-*'
|
||||
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
# %%
|
||||
# compute max token length
|
||||
max_length = 0
|
||||
for sample in test_dataset['text']:
|
||||
# Tokenize the sample and get the length
|
||||
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||
length = len(input_ids)
|
||||
|
||||
# Update max_length if this sample is longer
|
||||
if length > max_length:
|
||||
max_length = length
|
||||
|
||||
print(max_length)
|
||||
|
||||
# %%
|
||||
|
||||
max_length = 128
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
# truncation=True,
|
||||
padding='max_length'
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
datasets = test_dataset.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
|
||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
# metric = evaluate.load("accuracy")
|
||||
#
|
||||
#
|
||||
# def compute_metrics(eval_preds):
|
||||
# preds, labels = eval_preds
|
||||
# preds = np.argmax(preds, axis=1)
|
||||
# return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(target_id_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
model = model.eval()
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
model.to(device)
|
||||
|
||||
pred_labels = []
|
||||
actual_labels = []
|
||||
|
||||
|
||||
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||
for batch in tqdm(dataloader):
|
||||
# Inference in batches
|
||||
input_ids = batch['input_ids']
|
||||
attention_mask = batch['attention_mask']
|
||||
# save labels too
|
||||
actual_labels.extend(batch['label'])
|
||||
|
||||
|
||||
# Move to GPU if available
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
|
||||
# Perform inference
|
||||
with torch.no_grad():
|
||||
logits = model(
|
||||
input_ids,
|
||||
attention_mask).logits
|
||||
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||
pred_labels.extend(predicted_class_ids)
|
||||
|
||||
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||
|
||||
|
||||
# %%
|
||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||
y_true = actual_labels
|
||||
y_pred = pred_labels
|
||||
|
||||
# Compute metrics
|
||||
accuracy = accuracy_score(y_true, y_pred)
|
||||
average_parameter = 'weighted'
|
||||
zero_division_parameter = 0
|
||||
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
|
||||
with open("output_1.txt", "a") as f:
|
||||
|
||||
print('*' * 80, file=f)
|
||||
# Print the results
|
||||
print(f'Accuracy: {accuracy:.5f}', file=f)
|
||||
print(f'F1 Score: {f1:.5f}', file=f)
|
||||
print(f'Precision: {precision:.5f}', file=f)
|
||||
print(f'Recall: {recall:.5f}', file=f)
|
||||
|
||||
# export result
|
||||
label_list = [id2label[id] for id in pred_labels]
|
||||
df = pd.DataFrame({
|
||||
'class_prediction': pd.Series(label_list)
|
||||
})
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result_1.csv", index=False)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# reset file before writing to it
|
||||
with open("output_1.txt", "w") as f:
|
||||
print('', file=f)
|
||||
test()
|
|
@ -0,0 +1,265 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
import glob
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import re
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
|
||||
BATCH_SIZE = 256
|
||||
|
||||
# %%
|
||||
# construct the target id list
|
||||
# data_path = '../../../esAppMod_data_import/train.csv'
|
||||
data_path = '../../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(target_id_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
|
||||
# introduce pre-processing functions
|
||||
def preprocess_text(text):
|
||||
# 1. Make all uppercase
|
||||
text = text.lower()
|
||||
|
||||
# Substitute digits with '#'
|
||||
text = re.sub(r'\d+', '#', text)
|
||||
|
||||
# standardize spacing
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
desc = preprocess_text(desc)
|
||||
index = row['entity_id']
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
# data_path = '../../../esAppMod_data_import/test.csv'
|
||||
data_path = '../../../esAppMod_data_import/test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
# combined_data = DatasetDict({
|
||||
# 'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
# })
|
||||
return Dataset.from_list(process_df_to_dict(test_df))
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
def test():
|
||||
|
||||
test_dataset = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
checkpoint_directory = f'../checkpoint'
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
pattern = 'checkpoint-*'
|
||||
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
# %%
|
||||
# compute max token length
|
||||
max_length = 0
|
||||
for sample in test_dataset['text']:
|
||||
# Tokenize the sample and get the length
|
||||
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||
length = len(input_ids)
|
||||
|
||||
# Update max_length if this sample is longer
|
||||
if length > max_length:
|
||||
max_length = length
|
||||
|
||||
print(max_length)
|
||||
|
||||
# %%
|
||||
|
||||
max_length = 128
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
# truncation=True,
|
||||
padding='max_length'
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
datasets = test_dataset.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
|
||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
# metric = evaluate.load("accuracy")
|
||||
#
|
||||
#
|
||||
# def compute_metrics(eval_preds):
|
||||
# preds, labels = eval_preds
|
||||
# preds = np.argmax(preds, axis=1)
|
||||
# return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(target_id_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
model = model.eval()
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
model.to(device)
|
||||
|
||||
pred_labels = []
|
||||
actual_labels = []
|
||||
|
||||
|
||||
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||
for batch in tqdm(dataloader):
|
||||
# Inference in batches
|
||||
input_ids = batch['input_ids']
|
||||
attention_mask = batch['attention_mask']
|
||||
# save labels too
|
||||
actual_labels.extend(batch['label'])
|
||||
|
||||
|
||||
# Move to GPU if available
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
|
||||
# Perform inference
|
||||
with torch.no_grad():
|
||||
logits = model(
|
||||
input_ids,
|
||||
attention_mask).logits
|
||||
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||
pred_labels.extend(predicted_class_ids)
|
||||
|
||||
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||
|
||||
|
||||
# %%
|
||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||
y_true = actual_labels
|
||||
y_pred = pred_labels
|
||||
|
||||
# Compute metrics
|
||||
accuracy = accuracy_score(y_true, y_pred)
|
||||
average_parameter = 'weighted'
|
||||
zero_division_parameter = 0
|
||||
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
|
||||
with open("output_2.txt", "a") as f:
|
||||
|
||||
print('*' * 80, file=f)
|
||||
# Print the results
|
||||
print(f'Accuracy: {accuracy:.5f}', file=f)
|
||||
print(f'F1 Score: {f1:.5f}', file=f)
|
||||
print(f'Precision: {precision:.5f}', file=f)
|
||||
print(f'Recall: {recall:.5f}', file=f)
|
||||
|
||||
# export result
|
||||
label_list = [id2label[id] for id in pred_labels]
|
||||
df = pd.DataFrame({
|
||||
'class_prediction': pd.Series(label_list)
|
||||
})
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result_2.csv", index=False)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# reset file before writing to it
|
||||
with open("output_2.txt", "w") as f:
|
||||
print('', file=f)
|
||||
test()
|
|
@ -45,7 +45,7 @@ def set_seed(seed):
|
|||
|
||||
set_seed(42)
|
||||
|
||||
SHUFFLES=2
|
||||
SHUFFLES=5
|
||||
|
||||
# %%
|
||||
|
||||
|
@ -56,37 +56,6 @@ train_df = pd.read_csv(data_path, skipinitialspace=True)
|
|||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
def compute_normalized_class_weights(class_counts, max_resamples=SHUFFLES):
|
||||
"""
|
||||
Compute normalized class weights inversely proportional to class counts.
|
||||
The weights are normalized so that they sum to 1.
|
||||
|
||||
Args:
|
||||
class_counts (array-like): An array or list where each element represents the count of samples for a class.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: A normalized array of weights for each class.
|
||||
"""
|
||||
class_counts = np.array(class_counts)
|
||||
total_samples = np.sum(class_counts)
|
||||
class_weights = total_samples / class_counts
|
||||
# so that highest weight is 1
|
||||
normalized_weights = class_weights / np.max(class_weights)
|
||||
# Scale weights such that the highest weight corresponds to `max_resamples`
|
||||
resample_counts = normalized_weights * max_resamples
|
||||
# Round resamples to nearest integer
|
||||
resample_counts = np.round(resample_counts).astype(int)
|
||||
return resample_counts
|
||||
|
||||
# %%
|
||||
id_counts = train_df['entity_id'].value_counts()
|
||||
id_weights = compute_normalized_class_weights(id_counts, max_resamples=SHUFFLES)
|
||||
id_index = id_counts.index
|
||||
label2weight = {}
|
||||
for idx, label in enumerate(id_index):
|
||||
label2weight[label] = id_weights[idx]
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
|
@ -101,20 +70,8 @@ def preprocess_text(text):
|
|||
# 1. Make all uppercase
|
||||
text = text.lower()
|
||||
|
||||
# Remove any non alphanumeric character
|
||||
# text = re.sub(r'[^\w\s]', ' ', text) # Retains only alphanumeric and spaces
|
||||
# replace dashes
|
||||
text = re.sub(r"[-;:]", " ", text)
|
||||
|
||||
# Add space between digit followed by a letter
|
||||
text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
|
||||
|
||||
# Add space between letter followed by a digit
|
||||
text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
|
||||
|
||||
|
||||
# Substitute digits with 'x'
|
||||
text = re.sub(r'\d+', 'x', text)
|
||||
text = re.sub(r'\d+', '#', text)
|
||||
|
||||
# standardize spacing
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
@ -165,35 +122,143 @@ def shuffle_text(text, n_shuffles=SHUFFLES):
|
|||
|
||||
return all_processed
|
||||
|
||||
term_to_abbrev = {
|
||||
r'job entry system': 'jes',
|
||||
r'subversion': 'svn',
|
||||
r'borland database engine': 'bde',
|
||||
r'business intelligence and reporting tools': 'birt',
|
||||
r'lan management solution': 'lms',
|
||||
r'laboratory information management system': 'lims',
|
||||
r'ibm database 2': 'db/2',
|
||||
r'integrated development environment': 'ide',
|
||||
r'software development kit': 'sdk',
|
||||
r'hp operations orchestration': 'hpoo',
|
||||
r'hp server automation': 'hpsa',
|
||||
r'internet information server': 'iis',
|
||||
r'release 2': 'r2',
|
||||
r'red hat enterprise linux': 'rhel',
|
||||
r'oracle enterprise linux': 'oel',
|
||||
r'websphere application server': 'was',
|
||||
r'application development facility': 'adf',
|
||||
r'server analysis services': 'ssas'
|
||||
acronym_mapping = {
|
||||
'hpsa': 'hp server automation',
|
||||
'tam': 'tivoli access manager',
|
||||
'adf': 'application development facility',
|
||||
'html': 'hypertext markup language',
|
||||
'wff': 'microsoft web farm framework',
|
||||
'jsp': 'javaserver pages',
|
||||
'bw': 'business works',
|
||||
'ssrs': 'sql server reporting services',
|
||||
'cl': 'control language',
|
||||
'vba': 'visual basic for applications',
|
||||
'esapi': 'enterprise security api',
|
||||
'gwt': 'google web toolkit',
|
||||
'pki': 'perkin elmer informatics',
|
||||
'rtd': 'oracle realtime decisions',
|
||||
'jms': 'java message service',
|
||||
'db': 'database',
|
||||
'soa': 'service oriented architecture',
|
||||
'xsl': 'extensible stylesheet language',
|
||||
'com': 'compopent object model',
|
||||
'ldap': 'lightweight directory access protocol',
|
||||
'odm': 'ibm operational decision manager',
|
||||
'soql': 'salesforce object query language',
|
||||
'oms': 'order management system',
|
||||
'cfml': 'coldfusion markup language',
|
||||
'nas': 'netscape application server',
|
||||
'sql': 'structured query language',
|
||||
'bde': 'borland database engine',
|
||||
'imap': 'internet message access protocol',
|
||||
'uws': 'ultidev web server',
|
||||
'birt': 'business intelligence and reporting tools',
|
||||
'mdw': 'model driven workflow',
|
||||
'tws': 'tivoli workload scheduler',
|
||||
'jre': 'java runtime environment',
|
||||
'wcs': 'websphere commerce suite',
|
||||
'was': 'websphere application server',
|
||||
'ssis': 'sql server integration services',
|
||||
'xhtml': 'extensible hypertext markup language',
|
||||
'soap': 'simple object access protocol',
|
||||
'san': 'storage area network',
|
||||
'elk': 'elastic stack',
|
||||
'arr': 'application request routing',
|
||||
'xlst': 'extensible stylesheet language transformations',
|
||||
'sccm': 'microsoft endpoint configuration manager',
|
||||
'ejb': 'enterprise java beans',
|
||||
'css': 'cascading style sheets',
|
||||
'hpoo': 'hp operations orchestration',
|
||||
'xml': 'extensible markup language',
|
||||
'esb': 'enterprise service bus',
|
||||
'edi': 'electronic data interchange',
|
||||
'imsva': 'interscan messaging security virtual appliance',
|
||||
'wtx': 'ibm websphere transformation extender',
|
||||
'cgi': 'common gateway interface',
|
||||
'bal': 'ibm basic assembly language',
|
||||
'issow': 'integrated safe system of work',
|
||||
'dcl': 'data control language',
|
||||
'jdom': 'java document object model',
|
||||
'fim': 'microsoft forefront identity manager',
|
||||
'npl': 'niakwa programming language',
|
||||
'wf': 'windows workflow foundation',
|
||||
'lm': 'etap license manager',
|
||||
'wts': 'windows terminal server',
|
||||
'asp': 'active server pages',
|
||||
'jil': 'job information language',
|
||||
'mvc': 'model view controller',
|
||||
'rmi': 'remote method invocation',
|
||||
'ad': 'active directory',
|
||||
'owb': 'oracle warehouse builder',
|
||||
'rest': 'representational state transfer',
|
||||
'jdk': 'java development kit',
|
||||
'ids': 'integrated data store',
|
||||
'bms': 'batch management software',
|
||||
'vsx': 'vmware solution exchange',
|
||||
'ssas': 'sql server analysis services',
|
||||
'atl': 'atlas transformation language',
|
||||
'ice': 'infobright community edition',
|
||||
'esql': 'extended structured query language',
|
||||
'corba': 'common object request broker architecture',
|
||||
'dpe': 'device provisioning engines',
|
||||
'rac': 'oracle real application clusters',
|
||||
'iemt': 'iis easy migration tool',
|
||||
'mes': 'manufacturing execution system',
|
||||
'odbc': 'open database connectivity',
|
||||
'lms': 'lan management solution',
|
||||
'wcf': 'windows communication foundation',
|
||||
'nes': 'netscape enterprise server',
|
||||
'jsf': 'javaserver faces',
|
||||
'alm': 'application lifecycle management',
|
||||
'hlasm': 'high level assembler',
|
||||
'cmod': 'content manager ondemand'}
|
||||
|
||||
external_source = {
|
||||
'vb.net': 'visual basic dot net',
|
||||
'jes': 'job entry subsystem',
|
||||
'svn': 'subversion',
|
||||
'vcs': 'version control system',
|
||||
'lims': 'laboratory information management system',
|
||||
'ide': 'integrated development environment',
|
||||
'sdk': 'software development kit',
|
||||
'mq': 'message queue',
|
||||
'ims': 'information management system',
|
||||
'isa': 'internet security and acceleration',
|
||||
'vs': 'visual studio',
|
||||
'esr': 'extended support release',
|
||||
'ff': 'firefox',
|
||||
'vb': 'visual basic',
|
||||
'rhel': 'red hat enterprise linux',
|
||||
'iis': 'internet information server',
|
||||
'api': 'application programming interface',
|
||||
'se': 'standard edition',
|
||||
'\.net': 'dot net',
|
||||
'c#': 'c sharp',
|
||||
'ms': 'microsoft'
|
||||
}
|
||||
|
||||
abbrev_to_term = {rf'\b{value}\b': key for key, value in term_to_abbrev.items()}
|
||||
|
||||
# synonyms = {
|
||||
# 'windows server': 'windows nt',
|
||||
# 'windows 7': 'windows desktop',
|
||||
# 'windows 8': 'windows desktop',
|
||||
# 'windows 10': 'windows desktop'
|
||||
# }
|
||||
|
||||
|
||||
# add more information
|
||||
acronym_mapping.update(external_source)
|
||||
|
||||
|
||||
abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
|
||||
term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
|
||||
|
||||
def replace_terms_with_abbreviations(text):
|
||||
for input, replacement in term_to_abbrev.items():
|
||||
text = re.sub(input, replacement, text)
|
||||
return text
|
||||
|
||||
def replace_abbreivations_with_terms(text):
|
||||
def replace_abbreviations_with_terms(text):
|
||||
for input, replacement in abbrev_to_term.items():
|
||||
text = re.sub(input, replacement, text)
|
||||
return text
|
||||
|
@ -218,8 +283,19 @@ def process_df_to_dict(df):
|
|||
# no_of_shuffles = label2weight[index] + 1
|
||||
no_of_shuffles = SHUFFLES
|
||||
processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
|
||||
|
||||
for desc in processed_descs:
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
# augmentation
|
||||
# remove all non-alphanumerics
|
||||
desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
|
@ -227,24 +303,21 @@ def process_df_to_dict(df):
|
|||
output_list.append(element)
|
||||
|
||||
|
||||
# augmentation
|
||||
# perform abbrev_to_term
|
||||
desc = replace_terms_with_abbreviations(parent_desc)
|
||||
no_of_shuffles = SHUFFLES
|
||||
processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
|
||||
|
||||
for desc in processed_descs:
|
||||
temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||
desc = replace_terms_with_abbreviations(temp_desc)
|
||||
if (desc != temp_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
# augmentation
|
||||
# perform term to abbrev
|
||||
desc = replace_abbreivations_with_terms(parent_desc)
|
||||
no_of_shuffles = SHUFFLES
|
||||
processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
|
||||
|
||||
for desc in processed_descs:
|
||||
desc = replace_abbreviations_with_terms(parent_desc)
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
|
@ -257,7 +330,7 @@ def process_df_to_dict(df):
|
|||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../esAppMod_data_import/train.csv'
|
||||
data_path = '../../esAppMod_data_import/parent_train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
|
@ -271,13 +344,13 @@ def create_dataset():
|
|||
|
||||
def train():
|
||||
|
||||
save_path = f'checkpoint'
|
||||
save_path = f'checkpoint_part1'
|
||||
split_datasets = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||
# model_checkpoint = 'google-bert/bert-base-cased'
|
||||
# model_checkpoint = 'google-bert/bert-base-uncased'
|
||||
# model_checkpoint = 'prajjwal1/bert-small'
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
|
@ -348,7 +421,6 @@ def train():
|
|||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=f"{save_path}",
|
||||
# eval_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
|
@ -0,0 +1,469 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import re
|
||||
import random
|
||||
import glob
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
Trainer,
|
||||
EarlyStoppingCallback,
|
||||
TrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
# %%
|
||||
def set_seed(seed):
|
||||
"""
|
||||
Set the random seed for reproducibility.
|
||||
"""
|
||||
random.seed(seed) # Python random module
|
||||
np.random.seed(seed) # NumPy random
|
||||
torch.manual_seed(seed) # PyTorch CPU
|
||||
torch.cuda.manual_seed(seed) # PyTorch GPU
|
||||
torch.cuda.manual_seed_all(seed) # If using multiple GPUs
|
||||
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
|
||||
torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility
|
||||
|
||||
set_seed(42)
|
||||
|
||||
SHUFFLES=0
|
||||
|
||||
# %%
|
||||
|
||||
# import training file
|
||||
data_path = '../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(target_id_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
# %%
|
||||
# introduce pre-processing functions
|
||||
def preprocess_text(text):
|
||||
|
||||
# 1. Make all uppercase
|
||||
text = text.lower()
|
||||
|
||||
# Substitute digits with 'x'
|
||||
text = re.sub(r'\d+', '#', text)
|
||||
|
||||
# standardize spacing
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def generate_random_shuffles(text, n):
|
||||
"""
|
||||
Generate n strings with randomly shuffled words from the input text.
|
||||
|
||||
Args:
|
||||
text (str): The input text.
|
||||
n (int): The number of random variations to generate.
|
||||
|
||||
Returns:
|
||||
list: A list of strings with shuffled words.
|
||||
"""
|
||||
words = text.split() # Split the input into words
|
||||
shuffled_variations = []
|
||||
|
||||
for _ in range(n):
|
||||
shuffled = words[:] # Copy the word list to avoid in-place modification
|
||||
random.shuffle(shuffled) # Randomly shuffle the words
|
||||
shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string
|
||||
|
||||
return shuffled_variations
|
||||
|
||||
|
||||
# generate n more shuffled examples
|
||||
def shuffle_text(text, n_shuffles=SHUFFLES):
|
||||
"""
|
||||
Preprocess a list of texts and add n random shuffles for each string.
|
||||
|
||||
Args:
|
||||
texts (list): An input strings.
|
||||
n_shuffles (int): Number of random shuffles to generate for each string.
|
||||
|
||||
Returns:
|
||||
list: A list of preprocessed and shuffled strings.
|
||||
"""
|
||||
all_processed = []
|
||||
all_processed.append(text)
|
||||
|
||||
# Generate random shuffles
|
||||
shuffled_variations = generate_random_shuffles(text, n_shuffles)
|
||||
all_processed.extend(shuffled_variations)
|
||||
|
||||
return all_processed
|
||||
|
||||
acronym_mapping = {
|
||||
'hpsa': 'hp server automation',
|
||||
'tam': 'tivoli access manager',
|
||||
'adf': 'application development facility',
|
||||
'html': 'hypertext markup language',
|
||||
'wff': 'microsoft web farm framework',
|
||||
'jsp': 'javaserver pages',
|
||||
'bw': 'business works',
|
||||
'ssrs': 'sql server reporting services',
|
||||
'cl': 'control language',
|
||||
'vba': 'visual basic for applications',
|
||||
'esapi': 'enterprise security api',
|
||||
'gwt': 'google web toolkit',
|
||||
'pki': 'perkin elmer informatics',
|
||||
'rtd': 'oracle realtime decisions',
|
||||
'jms': 'java message service',
|
||||
'db': 'database',
|
||||
'soa': 'service oriented architecture',
|
||||
'xsl': 'extensible stylesheet language',
|
||||
'com': 'compopent object model',
|
||||
'ldap': 'lightweight directory access protocol',
|
||||
'odm': 'ibm operational decision manager',
|
||||
'soql': 'salesforce object query language',
|
||||
'oms': 'order management system',
|
||||
'cfml': 'coldfusion markup language',
|
||||
'nas': 'netscape application server',
|
||||
'sql': 'structured query language',
|
||||
'bde': 'borland database engine',
|
||||
'imap': 'internet message access protocol',
|
||||
'uws': 'ultidev web server',
|
||||
'birt': 'business intelligence and reporting tools',
|
||||
'mdw': 'model driven workflow',
|
||||
'tws': 'tivoli workload scheduler',
|
||||
'jre': 'java runtime environment',
|
||||
'wcs': 'websphere commerce suite',
|
||||
'was': 'websphere application server',
|
||||
'ssis': 'sql server integration services',
|
||||
'xhtml': 'extensible hypertext markup language',
|
||||
'soap': 'simple object access protocol',
|
||||
'san': 'storage area network',
|
||||
'elk': 'elastic stack',
|
||||
'arr': 'application request routing',
|
||||
'xlst': 'extensible stylesheet language transformations',
|
||||
'sccm': 'microsoft endpoint configuration manager',
|
||||
'ejb': 'enterprise java beans',
|
||||
'css': 'cascading style sheets',
|
||||
'hpoo': 'hp operations orchestration',
|
||||
'xml': 'extensible markup language',
|
||||
'esb': 'enterprise service bus',
|
||||
'edi': 'electronic data interchange',
|
||||
'imsva': 'interscan messaging security virtual appliance',
|
||||
'wtx': 'ibm websphere transformation extender',
|
||||
'cgi': 'common gateway interface',
|
||||
'bal': 'ibm basic assembly language',
|
||||
'issow': 'integrated safe system of work',
|
||||
'dcl': 'data control language',
|
||||
'jdom': 'java document object model',
|
||||
'fim': 'microsoft forefront identity manager',
|
||||
'npl': 'niakwa programming language',
|
||||
'wf': 'windows workflow foundation',
|
||||
'lm': 'etap license manager',
|
||||
'wts': 'windows terminal server',
|
||||
'asp': 'active server pages',
|
||||
'jil': 'job information language',
|
||||
'mvc': 'model view controller',
|
||||
'rmi': 'remote method invocation',
|
||||
'ad': 'active directory',
|
||||
'owb': 'oracle warehouse builder',
|
||||
'rest': 'representational state transfer',
|
||||
'jdk': 'java development kit',
|
||||
'ids': 'integrated data store',
|
||||
'bms': 'batch management software',
|
||||
'vsx': 'vmware solution exchange',
|
||||
'ssas': 'sql server analysis services',
|
||||
'atl': 'atlas transformation language',
|
||||
'ice': 'infobright community edition',
|
||||
'esql': 'extended structured query language',
|
||||
'corba': 'common object request broker architecture',
|
||||
'dpe': 'device provisioning engines',
|
||||
'rac': 'oracle real application clusters',
|
||||
'iemt': 'iis easy migration tool',
|
||||
'mes': 'manufacturing execution system',
|
||||
'odbc': 'open database connectivity',
|
||||
'lms': 'lan management solution',
|
||||
'wcf': 'windows communication foundation',
|
||||
'nes': 'netscape enterprise server',
|
||||
'jsf': 'javaserver faces',
|
||||
'alm': 'application lifecycle management',
|
||||
'hlasm': 'high level assembler',
|
||||
'cmod': 'content manager ondemand'}
|
||||
|
||||
external_source = {
|
||||
'vb.net': 'visual basic dot net',
|
||||
'jes': 'job entry subsystem',
|
||||
'svn': 'subversion',
|
||||
'vcs': 'version control system',
|
||||
'lims': 'laboratory information management system',
|
||||
'ide': 'integrated development environment',
|
||||
'sdk': 'software development kit',
|
||||
'mq': 'message queue',
|
||||
'ims': 'information management system',
|
||||
'isa': 'internet security and acceleration',
|
||||
'vs': 'visual studio',
|
||||
'esr': 'extended support release',
|
||||
'ff': 'firefox',
|
||||
'vb': 'visual basic',
|
||||
'rhel': 'red hat enterprise linux',
|
||||
'iis': 'internet information server',
|
||||
'api': 'application programming interface',
|
||||
'se': 'standard edition',
|
||||
'\.net': 'dot net',
|
||||
'c#': 'c sharp',
|
||||
'ms': 'microsoft'
|
||||
}
|
||||
|
||||
|
||||
# synonyms = {
|
||||
# 'windows server': 'windows nt',
|
||||
# 'windows 7': 'windows desktop',
|
||||
# 'windows 8': 'windows desktop',
|
||||
# 'windows 10': 'windows desktop'
|
||||
# }
|
||||
|
||||
|
||||
# add more information
|
||||
acronym_mapping.update(external_source)
|
||||
|
||||
|
||||
abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
|
||||
term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
|
||||
|
||||
def replace_terms_with_abbreviations(text):
|
||||
for input, replacement in term_to_abbrev.items():
|
||||
text = re.sub(input, replacement, text)
|
||||
return text
|
||||
|
||||
def replace_abbreviations_with_terms(text):
|
||||
for input, replacement in abbrev_to_term.items():
|
||||
text = re.sub(input, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
# produce shuffling
|
||||
index = row['entity_id']
|
||||
parent_desc = row['mention']
|
||||
parent_desc = preprocess_text(parent_desc)
|
||||
# ensure at least 1 shuffle
|
||||
# no_of_shuffles = label2weight[index] + 1
|
||||
no_of_shuffles = SHUFFLES
|
||||
processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
|
||||
for desc in processed_descs:
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
# augmentation
|
||||
# remove all non-alphanumerics
|
||||
desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
# augmentation
|
||||
# perform abbrev_to_term
|
||||
temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||
desc = replace_terms_with_abbreviations(temp_desc)
|
||||
if (desc != temp_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
# augmentation
|
||||
# perform term to abbrev
|
||||
desc = replace_abbreviations_with_terms(parent_desc)
|
||||
if (desc != parent_desc):
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
combined_data = DatasetDict({
|
||||
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
})
|
||||
return combined_data
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
def train():
|
||||
|
||||
save_path = f'checkpoint'
|
||||
split_datasets = create_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
pattern = 'checkpoint_part1-*'
|
||||
checkpoint_directory = 'checkpoint'
|
||||
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||
|
||||
# model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||
# model_checkpoint = 'google-bert/bert-base-uncased'
|
||||
# model_checkpoint = 'prajjwal1/bert-small'
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<DESC>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
max_length = 120
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
tokenized_datasets = split_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels = eval_preds
|
||||
preds = np.argmax(preds, axis=1)
|
||||
return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
# %%
|
||||
# create id2label and label2id
|
||||
|
||||
|
||||
# %%
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(target_id_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
||||
|
||||
# %%
|
||||
# Trainer
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=f"{save_path}",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=5e-5,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=300,
|
||||
warmup_steps=400,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
|
||||
trainer = Trainer(
|
||||
model,
|
||||
training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||
)
|
||||
|
||||
# uncomment to load training from checkpoint
|
||||
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# execute training
|
||||
train()
|
||||
|
||||
|
||||
# %%
|
|
@ -1,2 +0,0 @@
|
|||
|
||||
Accuracy for fold: 0.5846658466584665
|
|
@ -57,10 +57,10 @@ class Inference():
|
|||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
label = row['entity_name']
|
||||
label = row['entity_seq']
|
||||
element = {
|
||||
'input' : desc,
|
||||
'output': label
|
||||
'output': f'{label}'
|
||||
}
|
||||
|
||||
output_list.append(element)
|
||||
|
@ -101,7 +101,7 @@ class Inference():
|
|||
|
||||
|
||||
def generate(self):
|
||||
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
MAX_GENERATE_LENGTH = 128
|
||||
|
||||
pred_generations = []
|
|
@ -0,0 +1,2 @@
|
|||
|
||||
Accuracy for fold: 0.5022550225502255
|
|
@ -11,7 +11,7 @@ BATCH_SIZE = 512
|
|||
def infer():
|
||||
print(f"Inference for data")
|
||||
# import test data
|
||||
data_path = '../../../data_import/test.csv'
|
||||
data_path = '../../../esAppMod_data_import/test_seq.csv'
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
|
@ -35,18 +35,19 @@ def infer():
|
|||
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
|
||||
# Convert the list to a Pandas DataFrame
|
||||
df_out = pd.DataFrame({
|
||||
'predictions': prediction_list
|
||||
'class_prediction': prediction_list
|
||||
})
|
||||
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
|
||||
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
||||
df = pd.concat([df, df_out], axis=1)
|
||||
# df = pd.concat([df, df_out], axis=1)
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result.csv", index=False)
|
||||
df_out.to_csv(f"exports/result.csv", index=False)
|
||||
|
||||
# here we want to evaluate mapping accuracy within the valid in mdm data only
|
||||
condition_correct = df['predictions'] == df['entity_name']
|
||||
pred_correct_proportion = sum(condition_correct)/len(df)
|
||||
# predictions = pd.to_numeric(df_out['class_prediction'], errors="coerce")
|
||||
condition_correct = df_out['class_prediction'] == df['entity_seq']
|
||||
pred_correct_proportion = sum(condition_correct)/len(df_out)
|
||||
|
||||
# write output to file output.txt
|
||||
with open("output.txt", "a") as f:
|
|
@ -33,10 +33,10 @@ def process_df_to_dict(df):
|
|||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = row['mention']
|
||||
label = row['entity_name']
|
||||
label = row['entity_seq']
|
||||
element = {
|
||||
'input' : desc,
|
||||
'output': label
|
||||
'output': f'{label}'
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
@ -45,7 +45,7 @@ def process_df_to_dict(df):
|
|||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = f"../../data_import/train.csv"
|
||||
data_path = f"../../esAppMod_data_import/train_seq.csv"
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
combined_data = DatasetDict({
|
||||
|
|
|
@ -3,8 +3,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||
|
||||
# %%
|
||||
# Load model and tokenizer
|
||||
# model_name = "bigscience/bloom-7b1" # Replace with your model
|
||||
model_name = "bigscience/bloomz-1b1"
|
||||
model_name = "bigscience/bloom-7b1" # Replace with your model
|
||||
# model_name = "bigscience/bloomz-1b1"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Automatically map model layers to available GPUs
|
||||
|
@ -26,13 +26,12 @@ outputs = model.generate(inputs["input_ids"], max_length=50)
|
|||
# Decode and print result
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
# %%
|
||||
# %%
|
||||
# Prepare input
|
||||
|
||||
def generate(text):
|
||||
|
||||
# Define prompt
|
||||
prompt = f"Answer Concisely: Give me a mapping between the acronym and descriptor in the format '(acronym: description): '{text}'"
|
||||
prompt = f"Give me past product names relating to: '{text}'"
|
||||
|
||||
# Generate acronym
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
@ -45,7 +44,7 @@ def generate(text):
|
|||
|
||||
# Example usage
|
||||
# text = "Advanced Data Analytics Platform"
|
||||
text = 'ColdFusion Markup Language (CFML)'
|
||||
text = 'windows server'
|
||||
acronym = generate(text)
|
||||
print(f"Acronym: {acronym}")
|
||||
print(f"Generation: {acronym}")
|
||||
# %%
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
# %%
|
||||
import requests
|
||||
|
||||
def get_related_terms(term, language="en", limit=10):
|
||||
url = f"http://api.conceptnet.io/c/{language}/{term}"
|
||||
response = requests.get(url).json()
|
||||
|
||||
# Extract related terms
|
||||
related_terms = []
|
||||
for edge in response.get("edges", []):
|
||||
related = edge.get("end", {}).get("label", None)
|
||||
if related and related.lower() != term.lower():
|
||||
related_terms.append(related)
|
||||
if len(related_terms) >= limit:
|
||||
break
|
||||
return related_terms
|
||||
|
||||
# Example
|
||||
related_terms = get_related_terms("windows_server")
|
||||
print("Related Terms:", related_terms)
|
||||
# %%
|
|
@ -0,0 +1,38 @@
|
|||
# %%
|
||||
from SPARQLWrapper import SPARQLWrapper, JSON
|
||||
|
||||
# %%
|
||||
sparql = SPARQLWrapper("https://dbpedia.org/sparql")
|
||||
sparql.setQuery("""
|
||||
SELECT ?altLabel WHERE {
|
||||
?item rdfs:label "Windows Server"@en.
|
||||
?item skos:altLabel ?altLabel.
|
||||
FILTER (LANG(?altLabel) = "en")
|
||||
}
|
||||
LIMIT 10
|
||||
""")
|
||||
sparql.setReturnFormat(JSON)
|
||||
results = sparql.query().convert()
|
||||
|
||||
for result in results["results"]["bindings"]:
|
||||
print(result["label"]["value"])
|
||||
# %%
|
||||
from SPARQLWrapper import SPARQLWrapper, JSON
|
||||
|
||||
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
||||
sparql.setQuery("""
|
||||
SELECT ?itemLabel ?altLabel WHERE {
|
||||
?item ?label "Windows Server"@en.
|
||||
OPTIONAL { ?item skos:altLabel ?altLabel. FILTER (LANG(?altLabel) = "en") }
|
||||
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
||||
}
|
||||
LIMIT 10
|
||||
""")
|
||||
sparql.setReturnFormat(JSON)
|
||||
results = sparql.query().convert()
|
||||
|
||||
for result in results["results"]["bindings"]:
|
||||
print("Label:", result["itemLabel"]["value"])
|
||||
if "altLabel" in result:
|
||||
print("Alias:", result["altLabel"]["value"])
|
||||
# %%
|
|
@ -0,0 +1,626 @@
|
|||
,mention,entity_id,entity_name,class_prediction,predicted_name
|
||||
0,DOT NET,497,.NET Framework,579,Unix|BSD|*
|
||||
2,Dot net - FW 4,497,.NET Framework,368,VB.NET
|
||||
3,.Net 4.7.1 Enterprise Lib,497,.NET Framework,368,VB.NET
|
||||
11,.NET,497,.NET Framework,579,Unix|BSD|*
|
||||
13,.Net 4.5.2 Enterprise Lib,497,.NET Framework,368,VB.NET
|
||||
40,APACHE LOG4NET,483,.NET Framework|log4net,394,Java|Log4j
|
||||
41,LOG4NET,483,.NET Framework|log4net,394,Java|Log4j
|
||||
42,Magik,484,.NET Framework|Magick.NET,533,YAML
|
||||
43,WCF,485,.NET Framework|Windows Communication Foundation (WCF),486,.NET Framework|Windows Workflow Foundation (WF)
|
||||
45,WWF,486,.NET Framework|Windows Workflow Foundation (WF),443,OS/2
|
||||
47,Ejes,1,(E)JES,101,Microsoft Dynamics AX
|
||||
48,(UNIRITA) A-AUTO 7.2.2,2,A-Auto Job Scheduling Software,299,AutoIt
|
||||
50,Active Directoy,498,Active Directory (AD),40,Connect Direct
|
||||
54,APSX,592,Active Server Pages (ASP)|*,609,IIS|*
|
||||
69,Andriod,418,Android,586,PHP|*
|
||||
71,Apache Active Queue,6,Apache ActiveMQ,259,Apache HTTP Server
|
||||
72,MQ Apache Active Queue,6,Apache ActiveMQ,81,IBM Websphere MQ
|
||||
75,cordova-android,501,Apache Cordova,418,Android
|
||||
77,Hive,8,Apache Hive,177,SAP NetWeaver Business Warehouse
|
||||
99,solr,11,Apache Solr,375,Apache Lucene
|
||||
135,ADF,13,Application Development Facility (ADF),130,Oracle ADF
|
||||
144,WLS 10.2,600,Oracle WebLogic Server|*,442,OpenVMS
|
||||
149,BEA WLS,600,Oracle WebLogic Server|*,442,OpenVMS
|
||||
152,Weblogic 12c,600,Oracle WebLogic Server|*,582,C#|*
|
||||
160,WLE,600,Oracle WebLogic Server|*,443,OS/2
|
||||
168,Web Logic,600,Oracle WebLogic Server|*,97,MarkLogic DB
|
||||
174,BEA WLE,600,Oracle WebLogic Server|*,443,OS/2
|
||||
175,WLS 10,600,Oracle WebLogic Server|*,442,OpenVMS
|
||||
176,WLS,600,Oracle WebLogic Server|*,442,OpenVMS
|
||||
189,brain script,302,Brainscript,369,VBScript
|
||||
190,BRAINScript,302,Brainscript,367,TypeScript
|
||||
191,Business Intelligence and Reporting Tools,21,Business Intelligence and Reporting Tools (BIRT),133,Oracle Business Intelligence
|
||||
192,Actuate Report Server,21,Business Intelligence and Reporting Tools (BIRT),42,Crystal Reports
|
||||
194,CSHARP,582,C#|*,87,Informatica PowerCenter
|
||||
218,WinFrame,30,Citrix Virtual Apps and Desktops,443,OS/2
|
||||
221,METAFRAME,30,Citrix Virtual Apps and Desktops,406,JavaScript|AngularJS
|
||||
225,Presentation Server,30,Citrix Virtual Apps and Desktops,541,File Server
|
||||
226,NETSCALER-1.5,563,Citrix ADC,273,Netscape Enterprise Server (NES)
|
||||
227,NETSCALER-11.,563,Citrix ADC,273,Netscape Enterprise Server (NES)
|
||||
228,Citrix SD-WAN,563,Citrix ADC,30,Citrix Virtual Apps and Desktops
|
||||
229,NetScaler SD-WAN,563,Citrix ADC,273,Netscape Enterprise Server (NES)
|
||||
231,NetScaler ADC,563,Citrix ADC,272,Netscape Application Server (NAS)
|
||||
236,NetScaler SDX,291,Citrix ADC SDX,273,Netscape Enterprise Server (NES)
|
||||
240,Provisioning Services 7.15.8,32,Citrix Provisioning,538,Device Provisioning Engines (DPE)
|
||||
241,Citrix PVS,32,Citrix Provisioning,30,Citrix Virtual Apps and Desktops
|
||||
243,CLISTS,309,CLIST,329,IBM i Control Language (CL)
|
||||
253,CFML,311,ColdFusion Markup Language (CFML),316,eXtensible HyperText Markup Language (XHTML)
|
||||
254,ColdFusion Markup Language,311,ColdFusion Markup Language (CFML),37,Coldfusion
|
||||
255,Sterling Connect,40,Connect Direct,542,General Ledger
|
||||
264,Cormerstone,41,Cornerstone software,516,Compopent Object Model (COM)
|
||||
265,Cornerstone,41,Cornerstone software,370,Visual Basic
|
||||
279,DB2 UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA)
|
||||
282,DB2-UDB,43,DB2,365,TCL
|
||||
291,DB2/UDB,43,DB2,365,TCL
|
||||
292,IBM DB2 ENTERPRISE SERVER EDITION PVU OPTION 10.5,43,DB2,163,PVCS Version Manager
|
||||
300,IBM - IBM DB2 Advanced Enterprise Server Edition PVU Option 10.5,43,DB2,72,IBM Mobile Foundation
|
||||
301,UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA)
|
||||
302,IBM - IBM DB2 Enterprise Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|*
|
||||
306,IBM - IBM DB2 Workgroup Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|*
|
||||
313,EZTriev,314,Easytrieve,296,Intel Xeon Processor
|
||||
314,Eztrieve,314,Easytrieve,296,Intel Xeon Processor
|
||||
321,PrestoSoft - ExamDiff Application 1.6,49,ExamDiff,346,Pascal|Object Pascal
|
||||
322,PrestoSoft - ExamDiff Application,49,ExamDiff,346,Pascal|Object Pascal
|
||||
323,ExamDiff Application,49,ExamDiff,467,MS SQL Server|Log Reader Agent
|
||||
324,Expect Scripts,315,Expect,109,Microsoft MQ
|
||||
329,Microsoft - MSXML 4.0 SP2 4.2,318,Extensible Markup Language (XML)|MSXML,316,eXtensible HyperText Markup Language (XHTML)
|
||||
331,XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST)
|
||||
332,JAVA-XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST)
|
||||
335,ServerCA Access GatewayF5,50,F5 Secure Web Gateway Services,290,CA API Gateway
|
||||
347,HP C++,58,HP aC++ compiler,59,HP C/ANSI C compiler
|
||||
350,HP C++ 10.20,58,HP aC++ compiler,59,HP C/ANSI C compiler
|
||||
351,HPC 11.11,59,HP C/ANSI C compiler,58,HP aC++ compiler
|
||||
358,HFS,505,HTTP File Server,486,.NET Framework|Windows Workflow Foundation (WF)
|
||||
359,www.rejetto.com - HttpFileServer 2.3,505,HTTP File Server,55,Google Chrome
|
||||
360,HttpFileServer,505,HTTP File Server,522,Application Web Server
|
||||
367,IBM - IBM BigFix Platform Client Deploy Tool 9.5,457,IBM BigFix Platform|Client Deploy Tool,62,IBM BigFix Platform
|
||||
369,IBM BPM,64,IBM Business Process Manager,328,IBM High Level Assembler (HLASM)
|
||||
375,Data Power,294,IBM DataPower Gateway,295,IBM Power Systems
|
||||
376,IDG.7.5.2.19hp,294,IBM DataPower Gateway,449,Unix|HP-UX
|
||||
380,hlasm,328,IBM High Level Assembler (HLASM),438,macOS
|
||||
383,IHS,265,IBM HTTP Server,424,IBM i
|
||||
386,WebSphere and IHS,265,IBM HTTP Server,67,IBM InfoSphere DataStage
|
||||
387,WebSphere http,265,IBM HTTP Server,284,Websphere Application Server (WAS)
|
||||
391,IBM Websphere HTTP Server,265,IBM HTTP Server,285,WebSphere Liberty
|
||||
393,WebSphere IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS
|
||||
394,WebSphere -IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS
|
||||
397,OS400 V7R1,424,IBM i,443,OS/2
|
||||
398,OS400,424,IBM i,443,OS/2
|
||||
399,OS/400,424,IBM i,443,OS/2
|
||||
408,IIB,68,IBM Integration Bus,370,Visual Basic
|
||||
411,Extended Structured Query Language,458,IBM Integration Bus|Extended Structured Query Language (ESQL),572,Structured Query Language (SQL)
|
||||
415,IBM WorkLight,72,IBM Mobile Foundation,649,IBM Cloud
|
||||
417,ILOG JRules,73,IBM Operational Decision Manager (ODM),601,Websphere ILOG JRules BRMS
|
||||
420,Decision Center 8.0.1.0,73,IBM Operational Decision Manager (ODM),252,IBM ILOG Views
|
||||
423,AS400,295,IBM Power Systems,443,OS/2
|
||||
424,AS/400,295,IBM Power Systems,443,OS/2
|
||||
426,System i,295,IBM Power Systems,424,IBM i
|
||||
427,P-series,295,IBM Power Systems,81,IBM Websphere MQ
|
||||
428,IBM iSeries/AS400 system Model 520,295,IBM Power Systems,443,OS/2
|
||||
439,Tivoli Asset Discovery for Distributed,459,IBM Tivoli Asset Management|Asset Discovery for Distributed,606,IBM Tivoli Asset Management|*
|
||||
447,Database MS SQL Agent,77,IBM Tivoli Monitoring,469,MS SQL Server|SQL Server Database Engine
|
||||
448,Linux OS Agent,77,IBM Tivoli Monitoring,576,Linux|*
|
||||
449,Database DB2 Agent,77,IBM Tivoli Monitoring,520,Database (DB)
|
||||
452,Windows OS Agent,77,IBM Tivoli Monitoring,580,Windows|*
|
||||
454,IBM - IBM TSM FCM,604,IBM Tivoli Storage Manager|*,460,IBM Tivoli Storage Manager|TSM API
|
||||
459,Databases Data Protection for Microsoft SQL,604,IBM Tivoli Storage Manager|*,572,Structured Query Language (SQL)
|
||||
461,IBM - IBM Spectrum Protect Data Protection for Microsoft SQL Server 8.1,604,IBM Tivoli Storage Manager|*,469,MS SQL Server|SQL Server Database Engine
|
||||
462,IBM Spectrum Protect Data Protection,604,IBM Tivoli Storage Manager|*,312,Data Language Interface (DL/I)
|
||||
463,IBM - IBM Spectrum Protect API 7.1,460,IBM Tivoli Storage Manager|TSM API,294,IBM DataPower Gateway
|
||||
464,IBM - IBM Spectrum Protect Client,461,IBM Tivoli Storage Manager|TSM Client,294,IBM DataPower Gateway
|
||||
465,IBM - IBM Tivoli Storage Manager Client,461,IBM Tivoli Storage Manager|TSM Client,604,IBM Tivoli Storage Manager|*
|
||||
467,VSS Requestor configured 8.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|*
|
||||
468,VSS Requestor 7.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|*
|
||||
469,TWS-WS,79,IBM Tivoli Workload Scheduler (TWS),239,Windows Terminal Server (WTS)
|
||||
472,wbia 2.6,80,IBM WebSphere Business Integration Adaptor,627,XtraDB
|
||||
473,IBM WBIA 2.6.0.12,80,IBM WebSphere Business Integration Adaptor,424,IBM i
|
||||
475,MQ,81,IBM Websphere MQ,248,ZeroMQ
|
||||
476,MQ 9.1,81,IBM Websphere MQ,248,ZeroMQ
|
||||
479,MQ 7,81,IBM Websphere MQ,248,ZeroMQ
|
||||
480,MQ 6,81,IBM Websphere MQ,248,ZeroMQ
|
||||
481,MQ 9.0,81,IBM Websphere MQ,248,ZeroMQ
|
||||
482,MQ 5.3,81,IBM Websphere MQ,248,ZeroMQ
|
||||
483,MQ 7.01,81,IBM Websphere MQ,248,ZeroMQ
|
||||
484,MQ 7.5,81,IBM Websphere MQ,248,ZeroMQ
|
||||
485,MQSeries 8.0,81,IBM Websphere MQ,248,ZeroMQ
|
||||
488,WSMQ 8.0,81,IBM Websphere MQ,248,ZeroMQ
|
||||
489,MQ 9.0.5,81,IBM Websphere MQ,248,ZeroMQ
|
||||
491,WTX,83,IBM WebSphere Transformation Extender (WTX),274,Nginx
|
||||
505,Microsoft Internet Inf,609,IIS|*,130,Oracle ADF
|
||||
508,Microsoft Internet Informat,609,IIS|*,330,IBM Informix-4GL
|
||||
550,Microsoft - IIS 6.0 Migration Tool 1,489,IIS|Easy Migration Tool (IEMT),609,IIS|*
|
||||
558,Infozip 6,85,Info-ZIP,677,Git
|
||||
559,Infozip,85,Info-ZIP,677,Git
|
||||
578,IMSVA 9.1,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB
|
||||
580,IMSVA,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB
|
||||
581,Java 1.8,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
582,Java 7,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
583,Java on Weblogic server,584,Java|*,600,Oracle WebLogic Server|*
|
||||
584,Java5,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
585,Java 6,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
586,Java 6.0,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
587,Java 7 Update 25,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
589,Java (open source),584,Java|*,397,Java|Servlet
|
||||
590,Java 5,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
591,Java 1.5,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
593,Java 1.8.0_92,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
594,Java 1.6,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||
595,J2EE 6,584,Java|*,333,Java|Java Enterprise Edition (Java EE)
|
||||
596,Java (J2EE),584,Java|*,333,Java|Java Enterprise Edition (Java EE)
|
||||
598,JRE,506,Java Runtime Environment (JRE),84,IMS DB
|
||||
629,JEE,333,Java|Java Enterprise Edition (Java EE),1,(E)JES
|
||||
639,JSF,391,Java|JavaServer Faces (JSF),334,Java|Java Standard Edition (Java SE)
|
||||
643,JSP Scriptlets,336,Java|JavaServer Pages (JSP)|Scriptlets,335,Java|JavaServer Pages (JSP)
|
||||
644,Java Scriplet,336,Java|JavaServer Pages (JSP)|Scriptlets,88,Ingres
|
||||
645,Core 9.2.0.0,393,Java|JRuby Core,583,C++|*
|
||||
647,Java RMI,396,Java|Remote Method Invocation (RMI),584,Java|*
|
||||
650,Java Servlets,397,Java|Servlet,453,Linux|Fedora
|
||||
651,Java 6 Servlets,397,Java|Servlet,453,Linux|Fedora
|
||||
652,J2EE Servlets,397,Java|Servlet,443,OS/2
|
||||
653,Servlets,397,Java|Servlet,420,Cisco IOS
|
||||
654,Servlets v2.3,397,Java|Servlet,370,Visual Basic
|
||||
656,Spring BOOT,399,Java|Spring|Spring Boot,398,Java|Spring
|
||||
657,Springboot,399,Java|Spring|Spring Boot,398,Java|Spring
|
||||
661,javasript,589,JavaScript|*,335,Java|JavaServer Pages (JSP)
|
||||
662,JS,589,JavaScript|*,507,Node.js
|
||||
664,Java Script,589,JavaScript|*,584,Java|*
|
||||
671,Sencha 4.2.0,409,JavaScript|Ext JS,589,JavaScript|*
|
||||
674,jqueryui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery
|
||||
675,jquery-ui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery
|
||||
679,Scriptaculous,414,JavaScript|script.aculo.us,582,C#|*
|
||||
684,EAP,268,JBoss|*,174,SAP ERP
|
||||
685,JBOSS-EAP,268,JBoss|*,493,JBoss|Wildfly
|
||||
686,JBoss Application Server 4,268,JBoss|*,493,JBoss|Wildfly
|
||||
687,JBoss Application Server 7,268,JBoss|*,493,JBoss|Wildfly
|
||||
688,JBoss Application Server 5,268,JBoss|*,493,JBoss|Wildfly
|
||||
689,JBoss Application Server,268,JBoss|*,493,JBoss|Wildfly
|
||||
690,Enterprise Application Platform,268,JBoss|*,670,EAServer
|
||||
692,JBOSS 5.1.2 EAP,268,JBoss|*,493,JBoss|Wildfly
|
||||
693,server: Jboss,268,JBoss|*,493,JBoss|Wildfly
|
||||
694,JBOSS 6.3.2 EAP,268,JBoss|*,493,JBoss|Wildfly
|
||||
695,JBoss EAP 4.3,268,JBoss|*,493,JBoss|Wildfly
|
||||
700,Job Information Language,339,Job Information Language (JIL),338,JCL
|
||||
703,JoinIT by Acayosoft,91,joinIT,4,Adobe Acrobat Reader
|
||||
704,Acayosoft JoinIT,91,joinIT,4,Adobe Acrobat Reader
|
||||
705,JoinIT by Acayosoft v 9.0.8,91,joinIT,4,Adobe Acrobat Reader
|
||||
706,LifeFlow Tool,92,LifeFlow,486,.NET Framework|Windows Workflow Foundation (WF)
|
||||
707,Linux 2.6.32-696.28.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
|
||||
709,Linux 2.6.32-696.30.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
|
||||
710,Linux 2.6.9,576,Linux|*,437,Linux|zLinux
|
||||
711,Linux 2.6.32-642.3.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
|
||||
712,Linux - 2.6.18-371.1.2.el5,576,Linux|*,437,Linux|zLinux
|
||||
713,Linux 2.6.32-696.23.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
|
||||
749,Gaia Kernel version 2.7,428,Linux|Check Point,432,Linux|Oracle Linux
|
||||
752,Gaia Kernel version 2.6,428,Linux|Check Point,432,Linux|Oracle Linux
|
||||
766,OEL,432,Linux|Oracle Linux,449,Unix|HP-UX
|
||||
778,Oracle Enterprise Server 7.5,432,Linux|Oracle Linux,134,Oracle Database
|
||||
780,OEL6.7 - 3.8.13-68.3.4.el6uek.x86_64,432,Linux|Oracle Linux,449,Unix|HP-UX
|
||||
792,VMware Photon,433,Linux|Photon OS,569,VMware Server
|
||||
793,VMware Photon 1,433,Linux|Photon OS,569,VMware Server
|
||||
809,Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
818,Redhat - Redhat Linux 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
819,Linux RH6,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
|
||||
865,Redhat - Redhat Linux 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
870,Redhat - RHEL 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
874,Red Hat Entreprise Linux 6.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
882,Redhat 6 64-Bit,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
893,RED HAT ADVANCED SERVER 5,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
910,redhat6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
912,Redhat - Redhat Linux 6.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
913,Linux RH,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
|
||||
916,Redhat - Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
920,Linux RH7,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
|
||||
926,Red Hat V6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
932,Linux RH5,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
|
||||
934,rehl5.9,434,Linux|Red Hat Enterprise Linux,43,DB2
|
||||
964,Red Hat 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
979,red hat,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
991,Redhat,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
996,RedHat 7.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
998,LINUX RED HAT 5 EL,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||
1003,SUSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1006,Linux SuSE12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1011,SUSE10,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1012,SUSE Linux 12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1017,SUSELinux Enterprise 11.x,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1023,SUSE Linux 11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1024,SUSE Linux 11 SP3,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1029,Linux SuSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1030,SUSE,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1038,SuseLinux,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||
1051,domino8.5,270,Lotus Domino,93,Lotus Notes
|
||||
1052,Domino 8.x,270,Lotus Domino,93,Lotus Notes
|
||||
1054,Lucee 5.2.6.60,271,Lucee,375,Apache Lucene
|
||||
1056,Darwin,438,macOS,117,Mozilla Firefox
|
||||
1061,Memcache,98,Memcached,18,BMC Control-M
|
||||
1062,ACCDB,99,Microsoft Access,525,Open Database Connectivity (ODBC)
|
||||
1070,ConfigMgr,102,Microsoft Endpoint Configuration Manager (SCCM),21,Business Intelligence and Reporting Tools (BIRT)
|
||||
1080,FIM SQL Development Server,105,Microsoft Forefront Identity Manager (FIM),572,Structured Query Language (SQL)
|
||||
1082,Microsoft - Internet Explor,107,Microsoft Internet Explorer,356,Rexx
|
||||
1084,Internet Explor,107,Microsoft Internet Explorer,356,Rexx
|
||||
1090,SCEP for Linux,110,Microsoft System Center Endpoint Protection,437,Linux|zLinux
|
||||
1094,SCEP for Mac,110,Microsoft System Center Endpoint Protection,438,macOS
|
||||
1101,msdeploy,112,Microsoft Web Deploy,56,Greenplum DB
|
||||
1106,WebPI,114,Microsoft Web Platform Installer,522,Application Web Server
|
||||
1109,Web PI,114,Microsoft Web Platform Installer,531,Simple Object Access Protocol (SOAP)
|
||||
1111,MDW Framework,115,Model Driven Workflow (MDW),406,JavaScript|AngularJS
|
||||
1115,Mango DB,116,MongoDB,43,DB2
|
||||
1117,MangoDB,116,MongoDB,43,DB2
|
||||
1125,O365,119,MS Office 365,424,IBM i
|
||||
1141,MICROSOFT SQL SERVER 2012 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||
1153,MICROSOFT SQL SERVER 2012 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||
1154,MS SQL Server 2008 Developer,581,MS SQL Server|*,146,Oracle SQL Developer
|
||||
1156,MICROSOFT SQL SERVER 2008 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||
1167,MSSQL Database Server,581,MS SQL Server|*,122,MySQL
|
||||
1173,MSSQL2008,581,MS SQL Server|*,122,MySQL
|
||||
1192,Microsoft SQL Server Standard Edition,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||
1201,SQLServer,581,MS SQL Server|*,572,Structured Query Language (SQL)
|
||||
1226,MICROSOFT SQL SERVER 2012 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||
1233,MICROSOFT SQL SERVER 2005 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||
1234,SQLSVR2008,581,MS SQL Server|*,352,PL/SQL
|
||||
1235,MICROSOFT SQL SERVER 2008 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||
1239,MICROSOFT SQL SERVER 2008 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||
1244,MS SQL Server 2012 Developer,581,MS SQL Server|*,146,Oracle SQL Developer
|
||||
1253,Microsoft - SQL Server Express LocalDB 2014,581,MS SQL Server|*,469,MS SQL Server|SQL Server Database Engine
|
||||
1256,MSSQL,581,MS SQL Server|*,122,MySQL
|
||||
1280,Microsoft - Microsoft SQL Server Analysis Services 2012 for Microsoft SQL Server 2012 Standard Edition 2012,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
|
||||
1281,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
|
||||
1283,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
|
||||
1290,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|*
|
||||
1293,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|*
|
||||
1295,SQL Server Integration Services,470,MS SQL Server|SQL Server Integration Services (SSIS),473,MS SQL Server|SQL Server Reporting Services (SSRS)
|
||||
1316,ZOS Base 1.12,441,MVS|z/OS,437,Linux|zLinux
|
||||
1335,NAS,272,Netscape Application Server (NAS),443,OS/2
|
||||
1337,NES,273,Netscape Enterprise Server (NES),443,OS/2
|
||||
1349,Node.js 0.10 (Linux),507,Node.js,437,Linux|zLinux
|
||||
1361,Node.js 4 (Linux),507,Node.js,437,Linux|zLinux
|
||||
1371,Symas OpenLDAP,126,OpenLDAP,178,SAP SQL Anywhere
|
||||
1377,OAM 12c,129,Oracle Access Management,303,C
|
||||
1378,ADF 12c,130,Oracle ADF,343,Objective C
|
||||
1381,OHS,610,Oracle Application Server|*,122,MySQL
|
||||
1383,Oracle HTTP Server powered by Apache,610,Oracle Application Server|*,259,Apache HTTP Server
|
||||
1384,j2eeoracleca,610,Oracle Application Server|*,453,Linux|Fedora
|
||||
1385,Oracle HTTP,610,Oracle Application Server|*,134,Oracle Database
|
||||
1389,9i AS server,610,Oracle Application Server|*,227,Virtual I/O Server
|
||||
1391,Oracle Application R12.1.3,610,Oracle Application Server|*,134,Oracle Database
|
||||
1394,Weblogic BI Publisher,132,Oracle BI Publisher,600,Oracle WebLogic Server|*
|
||||
1396,OBI,133,Oracle Business Intelligence,343,Objective C
|
||||
1397,OBIEE,133,Oracle Business Intelligence,343,Objective C
|
||||
1398,OBI Reporting,133,Oracle Business Intelligence,343,Objective C
|
||||
1401,Oracle 12.2 Client,134,Oracle Database,610,Oracle Application Server|*
|
||||
1406,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0,134,Oracle Database,610,Oracle Application Server|*
|
||||
1407,Oracle 11.2 (Oracle Database 11g Enterprise Edition Release 11.2.0.3.0 - 64bit) RAC,134,Oracle Database,610,Oracle Application Server|*
|
||||
1415,Oracle 11 on AIX,134,Oracle Database,445,Unix|AIX
|
||||
1416,Oracle Database 10g Enterprise Edition Release 10.1.0.4.0 - 64bit,134,Oracle Database,610,Oracle Application Server|*
|
||||
1431,Oracle Database 10g Release 10.2.0.4.0 - 64bit Production,134,Oracle Database,298,Oracle Exadata
|
||||
1432,Oarcle 11G,134,Oracle Database,218,TIBCO InConcert
|
||||
1443,DB - Oracle inbuilt,134,Oracle Database,158,Powerbuilder
|
||||
1460,Oracle Server,134,Oracle Database,610,Oracle Application Server|*
|
||||
1475,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 - 64bit Production,134,Oracle Database,610,Oracle Application Server|*
|
||||
1480,Oracle 12C on linux,134,Oracle Database,303,C
|
||||
1484,Oracle9i Enterprise Edition Release 9.2.0.5.0,134,Oracle Database,610,Oracle Application Server|*
|
||||
1486,Oracle 11g on linux,134,Oracle Database,432,Linux|Oracle Linux
|
||||
1487,Oracle 11gEssbase,134,Oracle Database,298,Oracle Exadata
|
||||
1490,JServer Release 9.2.0.5.0,474,Oracle Database|Jserver,335,Java|JavaServer Pages (JSP)
|
||||
1492,Designer 6i,135,Oracle Designer,516,Compopent Object Model (COM)
|
||||
1493,Enterprise Manager 12.2.1.1,136,Oracle Enterprise Manager,427,Linux|CentOS
|
||||
1494,Enterprise Manager 12.2.1.2,136,Oracle Enterprise Manager,427,Linux|CentOS
|
||||
1495,Enterprise Manager 11.1.1.7,136,Oracle Enterprise Manager,427,Linux|CentOS
|
||||
1501,"Oracle, Nets",140,Oracle Net Services,273,Netscape Enterprise Server (NES)
|
||||
1502,Oracle RAC,141,Oracle Real Application Clusters (RAC),134,Oracle Database
|
||||
1504,ORPOS 13.3.3,142,Oracle Retail Point-of-Service,609,IIS|*
|
||||
1505,ORPOS 13.3.5,142,Oracle Retail Point-of-Service,609,IIS|*
|
||||
1506,ORPOS 13.3.4,142,Oracle Retail Point-of-Service,609,IIS|*
|
||||
1509,OSB Servers,143,Oracle Service Bus,443,OS/2
|
||||
1514,Oracle TT,147,Oracle TimesTen In-Memory Database,134,Oracle Database
|
||||
1517,OWB 10g,148,Oracle Warehouse Builder (OWB),300,AWK
|
||||
1524,Clascal,346,Pascal|Object Pascal,307,Cascading Style Sheets (CSS)
|
||||
1526,Oracle-HR-9.2,151,PeopleSoft,134,Oracle Database
|
||||
1529,ActiveState Tool Corp. - ActivePerl 5.12,348,Perl|ActivePerl,500,ActiveX
|
||||
1530,ActiveState Tool Corp. - ActivePerl 5.8,348,Perl|ActivePerl,500,ActiveX
|
||||
1531,ORAPERL,417,Perl|Oraperl,242,WinRAR
|
||||
1532,REX,349,Perl|Rex,356,Rexx
|
||||
1536,TCServer V6,277,Pivotal tc Server,365,TCL
|
||||
1537,IBM PKWARE PKZip 2,155,PKZIP,387,Java|IBM SDK
|
||||
1541,PLQSL,352,PL/SQL,351,PL/I
|
||||
1542,Oracle - SQL,352,PL/SQL,581,MS SQL Server|*
|
||||
1544,Oracle SQL,352,PL/SQL,134,Oracle Database
|
||||
1545,PLSQL;,352,PL/SQL,351,PL/I
|
||||
1547,Oracle PLSQL,352,PL/SQL,351,PL/I
|
||||
1548,plsql,352,PL/SQL,351,PL/I
|
||||
1551,Projectplace,156,Planview,21,Business Intelligence and Reporting Tools (BIRT)
|
||||
1558,Power Builder,158,Powerbuilder,151,PeopleSoft
|
||||
1560,Power Builder 6.5,158,Powerbuilder,27,Chef Automate
|
||||
1565,ProjectWise Oracle Server,161,ProjectWise,162,ProjectWise Web Server
|
||||
1576,RMQ,165,RabbitMQ,355,R
|
||||
1579,Clearquest,167,Rational ClearQuest,455,Clarify|Clear Basic
|
||||
1581,Remedy ARS,169,Remedy,322,Fortran
|
||||
1584,RightFax client 10,171,RightFax,118,MQ Client
|
||||
1585,SOQL,359,Salesforce Object Query Language (SOQL),621,ArangoDB
|
||||
1587,SAP Business Objects,173,SAP BusinessObjects BI server,177,SAP NetWeaver Business Warehouse
|
||||
1588,Business Objects 12,173,SAP BusinessObjects BI server,488,ActiveX|ADO
|
||||
1590,SAP BI 4.2 Sp5,173,SAP BusinessObjects BI server,174,SAP ERP
|
||||
1593,SAP HANA ON SUSEOracle 11g on Linux,175,SAP HANA DB,435,Linux|SUSE Linux Enterprise Server
|
||||
1596,NetWeaver,279,SAP NetWeaver App Server,431,Linux|openSUSE
|
||||
1605,SCSS,361,Sass,102,Microsoft Endpoint Configuration Manager (SCCM)
|
||||
1606,Scalla,362,Scala,664,Forte
|
||||
1609,Microsoft SPS 2010,603,SharePoint|*,577,MVS|*
|
||||
1613,SQL Server SP2013 Database Server,603,SharePoint|*,581,MS SQL Server|*
|
||||
1615,Siebel IP 2015,182,Siebel,583,C++|*
|
||||
1616,Siebel 7.8.2.16,182,Siebel,43,DB2
|
||||
1617,Siebel CRM,182,Siebel,583,C++|*
|
||||
1619,Techsmith Corporation - SnagIt 8,184,SnagIt,183,SNA Manager
|
||||
1620,Solid development server,185,solidDB,600,Oracle WebLogic Server|*
|
||||
1622,Sixty-Five Software - SpaceMonger 1.4,187,SpaceMonger,296,Intel Xeon Processor
|
||||
1623,SQLPlus,478,Oracle Database|SQL*Plus,572,Structured Query Language (SQL)
|
||||
1625,SQLIO 1.0,189,SQLIO,178,SAP SQL Anywhere
|
||||
1630,SunOne,281,Oracle iPlanet Web Server,448,Unix|BSD|SunOS
|
||||
1637,SAP - Sybase Central 4.3,479,Sybase SQL Server|Sybase Central,190,Sybase SQL Server
|
||||
1639,Sysncsort,191,Syncsort,178,SAP SQL Anywhere
|
||||
1640,syncsort,191,Syncsort,98,Memcached
|
||||
1641,Sysinternals LLC - AccessEnum 1 1,194,Sysinternal Tools|AccessEnum,124,Nexus Repository OSS
|
||||
1642,Sysinternals LLC - ClockRes 2,195,Sysinternal Tools|ClockRes,374,Xbase++
|
||||
1643,Sysinternals LLC - Coreinfo 3.21,196,Sysinternal Tools|Coreinfo,670,EAServer
|
||||
1644,Sysinternals LLC - DiskExt 1.1,197,Sysinternal Tools|DiskExt,374,Xbase++
|
||||
1645,Sysinternals LLC - DiskMon 2.01,198,Sysinternal Tools|DiskMon,670,EAServer
|
||||
1647,Sysinternals LLC - Junction 1.6,200,Sysinternal Tools|Junction,374,Xbase++
|
||||
1648,Sysinternals LLC - LDMDump 1.02,201,Sysinternal Tools|LDMDump,178,SAP SQL Anywhere
|
||||
1649,Sysinternals LLC - LoadOrder 1,202,Sysinternal Tools|LoadOrder,374,Xbase++
|
||||
1650,Sysinternals LLC - PipeList 1.01,203,Sysinternal Tools|PipeList,670,EAServer
|
||||
1651,Sysinternals LLC - Process Explorer 16.5,204,Sysinternal Tools|Process Explorer,464,Microsoft Exchange Server|Veeam Explorer
|
||||
1652,Sysinternals LLC - PsKill 1.15,205,Sysinternal Tools|PsKill,151,PeopleSoft
|
||||
1653,Sysinternals LLC - PsPasswd 1.23,206,Sysinternal Tools|PsPasswd,231,VMware vCenter
|
||||
1654,Sysinternals LLC - SDelete 1.61,207,Sysinternal Tools|SDelete,670,EAServer
|
||||
1655,Sysinternals LLC - ShareEnum 1.6,208,Sysinternal Tools|ShareEnum,603,SharePoint|*
|
||||
1656,Sysinternals LLC - Sync 2.2,209,Sysinternal Tools|Sync,374,Xbase++
|
||||
1657,Sysinternals LLC - Sysinternals TCPView 3.5,210,Sysinternal Tools|TCPView,365,TCL
|
||||
1658,Sysinternals LLC - VMMap 3.11,211,Sysinternal Tools|VMMap,176,SAP MaxDB
|
||||
1659,Sysinternals LLC - Whois 1.11,212,Sysinternal Tools|Whois,178,SAP SQL Anywhere
|
||||
1664,TERADATA QUERY SCHEDULER SERVER VERSION 15,216,Teradata QS Server,215,Teradata
|
||||
1667,BusinessWorks,217,TIBCO Business Works (BW),111,Microsoft Visual Studio
|
||||
1668,Tibco-IM,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous
|
||||
1669,Tibco Integration Manager,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous
|
||||
1674,TSQL,366,Transact-SQL,621,ArangoDB
|
||||
1675,Trasact SQL,366,Transact-SQL,352,PL/SQL
|
||||
1746,Solaris 11.2 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene
|
||||
1747,Solaris UNIX,448,Unix|BSD|SunOS,578,Unix|*
|
||||
1748,Unix Servers (Solaris,448,Unix|BSD|SunOS,578,Unix|*
|
||||
1749,Oracle Solaris 11.3 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene
|
||||
1753,Solaris 5.10 (Generic_150400-61),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
|
||||
1754,Solaris 5.10 (Generic_150400-62),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
|
||||
1756,Solaris 5.10 (Generic_150400-55),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
|
||||
1760,Oracle Solaris,448,Unix|BSD|SunOS,134,Oracle Database
|
||||
1762,Solaris 1 (SPARC),448,Unix|BSD|SunOS,375,Apache Lucene
|
||||
1765,SunSolaris 10.0,448,Unix|BSD|SunOS,430,Linux|Junos OS
|
||||
1771,Oracle Solaris 10,448,Unix|BSD|SunOS,134,Oracle Database
|
||||
1800,VIO 2.2.0.10,227,Virtual I/O Server,159,Primavera P6
|
||||
1801,VIOS,227,Virtual I/O Server,443,OS/2
|
||||
1802,visibroker,228,Visibroker,420,Cisco IOS
|
||||
1803,VB6,370,Visual Basic,368,VB.NET
|
||||
1804,VB 6.0,370,Visual Basic,368,VB.NET
|
||||
1805,visualbasic,370,Visual Basic,306,C++|Visual C++
|
||||
1808,Visual Basic 6.0,370,Visual Basic,368,VB.NET
|
||||
1811,VBA,371,Visual Basic for Applications (VBA),370,Visual Basic
|
||||
1812,Access VB,371,Visual Basic for Applications (VBA),99,Microsoft Access
|
||||
1813,vfoxpro,372,Visual FoxPro,117,Mozilla Firefox
|
||||
1827,VMware Appliance,569,VMware Server,559,Virtual Appliance
|
||||
1828,VSX,229,VMware Solution Exchange Marketplace (VSX),111,Microsoft Visual Studio
|
||||
1830,VMware - VMware Tools 10.2,230,VMware Tools,569,VMware Server
|
||||
1832,VXML,373,VoiceXML,316,eXtensible HyperText Markup Language (XHTML)
|
||||
1833,Web Focus,232,WebFOCUS,321,FOCUS
|
||||
1834,FOCEXEC,232,WebFOCUS,495,Oracle WebCenter Content Server|Idoc Script
|
||||
1836,WLI 8,233,WebLogic Integration,442,OpenVMS
|
||||
1842,IBM WEBSPHERE APPLICATION SERVER VERSION 6.1.0,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1848,"IBM WebSphere Application Server Network Deployment, 8.0.0.5",284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1850,IBM WebSphere Application Server Network Deployment 7,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1858,IBM WebSphere 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1861,IBM - WebSphere Application Server - Base 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1865,Websphere AS (JVM),284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1872,IBM WebSphere,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1875,IBM WebSphere Application Server 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1877,IBM WebSphere Application,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||
1878,WAS 6.x,284,Websphere Application Server (WAS),521,Electronic Data Interchange (EDI)
|
||||
1880,IBM OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE
|
||||
1882,Open Liberty,285,WebSphere Liberty,397,Java|Servlet
|
||||
1883,IBM Open Liberty,285,WebSphere Liberty,62,IBM BigFix Platform
|
||||
1887,WAS Liberty,285,WebSphere Liberty,397,Java|Servlet
|
||||
1889,OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE
|
||||
1891,WMB 6.1,235,WebSphere Message Broker,486,.NET Framework|Windows Workflow Foundation (WF)
|
||||
1892,WebSphere Message Broker v6.0,235,WebSphere Message Broker,285,WebSphere Liberty
|
||||
1899,WebSphere Portal Extend Limited Use 6.1,286,WebSphere Portal Server,285,WebSphere Liberty
|
||||
1901,Windchill 11.1,237,Windchill,17,Bluebeam|Bluebeam Q
|
||||
1908,Window,580,Windows|*,637,Microsoft Azure
|
||||
1914,Windows Terminal Server,239,Windows Terminal Server (WTS),452,Windows|Windows Server
|
||||
1915,Windows 7 Standard,451,Windows|Windows Desktop,580,Windows|*
|
||||
1916,WINDOWS 10 SERVER STANDARD EDITION X64,451,Windows|Windows Desktop,452,Windows|Windows Server
|
||||
1917,Microsoft Windows 7 (64-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||
1918,Microsoft Windows XP Professional (32-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||
1919,Windows 7 Professional x64,451,Windows|Windows Desktop,580,Windows|*
|
||||
1920,Microsoft Microsoft Windows Entreprise,451,Windows|Windows Desktop,580,Windows|*
|
||||
1921,Microsoft Windows 2000,451,Windows|Windows Desktop,580,Windows|*
|
||||
1922,Microsoft Windows 10,451,Windows|Windows Desktop,580,Windows|*
|
||||
1923,MS Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|*
|
||||
1924,Microsoft Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|*
|
||||
1925,Microsoft Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||
1926,Microsoft Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||
1927,Win Desktop,451,Windows|Windows Desktop,560,Webtop
|
||||
1928,Windows 10 Pro,451,Windows|Windows Desktop,580,Windows|*
|
||||
1929,Windows 10,451,Windows|Windows Desktop,580,Windows|*
|
||||
1930,Windows 7 Ultimate,451,Windows|Windows Desktop,580,Windows|*
|
||||
1931,Microsoft Windows 8 (64-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||
1932,Microsoft Windows XP,451,Windows|Windows Desktop,580,Windows|*
|
||||
1933,Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||
1934,Windows XP,451,Windows|Windows Desktop,580,Windows|*
|
||||
1935,Windows 10 Professional,451,Windows|Windows Desktop,580,Windows|*
|
||||
1936,Windows 7,451,Windows|Windows Desktop,580,Windows|*
|
||||
1937,Microsoft Windows 10 (64-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||
1938,Win 7,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE)
|
||||
1939,windowsxp,451,Windows|Windows Desktop,580,Windows|*
|
||||
1940,Microsoft Windows Unknown,451,Windows|Windows Desktop,580,Windows|*
|
||||
1941,Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||
1942,Windows XP Professional,451,Windows|Windows Desktop,580,Windows|*
|
||||
1943,Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|*
|
||||
1944,Window XP,451,Windows|Windows Desktop,580,Windows|*
|
||||
1945,Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||
1946,Microsoft Windows 7 - SOE,451,Windows|Windows Desktop,580,Windows|*
|
||||
1947,Windows 7 Enterprise Edition,451,Windows|Windows Desktop,452,Windows|Windows Server
|
||||
1948,Windows 8,451,Windows|Windows Desktop,580,Windows|*
|
||||
1949,Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|*
|
||||
1950,Microsoft Windows 7 (32-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||
1951,Windows Embedded Standard 7,451,Windows|Windows Desktop,580,Windows|*
|
||||
1952,Win10,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE)
|
||||
1953,Windows 2003,451,Windows|Windows Desktop,580,Windows|*
|
||||
1955,Windows 2003 Standard,452,Windows|Windows Server,580,Windows|*
|
||||
1956,Windows 2008 Enterprise R2 x64,452,Windows|Windows Server,580,Windows|*
|
||||
1960,WINDOWS 2008R2,452,Windows|Windows Server,580,Windows|*
|
||||
1961,Microsoft Windows Server 2008 Standard Editio,452,Windows|Windows Server,121,MS SQL Server Compact
|
||||
1962,MICROSOFT WINDOWS NT 2003,452,Windows|Windows Server,580,Windows|*
|
||||
1967,Microsoft Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
|
||||
1979,Windows 2008 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|*
|
||||
1982,Windows 2003 R2,452,Windows|Windows Server,580,Windows|*
|
||||
1983,Windows 2008 R2 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
1988,Windows 2008 R2,452,Windows|Windows Server,580,Windows|*
|
||||
1989,Windows 2012 Standard,452,Windows|Windows Server,580,Windows|*
|
||||
1992,Windows 2008 R2 Standard 6.1.7601 Service Pack 1,452,Windows|Windows Server,580,Windows|*
|
||||
1994,Windows 2008 Standard x64,452,Windows|Windows Server,580,Windows|*
|
||||
1998,Windows 2012 R2 Standard 64-Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2007,w2k12,452,Windows|Windows Server,582,C#|*
|
||||
2008,WINDOWS 2013,452,Windows|Windows Server,580,Windows|*
|
||||
2009,WINDOWS 2016 SE 64 BIT,452,Windows|Windows Server,580,Windows|*
|
||||
2011,Microsoft - Windows 2012,452,Windows|Windows Server,580,Windows|*
|
||||
2019,MICROSOFT WINDOWS 2008 TPM,452,Windows|Windows Server,580,Windows|*
|
||||
2021,MICROSOFT WINDOWS STD 2008,452,Windows|Windows Server,580,Windows|*
|
||||
2025,Windows 2008 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2028,MICROSOFT WINDOWS STD 2008 TPM,452,Windows|Windows Server,580,Windows|*
|
||||
2030,Windows 2012 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2031,MICROSOFT WINDOWS NT 2003 ENT,452,Windows|Windows Server,580,Windows|*
|
||||
2034,MICROSOFT WINDOWS 2012,452,Windows|Windows Server,580,Windows|*
|
||||
2036,Windows 2003 Standard5.2.3790,452,Windows|Windows Server,580,Windows|*
|
||||
2040,Windows 2012 R,452,Windows|Windows Server,580,Windows|*
|
||||
2044,Windows 2008 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2045,MICROSOFT WINDOWS 2008 ENT,452,Windows|Windows Server,580,Windows|*
|
||||
2047,Windows 2012 R2 Standard 6.3.9600,452,Windows|Windows Server,580,Windows|*
|
||||
2053,Windows 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
|
||||
2055,Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
|
||||
2061,Windows 2016 Datacenter10.0.14393,452,Windows|Windows Server,637,Microsoft Azure
|
||||
2065,windows6.3.9600,452,Windows|Windows Server,580,Windows|*
|
||||
2066,Windows 2012 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2069,Windows 2008 Enterprise,452,Windows|Windows Server,580,Windows|*
|
||||
2080,Windows 2008 Standard without Hyper-V6.0.6003,452,Windows|Windows Server,580,Windows|*
|
||||
2084,Windows 2012 R2 Datacenter,452,Windows|Windows Server,110,Microsoft System Center Endpoint Protection
|
||||
2089,Windows 2008 Standard 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||
2096,Windows 2000,452,Windows|Windows Server,580,Windows|*
|
||||
2097,W2K8R2 Standard 64 BIT,452,Windows|Windows Server,303,C
|
||||
2099,Windows 2008 Standard6.0.6003,452,Windows|Windows Server,580,Windows|*
|
||||
2100,Windows2008 R2 Enterprise 64bit,452,Windows|Windows Server,580,Windows|*
|
||||
2105,Win2008R2,452,Windows|Windows Server,355,R
|
||||
2107,Windows 2008 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2109,Windows Server 2003 Appliance,452,Windows|Windows Server,559,Virtual Appliance
|
||||
2111,Windows 2008 ENT R2 (64 bits),452,Windows|Windows Server,355,R
|
||||
2114,WIN2008R2 6.1.7601,452,Windows|Windows Server,355,R
|
||||
2116,microsoft windows std 2012 tpm,452,Windows|Windows Server,580,Windows|*
|
||||
2118,microsoft windows 2008,452,Windows|Windows Server,580,Windows|*
|
||||
2120,Windows 2008 Standard 32 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2121,Microsoft Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|*
|
||||
2126,Window2008 R2,452,Windows|Windows Server,355,R
|
||||
2130,Windows 2008 Standard,452,Windows|Windows Server,580,Windows|*
|
||||
2134,WS03,452,Windows|Windows Server,239,Windows Terminal Server (WTS)
|
||||
2136,Windows 2008 Enterprise x64,452,Windows|Windows Server,580,Windows|*
|
||||
2141,Windows 2008 R2 Enterprise,452,Windows|Windows Server,580,Windows|*
|
||||
2142,Windows Server 2003 Std 32-bit,452,Windows|Windows Server,580,Windows|*
|
||||
2143,Windows 2008 R2 Standard 64bit,452,Windows|Windows Server,580,Windows|*
|
||||
2146,Microsoft Windows 2003 R2 Standard,452,Windows|Windows Server,580,Windows|*
|
||||
2148,MICROSOFT WINDOWS NT 2003 TPM,452,Windows|Windows Server,580,Windows|*
|
||||
2149,Win Server 2008,452,Windows|Windows Server,569,VMware Server
|
||||
2150,Windows 2003 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2152,WIN2014,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||
2156,Win 2012 R2,452,Windows|Windows Server,355,R
|
||||
2160,Win Server,452,Windows|Windows Server,12,Apache Subversion
|
||||
2161,Windows 2008 Standard R2 x64,452,Windows|Windows Server,580,Windows|*
|
||||
2163,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 6196 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor
|
||||
2164,Windows2012,452,Windows|Windows Server,580,Windows|*
|
||||
2165,Windows 2008 R2 Standard6.1.7601,452,Windows|Windows Server,580,Windows|*
|
||||
2166,Windows 2016,452,Windows|Windows Server,580,Windows|*
|
||||
2167,Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|*
|
||||
2179,Windows Server 2003 Std 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||
2180,Windows 2012 R2,452,Windows|Windows Server,580,Windows|*
|
||||
2181,Wintel,452,Windows|Windows Server,461,IBM Tivoli Storage Manager|TSM Client
|
||||
2191,Windows 2003 Enterprise5.2.3790,452,Windows|Windows Server,580,Windows|*
|
||||
2192,WINDOWS 2012,452,Windows|Windows Server,580,Windows|*
|
||||
2193,Windows 2008 R2 OS,452,Windows|Windows Server,580,Windows|*
|
||||
2196,Windows 2003 Standard R2,452,Windows|Windows Server,580,Windows|*
|
||||
2197,Windows 2008 R2 Enterprise6.1.7601,452,Windows|Windows Server,580,Windows|*
|
||||
2198,Windows 2003 Standard 32 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2199,WINDOWS SERVER 2003 APPLIANCE 5.2,452,Windows|Windows Server,559,Virtual Appliance
|
||||
2201,WS08R2,452,Windows|Windows Server,355,R
|
||||
2204,Windows 2008 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2213,w2k8r2sp1,452,Windows|Windows Server,355,R
|
||||
2217,Win 2003,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||
2222,Windows 2012 R2 Standard,452,Windows|Windows Server,580,Windows|*
|
||||
2226,Windows 2008 R2 Standard 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||
2228,Windows 2003 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|*
|
||||
2230,Windows 2012 Storage R2,452,Windows|Windows Server,580,Windows|*
|
||||
2231,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 4096 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor
|
||||
2235,MICROSOFT WINDOWS NT 2003 ENT TPM,452,Windows|Windows Server,239,Windows Terminal Server (WTS)
|
||||
2237,Windows 2016 Standard10.0.14393,452,Windows|Windows Server,580,Windows|*
|
||||
2240,MICROSOFT WINDOWS 2003,452,Windows|Windows Server,580,Windows|*
|
||||
2242,Windows 2012 Standard R2,452,Windows|Windows Server,580,Windows|*
|
||||
2246,Win Server 2008 R2,452,Windows|Windows Server,355,R
|
||||
2248,MICROSOFT WINDOWS STD 2012 TPM,452,Windows|Windows Server,580,Windows|*
|
||||
2249,Windows 2003 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2250,Windows 2008 Enterprise R2,452,Windows|Windows Server,580,Windows|*
|
||||
2251,Windows 2008,452,Windows|Windows Server,580,Windows|*
|
||||
2252,Microsoft Microsoft Windows 2008 R2,452,Windows|Windows Server,580,Windows|*
|
||||
2257,Win Server 2012,452,Windows|Windows Server,569,VMware Server
|
||||
2258,Windows 2016 Standard,452,Windows|Windows Server,580,Windows|*
|
||||
2264,Windows 2008 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||
2267,Windows 2003 Standard 5.2.3790 Service Pack 2,452,Windows|Windows Server,580,Windows|*
|
||||
2268,Windows 2012 Standard6.2.9200,452,Windows|Windows Server,580,Windows|*
|
||||
2269,MICROSOFT WINDOWS 2016 TPM,452,Windows|Windows Server,580,Windows|*
|
||||
2272,Windows 2003 Enterprise,452,Windows|Windows Server,580,Windows|*
|
||||
2275,Windows 2008 R2 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||
2277,Windows 2012 R2 Standard6.3.9600,452,Windows|Windows Server,580,Windows|*
|
||||
2286,Windows 2008 Standard R2,452,Windows|Windows Server,580,Windows|*
|
||||
2287,MicrosoftWindows Server 2008 R2 (64-bit),452,Windows|Windows Server,443,OS/2
|
||||
2288,windows6.3,452,Windows|Windows Server,580,Windows|*
|
||||
2290,Windows 2016 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||
2296,Windows 2008 Enterprise6.0.6003,452,Windows|Windows Server,580,Windows|*
|
||||
2301,Win 2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||
2302,Win2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||
2303,Win2012R2,452,Windows|Windows Server,355,R
|
||||
2305,win2008,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||
2306,Windows 2003 Standard x64,452,Windows|Windows Server,580,Windows|*
|
||||
2315,WINDOWS 2016 STANDARD EDITION,452,Windows|Windows Server,580,Windows|*
|
||||
2325,WinSCP.net - WinSCP 5.11,243,WinSCP,178,SAP SQL Anywhere
|
||||
2332,Zerto Vritual Appliance,249,Zerto Virtual Replication,559,Virtual Appliance
|
||||
2333,Oracle RTD,289,Oracle Real-Time Decisions (RTD),134,Oracle Database
|
||||
2338,OMNIbus,251,Tivoli Netcool/OMNIbus,582,C#|*
|
||||
2347,ALM,511,Application Lifecycle Management (ALM),421,DART
|
||||
2349,BMS,513,Batch Management Software (BMS),442,OpenVMS
|
||||
2354,COM,516,Compopent Object Model (COM),661,COM+
|
||||
2357,CORBA Interface Definition Language,518,CORBA Interface Definition Language (CORBA IDL),517,Common Object Request Broker Architecture (CORBA)
|
||||
2359,Data Control Language,519,Data Control Language (DCL),329,IBM i Control Language (CL)
|
||||
2361,Database,520,Database (DB),43,DB2
|
||||
2362,DB,520,Database (DB),43,DB2
|
||||
2365,Electronic Data Interchange,521,Electronic Data Interchange (EDI),104,Microsoft Exchange Server
|
||||
2369,JDOM,523,Java-based Document Object Model for XML (JDOM),84,IMS DB
|
||||
2381,Simple Object Access Protocol,531,Simple Object Access Protocol (SOAP),547,Internet Message Access Protocol (IMAP)
|
||||
2383,SQL,572,Structured Query Language (SQL),581,MS SQL Server|*
|
||||
2386,DPE,538,Device Provisioning Engines (DPE),661,COM+
|
||||
2388,ESB,540,Enterprise Service Bus(ESB),370,Visual Basic
|
||||
2395,MES,553,Manufacturing Execution System (MES),623,Amazon S3
|
||||
2401,Z/Virtual System Environment,591,z/VSE,441,MVS|z/OS
|
||||
2403,DOS/VSE,591,z/VSE,597,DOS/360
|
||||
2404,Microsoft Disk Operating System,593,MS-DOS,443,OS/2
|
||||
2407,VME/B,595,VME,368,VB.NET
|
||||
2408,Virtual Machine Environment,595,VME,111,Microsoft Visual Studio
|
||||
2409,VME 2900,595,VME,107,Microsoft Internet Explorer
|
||||
2410,OpenVME,595,VME,442,OpenVMS
|
||||
2411,Disk Operating System/360,597,DOS/360,443,OS/2
|
||||
2413,Transaction Processing Facility,598,z/TPF,572,Structured Query Language (SQL)
|
||||
2419,NPL,653,Natural Programming Language,342,Niakwa Programming Language (NPL)
|
||||
2426,IDMS/DB Data Manipulation Language,668,IDMS DML,312,Data Language Interface (DL/I)
|
||||
2433,Basic Mapping Supprt,689,BMS Map,21,Business Intelligence and Reporting Tools (BIRT)
|
||||
2434,DB/400,690,DB400,43,DB2
|
||||
2435,IBM ISAM,693,ISAM,73,IBM Operational Decision Manager (ODM)
|
|
|
@ -32,21 +32,25 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|||
def generate_acronym(text):
|
||||
|
||||
# Define prompt
|
||||
prompt = f"Answer concisely: make a possible acronym from the following: '{text}'"
|
||||
# prompt = f"Imagine you are a diverse database. Given the following: '{text}', please suggest to me 5 possible variations. Give 5."
|
||||
prompt = f"Give me a list of 10 historical product names related to: '{text}'. Format the output in a list, like this 1. Item, 2. Item, 3. ..."
|
||||
|
||||
# Generate acronym
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
inputs = inputs.to("cuda")
|
||||
outputs = model.generate(
|
||||
inputs["input_ids"],
|
||||
max_length=100,
|
||||
no_repeat_ngram_size=3)
|
||||
max_length=200,
|
||||
do_sample=True,
|
||||
top_k=50,
|
||||
temperature=0.8)
|
||||
# no_repeat_ngram_size=3)
|
||||
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
# %%
|
||||
# Example usage
|
||||
# text = "Advanced Data Analytics Platform"
|
||||
text = "red hat enterprise linux"
|
||||
text = "windows desktop"
|
||||
acronym = generate_acronym(text)
|
||||
print(f"Acronym: {acronym}")
|
||||
print(f"Generation: {acronym}")
|
||||
# %%
|
Loading…
Reference in New Issue