added more augmentations to finally beat sota
- class_bert_augmentation is now the reference training code
This commit is contained in:
parent
e90bc69ea9
commit
5312cfa06f
|
@ -0,0 +1,41 @@
|
||||||
|
# %%
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
|
||||||
|
def corrupt_word(word):
|
||||||
|
"""Corrupt a single word using random corruption techniques."""
|
||||||
|
if len(word) <= 1: # Skip corruption for single-character words
|
||||||
|
return word
|
||||||
|
|
||||||
|
corruption_type = random.choice(["delete", "swap"])
|
||||||
|
|
||||||
|
if corruption_type == "delete":
|
||||||
|
# Randomly delete a character
|
||||||
|
idx = random.randint(0, len(word) - 1)
|
||||||
|
word = word[:idx] + word[idx + 1:]
|
||||||
|
|
||||||
|
elif corruption_type == "swap":
|
||||||
|
# Swap two adjacent characters
|
||||||
|
if len(word) > 1:
|
||||||
|
idx = random.randint(0, len(word) - 2)
|
||||||
|
word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
|
||||||
|
|
||||||
|
|
||||||
|
return word
|
||||||
|
|
||||||
|
def corrupt_string(sentence, corruption_probability=0.01):
|
||||||
|
"""Corrupt each word in the string with a given probability."""
|
||||||
|
words = sentence.split()
|
||||||
|
corrupted_words = [
|
||||||
|
corrupt_word(word) if random.random() < corruption_probability else word
|
||||||
|
for word in words
|
||||||
|
]
|
||||||
|
return " ".join(corrupted_words)
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
sentence = "This is a simple string for testing"
|
||||||
|
corrupted_sentence = corrupt_string(sentence, corruption_probability=0.1)
|
||||||
|
print("Original:", sentence)
|
||||||
|
print("Corrupted:", corrupted_sentence)
|
||||||
|
|
||||||
|
# %%
|
|
@ -1,95 +0,0 @@
|
||||||
# %%
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
##########################################
|
|
||||||
# %%
|
|
||||||
|
|
||||||
# Load the JSON file
|
|
||||||
data_path = '../esAppMod/tca_entities.json'
|
|
||||||
with open(data_path, 'r') as file:
|
|
||||||
data = json.load(file)
|
|
||||||
|
|
||||||
# Initialize an empty list to store the rows
|
|
||||||
rows = []
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# Loop through all entities in the JSON
|
|
||||||
for entity in data["data"].items():
|
|
||||||
entity_data = entity[1]
|
|
||||||
entity_id = entity_data['entity_id']
|
|
||||||
entity_name = entity_data['entity_name']
|
|
||||||
entity_type_id = entity_data['entity_type_id']
|
|
||||||
entity_type_name = entity_data['entity_type_name']
|
|
||||||
|
|
||||||
# Add each mention and its entity_id to the rows list
|
|
||||||
rows.append(
|
|
||||||
{
|
|
||||||
'id': entity_id,
|
|
||||||
'name': entity_name,
|
|
||||||
'type_id': entity_type_id,
|
|
||||||
'type_name': entity_type_name
|
|
||||||
})
|
|
||||||
|
|
||||||
# Create a DataFrame from the rows
|
|
||||||
df = pd.DataFrame(rows)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# df.to_csv('entity.csv', index=False)
|
|
||||||
df
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df['type_name'].value_counts()
|
|
||||||
# %%
|
|
||||||
df['type_id'].value_counts()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
name_list = df['name'].to_list()
|
|
||||||
# %%
|
|
||||||
name_list
|
|
||||||
|
|
||||||
# %%
|
|
||||||
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# Define labels
|
|
||||||
labels = name_list
|
|
||||||
|
|
||||||
# Create a prefix-based distance matrix
|
|
||||||
def prefix_distance(label1, label2):
|
|
||||||
prefix1 = label1.split()
|
|
||||||
prefix2 = label2.split()
|
|
||||||
# Find common prefix length
|
|
||||||
common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
|
|
||||||
# Distance is inversely proportional to common prefix length
|
|
||||||
return 1.0 / (common_prefix_length + 1)
|
|
||||||
|
|
||||||
# Create a pairwise distance matrix
|
|
||||||
n = len(labels)
|
|
||||||
distance_matrix = np.zeros((n, n))
|
|
||||||
for i in range(n):
|
|
||||||
for j in range(n):
|
|
||||||
distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
|
|
||||||
|
|
||||||
# Perform hierarchical clustering
|
|
||||||
linkage_matrix = linkage(distance_matrix, method='average')
|
|
||||||
|
|
||||||
# Visualize as a dendrogram
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
|
|
||||||
plt.title("Prefix-Based Clustering")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
linkage_matrix
|
|
||||||
# %%
|
|
||||||
# Extract flat clusters with a distance threshold
|
|
||||||
threshold = 0.5
|
|
||||||
clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
|
|
||||||
|
|
||||||
# Display clusters
|
|
||||||
for i, cluster_id in enumerate(clusters):
|
|
||||||
print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
|
|
||||||
|
|
||||||
# %%
|
|
|
@ -3,53 +3,55 @@ import pandas as pd
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# import training file
|
# import training file
|
||||||
data_path = '../data_import/train.csv'
|
data_path = '../esAppMod_data_import/train.csv'
|
||||||
|
# data_path = '../esAppMod_data_import/parent_train.csv'
|
||||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
|
||||||
# import test file
|
# import test file
|
||||||
data_path = '../data_import/test.csv'
|
data_path = '../esAppMod_data_import/test.csv'
|
||||||
|
# data_path = '../esAppMod_data_import/parent_test.csv'
|
||||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
# import entity file
|
# import entity file
|
||||||
data_path = '../data_import/entity.csv'
|
data_path = '../esAppMod_data_import/entity.csv'
|
||||||
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
id2label = {}
|
id2label = {}
|
||||||
for _, row in entity_df.iterrows():
|
for _, row in entity_df.iterrows():
|
||||||
id2label[row['id']] = row['name']
|
id2label[row['id']] = row['name']
|
||||||
|
|
||||||
# %%
|
|
||||||
train_df.sort_values(by=['entity_id']).to_markdown('out.md')
|
train_df.sort_values(by=['entity_id']).to_markdown('out.md')
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
data_path = '../train/class_bert_process/prediction/exports/result.csv'
|
data_path = '../train/class_bert_augmentation/prediction/exports/result.csv'
|
||||||
prediction_df = pd.read_csv(data_path)
|
prediction_df = pd.read_csv(data_path)
|
||||||
|
|
||||||
# %%
|
|
||||||
predicted_entity_list = []
|
predicted_entity_list = []
|
||||||
for element in prediction_df['class_prediction']:
|
for element in prediction_df['class_prediction']:
|
||||||
predicted_entity_list.append(id2label[element])
|
predicted_entity_list.append(id2label[element])
|
||||||
|
|
||||||
prediction_df['predicted_name'] = predicted_entity_list
|
prediction_df['predicted_name'] = predicted_entity_list
|
||||||
# %%
|
|
||||||
new_df = pd.concat((test_df, prediction_df ), axis=1)
|
new_df = pd.concat((test_df, prediction_df ), axis=1)
|
||||||
|
|
||||||
# %%
|
|
||||||
mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
|
mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
|
||||||
mismatch_df = new_df[mismatch_mask]
|
mismatch_df = new_df[mismatch_mask]
|
||||||
|
|
||||||
# %%
|
|
||||||
len(mismatch_df)
|
len(mismatch_df)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# print the top 10 offending classes
|
# print the top 10 offending classes
|
||||||
|
# mask1 = mismatch_df['entity_id'] != 434
|
||||||
|
# mask2 = mismatch_df['entity_id'] != 451
|
||||||
|
# mask3 = mismatch_df['entity_id'] != 452
|
||||||
|
# mask= mask1 & mask2 & mask3
|
||||||
|
# masked_df = mismatch_df[mask]
|
||||||
|
# print(masked_df['entity_id'].value_counts()[:10])
|
||||||
print(mismatch_df['entity_id'].value_counts()[:10])
|
print(mismatch_df['entity_id'].value_counts()[:10])
|
||||||
|
masked_df = mismatch_df
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# Convert the whole dataframe as a string and display
|
# Convert the whole dataframe as a string and display
|
||||||
# print the mismatch_df
|
# print the mismatch_df
|
||||||
print(mismatch_df.sort_values(by=['entity_id']).to_markdown())
|
print(masked_df.sort_values(by=['entity_id']).to_markdown())
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
mismatch_df.to_csv('error.csv')
|
mismatch_df.to_csv('error.csv')
|
||||||
|
@ -62,14 +64,9 @@ mismatch_df[select_mask]
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# let us see the train mentions
|
# let us see the train mentions
|
||||||
select_value = 452
|
select_value = 130
|
||||||
select_mask = train_df['entity_id'] == select_value
|
select_mask = train_df['entity_id'] == select_value
|
||||||
train_df[select_mask]
|
train_df[select_mask]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
|
||||||
mismatch_df[select_mask]['class_prediction'].to_list()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# %%
|
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# import training file
|
||||||
|
data_path = '../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# import test file
|
||||||
|
data_path = '../esAppMod_data_import/test.csv'
|
||||||
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# import entity file
|
||||||
|
data_path = '../esAppMod_data_import/entity.csv'
|
||||||
|
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
id2label = {}
|
||||||
|
for _, row in entity_df.iterrows():
|
||||||
|
id2label[row['id']] = row['name']
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
train_df
|
||||||
|
# %%
|
||||||
|
|
||||||
|
def extract_acronym_mapping(names):
|
||||||
|
mapping = {}
|
||||||
|
for name in names:
|
||||||
|
# Find acronym in parentheses
|
||||||
|
match = re.search(r"\((\w+)\)", name)
|
||||||
|
if match:
|
||||||
|
acronym = match.group(1)
|
||||||
|
|
||||||
|
# Remove unrelated prepended terms
|
||||||
|
core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip()
|
||||||
|
|
||||||
|
# Add to dictionary
|
||||||
|
mapping[acronym] = core_term
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
names = set(train_df['entity_name'].to_list())
|
||||||
|
|
||||||
|
# Extract mappings
|
||||||
|
acronym_mapping = extract_acronym_mapping(names)
|
||||||
|
print(acronym_mapping)
|
||||||
|
# %%
|
||||||
|
del acronym_mapping['E'] # too many false matches
|
||||||
|
acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()}
|
||||||
|
|
||||||
|
abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()}
|
||||||
|
term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()}
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
abbrev_to_term
|
||||||
|
# %%
|
||||||
|
term_to_abbrev
|
||||||
|
|
||||||
|
# %%
|
||||||
|
acronym_mapping
|
||||||
|
# %%
|
|
@ -0,0 +1,5 @@
|
||||||
|
out.md
|
||||||
|
parent_test.csv
|
||||||
|
parent_train.csv
|
||||||
|
test_seq.csv
|
||||||
|
train_seq.csv
|
|
@ -0,0 +1,124 @@
|
||||||
|
# %%
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
# %%
|
||||||
|
# import training file
|
||||||
|
data_path = '../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# import entity file
|
||||||
|
# Keep only one row per unique value in 'column1'
|
||||||
|
unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
|
||||||
|
id2label = {}
|
||||||
|
for _, row in unique_df.iterrows():
|
||||||
|
id2label[row['entity_id']] = row['entity_name']
|
||||||
|
|
||||||
|
inverse_dict = {value:key for key,value in id2label.items()}
|
||||||
|
# %%
|
||||||
|
# Create a new dictionary with sorted keys
|
||||||
|
# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
|
||||||
|
sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
|
||||||
|
|
||||||
|
# %%
|
||||||
|
sorted_dict
|
||||||
|
|
||||||
|
# %%
|
||||||
|
rule_set ={
|
||||||
|
'.NET': [497,482,484,487,485,486,483],
|
||||||
|
'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
|
||||||
|
'C++': [583,306],
|
||||||
|
'CA': [290,22,23,24,25],
|
||||||
|
'CSS': [307,377],
|
||||||
|
'Cisco': [28,420,29],
|
||||||
|
'Citrix': [563,565,31,292,291,564,32,30],
|
||||||
|
'coldfusion': [311,37],
|
||||||
|
'eclipse': [46,622,641,456],
|
||||||
|
'xml': [596, 318],
|
||||||
|
'xsl': [319,320],
|
||||||
|
'HP': [59,293,60,61,58],
|
||||||
|
'http': [505,543],
|
||||||
|
'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
|
||||||
|
'IBM BigFix': [62,457],
|
||||||
|
'IBM ILOG': [253,255,254,256,252],
|
||||||
|
'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
|
||||||
|
'IBM WebSphere': [80,82,83,81],
|
||||||
|
'IBM i': [424,329],
|
||||||
|
'IDMS': [667,668],
|
||||||
|
'IIS': [609,490,489,491],
|
||||||
|
'JBoss': [268,492,493],
|
||||||
|
'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
|
||||||
|
'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
|
||||||
|
'KVS': [549,550,551],
|
||||||
|
'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
|
||||||
|
'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
|
||||||
|
'MVS': [577,440,441],
|
||||||
|
'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
|
||||||
|
'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
|
||||||
|
'Oracle WebLogic': [600,233],
|
||||||
|
'Oracle Application Server': [610,494],
|
||||||
|
'Oracle Database': [134,474,475,478],
|
||||||
|
'Oracle Hyperion': [607,138,139],
|
||||||
|
'Oracle WebCenter': [276,495],
|
||||||
|
'Pascal': [599,346],
|
||||||
|
'Perl': [585,348,417,349],
|
||||||
|
'ProjectWise': [161,162],
|
||||||
|
'Rational': [166,167],
|
||||||
|
'SAP': [173,175,695,176,676,178,179],
|
||||||
|
'SAP ERP': [174,476,477],
|
||||||
|
'SAP NetWeaver': [279,496,177],
|
||||||
|
'Sybase SQL Server': [190,479,480],
|
||||||
|
'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
|
||||||
|
'TIBCO': [218,219],
|
||||||
|
'TIBCO Business Works': [217,481],
|
||||||
|
'Tivoli': [220,251],
|
||||||
|
'Tortoise': [221,222],
|
||||||
|
'Unix': [578,445,579,447,602,590,448,449],
|
||||||
|
'VB': [368,369],
|
||||||
|
'VMware': [568,569,229,230,231],
|
||||||
|
'Visual Basic': [370,371,372],
|
||||||
|
'WebSphere': [234,285,235,286,284,601,287],
|
||||||
|
'Windows': [580,238,239,451,452],
|
||||||
|
'z': [598,608,591]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# iterate through the whole training set
|
||||||
|
new_df = train_df.copy()
|
||||||
|
for idx, row in train_df.iterrows():
|
||||||
|
# we iterate through each rule set, replacing any matching values in the
|
||||||
|
# list with the first element of the list
|
||||||
|
for key in rule_set.keys():
|
||||||
|
id = row['entity_id']
|
||||||
|
if (id in rule_set[key]):
|
||||||
|
new_df.loc[idx,('entity_id')] = rule_set[key][0]
|
||||||
|
# %%
|
||||||
|
len(set(new_df['entity_id'].to_list()))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
new_df.to_csv('parent_train.csv')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# now do the same for the test data
|
||||||
|
# import training file
|
||||||
|
data_path = '../esAppMod_data_import/test.csv'
|
||||||
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
new_df = test_df.copy()
|
||||||
|
for idx, row in test_df.iterrows():
|
||||||
|
# we iterate through each rule set, replacing any matching values in the
|
||||||
|
# list with the first element of the list
|
||||||
|
for key in rule_set.keys():
|
||||||
|
id = row['entity_id']
|
||||||
|
if (id in rule_set[key]):
|
||||||
|
new_df.loc[idx,('entity_id')] = rule_set[key][0]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
new_df
|
||||||
|
|
||||||
|
# %%
|
||||||
|
new_df.to_csv('parent_test.csv')
|
||||||
|
# %%
|
|
@ -0,0 +1,129 @@
|
||||||
|
# %%
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
# %%
|
||||||
|
# import training file
|
||||||
|
data_path = '../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# import entity file
|
||||||
|
# Keep only one row per unique value in 'column1'
|
||||||
|
unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
|
||||||
|
id2label = {}
|
||||||
|
for _, row in unique_df.iterrows():
|
||||||
|
id2label[row['entity_id']] = row['entity_name']
|
||||||
|
|
||||||
|
inverse_dict = {value:key for key,value in id2label.items()}
|
||||||
|
# %%
|
||||||
|
# Create a new dictionary with sorted keys
|
||||||
|
# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
|
||||||
|
sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
|
||||||
|
|
||||||
|
# %%
|
||||||
|
sorted_dict
|
||||||
|
|
||||||
|
# %%
|
||||||
|
rule_set ={
|
||||||
|
'.NET': [497,482,484,487,485,486,483],
|
||||||
|
'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
|
||||||
|
'C++': [583,306],
|
||||||
|
'CA': [290,22,23,24,25],
|
||||||
|
'CSS': [307,377],
|
||||||
|
'Cisco': [28,420,29],
|
||||||
|
'Citrix': [563,565,31,292,291,564,32,30],
|
||||||
|
'coldfusion': [311,37],
|
||||||
|
'eclipse': [46,622,641,456],
|
||||||
|
'xml': [596, 318],
|
||||||
|
'xsl': [319,320],
|
||||||
|
'HP': [59,293,60,61,58],
|
||||||
|
'http': [505,543],
|
||||||
|
'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
|
||||||
|
'IBM BigFix': [62,457],
|
||||||
|
'IBM ILOG': [253,255,254,256,252],
|
||||||
|
'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
|
||||||
|
'IBM WebSphere': [80,82,83,81],
|
||||||
|
'IBM i': [424,329],
|
||||||
|
'IDMS': [667,668],
|
||||||
|
'IIS': [609,490,489,491],
|
||||||
|
'JBoss': [268,492,493],
|
||||||
|
'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
|
||||||
|
'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
|
||||||
|
'KVS': [549,550,551],
|
||||||
|
'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
|
||||||
|
'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
|
||||||
|
'MVS': [577,440,441],
|
||||||
|
'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
|
||||||
|
'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
|
||||||
|
'Oracle WebLogic': [600,233],
|
||||||
|
'Oracle Application Server': [610,494],
|
||||||
|
'Oracle Database': [134,474,475,478],
|
||||||
|
'Oracle Hyperion': [607,138,139],
|
||||||
|
'Oracle WebCenter': [276,495],
|
||||||
|
'Pascal': [599,346],
|
||||||
|
'Perl': [585,348,417,349],
|
||||||
|
'ProjectWise': [161,162],
|
||||||
|
'Rational': [166,167],
|
||||||
|
'SAP': [173,175,695,176,676,178,179],
|
||||||
|
'SAP ERP': [174,476,477],
|
||||||
|
'SAP NetWeaver': [279,496,177],
|
||||||
|
'Sybase SQL Server': [190,479,480],
|
||||||
|
'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
|
||||||
|
'TIBCO': [218,219],
|
||||||
|
'TIBCO Business Works': [217,481],
|
||||||
|
'Tivoli': [220,251],
|
||||||
|
'Tortoise': [221,222],
|
||||||
|
'Unix': [578,445,579,447,602,590,448,449],
|
||||||
|
'VB': [368,369],
|
||||||
|
'VMware': [568,569,229,230,231],
|
||||||
|
'Visual Basic': [370,371,372],
|
||||||
|
'WebSphere': [234,285,235,286,284,601,287],
|
||||||
|
'Windows': [580,238,239,451,452],
|
||||||
|
'z': [598,608,591]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# iterate through the whole training set
|
||||||
|
new_df = train_df.copy()
|
||||||
|
for idx, row in train_df.iterrows():
|
||||||
|
# we iterate through each rule set, replacing any matching values in the
|
||||||
|
# list with the first element of the list
|
||||||
|
for key in rule_set.keys():
|
||||||
|
id = row['entity_id']
|
||||||
|
if (id in rule_set[key]):
|
||||||
|
stem = rule_set[key][0]
|
||||||
|
leaf = rule_set[key].index(id)
|
||||||
|
new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}"
|
||||||
|
# %%
|
||||||
|
len(set(new_df['entity_seq'].to_list()))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
new_df.to_csv('train_seq.csv')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# now do the same for the test data
|
||||||
|
# import training file
|
||||||
|
data_path = '../esAppMod_data_import/test.csv'
|
||||||
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
new_df = test_df.copy()
|
||||||
|
for idx, row in test_df.iterrows():
|
||||||
|
# we iterate through each rule set, replacing any matching values in the
|
||||||
|
# list with the first element of the list
|
||||||
|
for key in rule_set.keys():
|
||||||
|
id = row['entity_id']
|
||||||
|
if (id in rule_set[key]):
|
||||||
|
stem = rule_set[key][0]
|
||||||
|
leaf = rule_set[key].index(id)
|
||||||
|
new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}"
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
new_df
|
||||||
|
|
||||||
|
# %%
|
||||||
|
new_df.to_csv('test_seq.csv')
|
||||||
|
# %%
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
Accuracy: 0.77655
|
Accuracy: 0.80197
|
||||||
F1 Score: 0.79605
|
F1 Score: 0.81948
|
||||||
Precision: 0.85637
|
Precision: 0.88067
|
||||||
Recall: 0.77655
|
Recall: 0.80197
|
|
@ -32,6 +32,8 @@ torch.set_float32_matmul_precision('high')
|
||||||
BATCH_SIZE = 256
|
BATCH_SIZE = 256
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
# construct the target id list
|
||||||
|
# data_path = '../../../esAppMod_data_import/train.csv'
|
||||||
data_path = '../../../esAppMod_data_import/train.csv'
|
data_path = '../../../esAppMod_data_import/train.csv'
|
||||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
# rather than use pattern, we use the real thing and property
|
# rather than use pattern, we use the real thing and property
|
||||||
|
@ -51,20 +53,9 @@ for idx, val in enumerate(target_id_list):
|
||||||
def preprocess_text(text):
|
def preprocess_text(text):
|
||||||
# 1. Make all uppercase
|
# 1. Make all uppercase
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
|
||||||
# Remove any non alphanumeric character
|
|
||||||
# text = re.sub(r'[^\w\s]', ' ', text) # Retains only alphanumeric and spaces
|
|
||||||
text = re.sub(r"[-;:]", " ", text)
|
|
||||||
|
|
||||||
# Add space between digit followed by a letter
|
|
||||||
text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
|
|
||||||
|
|
||||||
# Add space between letter followed by a digit
|
|
||||||
text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
|
|
||||||
|
|
||||||
|
|
||||||
# Substitute digits with '#'
|
# Substitute digits with '#'
|
||||||
text = re.sub(r'\d+', 'x', text)
|
# text = re.sub(r'\d+', '#', text)
|
||||||
|
|
||||||
# standardize spacing
|
# standardize spacing
|
||||||
text = re.sub(r'\s+', ' ', text).strip()
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
@ -0,0 +1,562 @@
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# from datasets import load_from_disk
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
|
import re
|
||||||
|
import random
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import (
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
DataCollatorWithPadding,
|
||||||
|
Trainer,
|
||||||
|
EarlyStoppingCallback,
|
||||||
|
TrainingArguments
|
||||||
|
)
|
||||||
|
import evaluate
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def set_seed(seed):
|
||||||
|
"""
|
||||||
|
Set the random seed for reproducibility.
|
||||||
|
"""
|
||||||
|
random.seed(seed) # Python random module
|
||||||
|
np.random.seed(seed) # NumPy random
|
||||||
|
torch.manual_seed(seed) # PyTorch CPU
|
||||||
|
torch.cuda.manual_seed(seed) # PyTorch GPU
|
||||||
|
torch.cuda.manual_seed_all(seed) # If using multiple GPUs
|
||||||
|
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
|
||||||
|
torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility
|
||||||
|
|
||||||
|
set_seed(42)
|
||||||
|
|
||||||
|
SHUFFLES=10
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# import training file
|
||||||
|
data_path = '../../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# rather than use pattern, we use the real thing and property
|
||||||
|
entity_ids = train_df['entity_id'].to_list()
|
||||||
|
target_id_list = sorted(list(set(entity_ids)))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
id2label = {}
|
||||||
|
label2id = {}
|
||||||
|
for idx, val in enumerate(target_id_list):
|
||||||
|
id2label[idx] = val
|
||||||
|
label2id[val] = idx
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# introduce pre-processing functions
|
||||||
|
def preprocess_text(text):
|
||||||
|
|
||||||
|
# 1. Make all uppercase
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# Substitute digits with 'x'
|
||||||
|
# text = re.sub(r'\d+', '#', text)
|
||||||
|
|
||||||
|
# standardize spacing
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def generate_random_shuffles(text, n):
|
||||||
|
"""
|
||||||
|
Generate n strings with randomly shuffled words from the input text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text.
|
||||||
|
n (int): The number of random variations to generate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of strings with shuffled words.
|
||||||
|
"""
|
||||||
|
words = text.split() # Split the input into words
|
||||||
|
shuffled_variations = []
|
||||||
|
|
||||||
|
for _ in range(n):
|
||||||
|
shuffled = words[:] # Copy the word list to avoid in-place modification
|
||||||
|
random.shuffle(shuffled) # Randomly shuffle the words
|
||||||
|
shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string
|
||||||
|
|
||||||
|
return shuffled_variations
|
||||||
|
|
||||||
|
|
||||||
|
# generate n more shuffled examples
|
||||||
|
def shuffle_text(text, n_shuffles=SHUFFLES):
|
||||||
|
"""
|
||||||
|
Preprocess a list of texts and add n random shuffles for each string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts (list): An input strings.
|
||||||
|
n_shuffles (int): Number of random shuffles to generate for each string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of preprocessed and shuffled strings.
|
||||||
|
"""
|
||||||
|
all_processed = []
|
||||||
|
# add the original text
|
||||||
|
all_processed.append(text)
|
||||||
|
|
||||||
|
# Generate random shuffles
|
||||||
|
shuffled_variations = generate_random_shuffles(text, n_shuffles)
|
||||||
|
all_processed.extend(shuffled_variations)
|
||||||
|
|
||||||
|
return all_processed
|
||||||
|
|
||||||
|
acronym_mapping = {
|
||||||
|
'hpsa': 'hp server automation',
|
||||||
|
'tam': 'tivoli access manager',
|
||||||
|
'adf': 'application development facility',
|
||||||
|
'html': 'hypertext markup language',
|
||||||
|
'wff': 'microsoft web farm framework',
|
||||||
|
'jsp': 'javaserver pages',
|
||||||
|
'bw': 'business works',
|
||||||
|
'ssrs': 'sql server reporting services',
|
||||||
|
'cl': 'control language',
|
||||||
|
'vba': 'visual basic for applications',
|
||||||
|
'esapi': 'enterprise security api',
|
||||||
|
'gwt': 'google web toolkit',
|
||||||
|
'pki': 'perkin elmer informatics',
|
||||||
|
'rtd': 'oracle realtime decisions',
|
||||||
|
'jms': 'java message service',
|
||||||
|
'db': 'database',
|
||||||
|
'soa': 'service oriented architecture',
|
||||||
|
'xsl': 'extensible stylesheet language',
|
||||||
|
'com': 'compopent object model',
|
||||||
|
'ldap': 'lightweight directory access protocol',
|
||||||
|
'odm': 'ibm operational decision manager',
|
||||||
|
'soql': 'salesforce object query language',
|
||||||
|
'oms': 'order management system',
|
||||||
|
'cfml': 'coldfusion markup language',
|
||||||
|
'nas': 'netscape application server',
|
||||||
|
'sql': 'structured query language',
|
||||||
|
'bde': 'borland database engine',
|
||||||
|
'imap': 'internet message access protocol',
|
||||||
|
'uws': 'ultidev web server',
|
||||||
|
'birt': 'business intelligence and reporting tools',
|
||||||
|
'mdw': 'model driven workflow',
|
||||||
|
'tws': 'tivoli workload scheduler',
|
||||||
|
'jre': 'java runtime environment',
|
||||||
|
'wcs': 'websphere commerce suite',
|
||||||
|
'was': 'websphere application server',
|
||||||
|
'ssis': 'sql server integration services',
|
||||||
|
'xhtml': 'extensible hypertext markup language',
|
||||||
|
'soap': 'simple object access protocol',
|
||||||
|
'san': 'storage area network',
|
||||||
|
'elk': 'elastic stack',
|
||||||
|
'arr': 'application request routing',
|
||||||
|
'xlst': 'extensible stylesheet language transformations',
|
||||||
|
'sccm': 'microsoft endpoint configuration manager',
|
||||||
|
'ejb': 'enterprise java beans',
|
||||||
|
'css': 'cascading style sheets',
|
||||||
|
'hpoo': 'hp operations orchestration',
|
||||||
|
'xml': 'extensible markup language',
|
||||||
|
'esb': 'enterprise service bus',
|
||||||
|
'edi': 'electronic data interchange',
|
||||||
|
'imsva': 'interscan messaging security virtual appliance',
|
||||||
|
'wtx': 'ibm websphere transformation extender',
|
||||||
|
'cgi': 'common gateway interface',
|
||||||
|
'bal': 'ibm basic assembly language',
|
||||||
|
'issow': 'integrated safe system of work',
|
||||||
|
'dcl': 'data control language',
|
||||||
|
'jdom': 'java document object model',
|
||||||
|
'fim': 'microsoft forefront identity manager',
|
||||||
|
'npl': 'niakwa programming language',
|
||||||
|
'wf': 'windows workflow foundation',
|
||||||
|
'lm': 'etap license manager',
|
||||||
|
'wts': 'windows terminal server',
|
||||||
|
'asp': 'active server pages',
|
||||||
|
'jil': 'job information language',
|
||||||
|
'mvc': 'model view controller',
|
||||||
|
'rmi': 'remote method invocation',
|
||||||
|
'ad': 'active directory',
|
||||||
|
'owb': 'oracle warehouse builder',
|
||||||
|
'rest': 'representational state transfer',
|
||||||
|
'jdk': 'java development kit',
|
||||||
|
'ids': 'integrated data store',
|
||||||
|
'bms': 'batch management software',
|
||||||
|
'vsx': 'vmware solution exchange',
|
||||||
|
'ssas': 'sql server analysis services',
|
||||||
|
'atl': 'atlas transformation language',
|
||||||
|
'ice': 'infobright community edition',
|
||||||
|
'esql': 'extended structured query language',
|
||||||
|
'corba': 'common object request broker architecture',
|
||||||
|
'dpe': 'device provisioning engines',
|
||||||
|
'rac': 'oracle real application clusters',
|
||||||
|
'iemt': 'iis easy migration tool',
|
||||||
|
'mes': 'manufacturing execution system',
|
||||||
|
'odbc': 'open database connectivity',
|
||||||
|
'lms': 'lan management solution',
|
||||||
|
'wcf': 'windows communication foundation',
|
||||||
|
'nes': 'netscape enterprise server',
|
||||||
|
'jsf': 'javaserver faces',
|
||||||
|
'alm': 'application lifecycle management',
|
||||||
|
'hlasm': 'high level assembler',
|
||||||
|
'cmod': 'content manager ondemand'}
|
||||||
|
|
||||||
|
external_source = {
|
||||||
|
'vb.net': 'visual basic dot net',
|
||||||
|
'jes': 'job entry subsystem',
|
||||||
|
'svn': 'subversion',
|
||||||
|
'vcs': 'version control system',
|
||||||
|
'lims': 'laboratory information management system',
|
||||||
|
'ide': 'integrated development environment',
|
||||||
|
'sdk': 'software development kit',
|
||||||
|
'mq': 'message queue',
|
||||||
|
'ims': 'information management system',
|
||||||
|
'isa': 'internet security and acceleration',
|
||||||
|
'vs': 'visual studio',
|
||||||
|
'esr': 'extended support release',
|
||||||
|
'ff': 'firefox',
|
||||||
|
'vb': 'visual basic',
|
||||||
|
'rhel': 'red hat enterprise linux',
|
||||||
|
'iis': 'internet information server',
|
||||||
|
'api': 'application programming interface',
|
||||||
|
'se': 'standard edition',
|
||||||
|
'\.net': 'dot net',
|
||||||
|
'c#': 'c sharp'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# synonyms = {
|
||||||
|
# 'windows server': 'windows nt',
|
||||||
|
# 'windows 7': 'windows desktop',
|
||||||
|
# 'windows 8': 'windows desktop',
|
||||||
|
# 'windows 10': 'windows desktop'
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
|
# add more information
|
||||||
|
acronym_mapping.update(external_source)
|
||||||
|
|
||||||
|
|
||||||
|
abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
|
||||||
|
term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
|
||||||
|
|
||||||
|
def replace_terms_with_abbreviations(text):
|
||||||
|
for input, replacement in term_to_abbrev.items():
|
||||||
|
text = re.sub(input, replacement, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def replace_abbreviations_with_terms(text):
|
||||||
|
for input, replacement in abbrev_to_term.items():
|
||||||
|
text = re.sub(input, replacement, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
######################################
|
||||||
|
|
||||||
|
# augmentation by text corruption
|
||||||
|
|
||||||
|
def corrupt_word(word):
|
||||||
|
"""Corrupt a single word using random corruption techniques."""
|
||||||
|
if len(word) <= 1: # Skip corruption for single-character words
|
||||||
|
return word
|
||||||
|
|
||||||
|
corruption_type = random.choice(["delete", "swap"])
|
||||||
|
|
||||||
|
if corruption_type == "delete":
|
||||||
|
# Randomly delete a character
|
||||||
|
idx = random.randint(0, len(word) - 1)
|
||||||
|
word = word[:idx] + word[idx + 1:]
|
||||||
|
|
||||||
|
elif corruption_type == "swap":
|
||||||
|
# Swap two adjacent characters
|
||||||
|
if len(word) > 1:
|
||||||
|
idx = random.randint(0, len(word) - 2)
|
||||||
|
word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
|
||||||
|
|
||||||
|
|
||||||
|
return word
|
||||||
|
|
||||||
|
def corrupt_string(sentence, corruption_probability=0.01):
|
||||||
|
"""Corrupt each word in the string with a given probability."""
|
||||||
|
words = sentence.split()
|
||||||
|
corrupted_words = [
|
||||||
|
corrupt_word(word) if random.random() < corruption_probability else word
|
||||||
|
for word in words
|
||||||
|
]
|
||||||
|
return " ".join(corrupted_words)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# outputs a list of dictionaries
|
||||||
|
# processes dataframe into lists of dictionaries
|
||||||
|
# each element maps input to output
|
||||||
|
# input: tag_description
|
||||||
|
# output: class label
|
||||||
|
label_flag_list = []
|
||||||
|
|
||||||
|
def process_df_to_dict(df):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
# produce shuffling
|
||||||
|
index = row['entity_id']
|
||||||
|
parent_desc = row['mention']
|
||||||
|
parent_desc = preprocess_text(parent_desc)
|
||||||
|
|
||||||
|
# Split the string into words
|
||||||
|
words = parent_desc.split()
|
||||||
|
|
||||||
|
# Count the number of words
|
||||||
|
word_count = len(words)
|
||||||
|
|
||||||
|
# short sequences are rare, and we must compensate by including more examples
|
||||||
|
# mutation of other longer sequences might drown out rare short sequences
|
||||||
|
if word_count < 3:
|
||||||
|
for _ in range(10):
|
||||||
|
element = {
|
||||||
|
'text': parent_desc,
|
||||||
|
'label': label2id[index],
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
# check if label is in label_flag_list
|
||||||
|
if index not in label_flag_list:
|
||||||
|
|
||||||
|
entity_name = row['entity_name']
|
||||||
|
# add the "entity_name" label as a mention
|
||||||
|
element = {
|
||||||
|
'text': entity_name,
|
||||||
|
'label': label2id[index],
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
# remove all non-alphanumerics
|
||||||
|
desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
# add shufles of the original entity name
|
||||||
|
no_of_shuffles = SHUFFLES
|
||||||
|
processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles)
|
||||||
|
for desc in processed_descs:
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
label_flag_list.append(index)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# add shuffled strings
|
||||||
|
processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
|
||||||
|
for desc in processed_descs:
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
# corrupt string
|
||||||
|
desc = corrupt_string(parent_desc, corruption_probability=0.1)
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
# augmentation
|
||||||
|
# remove all non-alphanumerics
|
||||||
|
desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
# # augmentation
|
||||||
|
# # perform abbrev_to_term
|
||||||
|
# temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||||
|
# desc = replace_terms_with_abbreviations(temp_desc)
|
||||||
|
# if (desc != temp_desc):
|
||||||
|
# element = {
|
||||||
|
# 'text' : desc,
|
||||||
|
# 'label': label2id[index], # ensure labels starts from 0
|
||||||
|
# }
|
||||||
|
# output_list.append(element)
|
||||||
|
|
||||||
|
# augmentation
|
||||||
|
# perform term to abbrev
|
||||||
|
desc = replace_abbreviations_with_terms(parent_desc)
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset():
|
||||||
|
# train
|
||||||
|
data_path = '../../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
|
||||||
|
combined_data = DatasetDict({
|
||||||
|
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||||
|
})
|
||||||
|
return combined_data
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
def train():
|
||||||
|
|
||||||
|
save_path = f'checkpoint'
|
||||||
|
split_datasets = create_dataset()
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
|
||||||
|
model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||||
|
# model_checkpoint = 'google-bert/bert-base-cased'
|
||||||
|
# model_checkpoint = 'prajjwal1/bert-small'
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# Define additional special tokens
|
||||||
|
# additional_special_tokens = ["<DESC>"]
|
||||||
|
# Add the additional special tokens to the tokenizer
|
||||||
|
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
max_length = 120
|
||||||
|
|
||||||
|
# given a dataset entry, run it through the tokenizer
|
||||||
|
def preprocess_function(example):
|
||||||
|
input = example['text']
|
||||||
|
# text_target sets the corresponding label to inputs
|
||||||
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = tokenizer(
|
||||||
|
input,
|
||||||
|
max_length=max_length,
|
||||||
|
truncation=True,
|
||||||
|
padding=True
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
tokenized_datasets = split_datasets.map(
|
||||||
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
# %% temp
|
||||||
|
# tokenized_datasets['train'].rename_columns()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create data collator
|
||||||
|
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute metrics
|
||||||
|
metric = evaluate.load("accuracy")
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(eval_preds):
|
||||||
|
preds, labels = eval_preds
|
||||||
|
preds = np.argmax(preds, axis=1)
|
||||||
|
return metric.compute(predictions=preds, references=labels)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create id2label and label2id
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_checkpoint,
|
||||||
|
num_labels=len(target_id_list),
|
||||||
|
id2label=id2label,
|
||||||
|
label2id=label2id)
|
||||||
|
# important! after extending tokens vocab
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Trainer
|
||||||
|
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
output_dir=f"{save_path}",
|
||||||
|
# eval_strategy="epoch",
|
||||||
|
eval_strategy="no",
|
||||||
|
logging_dir="tensorboard-log",
|
||||||
|
logging_strategy="epoch",
|
||||||
|
# save_strategy="epoch",
|
||||||
|
load_best_model_at_end=False,
|
||||||
|
learning_rate=5e-5,
|
||||||
|
per_device_train_batch_size=64,
|
||||||
|
per_device_eval_batch_size=64,
|
||||||
|
auto_find_batch_size=False,
|
||||||
|
ddp_find_unused_parameters=False,
|
||||||
|
weight_decay=0.01,
|
||||||
|
save_total_limit=1,
|
||||||
|
num_train_epochs=40,
|
||||||
|
warmup_steps=400,
|
||||||
|
bf16=True,
|
||||||
|
push_to_hub=False,
|
||||||
|
remove_unused_columns=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model,
|
||||||
|
training_args,
|
||||||
|
train_dataset=tokenized_datasets["train"],
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
data_collator=data_collator,
|
||||||
|
compute_metrics=compute_metrics,
|
||||||
|
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||||
|
)
|
||||||
|
|
||||||
|
# uncomment to load training from checkpoint
|
||||||
|
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||||
|
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
|
||||||
|
# execute training
|
||||||
|
train()
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,2 @@
|
||||||
|
checkpoint*
|
||||||
|
tensorboard-log
|
|
@ -0,0 +1 @@
|
||||||
|
exports
|
|
@ -0,0 +1,11 @@
|
||||||
|
|
||||||
|
*******************************************************************************
|
||||||
|
Accuracy: 0.71956
|
||||||
|
F1 Score: 0.74142
|
||||||
|
Precision: 0.81529
|
||||||
|
Recall: 0.71956
|
||||||
|
********************************************************************************
|
||||||
|
Accuracy: 0.71710
|
||||||
|
F1 Score: 0.74095
|
||||||
|
Precision: 0.82181
|
||||||
|
Recall: 0.71710
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
*******************************************************************************
|
||||||
|
Accuracy: 0.81591
|
||||||
|
F1 Score: 0.82162
|
||||||
|
Precision: 0.85519
|
||||||
|
Recall: 0.81591
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
*******************************************************************************
|
||||||
|
Accuracy: 0.59943
|
||||||
|
F1 Score: 0.60266
|
||||||
|
Precision: 0.66956
|
||||||
|
Recall: 0.59943
|
|
@ -0,0 +1,265 @@
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# from datasets import load_from_disk
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
|
||||||
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
|
import re
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from transformers import (
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
DataCollatorWithPadding,
|
||||||
|
)
|
||||||
|
import evaluate
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
|
||||||
|
BATCH_SIZE = 256
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# construct the target id list
|
||||||
|
# data_path = '../../../esAppMod_data_import/train.csv'
|
||||||
|
data_path = '../../../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# rather than use pattern, we use the real thing and property
|
||||||
|
entity_ids = train_df['entity_id'].to_list()
|
||||||
|
target_id_list = sorted(list(set(entity_ids)))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
id2label = {}
|
||||||
|
label2id = {}
|
||||||
|
for idx, val in enumerate(target_id_list):
|
||||||
|
id2label[idx] = val
|
||||||
|
label2id[val] = idx
|
||||||
|
|
||||||
|
|
||||||
|
# introduce pre-processing functions
|
||||||
|
def preprocess_text(text):
|
||||||
|
# 1. Make all uppercase
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# Substitute digits with '#'
|
||||||
|
text = re.sub(r'\d+', '#', text)
|
||||||
|
|
||||||
|
# standardize spacing
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# outputs a list of dictionaries
|
||||||
|
# processes dataframe into lists of dictionaries
|
||||||
|
# each element maps input to output
|
||||||
|
# input: tag_description
|
||||||
|
# output: class label
|
||||||
|
def process_df_to_dict(df):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
desc = row['mention']
|
||||||
|
desc = preprocess_text(desc)
|
||||||
|
index = row['entity_id']
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset():
|
||||||
|
# train
|
||||||
|
# data_path = '../../../esAppMod_data_import/test.csv'
|
||||||
|
data_path = '../../../esAppMod_data_import/parent_test.csv'
|
||||||
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# combined_data = DatasetDict({
|
||||||
|
# 'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||||
|
# })
|
||||||
|
return Dataset.from_list(process_df_to_dict(test_df))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
def test():
|
||||||
|
|
||||||
|
test_dataset = create_dataset()
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
|
||||||
|
checkpoint_directory = f'../checkpoint'
|
||||||
|
# Use glob to find matching paths
|
||||||
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
|
pattern = 'checkpoint_part1-*'
|
||||||
|
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# Define additional special tokens
|
||||||
|
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||||
|
# Add the additional special tokens to the tokenizer
|
||||||
|
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute max token length
|
||||||
|
max_length = 0
|
||||||
|
for sample in test_dataset['text']:
|
||||||
|
# Tokenize the sample and get the length
|
||||||
|
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||||
|
length = len(input_ids)
|
||||||
|
|
||||||
|
# Update max_length if this sample is longer
|
||||||
|
if length > max_length:
|
||||||
|
max_length = length
|
||||||
|
|
||||||
|
print(max_length)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
max_length = 128
|
||||||
|
|
||||||
|
# given a dataset entry, run it through the tokenizer
|
||||||
|
def preprocess_function(example):
|
||||||
|
input = example['text']
|
||||||
|
# text_target sets the corresponding label to inputs
|
||||||
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = tokenizer(
|
||||||
|
input,
|
||||||
|
max_length=max_length,
|
||||||
|
# truncation=True,
|
||||||
|
padding='max_length'
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
datasets = test_dataset.map(
|
||||||
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||||
|
|
||||||
|
# %% temp
|
||||||
|
# tokenized_datasets['train'].rename_columns()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create data collator
|
||||||
|
|
||||||
|
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute metrics
|
||||||
|
# metric = evaluate.load("accuracy")
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def compute_metrics(eval_preds):
|
||||||
|
# preds, labels = eval_preds
|
||||||
|
# preds = np.argmax(preds, axis=1)
|
||||||
|
# return metric.compute(predictions=preds, references=labels)
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_checkpoint,
|
||||||
|
num_labels=len(target_id_list),
|
||||||
|
id2label=id2label,
|
||||||
|
label2id=label2id)
|
||||||
|
# important! after extending tokens vocab
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
model = model.eval()
|
||||||
|
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
pred_labels = []
|
||||||
|
actual_labels = []
|
||||||
|
|
||||||
|
|
||||||
|
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||||
|
for batch in tqdm(dataloader):
|
||||||
|
# Inference in batches
|
||||||
|
input_ids = batch['input_ids']
|
||||||
|
attention_mask = batch['attention_mask']
|
||||||
|
# save labels too
|
||||||
|
actual_labels.extend(batch['label'])
|
||||||
|
|
||||||
|
|
||||||
|
# Move to GPU if available
|
||||||
|
input_ids = input_ids.to(device)
|
||||||
|
attention_mask = attention_mask.to(device)
|
||||||
|
|
||||||
|
# Perform inference
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = model(
|
||||||
|
input_ids,
|
||||||
|
attention_mask).logits
|
||||||
|
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||||
|
pred_labels.extend(predicted_class_ids)
|
||||||
|
|
||||||
|
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||||
|
y_true = actual_labels
|
||||||
|
y_pred = pred_labels
|
||||||
|
|
||||||
|
# Compute metrics
|
||||||
|
accuracy = accuracy_score(y_true, y_pred)
|
||||||
|
average_parameter = 'weighted'
|
||||||
|
zero_division_parameter = 0
|
||||||
|
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
|
||||||
|
with open("output_1.txt", "a") as f:
|
||||||
|
|
||||||
|
print('*' * 80, file=f)
|
||||||
|
# Print the results
|
||||||
|
print(f'Accuracy: {accuracy:.5f}', file=f)
|
||||||
|
print(f'F1 Score: {f1:.5f}', file=f)
|
||||||
|
print(f'Precision: {precision:.5f}', file=f)
|
||||||
|
print(f'Recall: {recall:.5f}', file=f)
|
||||||
|
|
||||||
|
# export result
|
||||||
|
label_list = [id2label[id] for id in pred_labels]
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'class_prediction': pd.Series(label_list)
|
||||||
|
})
|
||||||
|
|
||||||
|
# we can save the t5 generation output here
|
||||||
|
df.to_csv(f"exports/result_1.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# reset file before writing to it
|
||||||
|
with open("output_1.txt", "w") as f:
|
||||||
|
print('', file=f)
|
||||||
|
test()
|
|
@ -0,0 +1,265 @@
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# from datasets import load_from_disk
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
|
||||||
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
|
import re
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from transformers import (
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
DataCollatorWithPadding,
|
||||||
|
)
|
||||||
|
import evaluate
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
|
||||||
|
BATCH_SIZE = 256
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# construct the target id list
|
||||||
|
# data_path = '../../../esAppMod_data_import/train.csv'
|
||||||
|
data_path = '../../../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# rather than use pattern, we use the real thing and property
|
||||||
|
entity_ids = train_df['entity_id'].to_list()
|
||||||
|
target_id_list = sorted(list(set(entity_ids)))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
id2label = {}
|
||||||
|
label2id = {}
|
||||||
|
for idx, val in enumerate(target_id_list):
|
||||||
|
id2label[idx] = val
|
||||||
|
label2id[val] = idx
|
||||||
|
|
||||||
|
|
||||||
|
# introduce pre-processing functions
|
||||||
|
def preprocess_text(text):
|
||||||
|
# 1. Make all uppercase
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# Substitute digits with '#'
|
||||||
|
text = re.sub(r'\d+', '#', text)
|
||||||
|
|
||||||
|
# standardize spacing
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# outputs a list of dictionaries
|
||||||
|
# processes dataframe into lists of dictionaries
|
||||||
|
# each element maps input to output
|
||||||
|
# input: tag_description
|
||||||
|
# output: class label
|
||||||
|
def process_df_to_dict(df):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
desc = row['mention']
|
||||||
|
desc = preprocess_text(desc)
|
||||||
|
index = row['entity_id']
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset():
|
||||||
|
# train
|
||||||
|
# data_path = '../../../esAppMod_data_import/test.csv'
|
||||||
|
data_path = '../../../esAppMod_data_import/test.csv'
|
||||||
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# combined_data = DatasetDict({
|
||||||
|
# 'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||||
|
# })
|
||||||
|
return Dataset.from_list(process_df_to_dict(test_df))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
def test():
|
||||||
|
|
||||||
|
test_dataset = create_dataset()
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
|
||||||
|
checkpoint_directory = f'../checkpoint'
|
||||||
|
# Use glob to find matching paths
|
||||||
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
|
pattern = 'checkpoint-*'
|
||||||
|
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# Define additional special tokens
|
||||||
|
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||||
|
# Add the additional special tokens to the tokenizer
|
||||||
|
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute max token length
|
||||||
|
max_length = 0
|
||||||
|
for sample in test_dataset['text']:
|
||||||
|
# Tokenize the sample and get the length
|
||||||
|
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||||
|
length = len(input_ids)
|
||||||
|
|
||||||
|
# Update max_length if this sample is longer
|
||||||
|
if length > max_length:
|
||||||
|
max_length = length
|
||||||
|
|
||||||
|
print(max_length)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
max_length = 128
|
||||||
|
|
||||||
|
# given a dataset entry, run it through the tokenizer
|
||||||
|
def preprocess_function(example):
|
||||||
|
input = example['text']
|
||||||
|
# text_target sets the corresponding label to inputs
|
||||||
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = tokenizer(
|
||||||
|
input,
|
||||||
|
max_length=max_length,
|
||||||
|
# truncation=True,
|
||||||
|
padding='max_length'
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
datasets = test_dataset.map(
|
||||||
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||||
|
|
||||||
|
# %% temp
|
||||||
|
# tokenized_datasets['train'].rename_columns()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create data collator
|
||||||
|
|
||||||
|
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute metrics
|
||||||
|
# metric = evaluate.load("accuracy")
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def compute_metrics(eval_preds):
|
||||||
|
# preds, labels = eval_preds
|
||||||
|
# preds = np.argmax(preds, axis=1)
|
||||||
|
# return metric.compute(predictions=preds, references=labels)
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_checkpoint,
|
||||||
|
num_labels=len(target_id_list),
|
||||||
|
id2label=id2label,
|
||||||
|
label2id=label2id)
|
||||||
|
# important! after extending tokens vocab
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
model = model.eval()
|
||||||
|
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
pred_labels = []
|
||||||
|
actual_labels = []
|
||||||
|
|
||||||
|
|
||||||
|
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||||
|
for batch in tqdm(dataloader):
|
||||||
|
# Inference in batches
|
||||||
|
input_ids = batch['input_ids']
|
||||||
|
attention_mask = batch['attention_mask']
|
||||||
|
# save labels too
|
||||||
|
actual_labels.extend(batch['label'])
|
||||||
|
|
||||||
|
|
||||||
|
# Move to GPU if available
|
||||||
|
input_ids = input_ids.to(device)
|
||||||
|
attention_mask = attention_mask.to(device)
|
||||||
|
|
||||||
|
# Perform inference
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = model(
|
||||||
|
input_ids,
|
||||||
|
attention_mask).logits
|
||||||
|
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||||
|
pred_labels.extend(predicted_class_ids)
|
||||||
|
|
||||||
|
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||||
|
y_true = actual_labels
|
||||||
|
y_pred = pred_labels
|
||||||
|
|
||||||
|
# Compute metrics
|
||||||
|
accuracy = accuracy_score(y_true, y_pred)
|
||||||
|
average_parameter = 'weighted'
|
||||||
|
zero_division_parameter = 0
|
||||||
|
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
|
||||||
|
with open("output_2.txt", "a") as f:
|
||||||
|
|
||||||
|
print('*' * 80, file=f)
|
||||||
|
# Print the results
|
||||||
|
print(f'Accuracy: {accuracy:.5f}', file=f)
|
||||||
|
print(f'F1 Score: {f1:.5f}', file=f)
|
||||||
|
print(f'Precision: {precision:.5f}', file=f)
|
||||||
|
print(f'Recall: {recall:.5f}', file=f)
|
||||||
|
|
||||||
|
# export result
|
||||||
|
label_list = [id2label[id] for id in pred_labels]
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'class_prediction': pd.Series(label_list)
|
||||||
|
})
|
||||||
|
|
||||||
|
# we can save the t5 generation output here
|
||||||
|
df.to_csv(f"exports/result_2.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# reset file before writing to it
|
||||||
|
with open("output_2.txt", "w") as f:
|
||||||
|
print('', file=f)
|
||||||
|
test()
|
|
@ -45,7 +45,7 @@ def set_seed(seed):
|
||||||
|
|
||||||
set_seed(42)
|
set_seed(42)
|
||||||
|
|
||||||
SHUFFLES=2
|
SHUFFLES=5
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
||||||
|
@ -56,37 +56,6 @@ train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
entity_ids = train_df['entity_id'].to_list()
|
entity_ids = train_df['entity_id'].to_list()
|
||||||
target_id_list = sorted(list(set(entity_ids)))
|
target_id_list = sorted(list(set(entity_ids)))
|
||||||
|
|
||||||
def compute_normalized_class_weights(class_counts, max_resamples=SHUFFLES):
|
|
||||||
"""
|
|
||||||
Compute normalized class weights inversely proportional to class counts.
|
|
||||||
The weights are normalized so that they sum to 1.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
class_counts (array-like): An array or list where each element represents the count of samples for a class.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
numpy.ndarray: A normalized array of weights for each class.
|
|
||||||
"""
|
|
||||||
class_counts = np.array(class_counts)
|
|
||||||
total_samples = np.sum(class_counts)
|
|
||||||
class_weights = total_samples / class_counts
|
|
||||||
# so that highest weight is 1
|
|
||||||
normalized_weights = class_weights / np.max(class_weights)
|
|
||||||
# Scale weights such that the highest weight corresponds to `max_resamples`
|
|
||||||
resample_counts = normalized_weights * max_resamples
|
|
||||||
# Round resamples to nearest integer
|
|
||||||
resample_counts = np.round(resample_counts).astype(int)
|
|
||||||
return resample_counts
|
|
||||||
|
|
||||||
# %%
|
|
||||||
id_counts = train_df['entity_id'].value_counts()
|
|
||||||
id_weights = compute_normalized_class_weights(id_counts, max_resamples=SHUFFLES)
|
|
||||||
id_index = id_counts.index
|
|
||||||
label2weight = {}
|
|
||||||
for idx, label in enumerate(id_index):
|
|
||||||
label2weight[label] = id_weights[idx]
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
id2label = {}
|
id2label = {}
|
||||||
label2id = {}
|
label2id = {}
|
||||||
|
@ -100,21 +69,9 @@ def preprocess_text(text):
|
||||||
|
|
||||||
# 1. Make all uppercase
|
# 1. Make all uppercase
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
|
||||||
# Remove any non alphanumeric character
|
|
||||||
# text = re.sub(r'[^\w\s]', ' ', text) # Retains only alphanumeric and spaces
|
|
||||||
# replace dashes
|
|
||||||
text = re.sub(r"[-;:]", " ", text)
|
|
||||||
|
|
||||||
# Add space between digit followed by a letter
|
|
||||||
text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
|
|
||||||
|
|
||||||
# Add space between letter followed by a digit
|
|
||||||
text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
|
|
||||||
|
|
||||||
|
|
||||||
# Substitute digits with 'x'
|
# Substitute digits with 'x'
|
||||||
text = re.sub(r'\d+', 'x', text)
|
text = re.sub(r'\d+', '#', text)
|
||||||
|
|
||||||
# standardize spacing
|
# standardize spacing
|
||||||
text = re.sub(r'\s+', ' ', text).strip()
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
@ -165,35 +122,143 @@ def shuffle_text(text, n_shuffles=SHUFFLES):
|
||||||
|
|
||||||
return all_processed
|
return all_processed
|
||||||
|
|
||||||
term_to_abbrev = {
|
acronym_mapping = {
|
||||||
r'job entry system': 'jes',
|
'hpsa': 'hp server automation',
|
||||||
r'subversion': 'svn',
|
'tam': 'tivoli access manager',
|
||||||
r'borland database engine': 'bde',
|
'adf': 'application development facility',
|
||||||
r'business intelligence and reporting tools': 'birt',
|
'html': 'hypertext markup language',
|
||||||
r'lan management solution': 'lms',
|
'wff': 'microsoft web farm framework',
|
||||||
r'laboratory information management system': 'lims',
|
'jsp': 'javaserver pages',
|
||||||
r'ibm database 2': 'db/2',
|
'bw': 'business works',
|
||||||
r'integrated development environment': 'ide',
|
'ssrs': 'sql server reporting services',
|
||||||
r'software development kit': 'sdk',
|
'cl': 'control language',
|
||||||
r'hp operations orchestration': 'hpoo',
|
'vba': 'visual basic for applications',
|
||||||
r'hp server automation': 'hpsa',
|
'esapi': 'enterprise security api',
|
||||||
r'internet information server': 'iis',
|
'gwt': 'google web toolkit',
|
||||||
r'release 2': 'r2',
|
'pki': 'perkin elmer informatics',
|
||||||
r'red hat enterprise linux': 'rhel',
|
'rtd': 'oracle realtime decisions',
|
||||||
r'oracle enterprise linux': 'oel',
|
'jms': 'java message service',
|
||||||
r'websphere application server': 'was',
|
'db': 'database',
|
||||||
r'application development facility': 'adf',
|
'soa': 'service oriented architecture',
|
||||||
r'server analysis services': 'ssas'
|
'xsl': 'extensible stylesheet language',
|
||||||
|
'com': 'compopent object model',
|
||||||
|
'ldap': 'lightweight directory access protocol',
|
||||||
|
'odm': 'ibm operational decision manager',
|
||||||
|
'soql': 'salesforce object query language',
|
||||||
|
'oms': 'order management system',
|
||||||
|
'cfml': 'coldfusion markup language',
|
||||||
|
'nas': 'netscape application server',
|
||||||
|
'sql': 'structured query language',
|
||||||
|
'bde': 'borland database engine',
|
||||||
|
'imap': 'internet message access protocol',
|
||||||
|
'uws': 'ultidev web server',
|
||||||
|
'birt': 'business intelligence and reporting tools',
|
||||||
|
'mdw': 'model driven workflow',
|
||||||
|
'tws': 'tivoli workload scheduler',
|
||||||
|
'jre': 'java runtime environment',
|
||||||
|
'wcs': 'websphere commerce suite',
|
||||||
|
'was': 'websphere application server',
|
||||||
|
'ssis': 'sql server integration services',
|
||||||
|
'xhtml': 'extensible hypertext markup language',
|
||||||
|
'soap': 'simple object access protocol',
|
||||||
|
'san': 'storage area network',
|
||||||
|
'elk': 'elastic stack',
|
||||||
|
'arr': 'application request routing',
|
||||||
|
'xlst': 'extensible stylesheet language transformations',
|
||||||
|
'sccm': 'microsoft endpoint configuration manager',
|
||||||
|
'ejb': 'enterprise java beans',
|
||||||
|
'css': 'cascading style sheets',
|
||||||
|
'hpoo': 'hp operations orchestration',
|
||||||
|
'xml': 'extensible markup language',
|
||||||
|
'esb': 'enterprise service bus',
|
||||||
|
'edi': 'electronic data interchange',
|
||||||
|
'imsva': 'interscan messaging security virtual appliance',
|
||||||
|
'wtx': 'ibm websphere transformation extender',
|
||||||
|
'cgi': 'common gateway interface',
|
||||||
|
'bal': 'ibm basic assembly language',
|
||||||
|
'issow': 'integrated safe system of work',
|
||||||
|
'dcl': 'data control language',
|
||||||
|
'jdom': 'java document object model',
|
||||||
|
'fim': 'microsoft forefront identity manager',
|
||||||
|
'npl': 'niakwa programming language',
|
||||||
|
'wf': 'windows workflow foundation',
|
||||||
|
'lm': 'etap license manager',
|
||||||
|
'wts': 'windows terminal server',
|
||||||
|
'asp': 'active server pages',
|
||||||
|
'jil': 'job information language',
|
||||||
|
'mvc': 'model view controller',
|
||||||
|
'rmi': 'remote method invocation',
|
||||||
|
'ad': 'active directory',
|
||||||
|
'owb': 'oracle warehouse builder',
|
||||||
|
'rest': 'representational state transfer',
|
||||||
|
'jdk': 'java development kit',
|
||||||
|
'ids': 'integrated data store',
|
||||||
|
'bms': 'batch management software',
|
||||||
|
'vsx': 'vmware solution exchange',
|
||||||
|
'ssas': 'sql server analysis services',
|
||||||
|
'atl': 'atlas transformation language',
|
||||||
|
'ice': 'infobright community edition',
|
||||||
|
'esql': 'extended structured query language',
|
||||||
|
'corba': 'common object request broker architecture',
|
||||||
|
'dpe': 'device provisioning engines',
|
||||||
|
'rac': 'oracle real application clusters',
|
||||||
|
'iemt': 'iis easy migration tool',
|
||||||
|
'mes': 'manufacturing execution system',
|
||||||
|
'odbc': 'open database connectivity',
|
||||||
|
'lms': 'lan management solution',
|
||||||
|
'wcf': 'windows communication foundation',
|
||||||
|
'nes': 'netscape enterprise server',
|
||||||
|
'jsf': 'javaserver faces',
|
||||||
|
'alm': 'application lifecycle management',
|
||||||
|
'hlasm': 'high level assembler',
|
||||||
|
'cmod': 'content manager ondemand'}
|
||||||
|
|
||||||
|
external_source = {
|
||||||
|
'vb.net': 'visual basic dot net',
|
||||||
|
'jes': 'job entry subsystem',
|
||||||
|
'svn': 'subversion',
|
||||||
|
'vcs': 'version control system',
|
||||||
|
'lims': 'laboratory information management system',
|
||||||
|
'ide': 'integrated development environment',
|
||||||
|
'sdk': 'software development kit',
|
||||||
|
'mq': 'message queue',
|
||||||
|
'ims': 'information management system',
|
||||||
|
'isa': 'internet security and acceleration',
|
||||||
|
'vs': 'visual studio',
|
||||||
|
'esr': 'extended support release',
|
||||||
|
'ff': 'firefox',
|
||||||
|
'vb': 'visual basic',
|
||||||
|
'rhel': 'red hat enterprise linux',
|
||||||
|
'iis': 'internet information server',
|
||||||
|
'api': 'application programming interface',
|
||||||
|
'se': 'standard edition',
|
||||||
|
'\.net': 'dot net',
|
||||||
|
'c#': 'c sharp',
|
||||||
|
'ms': 'microsoft'
|
||||||
}
|
}
|
||||||
|
|
||||||
abbrev_to_term = {rf'\b{value}\b': key for key, value in term_to_abbrev.items()}
|
|
||||||
|
# synonyms = {
|
||||||
|
# 'windows server': 'windows nt',
|
||||||
|
# 'windows 7': 'windows desktop',
|
||||||
|
# 'windows 8': 'windows desktop',
|
||||||
|
# 'windows 10': 'windows desktop'
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
|
# add more information
|
||||||
|
acronym_mapping.update(external_source)
|
||||||
|
|
||||||
|
|
||||||
|
abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
|
||||||
|
term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
|
||||||
|
|
||||||
def replace_terms_with_abbreviations(text):
|
def replace_terms_with_abbreviations(text):
|
||||||
for input, replacement in term_to_abbrev.items():
|
for input, replacement in term_to_abbrev.items():
|
||||||
text = re.sub(input, replacement, text)
|
text = re.sub(input, replacement, text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def replace_abbreivations_with_terms(text):
|
def replace_abbreviations_with_terms(text):
|
||||||
for input, replacement in abbrev_to_term.items():
|
for input, replacement in abbrev_to_term.items():
|
||||||
text = re.sub(input, replacement, text)
|
text = re.sub(input, replacement, text)
|
||||||
return text
|
return text
|
||||||
|
@ -218,8 +283,19 @@ def process_df_to_dict(df):
|
||||||
# no_of_shuffles = label2weight[index] + 1
|
# no_of_shuffles = label2weight[index] + 1
|
||||||
no_of_shuffles = SHUFFLES
|
no_of_shuffles = SHUFFLES
|
||||||
processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
|
processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
|
||||||
|
|
||||||
for desc in processed_descs:
|
for desc in processed_descs:
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
# augmentation
|
||||||
|
# remove all non-alphanumerics
|
||||||
|
desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||||
|
if (desc != parent_desc):
|
||||||
element = {
|
element = {
|
||||||
'text' : desc,
|
'text' : desc,
|
||||||
'label': label2id[index], # ensure labels starts from 0
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
@ -227,24 +303,21 @@ def process_df_to_dict(df):
|
||||||
output_list.append(element)
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
# augmentation
|
||||||
# perform abbrev_to_term
|
# perform abbrev_to_term
|
||||||
desc = replace_terms_with_abbreviations(parent_desc)
|
temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||||
no_of_shuffles = SHUFFLES
|
desc = replace_terms_with_abbreviations(temp_desc)
|
||||||
processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
|
if (desc != temp_desc):
|
||||||
|
|
||||||
for desc in processed_descs:
|
|
||||||
element = {
|
element = {
|
||||||
'text' : desc,
|
'text' : desc,
|
||||||
'label': label2id[index], # ensure labels starts from 0
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
}
|
}
|
||||||
output_list.append(element)
|
output_list.append(element)
|
||||||
|
|
||||||
|
# augmentation
|
||||||
# perform term to abbrev
|
# perform term to abbrev
|
||||||
desc = replace_abbreivations_with_terms(parent_desc)
|
desc = replace_abbreviations_with_terms(parent_desc)
|
||||||
no_of_shuffles = SHUFFLES
|
if (desc != parent_desc):
|
||||||
processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
|
|
||||||
|
|
||||||
for desc in processed_descs:
|
|
||||||
element = {
|
element = {
|
||||||
'text' : desc,
|
'text' : desc,
|
||||||
'label': label2id[index], # ensure labels starts from 0
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
@ -257,7 +330,7 @@ def process_df_to_dict(df):
|
||||||
|
|
||||||
def create_dataset():
|
def create_dataset():
|
||||||
# train
|
# train
|
||||||
data_path = '../../esAppMod_data_import/train.csv'
|
data_path = '../../esAppMod_data_import/parent_train.csv'
|
||||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -271,13 +344,13 @@ def create_dataset():
|
||||||
|
|
||||||
def train():
|
def train():
|
||||||
|
|
||||||
save_path = f'checkpoint'
|
save_path = f'checkpoint_part1'
|
||||||
split_datasets = create_dataset()
|
split_datasets = create_dataset()
|
||||||
|
|
||||||
# prepare tokenizer
|
# prepare tokenizer
|
||||||
|
|
||||||
model_checkpoint = "distilbert/distilbert-base-uncased"
|
model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||||
# model_checkpoint = 'google-bert/bert-base-cased'
|
# model_checkpoint = 'google-bert/bert-base-uncased'
|
||||||
# model_checkpoint = 'prajjwal1/bert-small'
|
# model_checkpoint = 'prajjwal1/bert-small'
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
# Define additional special tokens
|
# Define additional special tokens
|
||||||
|
@ -348,7 +421,6 @@ def train():
|
||||||
|
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir=f"{save_path}",
|
output_dir=f"{save_path}",
|
||||||
# eval_strategy="epoch",
|
|
||||||
eval_strategy="no",
|
eval_strategy="no",
|
||||||
logging_dir="tensorboard-log",
|
logging_dir="tensorboard-log",
|
||||||
logging_strategy="epoch",
|
logging_strategy="epoch",
|
|
@ -0,0 +1,469 @@
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# from datasets import load_from_disk
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
|
import re
|
||||||
|
import random
|
||||||
|
import glob
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import (
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
DataCollatorWithPadding,
|
||||||
|
Trainer,
|
||||||
|
EarlyStoppingCallback,
|
||||||
|
TrainingArguments
|
||||||
|
)
|
||||||
|
import evaluate
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def set_seed(seed):
|
||||||
|
"""
|
||||||
|
Set the random seed for reproducibility.
|
||||||
|
"""
|
||||||
|
random.seed(seed) # Python random module
|
||||||
|
np.random.seed(seed) # NumPy random
|
||||||
|
torch.manual_seed(seed) # PyTorch CPU
|
||||||
|
torch.cuda.manual_seed(seed) # PyTorch GPU
|
||||||
|
torch.cuda.manual_seed_all(seed) # If using multiple GPUs
|
||||||
|
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
|
||||||
|
torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility
|
||||||
|
|
||||||
|
set_seed(42)
|
||||||
|
|
||||||
|
SHUFFLES=0
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# import training file
|
||||||
|
data_path = '../../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# rather than use pattern, we use the real thing and property
|
||||||
|
entity_ids = train_df['entity_id'].to_list()
|
||||||
|
target_id_list = sorted(list(set(entity_ids)))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
id2label = {}
|
||||||
|
label2id = {}
|
||||||
|
for idx, val in enumerate(target_id_list):
|
||||||
|
id2label[idx] = val
|
||||||
|
label2id[val] = idx
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# introduce pre-processing functions
|
||||||
|
def preprocess_text(text):
|
||||||
|
|
||||||
|
# 1. Make all uppercase
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# Substitute digits with 'x'
|
||||||
|
text = re.sub(r'\d+', '#', text)
|
||||||
|
|
||||||
|
# standardize spacing
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def generate_random_shuffles(text, n):
|
||||||
|
"""
|
||||||
|
Generate n strings with randomly shuffled words from the input text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text.
|
||||||
|
n (int): The number of random variations to generate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of strings with shuffled words.
|
||||||
|
"""
|
||||||
|
words = text.split() # Split the input into words
|
||||||
|
shuffled_variations = []
|
||||||
|
|
||||||
|
for _ in range(n):
|
||||||
|
shuffled = words[:] # Copy the word list to avoid in-place modification
|
||||||
|
random.shuffle(shuffled) # Randomly shuffle the words
|
||||||
|
shuffled_variations.append(" ".join(shuffled)) # Join the words back into a string
|
||||||
|
|
||||||
|
return shuffled_variations
|
||||||
|
|
||||||
|
|
||||||
|
# generate n more shuffled examples
|
||||||
|
def shuffle_text(text, n_shuffles=SHUFFLES):
|
||||||
|
"""
|
||||||
|
Preprocess a list of texts and add n random shuffles for each string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts (list): An input strings.
|
||||||
|
n_shuffles (int): Number of random shuffles to generate for each string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of preprocessed and shuffled strings.
|
||||||
|
"""
|
||||||
|
all_processed = []
|
||||||
|
all_processed.append(text)
|
||||||
|
|
||||||
|
# Generate random shuffles
|
||||||
|
shuffled_variations = generate_random_shuffles(text, n_shuffles)
|
||||||
|
all_processed.extend(shuffled_variations)
|
||||||
|
|
||||||
|
return all_processed
|
||||||
|
|
||||||
|
acronym_mapping = {
|
||||||
|
'hpsa': 'hp server automation',
|
||||||
|
'tam': 'tivoli access manager',
|
||||||
|
'adf': 'application development facility',
|
||||||
|
'html': 'hypertext markup language',
|
||||||
|
'wff': 'microsoft web farm framework',
|
||||||
|
'jsp': 'javaserver pages',
|
||||||
|
'bw': 'business works',
|
||||||
|
'ssrs': 'sql server reporting services',
|
||||||
|
'cl': 'control language',
|
||||||
|
'vba': 'visual basic for applications',
|
||||||
|
'esapi': 'enterprise security api',
|
||||||
|
'gwt': 'google web toolkit',
|
||||||
|
'pki': 'perkin elmer informatics',
|
||||||
|
'rtd': 'oracle realtime decisions',
|
||||||
|
'jms': 'java message service',
|
||||||
|
'db': 'database',
|
||||||
|
'soa': 'service oriented architecture',
|
||||||
|
'xsl': 'extensible stylesheet language',
|
||||||
|
'com': 'compopent object model',
|
||||||
|
'ldap': 'lightweight directory access protocol',
|
||||||
|
'odm': 'ibm operational decision manager',
|
||||||
|
'soql': 'salesforce object query language',
|
||||||
|
'oms': 'order management system',
|
||||||
|
'cfml': 'coldfusion markup language',
|
||||||
|
'nas': 'netscape application server',
|
||||||
|
'sql': 'structured query language',
|
||||||
|
'bde': 'borland database engine',
|
||||||
|
'imap': 'internet message access protocol',
|
||||||
|
'uws': 'ultidev web server',
|
||||||
|
'birt': 'business intelligence and reporting tools',
|
||||||
|
'mdw': 'model driven workflow',
|
||||||
|
'tws': 'tivoli workload scheduler',
|
||||||
|
'jre': 'java runtime environment',
|
||||||
|
'wcs': 'websphere commerce suite',
|
||||||
|
'was': 'websphere application server',
|
||||||
|
'ssis': 'sql server integration services',
|
||||||
|
'xhtml': 'extensible hypertext markup language',
|
||||||
|
'soap': 'simple object access protocol',
|
||||||
|
'san': 'storage area network',
|
||||||
|
'elk': 'elastic stack',
|
||||||
|
'arr': 'application request routing',
|
||||||
|
'xlst': 'extensible stylesheet language transformations',
|
||||||
|
'sccm': 'microsoft endpoint configuration manager',
|
||||||
|
'ejb': 'enterprise java beans',
|
||||||
|
'css': 'cascading style sheets',
|
||||||
|
'hpoo': 'hp operations orchestration',
|
||||||
|
'xml': 'extensible markup language',
|
||||||
|
'esb': 'enterprise service bus',
|
||||||
|
'edi': 'electronic data interchange',
|
||||||
|
'imsva': 'interscan messaging security virtual appliance',
|
||||||
|
'wtx': 'ibm websphere transformation extender',
|
||||||
|
'cgi': 'common gateway interface',
|
||||||
|
'bal': 'ibm basic assembly language',
|
||||||
|
'issow': 'integrated safe system of work',
|
||||||
|
'dcl': 'data control language',
|
||||||
|
'jdom': 'java document object model',
|
||||||
|
'fim': 'microsoft forefront identity manager',
|
||||||
|
'npl': 'niakwa programming language',
|
||||||
|
'wf': 'windows workflow foundation',
|
||||||
|
'lm': 'etap license manager',
|
||||||
|
'wts': 'windows terminal server',
|
||||||
|
'asp': 'active server pages',
|
||||||
|
'jil': 'job information language',
|
||||||
|
'mvc': 'model view controller',
|
||||||
|
'rmi': 'remote method invocation',
|
||||||
|
'ad': 'active directory',
|
||||||
|
'owb': 'oracle warehouse builder',
|
||||||
|
'rest': 'representational state transfer',
|
||||||
|
'jdk': 'java development kit',
|
||||||
|
'ids': 'integrated data store',
|
||||||
|
'bms': 'batch management software',
|
||||||
|
'vsx': 'vmware solution exchange',
|
||||||
|
'ssas': 'sql server analysis services',
|
||||||
|
'atl': 'atlas transformation language',
|
||||||
|
'ice': 'infobright community edition',
|
||||||
|
'esql': 'extended structured query language',
|
||||||
|
'corba': 'common object request broker architecture',
|
||||||
|
'dpe': 'device provisioning engines',
|
||||||
|
'rac': 'oracle real application clusters',
|
||||||
|
'iemt': 'iis easy migration tool',
|
||||||
|
'mes': 'manufacturing execution system',
|
||||||
|
'odbc': 'open database connectivity',
|
||||||
|
'lms': 'lan management solution',
|
||||||
|
'wcf': 'windows communication foundation',
|
||||||
|
'nes': 'netscape enterprise server',
|
||||||
|
'jsf': 'javaserver faces',
|
||||||
|
'alm': 'application lifecycle management',
|
||||||
|
'hlasm': 'high level assembler',
|
||||||
|
'cmod': 'content manager ondemand'}
|
||||||
|
|
||||||
|
external_source = {
|
||||||
|
'vb.net': 'visual basic dot net',
|
||||||
|
'jes': 'job entry subsystem',
|
||||||
|
'svn': 'subversion',
|
||||||
|
'vcs': 'version control system',
|
||||||
|
'lims': 'laboratory information management system',
|
||||||
|
'ide': 'integrated development environment',
|
||||||
|
'sdk': 'software development kit',
|
||||||
|
'mq': 'message queue',
|
||||||
|
'ims': 'information management system',
|
||||||
|
'isa': 'internet security and acceleration',
|
||||||
|
'vs': 'visual studio',
|
||||||
|
'esr': 'extended support release',
|
||||||
|
'ff': 'firefox',
|
||||||
|
'vb': 'visual basic',
|
||||||
|
'rhel': 'red hat enterprise linux',
|
||||||
|
'iis': 'internet information server',
|
||||||
|
'api': 'application programming interface',
|
||||||
|
'se': 'standard edition',
|
||||||
|
'\.net': 'dot net',
|
||||||
|
'c#': 'c sharp',
|
||||||
|
'ms': 'microsoft'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# synonyms = {
|
||||||
|
# 'windows server': 'windows nt',
|
||||||
|
# 'windows 7': 'windows desktop',
|
||||||
|
# 'windows 8': 'windows desktop',
|
||||||
|
# 'windows 10': 'windows desktop'
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
|
# add more information
|
||||||
|
acronym_mapping.update(external_source)
|
||||||
|
|
||||||
|
|
||||||
|
abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
|
||||||
|
term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
|
||||||
|
|
||||||
|
def replace_terms_with_abbreviations(text):
|
||||||
|
for input, replacement in term_to_abbrev.items():
|
||||||
|
text = re.sub(input, replacement, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def replace_abbreviations_with_terms(text):
|
||||||
|
for input, replacement in abbrev_to_term.items():
|
||||||
|
text = re.sub(input, replacement, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# outputs a list of dictionaries
|
||||||
|
# processes dataframe into lists of dictionaries
|
||||||
|
# each element maps input to output
|
||||||
|
# input: tag_description
|
||||||
|
# output: class label
|
||||||
|
def process_df_to_dict(df):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
# produce shuffling
|
||||||
|
index = row['entity_id']
|
||||||
|
parent_desc = row['mention']
|
||||||
|
parent_desc = preprocess_text(parent_desc)
|
||||||
|
# ensure at least 1 shuffle
|
||||||
|
# no_of_shuffles = label2weight[index] + 1
|
||||||
|
no_of_shuffles = SHUFFLES
|
||||||
|
processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
|
||||||
|
for desc in processed_descs:
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
# augmentation
|
||||||
|
# remove all non-alphanumerics
|
||||||
|
desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
# augmentation
|
||||||
|
# perform abbrev_to_term
|
||||||
|
temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc) # Retains only alphanumeric and spaces
|
||||||
|
desc = replace_terms_with_abbreviations(temp_desc)
|
||||||
|
if (desc != temp_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
# augmentation
|
||||||
|
# perform term to abbrev
|
||||||
|
desc = replace_abbreviations_with_terms(parent_desc)
|
||||||
|
if (desc != parent_desc):
|
||||||
|
element = {
|
||||||
|
'text' : desc,
|
||||||
|
'label': label2id[index], # ensure labels starts from 0
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset():
|
||||||
|
# train
|
||||||
|
data_path = '../../esAppMod_data_import/train.csv'
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
|
||||||
|
combined_data = DatasetDict({
|
||||||
|
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||||
|
})
|
||||||
|
return combined_data
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
def train():
|
||||||
|
|
||||||
|
save_path = f'checkpoint'
|
||||||
|
split_datasets = create_dataset()
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
|
||||||
|
pattern = 'checkpoint_part1-*'
|
||||||
|
checkpoint_directory = 'checkpoint'
|
||||||
|
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||||
|
|
||||||
|
# model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||||
|
# model_checkpoint = 'google-bert/bert-base-uncased'
|
||||||
|
# model_checkpoint = 'prajjwal1/bert-small'
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# Define additional special tokens
|
||||||
|
# additional_special_tokens = ["<DESC>"]
|
||||||
|
# Add the additional special tokens to the tokenizer
|
||||||
|
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
max_length = 120
|
||||||
|
|
||||||
|
# given a dataset entry, run it through the tokenizer
|
||||||
|
def preprocess_function(example):
|
||||||
|
input = example['text']
|
||||||
|
# text_target sets the corresponding label to inputs
|
||||||
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = tokenizer(
|
||||||
|
input,
|
||||||
|
max_length=max_length,
|
||||||
|
truncation=True,
|
||||||
|
padding=True
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
tokenized_datasets = split_datasets.map(
|
||||||
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
# %% temp
|
||||||
|
# tokenized_datasets['train'].rename_columns()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create data collator
|
||||||
|
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute metrics
|
||||||
|
metric = evaluate.load("accuracy")
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(eval_preds):
|
||||||
|
preds, labels = eval_preds
|
||||||
|
preds = np.argmax(preds, axis=1)
|
||||||
|
return metric.compute(predictions=preds, references=labels)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create id2label and label2id
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_checkpoint,
|
||||||
|
num_labels=len(target_id_list),
|
||||||
|
id2label=id2label,
|
||||||
|
label2id=label2id)
|
||||||
|
# important! after extending tokens vocab
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Trainer
|
||||||
|
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
output_dir=f"{save_path}",
|
||||||
|
eval_strategy="no",
|
||||||
|
logging_dir="tensorboard-log",
|
||||||
|
logging_strategy="epoch",
|
||||||
|
# save_strategy="epoch",
|
||||||
|
load_best_model_at_end=False,
|
||||||
|
learning_rate=5e-5,
|
||||||
|
per_device_train_batch_size=64,
|
||||||
|
per_device_eval_batch_size=64,
|
||||||
|
auto_find_batch_size=False,
|
||||||
|
ddp_find_unused_parameters=False,
|
||||||
|
weight_decay=0.01,
|
||||||
|
save_total_limit=1,
|
||||||
|
num_train_epochs=300,
|
||||||
|
warmup_steps=400,
|
||||||
|
bf16=True,
|
||||||
|
push_to_hub=False,
|
||||||
|
remove_unused_columns=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model,
|
||||||
|
training_args,
|
||||||
|
train_dataset=tokenized_datasets["train"],
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
data_collator=data_collator,
|
||||||
|
compute_metrics=compute_metrics,
|
||||||
|
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||||
|
)
|
||||||
|
|
||||||
|
# uncomment to load training from checkpoint
|
||||||
|
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||||
|
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
|
||||||
|
# execute training
|
||||||
|
train()
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -1,2 +0,0 @@
|
||||||
|
|
||||||
Accuracy for fold: 0.5846658466584665
|
|
|
@ -57,10 +57,10 @@ class Inference():
|
||||||
output_list = []
|
output_list = []
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
desc = row['mention']
|
desc = row['mention']
|
||||||
label = row['entity_name']
|
label = row['entity_seq']
|
||||||
element = {
|
element = {
|
||||||
'input' : desc,
|
'input' : desc,
|
||||||
'output': label
|
'output': f'{label}'
|
||||||
}
|
}
|
||||||
|
|
||||||
output_list.append(element)
|
output_list.append(element)
|
||||||
|
@ -101,7 +101,7 @@ class Inference():
|
||||||
|
|
||||||
|
|
||||||
def generate(self):
|
def generate(self):
|
||||||
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
MAX_GENERATE_LENGTH = 128
|
MAX_GENERATE_LENGTH = 128
|
||||||
|
|
||||||
pred_generations = []
|
pred_generations = []
|
|
@ -0,0 +1,2 @@
|
||||||
|
|
||||||
|
Accuracy for fold: 0.5022550225502255
|
|
@ -11,7 +11,7 @@ BATCH_SIZE = 512
|
||||||
def infer():
|
def infer():
|
||||||
print(f"Inference for data")
|
print(f"Inference for data")
|
||||||
# import test data
|
# import test data
|
||||||
data_path = '../../../data_import/test.csv'
|
data_path = '../../../esAppMod_data_import/test_seq.csv'
|
||||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,18 +35,19 @@ def infer():
|
||||||
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
|
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
|
||||||
# Convert the list to a Pandas DataFrame
|
# Convert the list to a Pandas DataFrame
|
||||||
df_out = pd.DataFrame({
|
df_out = pd.DataFrame({
|
||||||
'predictions': prediction_list
|
'class_prediction': prediction_list
|
||||||
})
|
})
|
||||||
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
|
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
|
||||||
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
||||||
df = pd.concat([df, df_out], axis=1)
|
# df = pd.concat([df, df_out], axis=1)
|
||||||
|
|
||||||
# we can save the t5 generation output here
|
# we can save the t5 generation output here
|
||||||
df.to_csv(f"exports/result.csv", index=False)
|
df_out.to_csv(f"exports/result.csv", index=False)
|
||||||
|
|
||||||
# here we want to evaluate mapping accuracy within the valid in mdm data only
|
# here we want to evaluate mapping accuracy within the valid in mdm data only
|
||||||
condition_correct = df['predictions'] == df['entity_name']
|
# predictions = pd.to_numeric(df_out['class_prediction'], errors="coerce")
|
||||||
pred_correct_proportion = sum(condition_correct)/len(df)
|
condition_correct = df_out['class_prediction'] == df['entity_seq']
|
||||||
|
pred_correct_proportion = sum(condition_correct)/len(df_out)
|
||||||
|
|
||||||
# write output to file output.txt
|
# write output to file output.txt
|
||||||
with open("output.txt", "a") as f:
|
with open("output.txt", "a") as f:
|
|
@ -33,10 +33,10 @@ def process_df_to_dict(df):
|
||||||
output_list = []
|
output_list = []
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
desc = row['mention']
|
desc = row['mention']
|
||||||
label = row['entity_name']
|
label = row['entity_seq']
|
||||||
element = {
|
element = {
|
||||||
'input' : desc,
|
'input' : desc,
|
||||||
'output': label
|
'output': f'{label}'
|
||||||
}
|
}
|
||||||
output_list.append(element)
|
output_list.append(element)
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ def process_df_to_dict(df):
|
||||||
|
|
||||||
def create_dataset():
|
def create_dataset():
|
||||||
# train
|
# train
|
||||||
data_path = f"../../data_import/train.csv"
|
data_path = f"../../esAppMod_data_import/train_seq.csv"
|
||||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
combined_data = DatasetDict({
|
combined_data = DatasetDict({
|
||||||
|
|
|
@ -3,8 +3,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# Load model and tokenizer
|
# Load model and tokenizer
|
||||||
# model_name = "bigscience/bloom-7b1" # Replace with your model
|
model_name = "bigscience/bloom-7b1" # Replace with your model
|
||||||
model_name = "bigscience/bloomz-1b1"
|
# model_name = "bigscience/bloomz-1b1"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
# Automatically map model layers to available GPUs
|
# Automatically map model layers to available GPUs
|
||||||
|
@ -26,13 +26,12 @@ outputs = model.generate(inputs["input_ids"], max_length=50)
|
||||||
# Decode and print result
|
# Decode and print result
|
||||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||||
# %%
|
# %%
|
||||||
# %%
|
|
||||||
# Prepare input
|
# Prepare input
|
||||||
|
|
||||||
def generate(text):
|
def generate(text):
|
||||||
|
|
||||||
# Define prompt
|
# Define prompt
|
||||||
prompt = f"Answer Concisely: Give me a mapping between the acronym and descriptor in the format '(acronym: description): '{text}'"
|
prompt = f"Give me past product names relating to: '{text}'"
|
||||||
|
|
||||||
# Generate acronym
|
# Generate acronym
|
||||||
inputs = tokenizer(prompt, return_tensors="pt")
|
inputs = tokenizer(prompt, return_tensors="pt")
|
||||||
|
@ -45,7 +44,7 @@ def generate(text):
|
||||||
|
|
||||||
# Example usage
|
# Example usage
|
||||||
# text = "Advanced Data Analytics Platform"
|
# text = "Advanced Data Analytics Platform"
|
||||||
text = 'ColdFusion Markup Language (CFML)'
|
text = 'windows server'
|
||||||
acronym = generate(text)
|
acronym = generate(text)
|
||||||
print(f"Acronym: {acronym}")
|
print(f"Generation: {acronym}")
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
# %%
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def get_related_terms(term, language="en", limit=10):
|
||||||
|
url = f"http://api.conceptnet.io/c/{language}/{term}"
|
||||||
|
response = requests.get(url).json()
|
||||||
|
|
||||||
|
# Extract related terms
|
||||||
|
related_terms = []
|
||||||
|
for edge in response.get("edges", []):
|
||||||
|
related = edge.get("end", {}).get("label", None)
|
||||||
|
if related and related.lower() != term.lower():
|
||||||
|
related_terms.append(related)
|
||||||
|
if len(related_terms) >= limit:
|
||||||
|
break
|
||||||
|
return related_terms
|
||||||
|
|
||||||
|
# Example
|
||||||
|
related_terms = get_related_terms("windows_server")
|
||||||
|
print("Related Terms:", related_terms)
|
||||||
|
# %%
|
|
@ -0,0 +1,38 @@
|
||||||
|
# %%
|
||||||
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
||||||
|
|
||||||
|
# %%
|
||||||
|
sparql = SPARQLWrapper("https://dbpedia.org/sparql")
|
||||||
|
sparql.setQuery("""
|
||||||
|
SELECT ?altLabel WHERE {
|
||||||
|
?item rdfs:label "Windows Server"@en.
|
||||||
|
?item skos:altLabel ?altLabel.
|
||||||
|
FILTER (LANG(?altLabel) = "en")
|
||||||
|
}
|
||||||
|
LIMIT 10
|
||||||
|
""")
|
||||||
|
sparql.setReturnFormat(JSON)
|
||||||
|
results = sparql.query().convert()
|
||||||
|
|
||||||
|
for result in results["results"]["bindings"]:
|
||||||
|
print(result["label"]["value"])
|
||||||
|
# %%
|
||||||
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
||||||
|
|
||||||
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
||||||
|
sparql.setQuery("""
|
||||||
|
SELECT ?itemLabel ?altLabel WHERE {
|
||||||
|
?item ?label "Windows Server"@en.
|
||||||
|
OPTIONAL { ?item skos:altLabel ?altLabel. FILTER (LANG(?altLabel) = "en") }
|
||||||
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
||||||
|
}
|
||||||
|
LIMIT 10
|
||||||
|
""")
|
||||||
|
sparql.setReturnFormat(JSON)
|
||||||
|
results = sparql.query().convert()
|
||||||
|
|
||||||
|
for result in results["results"]["bindings"]:
|
||||||
|
print("Label:", result["itemLabel"]["value"])
|
||||||
|
if "altLabel" in result:
|
||||||
|
print("Alias:", result["altLabel"]["value"])
|
||||||
|
# %%
|
|
@ -0,0 +1,626 @@
|
||||||
|
,mention,entity_id,entity_name,class_prediction,predicted_name
|
||||||
|
0,DOT NET,497,.NET Framework,579,Unix|BSD|*
|
||||||
|
2,Dot net - FW 4,497,.NET Framework,368,VB.NET
|
||||||
|
3,.Net 4.7.1 Enterprise Lib,497,.NET Framework,368,VB.NET
|
||||||
|
11,.NET,497,.NET Framework,579,Unix|BSD|*
|
||||||
|
13,.Net 4.5.2 Enterprise Lib,497,.NET Framework,368,VB.NET
|
||||||
|
40,APACHE LOG4NET,483,.NET Framework|log4net,394,Java|Log4j
|
||||||
|
41,LOG4NET,483,.NET Framework|log4net,394,Java|Log4j
|
||||||
|
42,Magik,484,.NET Framework|Magick.NET,533,YAML
|
||||||
|
43,WCF,485,.NET Framework|Windows Communication Foundation (WCF),486,.NET Framework|Windows Workflow Foundation (WF)
|
||||||
|
45,WWF,486,.NET Framework|Windows Workflow Foundation (WF),443,OS/2
|
||||||
|
47,Ejes,1,(E)JES,101,Microsoft Dynamics AX
|
||||||
|
48,(UNIRITA) A-AUTO 7.2.2,2,A-Auto Job Scheduling Software,299,AutoIt
|
||||||
|
50,Active Directoy,498,Active Directory (AD),40,Connect Direct
|
||||||
|
54,APSX,592,Active Server Pages (ASP)|*,609,IIS|*
|
||||||
|
69,Andriod,418,Android,586,PHP|*
|
||||||
|
71,Apache Active Queue,6,Apache ActiveMQ,259,Apache HTTP Server
|
||||||
|
72,MQ Apache Active Queue,6,Apache ActiveMQ,81,IBM Websphere MQ
|
||||||
|
75,cordova-android,501,Apache Cordova,418,Android
|
||||||
|
77,Hive,8,Apache Hive,177,SAP NetWeaver Business Warehouse
|
||||||
|
99,solr,11,Apache Solr,375,Apache Lucene
|
||||||
|
135,ADF,13,Application Development Facility (ADF),130,Oracle ADF
|
||||||
|
144,WLS 10.2,600,Oracle WebLogic Server|*,442,OpenVMS
|
||||||
|
149,BEA WLS,600,Oracle WebLogic Server|*,442,OpenVMS
|
||||||
|
152,Weblogic 12c,600,Oracle WebLogic Server|*,582,C#|*
|
||||||
|
160,WLE,600,Oracle WebLogic Server|*,443,OS/2
|
||||||
|
168,Web Logic,600,Oracle WebLogic Server|*,97,MarkLogic DB
|
||||||
|
174,BEA WLE,600,Oracle WebLogic Server|*,443,OS/2
|
||||||
|
175,WLS 10,600,Oracle WebLogic Server|*,442,OpenVMS
|
||||||
|
176,WLS,600,Oracle WebLogic Server|*,442,OpenVMS
|
||||||
|
189,brain script,302,Brainscript,369,VBScript
|
||||||
|
190,BRAINScript,302,Brainscript,367,TypeScript
|
||||||
|
191,Business Intelligence and Reporting Tools,21,Business Intelligence and Reporting Tools (BIRT),133,Oracle Business Intelligence
|
||||||
|
192,Actuate Report Server,21,Business Intelligence and Reporting Tools (BIRT),42,Crystal Reports
|
||||||
|
194,CSHARP,582,C#|*,87,Informatica PowerCenter
|
||||||
|
218,WinFrame,30,Citrix Virtual Apps and Desktops,443,OS/2
|
||||||
|
221,METAFRAME,30,Citrix Virtual Apps and Desktops,406,JavaScript|AngularJS
|
||||||
|
225,Presentation Server,30,Citrix Virtual Apps and Desktops,541,File Server
|
||||||
|
226,NETSCALER-1.5,563,Citrix ADC,273,Netscape Enterprise Server (NES)
|
||||||
|
227,NETSCALER-11.,563,Citrix ADC,273,Netscape Enterprise Server (NES)
|
||||||
|
228,Citrix SD-WAN,563,Citrix ADC,30,Citrix Virtual Apps and Desktops
|
||||||
|
229,NetScaler SD-WAN,563,Citrix ADC,273,Netscape Enterprise Server (NES)
|
||||||
|
231,NetScaler ADC,563,Citrix ADC,272,Netscape Application Server (NAS)
|
||||||
|
236,NetScaler SDX,291,Citrix ADC SDX,273,Netscape Enterprise Server (NES)
|
||||||
|
240,Provisioning Services 7.15.8,32,Citrix Provisioning,538,Device Provisioning Engines (DPE)
|
||||||
|
241,Citrix PVS,32,Citrix Provisioning,30,Citrix Virtual Apps and Desktops
|
||||||
|
243,CLISTS,309,CLIST,329,IBM i Control Language (CL)
|
||||||
|
253,CFML,311,ColdFusion Markup Language (CFML),316,eXtensible HyperText Markup Language (XHTML)
|
||||||
|
254,ColdFusion Markup Language,311,ColdFusion Markup Language (CFML),37,Coldfusion
|
||||||
|
255,Sterling Connect,40,Connect Direct,542,General Ledger
|
||||||
|
264,Cormerstone,41,Cornerstone software,516,Compopent Object Model (COM)
|
||||||
|
265,Cornerstone,41,Cornerstone software,370,Visual Basic
|
||||||
|
279,DB2 UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA)
|
||||||
|
282,DB2-UDB,43,DB2,365,TCL
|
||||||
|
291,DB2/UDB,43,DB2,365,TCL
|
||||||
|
292,IBM DB2 ENTERPRISE SERVER EDITION PVU OPTION 10.5,43,DB2,163,PVCS Version Manager
|
||||||
|
300,IBM - IBM DB2 Advanced Enterprise Server Edition PVU Option 10.5,43,DB2,72,IBM Mobile Foundation
|
||||||
|
301,UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA)
|
||||||
|
302,IBM - IBM DB2 Enterprise Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|*
|
||||||
|
306,IBM - IBM DB2 Workgroup Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|*
|
||||||
|
313,EZTriev,314,Easytrieve,296,Intel Xeon Processor
|
||||||
|
314,Eztrieve,314,Easytrieve,296,Intel Xeon Processor
|
||||||
|
321,PrestoSoft - ExamDiff Application 1.6,49,ExamDiff,346,Pascal|Object Pascal
|
||||||
|
322,PrestoSoft - ExamDiff Application,49,ExamDiff,346,Pascal|Object Pascal
|
||||||
|
323,ExamDiff Application,49,ExamDiff,467,MS SQL Server|Log Reader Agent
|
||||||
|
324,Expect Scripts,315,Expect,109,Microsoft MQ
|
||||||
|
329,Microsoft - MSXML 4.0 SP2 4.2,318,Extensible Markup Language (XML)|MSXML,316,eXtensible HyperText Markup Language (XHTML)
|
||||||
|
331,XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST)
|
||||||
|
332,JAVA-XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST)
|
||||||
|
335,ServerCA Access GatewayF5,50,F5 Secure Web Gateway Services,290,CA API Gateway
|
||||||
|
347,HP C++,58,HP aC++ compiler,59,HP C/ANSI C compiler
|
||||||
|
350,HP C++ 10.20,58,HP aC++ compiler,59,HP C/ANSI C compiler
|
||||||
|
351,HPC 11.11,59,HP C/ANSI C compiler,58,HP aC++ compiler
|
||||||
|
358,HFS,505,HTTP File Server,486,.NET Framework|Windows Workflow Foundation (WF)
|
||||||
|
359,www.rejetto.com - HttpFileServer 2.3,505,HTTP File Server,55,Google Chrome
|
||||||
|
360,HttpFileServer,505,HTTP File Server,522,Application Web Server
|
||||||
|
367,IBM - IBM BigFix Platform Client Deploy Tool 9.5,457,IBM BigFix Platform|Client Deploy Tool,62,IBM BigFix Platform
|
||||||
|
369,IBM BPM,64,IBM Business Process Manager,328,IBM High Level Assembler (HLASM)
|
||||||
|
375,Data Power,294,IBM DataPower Gateway,295,IBM Power Systems
|
||||||
|
376,IDG.7.5.2.19hp,294,IBM DataPower Gateway,449,Unix|HP-UX
|
||||||
|
380,hlasm,328,IBM High Level Assembler (HLASM),438,macOS
|
||||||
|
383,IHS,265,IBM HTTP Server,424,IBM i
|
||||||
|
386,WebSphere and IHS,265,IBM HTTP Server,67,IBM InfoSphere DataStage
|
||||||
|
387,WebSphere http,265,IBM HTTP Server,284,Websphere Application Server (WAS)
|
||||||
|
391,IBM Websphere HTTP Server,265,IBM HTTP Server,285,WebSphere Liberty
|
||||||
|
393,WebSphere IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS
|
||||||
|
394,WebSphere -IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS
|
||||||
|
397,OS400 V7R1,424,IBM i,443,OS/2
|
||||||
|
398,OS400,424,IBM i,443,OS/2
|
||||||
|
399,OS/400,424,IBM i,443,OS/2
|
||||||
|
408,IIB,68,IBM Integration Bus,370,Visual Basic
|
||||||
|
411,Extended Structured Query Language,458,IBM Integration Bus|Extended Structured Query Language (ESQL),572,Structured Query Language (SQL)
|
||||||
|
415,IBM WorkLight,72,IBM Mobile Foundation,649,IBM Cloud
|
||||||
|
417,ILOG JRules,73,IBM Operational Decision Manager (ODM),601,Websphere ILOG JRules BRMS
|
||||||
|
420,Decision Center 8.0.1.0,73,IBM Operational Decision Manager (ODM),252,IBM ILOG Views
|
||||||
|
423,AS400,295,IBM Power Systems,443,OS/2
|
||||||
|
424,AS/400,295,IBM Power Systems,443,OS/2
|
||||||
|
426,System i,295,IBM Power Systems,424,IBM i
|
||||||
|
427,P-series,295,IBM Power Systems,81,IBM Websphere MQ
|
||||||
|
428,IBM iSeries/AS400 system Model 520,295,IBM Power Systems,443,OS/2
|
||||||
|
439,Tivoli Asset Discovery for Distributed,459,IBM Tivoli Asset Management|Asset Discovery for Distributed,606,IBM Tivoli Asset Management|*
|
||||||
|
447,Database MS SQL Agent,77,IBM Tivoli Monitoring,469,MS SQL Server|SQL Server Database Engine
|
||||||
|
448,Linux OS Agent,77,IBM Tivoli Monitoring,576,Linux|*
|
||||||
|
449,Database DB2 Agent,77,IBM Tivoli Monitoring,520,Database (DB)
|
||||||
|
452,Windows OS Agent,77,IBM Tivoli Monitoring,580,Windows|*
|
||||||
|
454,IBM - IBM TSM FCM,604,IBM Tivoli Storage Manager|*,460,IBM Tivoli Storage Manager|TSM API
|
||||||
|
459,Databases Data Protection for Microsoft SQL,604,IBM Tivoli Storage Manager|*,572,Structured Query Language (SQL)
|
||||||
|
461,IBM - IBM Spectrum Protect Data Protection for Microsoft SQL Server 8.1,604,IBM Tivoli Storage Manager|*,469,MS SQL Server|SQL Server Database Engine
|
||||||
|
462,IBM Spectrum Protect Data Protection,604,IBM Tivoli Storage Manager|*,312,Data Language Interface (DL/I)
|
||||||
|
463,IBM - IBM Spectrum Protect API 7.1,460,IBM Tivoli Storage Manager|TSM API,294,IBM DataPower Gateway
|
||||||
|
464,IBM - IBM Spectrum Protect Client,461,IBM Tivoli Storage Manager|TSM Client,294,IBM DataPower Gateway
|
||||||
|
465,IBM - IBM Tivoli Storage Manager Client,461,IBM Tivoli Storage Manager|TSM Client,604,IBM Tivoli Storage Manager|*
|
||||||
|
467,VSS Requestor configured 8.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|*
|
||||||
|
468,VSS Requestor 7.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|*
|
||||||
|
469,TWS-WS,79,IBM Tivoli Workload Scheduler (TWS),239,Windows Terminal Server (WTS)
|
||||||
|
472,wbia 2.6,80,IBM WebSphere Business Integration Adaptor,627,XtraDB
|
||||||
|
473,IBM WBIA 2.6.0.12,80,IBM WebSphere Business Integration Adaptor,424,IBM i
|
||||||
|
475,MQ,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
476,MQ 9.1,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
479,MQ 7,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
480,MQ 6,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
481,MQ 9.0,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
482,MQ 5.3,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
483,MQ 7.01,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
484,MQ 7.5,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
485,MQSeries 8.0,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
488,WSMQ 8.0,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
489,MQ 9.0.5,81,IBM Websphere MQ,248,ZeroMQ
|
||||||
|
491,WTX,83,IBM WebSphere Transformation Extender (WTX),274,Nginx
|
||||||
|
505,Microsoft Internet Inf,609,IIS|*,130,Oracle ADF
|
||||||
|
508,Microsoft Internet Informat,609,IIS|*,330,IBM Informix-4GL
|
||||||
|
550,Microsoft - IIS 6.0 Migration Tool 1,489,IIS|Easy Migration Tool (IEMT),609,IIS|*
|
||||||
|
558,Infozip 6,85,Info-ZIP,677,Git
|
||||||
|
559,Infozip,85,Info-ZIP,677,Git
|
||||||
|
578,IMSVA 9.1,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB
|
||||||
|
580,IMSVA,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB
|
||||||
|
581,Java 1.8,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
582,Java 7,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
583,Java on Weblogic server,584,Java|*,600,Oracle WebLogic Server|*
|
||||||
|
584,Java5,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
585,Java 6,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
586,Java 6.0,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
587,Java 7 Update 25,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
589,Java (open source),584,Java|*,397,Java|Servlet
|
||||||
|
590,Java 5,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
591,Java 1.5,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
593,Java 1.8.0_92,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
594,Java 1.6,584,Java|*,334,Java|Java Standard Edition (Java SE)
|
||||||
|
595,J2EE 6,584,Java|*,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
596,Java (J2EE),584,Java|*,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
598,JRE,506,Java Runtime Environment (JRE),84,IMS DB
|
||||||
|
629,JEE,333,Java|Java Enterprise Edition (Java EE),1,(E)JES
|
||||||
|
639,JSF,391,Java|JavaServer Faces (JSF),334,Java|Java Standard Edition (Java SE)
|
||||||
|
643,JSP Scriptlets,336,Java|JavaServer Pages (JSP)|Scriptlets,335,Java|JavaServer Pages (JSP)
|
||||||
|
644,Java Scriplet,336,Java|JavaServer Pages (JSP)|Scriptlets,88,Ingres
|
||||||
|
645,Core 9.2.0.0,393,Java|JRuby Core,583,C++|*
|
||||||
|
647,Java RMI,396,Java|Remote Method Invocation (RMI),584,Java|*
|
||||||
|
650,Java Servlets,397,Java|Servlet,453,Linux|Fedora
|
||||||
|
651,Java 6 Servlets,397,Java|Servlet,453,Linux|Fedora
|
||||||
|
652,J2EE Servlets,397,Java|Servlet,443,OS/2
|
||||||
|
653,Servlets,397,Java|Servlet,420,Cisco IOS
|
||||||
|
654,Servlets v2.3,397,Java|Servlet,370,Visual Basic
|
||||||
|
656,Spring BOOT,399,Java|Spring|Spring Boot,398,Java|Spring
|
||||||
|
657,Springboot,399,Java|Spring|Spring Boot,398,Java|Spring
|
||||||
|
661,javasript,589,JavaScript|*,335,Java|JavaServer Pages (JSP)
|
||||||
|
662,JS,589,JavaScript|*,507,Node.js
|
||||||
|
664,Java Script,589,JavaScript|*,584,Java|*
|
||||||
|
671,Sencha 4.2.0,409,JavaScript|Ext JS,589,JavaScript|*
|
||||||
|
674,jqueryui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery
|
||||||
|
675,jquery-ui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery
|
||||||
|
679,Scriptaculous,414,JavaScript|script.aculo.us,582,C#|*
|
||||||
|
684,EAP,268,JBoss|*,174,SAP ERP
|
||||||
|
685,JBOSS-EAP,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
686,JBoss Application Server 4,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
687,JBoss Application Server 7,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
688,JBoss Application Server 5,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
689,JBoss Application Server,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
690,Enterprise Application Platform,268,JBoss|*,670,EAServer
|
||||||
|
692,JBOSS 5.1.2 EAP,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
693,server: Jboss,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
694,JBOSS 6.3.2 EAP,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
695,JBoss EAP 4.3,268,JBoss|*,493,JBoss|Wildfly
|
||||||
|
700,Job Information Language,339,Job Information Language (JIL),338,JCL
|
||||||
|
703,JoinIT by Acayosoft,91,joinIT,4,Adobe Acrobat Reader
|
||||||
|
704,Acayosoft JoinIT,91,joinIT,4,Adobe Acrobat Reader
|
||||||
|
705,JoinIT by Acayosoft v 9.0.8,91,joinIT,4,Adobe Acrobat Reader
|
||||||
|
706,LifeFlow Tool,92,LifeFlow,486,.NET Framework|Windows Workflow Foundation (WF)
|
||||||
|
707,Linux 2.6.32-696.28.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
|
||||||
|
709,Linux 2.6.32-696.30.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
|
||||||
|
710,Linux 2.6.9,576,Linux|*,437,Linux|zLinux
|
||||||
|
711,Linux 2.6.32-642.3.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
|
||||||
|
712,Linux - 2.6.18-371.1.2.el5,576,Linux|*,437,Linux|zLinux
|
||||||
|
713,Linux 2.6.32-696.23.1.el6.x86_64,576,Linux|*,437,Linux|zLinux
|
||||||
|
749,Gaia Kernel version 2.7,428,Linux|Check Point,432,Linux|Oracle Linux
|
||||||
|
752,Gaia Kernel version 2.6,428,Linux|Check Point,432,Linux|Oracle Linux
|
||||||
|
766,OEL,432,Linux|Oracle Linux,449,Unix|HP-UX
|
||||||
|
778,Oracle Enterprise Server 7.5,432,Linux|Oracle Linux,134,Oracle Database
|
||||||
|
780,OEL6.7 - 3.8.13-68.3.4.el6uek.x86_64,432,Linux|Oracle Linux,449,Unix|HP-UX
|
||||||
|
792,VMware Photon,433,Linux|Photon OS,569,VMware Server
|
||||||
|
793,VMware Photon 1,433,Linux|Photon OS,569,VMware Server
|
||||||
|
809,Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
818,Redhat - Redhat Linux 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
819,Linux RH6,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
|
||||||
|
865,Redhat - Redhat Linux 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
870,Redhat - RHEL 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
874,Red Hat Entreprise Linux 6.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
882,Redhat 6 64-Bit,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
893,RED HAT ADVANCED SERVER 5,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
910,redhat6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
912,Redhat - Redhat Linux 6.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
913,Linux RH,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
|
||||||
|
916,Redhat - Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
920,Linux RH7,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
|
||||||
|
926,Red Hat V6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
932,Linux RH5,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux
|
||||||
|
934,rehl5.9,434,Linux|Red Hat Enterprise Linux,43,DB2
|
||||||
|
964,Red Hat 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
979,red hat,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
991,Redhat,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
996,RedHat 7.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
998,LINUX RED HAT 5 EL,434,Linux|Red Hat Enterprise Linux,268,JBoss|*
|
||||||
|
1003,SUSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1006,Linux SuSE12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1011,SUSE10,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1012,SUSE Linux 12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1017,SUSELinux Enterprise 11.x,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1023,SUSE Linux 11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1024,SUSE Linux 11 SP3,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1029,Linux SuSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1030,SUSE,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1038,SuseLinux,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE
|
||||||
|
1051,domino8.5,270,Lotus Domino,93,Lotus Notes
|
||||||
|
1052,Domino 8.x,270,Lotus Domino,93,Lotus Notes
|
||||||
|
1054,Lucee 5.2.6.60,271,Lucee,375,Apache Lucene
|
||||||
|
1056,Darwin,438,macOS,117,Mozilla Firefox
|
||||||
|
1061,Memcache,98,Memcached,18,BMC Control-M
|
||||||
|
1062,ACCDB,99,Microsoft Access,525,Open Database Connectivity (ODBC)
|
||||||
|
1070,ConfigMgr,102,Microsoft Endpoint Configuration Manager (SCCM),21,Business Intelligence and Reporting Tools (BIRT)
|
||||||
|
1080,FIM SQL Development Server,105,Microsoft Forefront Identity Manager (FIM),572,Structured Query Language (SQL)
|
||||||
|
1082,Microsoft - Internet Explor,107,Microsoft Internet Explorer,356,Rexx
|
||||||
|
1084,Internet Explor,107,Microsoft Internet Explorer,356,Rexx
|
||||||
|
1090,SCEP for Linux,110,Microsoft System Center Endpoint Protection,437,Linux|zLinux
|
||||||
|
1094,SCEP for Mac,110,Microsoft System Center Endpoint Protection,438,macOS
|
||||||
|
1101,msdeploy,112,Microsoft Web Deploy,56,Greenplum DB
|
||||||
|
1106,WebPI,114,Microsoft Web Platform Installer,522,Application Web Server
|
||||||
|
1109,Web PI,114,Microsoft Web Platform Installer,531,Simple Object Access Protocol (SOAP)
|
||||||
|
1111,MDW Framework,115,Model Driven Workflow (MDW),406,JavaScript|AngularJS
|
||||||
|
1115,Mango DB,116,MongoDB,43,DB2
|
||||||
|
1117,MangoDB,116,MongoDB,43,DB2
|
||||||
|
1125,O365,119,MS Office 365,424,IBM i
|
||||||
|
1141,MICROSOFT SQL SERVER 2012 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||||
|
1153,MICROSOFT SQL SERVER 2012 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||||
|
1154,MS SQL Server 2008 Developer,581,MS SQL Server|*,146,Oracle SQL Developer
|
||||||
|
1156,MICROSOFT SQL SERVER 2008 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||||
|
1167,MSSQL Database Server,581,MS SQL Server|*,122,MySQL
|
||||||
|
1173,MSSQL2008,581,MS SQL Server|*,122,MySQL
|
||||||
|
1192,Microsoft SQL Server Standard Edition,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||||
|
1201,SQLServer,581,MS SQL Server|*,572,Structured Query Language (SQL)
|
||||||
|
1226,MICROSOFT SQL SERVER 2012 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||||
|
1233,MICROSOFT SQL SERVER 2005 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||||
|
1234,SQLSVR2008,581,MS SQL Server|*,352,PL/SQL
|
||||||
|
1235,MICROSOFT SQL SERVER 2008 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||||
|
1239,MICROSOFT SQL SERVER 2008 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact
|
||||||
|
1244,MS SQL Server 2012 Developer,581,MS SQL Server|*,146,Oracle SQL Developer
|
||||||
|
1253,Microsoft - SQL Server Express LocalDB 2014,581,MS SQL Server|*,469,MS SQL Server|SQL Server Database Engine
|
||||||
|
1256,MSSQL,581,MS SQL Server|*,122,MySQL
|
||||||
|
1280,Microsoft - Microsoft SQL Server Analysis Services 2012 for Microsoft SQL Server 2012 Standard Edition 2012,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
|
||||||
|
1281,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
|
||||||
|
1283,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|*
|
||||||
|
1290,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|*
|
||||||
|
1293,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|*
|
||||||
|
1295,SQL Server Integration Services,470,MS SQL Server|SQL Server Integration Services (SSIS),473,MS SQL Server|SQL Server Reporting Services (SSRS)
|
||||||
|
1316,ZOS Base 1.12,441,MVS|z/OS,437,Linux|zLinux
|
||||||
|
1335,NAS,272,Netscape Application Server (NAS),443,OS/2
|
||||||
|
1337,NES,273,Netscape Enterprise Server (NES),443,OS/2
|
||||||
|
1349,Node.js 0.10 (Linux),507,Node.js,437,Linux|zLinux
|
||||||
|
1361,Node.js 4 (Linux),507,Node.js,437,Linux|zLinux
|
||||||
|
1371,Symas OpenLDAP,126,OpenLDAP,178,SAP SQL Anywhere
|
||||||
|
1377,OAM 12c,129,Oracle Access Management,303,C
|
||||||
|
1378,ADF 12c,130,Oracle ADF,343,Objective C
|
||||||
|
1381,OHS,610,Oracle Application Server|*,122,MySQL
|
||||||
|
1383,Oracle HTTP Server powered by Apache,610,Oracle Application Server|*,259,Apache HTTP Server
|
||||||
|
1384,j2eeoracleca,610,Oracle Application Server|*,453,Linux|Fedora
|
||||||
|
1385,Oracle HTTP,610,Oracle Application Server|*,134,Oracle Database
|
||||||
|
1389,9i AS server,610,Oracle Application Server|*,227,Virtual I/O Server
|
||||||
|
1391,Oracle Application R12.1.3,610,Oracle Application Server|*,134,Oracle Database
|
||||||
|
1394,Weblogic BI Publisher,132,Oracle BI Publisher,600,Oracle WebLogic Server|*
|
||||||
|
1396,OBI,133,Oracle Business Intelligence,343,Objective C
|
||||||
|
1397,OBIEE,133,Oracle Business Intelligence,343,Objective C
|
||||||
|
1398,OBI Reporting,133,Oracle Business Intelligence,343,Objective C
|
||||||
|
1401,Oracle 12.2 Client,134,Oracle Database,610,Oracle Application Server|*
|
||||||
|
1406,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0,134,Oracle Database,610,Oracle Application Server|*
|
||||||
|
1407,Oracle 11.2 (Oracle Database 11g Enterprise Edition Release 11.2.0.3.0 - 64bit) RAC,134,Oracle Database,610,Oracle Application Server|*
|
||||||
|
1415,Oracle 11 on AIX,134,Oracle Database,445,Unix|AIX
|
||||||
|
1416,Oracle Database 10g Enterprise Edition Release 10.1.0.4.0 - 64bit,134,Oracle Database,610,Oracle Application Server|*
|
||||||
|
1431,Oracle Database 10g Release 10.2.0.4.0 - 64bit Production,134,Oracle Database,298,Oracle Exadata
|
||||||
|
1432,Oarcle 11G,134,Oracle Database,218,TIBCO InConcert
|
||||||
|
1443,DB - Oracle inbuilt,134,Oracle Database,158,Powerbuilder
|
||||||
|
1460,Oracle Server,134,Oracle Database,610,Oracle Application Server|*
|
||||||
|
1475,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 - 64bit Production,134,Oracle Database,610,Oracle Application Server|*
|
||||||
|
1480,Oracle 12C on linux,134,Oracle Database,303,C
|
||||||
|
1484,Oracle9i Enterprise Edition Release 9.2.0.5.0,134,Oracle Database,610,Oracle Application Server|*
|
||||||
|
1486,Oracle 11g on linux,134,Oracle Database,432,Linux|Oracle Linux
|
||||||
|
1487,Oracle 11gEssbase,134,Oracle Database,298,Oracle Exadata
|
||||||
|
1490,JServer Release 9.2.0.5.0,474,Oracle Database|Jserver,335,Java|JavaServer Pages (JSP)
|
||||||
|
1492,Designer 6i,135,Oracle Designer,516,Compopent Object Model (COM)
|
||||||
|
1493,Enterprise Manager 12.2.1.1,136,Oracle Enterprise Manager,427,Linux|CentOS
|
||||||
|
1494,Enterprise Manager 12.2.1.2,136,Oracle Enterprise Manager,427,Linux|CentOS
|
||||||
|
1495,Enterprise Manager 11.1.1.7,136,Oracle Enterprise Manager,427,Linux|CentOS
|
||||||
|
1501,"Oracle, Nets",140,Oracle Net Services,273,Netscape Enterprise Server (NES)
|
||||||
|
1502,Oracle RAC,141,Oracle Real Application Clusters (RAC),134,Oracle Database
|
||||||
|
1504,ORPOS 13.3.3,142,Oracle Retail Point-of-Service,609,IIS|*
|
||||||
|
1505,ORPOS 13.3.5,142,Oracle Retail Point-of-Service,609,IIS|*
|
||||||
|
1506,ORPOS 13.3.4,142,Oracle Retail Point-of-Service,609,IIS|*
|
||||||
|
1509,OSB Servers,143,Oracle Service Bus,443,OS/2
|
||||||
|
1514,Oracle TT,147,Oracle TimesTen In-Memory Database,134,Oracle Database
|
||||||
|
1517,OWB 10g,148,Oracle Warehouse Builder (OWB),300,AWK
|
||||||
|
1524,Clascal,346,Pascal|Object Pascal,307,Cascading Style Sheets (CSS)
|
||||||
|
1526,Oracle-HR-9.2,151,PeopleSoft,134,Oracle Database
|
||||||
|
1529,ActiveState Tool Corp. - ActivePerl 5.12,348,Perl|ActivePerl,500,ActiveX
|
||||||
|
1530,ActiveState Tool Corp. - ActivePerl 5.8,348,Perl|ActivePerl,500,ActiveX
|
||||||
|
1531,ORAPERL,417,Perl|Oraperl,242,WinRAR
|
||||||
|
1532,REX,349,Perl|Rex,356,Rexx
|
||||||
|
1536,TCServer V6,277,Pivotal tc Server,365,TCL
|
||||||
|
1537,IBM PKWARE PKZip 2,155,PKZIP,387,Java|IBM SDK
|
||||||
|
1541,PLQSL,352,PL/SQL,351,PL/I
|
||||||
|
1542,Oracle - SQL,352,PL/SQL,581,MS SQL Server|*
|
||||||
|
1544,Oracle SQL,352,PL/SQL,134,Oracle Database
|
||||||
|
1545,PLSQL;,352,PL/SQL,351,PL/I
|
||||||
|
1547,Oracle PLSQL,352,PL/SQL,351,PL/I
|
||||||
|
1548,plsql,352,PL/SQL,351,PL/I
|
||||||
|
1551,Projectplace,156,Planview,21,Business Intelligence and Reporting Tools (BIRT)
|
||||||
|
1558,Power Builder,158,Powerbuilder,151,PeopleSoft
|
||||||
|
1560,Power Builder 6.5,158,Powerbuilder,27,Chef Automate
|
||||||
|
1565,ProjectWise Oracle Server,161,ProjectWise,162,ProjectWise Web Server
|
||||||
|
1576,RMQ,165,RabbitMQ,355,R
|
||||||
|
1579,Clearquest,167,Rational ClearQuest,455,Clarify|Clear Basic
|
||||||
|
1581,Remedy ARS,169,Remedy,322,Fortran
|
||||||
|
1584,RightFax client 10,171,RightFax,118,MQ Client
|
||||||
|
1585,SOQL,359,Salesforce Object Query Language (SOQL),621,ArangoDB
|
||||||
|
1587,SAP Business Objects,173,SAP BusinessObjects BI server,177,SAP NetWeaver Business Warehouse
|
||||||
|
1588,Business Objects 12,173,SAP BusinessObjects BI server,488,ActiveX|ADO
|
||||||
|
1590,SAP BI 4.2 Sp5,173,SAP BusinessObjects BI server,174,SAP ERP
|
||||||
|
1593,SAP HANA ON SUSEOracle 11g on Linux,175,SAP HANA DB,435,Linux|SUSE Linux Enterprise Server
|
||||||
|
1596,NetWeaver,279,SAP NetWeaver App Server,431,Linux|openSUSE
|
||||||
|
1605,SCSS,361,Sass,102,Microsoft Endpoint Configuration Manager (SCCM)
|
||||||
|
1606,Scalla,362,Scala,664,Forte
|
||||||
|
1609,Microsoft SPS 2010,603,SharePoint|*,577,MVS|*
|
||||||
|
1613,SQL Server SP2013 Database Server,603,SharePoint|*,581,MS SQL Server|*
|
||||||
|
1615,Siebel IP 2015,182,Siebel,583,C++|*
|
||||||
|
1616,Siebel 7.8.2.16,182,Siebel,43,DB2
|
||||||
|
1617,Siebel CRM,182,Siebel,583,C++|*
|
||||||
|
1619,Techsmith Corporation - SnagIt 8,184,SnagIt,183,SNA Manager
|
||||||
|
1620,Solid development server,185,solidDB,600,Oracle WebLogic Server|*
|
||||||
|
1622,Sixty-Five Software - SpaceMonger 1.4,187,SpaceMonger,296,Intel Xeon Processor
|
||||||
|
1623,SQLPlus,478,Oracle Database|SQL*Plus,572,Structured Query Language (SQL)
|
||||||
|
1625,SQLIO 1.0,189,SQLIO,178,SAP SQL Anywhere
|
||||||
|
1630,SunOne,281,Oracle iPlanet Web Server,448,Unix|BSD|SunOS
|
||||||
|
1637,SAP - Sybase Central 4.3,479,Sybase SQL Server|Sybase Central,190,Sybase SQL Server
|
||||||
|
1639,Sysncsort,191,Syncsort,178,SAP SQL Anywhere
|
||||||
|
1640,syncsort,191,Syncsort,98,Memcached
|
||||||
|
1641,Sysinternals LLC - AccessEnum 1 1,194,Sysinternal Tools|AccessEnum,124,Nexus Repository OSS
|
||||||
|
1642,Sysinternals LLC - ClockRes 2,195,Sysinternal Tools|ClockRes,374,Xbase++
|
||||||
|
1643,Sysinternals LLC - Coreinfo 3.21,196,Sysinternal Tools|Coreinfo,670,EAServer
|
||||||
|
1644,Sysinternals LLC - DiskExt 1.1,197,Sysinternal Tools|DiskExt,374,Xbase++
|
||||||
|
1645,Sysinternals LLC - DiskMon 2.01,198,Sysinternal Tools|DiskMon,670,EAServer
|
||||||
|
1647,Sysinternals LLC - Junction 1.6,200,Sysinternal Tools|Junction,374,Xbase++
|
||||||
|
1648,Sysinternals LLC - LDMDump 1.02,201,Sysinternal Tools|LDMDump,178,SAP SQL Anywhere
|
||||||
|
1649,Sysinternals LLC - LoadOrder 1,202,Sysinternal Tools|LoadOrder,374,Xbase++
|
||||||
|
1650,Sysinternals LLC - PipeList 1.01,203,Sysinternal Tools|PipeList,670,EAServer
|
||||||
|
1651,Sysinternals LLC - Process Explorer 16.5,204,Sysinternal Tools|Process Explorer,464,Microsoft Exchange Server|Veeam Explorer
|
||||||
|
1652,Sysinternals LLC - PsKill 1.15,205,Sysinternal Tools|PsKill,151,PeopleSoft
|
||||||
|
1653,Sysinternals LLC - PsPasswd 1.23,206,Sysinternal Tools|PsPasswd,231,VMware vCenter
|
||||||
|
1654,Sysinternals LLC - SDelete 1.61,207,Sysinternal Tools|SDelete,670,EAServer
|
||||||
|
1655,Sysinternals LLC - ShareEnum 1.6,208,Sysinternal Tools|ShareEnum,603,SharePoint|*
|
||||||
|
1656,Sysinternals LLC - Sync 2.2,209,Sysinternal Tools|Sync,374,Xbase++
|
||||||
|
1657,Sysinternals LLC - Sysinternals TCPView 3.5,210,Sysinternal Tools|TCPView,365,TCL
|
||||||
|
1658,Sysinternals LLC - VMMap 3.11,211,Sysinternal Tools|VMMap,176,SAP MaxDB
|
||||||
|
1659,Sysinternals LLC - Whois 1.11,212,Sysinternal Tools|Whois,178,SAP SQL Anywhere
|
||||||
|
1664,TERADATA QUERY SCHEDULER SERVER VERSION 15,216,Teradata QS Server,215,Teradata
|
||||||
|
1667,BusinessWorks,217,TIBCO Business Works (BW),111,Microsoft Visual Studio
|
||||||
|
1668,Tibco-IM,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous
|
||||||
|
1669,Tibco Integration Manager,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous
|
||||||
|
1674,TSQL,366,Transact-SQL,621,ArangoDB
|
||||||
|
1675,Trasact SQL,366,Transact-SQL,352,PL/SQL
|
||||||
|
1746,Solaris 11.2 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene
|
||||||
|
1747,Solaris UNIX,448,Unix|BSD|SunOS,578,Unix|*
|
||||||
|
1748,Unix Servers (Solaris,448,Unix|BSD|SunOS,578,Unix|*
|
||||||
|
1749,Oracle Solaris 11.3 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene
|
||||||
|
1753,Solaris 5.10 (Generic_150400-61),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
|
||||||
|
1754,Solaris 5.10 (Generic_150400-62),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
|
||||||
|
1756,Solaris 5.10 (Generic_150400-55),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI)
|
||||||
|
1760,Oracle Solaris,448,Unix|BSD|SunOS,134,Oracle Database
|
||||||
|
1762,Solaris 1 (SPARC),448,Unix|BSD|SunOS,375,Apache Lucene
|
||||||
|
1765,SunSolaris 10.0,448,Unix|BSD|SunOS,430,Linux|Junos OS
|
||||||
|
1771,Oracle Solaris 10,448,Unix|BSD|SunOS,134,Oracle Database
|
||||||
|
1800,VIO 2.2.0.10,227,Virtual I/O Server,159,Primavera P6
|
||||||
|
1801,VIOS,227,Virtual I/O Server,443,OS/2
|
||||||
|
1802,visibroker,228,Visibroker,420,Cisco IOS
|
||||||
|
1803,VB6,370,Visual Basic,368,VB.NET
|
||||||
|
1804,VB 6.0,370,Visual Basic,368,VB.NET
|
||||||
|
1805,visualbasic,370,Visual Basic,306,C++|Visual C++
|
||||||
|
1808,Visual Basic 6.0,370,Visual Basic,368,VB.NET
|
||||||
|
1811,VBA,371,Visual Basic for Applications (VBA),370,Visual Basic
|
||||||
|
1812,Access VB,371,Visual Basic for Applications (VBA),99,Microsoft Access
|
||||||
|
1813,vfoxpro,372,Visual FoxPro,117,Mozilla Firefox
|
||||||
|
1827,VMware Appliance,569,VMware Server,559,Virtual Appliance
|
||||||
|
1828,VSX,229,VMware Solution Exchange Marketplace (VSX),111,Microsoft Visual Studio
|
||||||
|
1830,VMware - VMware Tools 10.2,230,VMware Tools,569,VMware Server
|
||||||
|
1832,VXML,373,VoiceXML,316,eXtensible HyperText Markup Language (XHTML)
|
||||||
|
1833,Web Focus,232,WebFOCUS,321,FOCUS
|
||||||
|
1834,FOCEXEC,232,WebFOCUS,495,Oracle WebCenter Content Server|Idoc Script
|
||||||
|
1836,WLI 8,233,WebLogic Integration,442,OpenVMS
|
||||||
|
1842,IBM WEBSPHERE APPLICATION SERVER VERSION 6.1.0,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1848,"IBM WebSphere Application Server Network Deployment, 8.0.0.5",284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1850,IBM WebSphere Application Server Network Deployment 7,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1858,IBM WebSphere 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1861,IBM - WebSphere Application Server - Base 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1865,Websphere AS (JVM),284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1872,IBM WebSphere,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1875,IBM WebSphere Application Server 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1877,IBM WebSphere Application,284,Websphere Application Server (WAS),285,WebSphere Liberty
|
||||||
|
1878,WAS 6.x,284,Websphere Application Server (WAS),521,Electronic Data Interchange (EDI)
|
||||||
|
1880,IBM OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE
|
||||||
|
1882,Open Liberty,285,WebSphere Liberty,397,Java|Servlet
|
||||||
|
1883,IBM Open Liberty,285,WebSphere Liberty,62,IBM BigFix Platform
|
||||||
|
1887,WAS Liberty,285,WebSphere Liberty,397,Java|Servlet
|
||||||
|
1889,OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE
|
||||||
|
1891,WMB 6.1,235,WebSphere Message Broker,486,.NET Framework|Windows Workflow Foundation (WF)
|
||||||
|
1892,WebSphere Message Broker v6.0,235,WebSphere Message Broker,285,WebSphere Liberty
|
||||||
|
1899,WebSphere Portal Extend Limited Use 6.1,286,WebSphere Portal Server,285,WebSphere Liberty
|
||||||
|
1901,Windchill 11.1,237,Windchill,17,Bluebeam|Bluebeam Q
|
||||||
|
1908,Window,580,Windows|*,637,Microsoft Azure
|
||||||
|
1914,Windows Terminal Server,239,Windows Terminal Server (WTS),452,Windows|Windows Server
|
||||||
|
1915,Windows 7 Standard,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1916,WINDOWS 10 SERVER STANDARD EDITION X64,451,Windows|Windows Desktop,452,Windows|Windows Server
|
||||||
|
1917,Microsoft Windows 7 (64-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1918,Microsoft Windows XP Professional (32-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1919,Windows 7 Professional x64,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1920,Microsoft Microsoft Windows Entreprise,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1921,Microsoft Windows 2000,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1922,Microsoft Windows 10,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1923,MS Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1924,Microsoft Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1925,Microsoft Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1926,Microsoft Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1927,Win Desktop,451,Windows|Windows Desktop,560,Webtop
|
||||||
|
1928,Windows 10 Pro,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1929,Windows 10,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1930,Windows 7 Ultimate,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1931,Microsoft Windows 8 (64-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1932,Microsoft Windows XP,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1933,Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1934,Windows XP,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1935,Windows 10 Professional,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1936,Windows 7,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1937,Microsoft Windows 10 (64-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1938,Win 7,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
1939,windowsxp,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1940,Microsoft Windows Unknown,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1941,Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1942,Windows XP Professional,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1943,Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1944,Window XP,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1945,Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1946,Microsoft Windows 7 - SOE,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1947,Windows 7 Enterprise Edition,451,Windows|Windows Desktop,452,Windows|Windows Server
|
||||||
|
1948,Windows 8,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1949,Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1950,Microsoft Windows 7 (32-bit),451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1951,Windows Embedded Standard 7,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1952,Win10,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
1953,Windows 2003,451,Windows|Windows Desktop,580,Windows|*
|
||||||
|
1955,Windows 2003 Standard,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1956,Windows 2008 Enterprise R2 x64,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1960,WINDOWS 2008R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1961,Microsoft Windows Server 2008 Standard Editio,452,Windows|Windows Server,121,MS SQL Server Compact
|
||||||
|
1962,MICROSOFT WINDOWS NT 2003,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1967,Microsoft Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
|
||||||
|
1979,Windows 2008 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1982,Windows 2003 R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1983,Windows 2008 R2 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1988,Windows 2008 R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1989,Windows 2012 Standard,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1992,Windows 2008 R2 Standard 6.1.7601 Service Pack 1,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1994,Windows 2008 Standard x64,452,Windows|Windows Server,580,Windows|*
|
||||||
|
1998,Windows 2012 R2 Standard 64-Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2007,w2k12,452,Windows|Windows Server,582,C#|*
|
||||||
|
2008,WINDOWS 2013,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2009,WINDOWS 2016 SE 64 BIT,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2011,Microsoft - Windows 2012,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2019,MICROSOFT WINDOWS 2008 TPM,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2021,MICROSOFT WINDOWS STD 2008,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2025,Windows 2008 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2028,MICROSOFT WINDOWS STD 2008 TPM,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2030,Windows 2012 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2031,MICROSOFT WINDOWS NT 2003 ENT,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2034,MICROSOFT WINDOWS 2012,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2036,Windows 2003 Standard5.2.3790,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2040,Windows 2012 R,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2044,Windows 2008 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2045,MICROSOFT WINDOWS 2008 ENT,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2047,Windows 2012 R2 Standard 6.3.9600,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2053,Windows 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
|
||||||
|
2055,Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server
|
||||||
|
2061,Windows 2016 Datacenter10.0.14393,452,Windows|Windows Server,637,Microsoft Azure
|
||||||
|
2065,windows6.3.9600,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2066,Windows 2012 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2069,Windows 2008 Enterprise,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2080,Windows 2008 Standard without Hyper-V6.0.6003,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2084,Windows 2012 R2 Datacenter,452,Windows|Windows Server,110,Microsoft System Center Endpoint Protection
|
||||||
|
2089,Windows 2008 Standard 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2096,Windows 2000,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2097,W2K8R2 Standard 64 BIT,452,Windows|Windows Server,303,C
|
||||||
|
2099,Windows 2008 Standard6.0.6003,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2100,Windows2008 R2 Enterprise 64bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2105,Win2008R2,452,Windows|Windows Server,355,R
|
||||||
|
2107,Windows 2008 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2109,Windows Server 2003 Appliance,452,Windows|Windows Server,559,Virtual Appliance
|
||||||
|
2111,Windows 2008 ENT R2 (64 bits),452,Windows|Windows Server,355,R
|
||||||
|
2114,WIN2008R2 6.1.7601,452,Windows|Windows Server,355,R
|
||||||
|
2116,microsoft windows std 2012 tpm,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2118,microsoft windows 2008,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2120,Windows 2008 Standard 32 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2121,Microsoft Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2126,Window2008 R2,452,Windows|Windows Server,355,R
|
||||||
|
2130,Windows 2008 Standard,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2134,WS03,452,Windows|Windows Server,239,Windows Terminal Server (WTS)
|
||||||
|
2136,Windows 2008 Enterprise x64,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2141,Windows 2008 R2 Enterprise,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2142,Windows Server 2003 Std 32-bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2143,Windows 2008 R2 Standard 64bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2146,Microsoft Windows 2003 R2 Standard,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2148,MICROSOFT WINDOWS NT 2003 TPM,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2149,Win Server 2008,452,Windows|Windows Server,569,VMware Server
|
||||||
|
2150,Windows 2003 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2152,WIN2014,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
2156,Win 2012 R2,452,Windows|Windows Server,355,R
|
||||||
|
2160,Win Server,452,Windows|Windows Server,12,Apache Subversion
|
||||||
|
2161,Windows 2008 Standard R2 x64,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2163,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 6196 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor
|
||||||
|
2164,Windows2012,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2165,Windows 2008 R2 Standard6.1.7601,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2166,Windows 2016,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2167,Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2179,Windows Server 2003 Std 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2180,Windows 2012 R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2181,Wintel,452,Windows|Windows Server,461,IBM Tivoli Storage Manager|TSM Client
|
||||||
|
2191,Windows 2003 Enterprise5.2.3790,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2192,WINDOWS 2012,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2193,Windows 2008 R2 OS,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2196,Windows 2003 Standard R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2197,Windows 2008 R2 Enterprise6.1.7601,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2198,Windows 2003 Standard 32 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2199,WINDOWS SERVER 2003 APPLIANCE 5.2,452,Windows|Windows Server,559,Virtual Appliance
|
||||||
|
2201,WS08R2,452,Windows|Windows Server,355,R
|
||||||
|
2204,Windows 2008 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2213,w2k8r2sp1,452,Windows|Windows Server,355,R
|
||||||
|
2217,Win 2003,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
2222,Windows 2012 R2 Standard,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2226,Windows 2008 R2 Standard 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2228,Windows 2003 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2230,Windows 2012 Storage R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2231,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 4096 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor
|
||||||
|
2235,MICROSOFT WINDOWS NT 2003 ENT TPM,452,Windows|Windows Server,239,Windows Terminal Server (WTS)
|
||||||
|
2237,Windows 2016 Standard10.0.14393,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2240,MICROSOFT WINDOWS 2003,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2242,Windows 2012 Standard R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2246,Win Server 2008 R2,452,Windows|Windows Server,355,R
|
||||||
|
2248,MICROSOFT WINDOWS STD 2012 TPM,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2249,Windows 2003 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2250,Windows 2008 Enterprise R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2251,Windows 2008,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2252,Microsoft Microsoft Windows 2008 R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2257,Win Server 2012,452,Windows|Windows Server,569,VMware Server
|
||||||
|
2258,Windows 2016 Standard,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2264,Windows 2008 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2267,Windows 2003 Standard 5.2.3790 Service Pack 2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2268,Windows 2012 Standard6.2.9200,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2269,MICROSOFT WINDOWS 2016 TPM,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2272,Windows 2003 Enterprise,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2275,Windows 2008 R2 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2277,Windows 2012 R2 Standard6.3.9600,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2286,Windows 2008 Standard R2,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2287,MicrosoftWindows Server 2008 R2 (64-bit),452,Windows|Windows Server,443,OS/2
|
||||||
|
2288,windows6.3,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2290,Windows 2016 64 Bit,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2296,Windows 2008 Enterprise6.0.6003,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2301,Win 2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
2302,Win2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
2303,Win2012R2,452,Windows|Windows Server,355,R
|
||||||
|
2305,win2008,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE)
|
||||||
|
2306,Windows 2003 Standard x64,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2315,WINDOWS 2016 STANDARD EDITION,452,Windows|Windows Server,580,Windows|*
|
||||||
|
2325,WinSCP.net - WinSCP 5.11,243,WinSCP,178,SAP SQL Anywhere
|
||||||
|
2332,Zerto Vritual Appliance,249,Zerto Virtual Replication,559,Virtual Appliance
|
||||||
|
2333,Oracle RTD,289,Oracle Real-Time Decisions (RTD),134,Oracle Database
|
||||||
|
2338,OMNIbus,251,Tivoli Netcool/OMNIbus,582,C#|*
|
||||||
|
2347,ALM,511,Application Lifecycle Management (ALM),421,DART
|
||||||
|
2349,BMS,513,Batch Management Software (BMS),442,OpenVMS
|
||||||
|
2354,COM,516,Compopent Object Model (COM),661,COM+
|
||||||
|
2357,CORBA Interface Definition Language,518,CORBA Interface Definition Language (CORBA IDL),517,Common Object Request Broker Architecture (CORBA)
|
||||||
|
2359,Data Control Language,519,Data Control Language (DCL),329,IBM i Control Language (CL)
|
||||||
|
2361,Database,520,Database (DB),43,DB2
|
||||||
|
2362,DB,520,Database (DB),43,DB2
|
||||||
|
2365,Electronic Data Interchange,521,Electronic Data Interchange (EDI),104,Microsoft Exchange Server
|
||||||
|
2369,JDOM,523,Java-based Document Object Model for XML (JDOM),84,IMS DB
|
||||||
|
2381,Simple Object Access Protocol,531,Simple Object Access Protocol (SOAP),547,Internet Message Access Protocol (IMAP)
|
||||||
|
2383,SQL,572,Structured Query Language (SQL),581,MS SQL Server|*
|
||||||
|
2386,DPE,538,Device Provisioning Engines (DPE),661,COM+
|
||||||
|
2388,ESB,540,Enterprise Service Bus(ESB),370,Visual Basic
|
||||||
|
2395,MES,553,Manufacturing Execution System (MES),623,Amazon S3
|
||||||
|
2401,Z/Virtual System Environment,591,z/VSE,441,MVS|z/OS
|
||||||
|
2403,DOS/VSE,591,z/VSE,597,DOS/360
|
||||||
|
2404,Microsoft Disk Operating System,593,MS-DOS,443,OS/2
|
||||||
|
2407,VME/B,595,VME,368,VB.NET
|
||||||
|
2408,Virtual Machine Environment,595,VME,111,Microsoft Visual Studio
|
||||||
|
2409,VME 2900,595,VME,107,Microsoft Internet Explorer
|
||||||
|
2410,OpenVME,595,VME,442,OpenVMS
|
||||||
|
2411,Disk Operating System/360,597,DOS/360,443,OS/2
|
||||||
|
2413,Transaction Processing Facility,598,z/TPF,572,Structured Query Language (SQL)
|
||||||
|
2419,NPL,653,Natural Programming Language,342,Niakwa Programming Language (NPL)
|
||||||
|
2426,IDMS/DB Data Manipulation Language,668,IDMS DML,312,Data Language Interface (DL/I)
|
||||||
|
2433,Basic Mapping Supprt,689,BMS Map,21,Business Intelligence and Reporting Tools (BIRT)
|
||||||
|
2434,DB/400,690,DB400,43,DB2
|
||||||
|
2435,IBM ISAM,693,ISAM,73,IBM Operational Decision Manager (ODM)
|
|
|
@ -32,21 +32,25 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||||
def generate_acronym(text):
|
def generate_acronym(text):
|
||||||
|
|
||||||
# Define prompt
|
# Define prompt
|
||||||
prompt = f"Answer concisely: make a possible acronym from the following: '{text}'"
|
# prompt = f"Imagine you are a diverse database. Given the following: '{text}', please suggest to me 5 possible variations. Give 5."
|
||||||
|
prompt = f"Give me a list of 10 historical product names related to: '{text}'. Format the output in a list, like this 1. Item, 2. Item, 3. ..."
|
||||||
|
|
||||||
# Generate acronym
|
# Generate acronym
|
||||||
inputs = tokenizer(prompt, return_tensors="pt")
|
inputs = tokenizer(prompt, return_tensors="pt")
|
||||||
inputs = inputs.to("cuda")
|
inputs = inputs.to("cuda")
|
||||||
outputs = model.generate(
|
outputs = model.generate(
|
||||||
inputs["input_ids"],
|
inputs["input_ids"],
|
||||||
max_length=100,
|
max_length=200,
|
||||||
no_repeat_ngram_size=3)
|
do_sample=True,
|
||||||
|
top_k=50,
|
||||||
|
temperature=0.8)
|
||||||
|
# no_repeat_ngram_size=3)
|
||||||
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# Example usage
|
# Example usage
|
||||||
# text = "Advanced Data Analytics Platform"
|
# text = "Advanced Data Analytics Platform"
|
||||||
text = "red hat enterprise linux"
|
text = "windows desktop"
|
||||||
acronym = generate_acronym(text)
|
acronym = generate_acronym(text)
|
||||||
print(f"Acronym: {acronym}")
|
print(f"Generation: {acronym}")
|
||||||
# %%
|
# %%
|
Loading…
Reference in New Issue