added more augmentations to finally beat sota
- class_bert_augmentation is now the reference training code
This commit is contained in:
		
							parent
							
								
									e90bc69ea9
								
							
						
					
					
						commit
						5312cfa06f
					
				|  | @ -0,0 +1,41 @@ | ||||||
|  | # %% | ||||||
|  | import random | ||||||
|  | import string | ||||||
|  | 
 | ||||||
|  | def corrupt_word(word): | ||||||
|  |     """Corrupt a single word using random corruption techniques.""" | ||||||
|  |     if len(word) <= 1:  # Skip corruption for single-character words | ||||||
|  |         return word | ||||||
|  |      | ||||||
|  |     corruption_type = random.choice(["delete", "swap"]) | ||||||
|  |      | ||||||
|  |     if corruption_type == "delete": | ||||||
|  |         # Randomly delete a character | ||||||
|  |         idx = random.randint(0, len(word) - 1) | ||||||
|  |         word = word[:idx] + word[idx + 1:] | ||||||
|  |      | ||||||
|  |     elif corruption_type == "swap": | ||||||
|  |         # Swap two adjacent characters | ||||||
|  |         if len(word) > 1: | ||||||
|  |             idx = random.randint(0, len(word) - 2) | ||||||
|  |             word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) | ||||||
|  |      | ||||||
|  |      | ||||||
|  |     return word | ||||||
|  | 
 | ||||||
|  | def corrupt_string(sentence, corruption_probability=0.01): | ||||||
|  |     """Corrupt each word in the string with a given probability.""" | ||||||
|  |     words = sentence.split() | ||||||
|  |     corrupted_words = [ | ||||||
|  |         corrupt_word(word) if random.random() < corruption_probability else word | ||||||
|  |         for word in words | ||||||
|  |     ] | ||||||
|  |     return " ".join(corrupted_words) | ||||||
|  | 
 | ||||||
|  | # Example usage | ||||||
|  | sentence = "This is a simple string for testing" | ||||||
|  | corrupted_sentence = corrupt_string(sentence, corruption_probability=0.1) | ||||||
|  | print("Original:", sentence) | ||||||
|  | print("Corrupted:", corrupted_sentence) | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | @ -1,95 +0,0 @@ | ||||||
| # %% |  | ||||||
| import json |  | ||||||
| import pandas as pd |  | ||||||
| 
 |  | ||||||
| ########################################## |  | ||||||
| # %% |  | ||||||
| 
 |  | ||||||
| # Load the JSON file |  | ||||||
| data_path = '../esAppMod/tca_entities.json' |  | ||||||
| with open(data_path, 'r') as file: |  | ||||||
|     data = json.load(file) |  | ||||||
| 
 |  | ||||||
| # Initialize an empty list to store the rows |  | ||||||
| rows = [] |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| # Loop through all entities in the JSON |  | ||||||
| for entity in data["data"].items(): |  | ||||||
|     entity_data = entity[1] |  | ||||||
|     entity_id = entity_data['entity_id'] |  | ||||||
|     entity_name = entity_data['entity_name'] |  | ||||||
|     entity_type_id = entity_data['entity_type_id'] |  | ||||||
|     entity_type_name = entity_data['entity_type_name'] |  | ||||||
|      |  | ||||||
|     # Add each mention and its entity_id to the rows list |  | ||||||
|     rows.append( |  | ||||||
|         { |  | ||||||
|         'id': entity_id, |  | ||||||
|         'name': entity_name, |  | ||||||
|         'type_id': entity_type_id, |  | ||||||
|         'type_name': entity_type_name |  | ||||||
|         }) |  | ||||||
| 
 |  | ||||||
| # Create a DataFrame from the rows |  | ||||||
| df = pd.DataFrame(rows) |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| # df.to_csv('entity.csv', index=False) |  | ||||||
| df |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| df['type_name'].value_counts() |  | ||||||
| # %% |  | ||||||
| df['type_id'].value_counts() |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| name_list = df['name'].to_list() |  | ||||||
| # %% |  | ||||||
| name_list |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| from scipy.cluster.hierarchy import dendrogram, linkage, fcluster |  | ||||||
| import numpy as np |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| # Define labels |  | ||||||
| labels = name_list |  | ||||||
| 
 |  | ||||||
| # Create a prefix-based distance matrix |  | ||||||
| def prefix_distance(label1, label2): |  | ||||||
|     prefix1 = label1.split() |  | ||||||
|     prefix2 = label2.split() |  | ||||||
|     # Find common prefix length |  | ||||||
|     common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2]) |  | ||||||
|     # Distance is inversely proportional to common prefix length |  | ||||||
|     return 1.0 / (common_prefix_length + 1) |  | ||||||
| 
 |  | ||||||
| # Create a pairwise distance matrix |  | ||||||
| n = len(labels) |  | ||||||
| distance_matrix = np.zeros((n, n)) |  | ||||||
| for i in range(n): |  | ||||||
|     for j in range(n): |  | ||||||
|         distance_matrix[i, j] = prefix_distance(labels[i], labels[j]) |  | ||||||
| 
 |  | ||||||
| # Perform hierarchical clustering |  | ||||||
| linkage_matrix = linkage(distance_matrix, method='average') |  | ||||||
| 
 |  | ||||||
| # Visualize as a dendrogram |  | ||||||
| import matplotlib.pyplot as plt |  | ||||||
| dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2) |  | ||||||
| plt.title("Prefix-Based Clustering") |  | ||||||
| plt.show() |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| linkage_matrix |  | ||||||
| # %% |  | ||||||
| # Extract flat clusters with a distance threshold |  | ||||||
| threshold = 0.5 |  | ||||||
| clusters = fcluster(linkage_matrix, t=threshold, criterion='distance') |  | ||||||
| 
 |  | ||||||
| # Display clusters |  | ||||||
| for i, cluster_id in enumerate(clusters): |  | ||||||
|     print(f"Label: {labels[i]}, Cluster ID: {cluster_id}") |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
|  | @ -3,53 +3,55 @@ import pandas as pd | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| # import training file | # import training file | ||||||
| data_path = '../data_import/train.csv' | data_path = '../esAppMod_data_import/train.csv' | ||||||
|  | # data_path = '../esAppMod_data_import/parent_train.csv' | ||||||
| train_df = pd.read_csv(data_path, skipinitialspace=True) | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # import test file | # import test file | ||||||
| data_path = '../data_import/test.csv' | data_path = '../esAppMod_data_import/test.csv' | ||||||
|  | # data_path = '../esAppMod_data_import/parent_test.csv' | ||||||
| test_df = pd.read_csv(data_path, skipinitialspace=True) | test_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| 
 | 
 | ||||||
| # import entity file | # import entity file | ||||||
| data_path = '../data_import/entity.csv' | data_path = '../esAppMod_data_import/entity.csv' | ||||||
| entity_df = pd.read_csv(data_path, skipinitialspace=True) | entity_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| id2label = {} | id2label = {} | ||||||
| for _, row in entity_df.iterrows(): | for _, row in entity_df.iterrows(): | ||||||
|     id2label[row['id']] = row['name'] |     id2label[row['id']] = row['name'] | ||||||
| 
 | 
 | ||||||
| # %% |  | ||||||
| train_df.sort_values(by=['entity_id']).to_markdown('out.md') | train_df.sort_values(by=['entity_id']).to_markdown('out.md') | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| data_path = '../train/class_bert_process/prediction/exports/result.csv' | data_path = '../train/class_bert_augmentation/prediction/exports/result.csv' | ||||||
| prediction_df = pd.read_csv(data_path) | prediction_df = pd.read_csv(data_path) | ||||||
| 
 | 
 | ||||||
| # %% |  | ||||||
| predicted_entity_list = [] | predicted_entity_list = [] | ||||||
| for element in prediction_df['class_prediction']: | for element in prediction_df['class_prediction']: | ||||||
|     predicted_entity_list.append(id2label[element]) |     predicted_entity_list.append(id2label[element]) | ||||||
| 
 | 
 | ||||||
| prediction_df['predicted_name'] = predicted_entity_list | prediction_df['predicted_name'] = predicted_entity_list | ||||||
| # %% |  | ||||||
| new_df = pd.concat((test_df, prediction_df ), axis=1) | new_df = pd.concat((test_df, prediction_df ), axis=1) | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| mismatch_mask = new_df['entity_id'] != new_df['class_prediction'] | mismatch_mask = new_df['entity_id'] != new_df['class_prediction'] | ||||||
| mismatch_df = new_df[mismatch_mask] | mismatch_df = new_df[mismatch_mask] | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| len(mismatch_df) | len(mismatch_df) | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| # print the top 10 offending classes | # print the top 10 offending classes | ||||||
|  | # mask1 = mismatch_df['entity_id'] != 434 | ||||||
|  | # mask2 = mismatch_df['entity_id'] != 451 | ||||||
|  | # mask3 = mismatch_df['entity_id'] != 452 | ||||||
|  | # mask= mask1 & mask2 & mask3 | ||||||
|  | # masked_df = mismatch_df[mask] | ||||||
|  | # print(masked_df['entity_id'].value_counts()[:10]) | ||||||
| print(mismatch_df['entity_id'].value_counts()[:10]) | print(mismatch_df['entity_id'].value_counts()[:10]) | ||||||
|  | masked_df = mismatch_df | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| # Convert the whole dataframe as a string and display | # Convert the whole dataframe as a string and display | ||||||
| # print the mismatch_df | # print the mismatch_df | ||||||
| print(mismatch_df.sort_values(by=['entity_id']).to_markdown()) | print(masked_df.sort_values(by=['entity_id']).to_markdown()) | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| mismatch_df.to_csv('error.csv') | mismatch_df.to_csv('error.csv') | ||||||
|  | @ -62,14 +64,9 @@ mismatch_df[select_mask] | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| # let us see the train mentions | # let us see the train mentions | ||||||
| select_value = 452 | select_value = 130 | ||||||
| select_mask = train_df['entity_id'] == select_value | select_mask = train_df['entity_id'] == select_value | ||||||
| train_df[select_mask] | train_df[select_mask] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # %% |  | ||||||
| mismatch_df[select_mask]['class_prediction'].to_list() |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| # %% |  | ||||||
|  |  | ||||||
|  | @ -0,0 +1,62 @@ | ||||||
|  | # %% | ||||||
|  | import pandas as pd | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # import training file | ||||||
|  | data_path = '../esAppMod_data_import/train.csv' | ||||||
|  | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # import test file | ||||||
|  | data_path = '../esAppMod_data_import/test.csv' | ||||||
|  | test_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | # import entity file | ||||||
|  | data_path = '../esAppMod_data_import/entity.csv' | ||||||
|  | entity_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | id2label = {} | ||||||
|  | for _, row in entity_df.iterrows(): | ||||||
|  |     id2label[row['id']] = row['name'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | train_df | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | def extract_acronym_mapping(names): | ||||||
|  |     mapping = {} | ||||||
|  |     for name in names: | ||||||
|  |         # Find acronym in parentheses | ||||||
|  |         match = re.search(r"\((\w+)\)", name) | ||||||
|  |         if match: | ||||||
|  |             acronym = match.group(1) | ||||||
|  |              | ||||||
|  |             # Remove unrelated prepended terms | ||||||
|  |             core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip() | ||||||
|  |              | ||||||
|  |             # Add to dictionary | ||||||
|  |             mapping[acronym] = core_term | ||||||
|  |     return mapping | ||||||
|  | 
 | ||||||
|  | names = set(train_df['entity_name'].to_list()) | ||||||
|  | 
 | ||||||
|  | # Extract mappings | ||||||
|  | acronym_mapping = extract_acronym_mapping(names) | ||||||
|  | print(acronym_mapping) | ||||||
|  | # %% | ||||||
|  | del acronym_mapping['E']  # too many false matches | ||||||
|  | acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()} | ||||||
|  | 
 | ||||||
|  | abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()} | ||||||
|  | term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | abbrev_to_term | ||||||
|  | # %% | ||||||
|  | term_to_abbrev | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | acronym_mapping | ||||||
|  | # %% | ||||||
|  | @ -0,0 +1,5 @@ | ||||||
|  | out.md | ||||||
|  | parent_test.csv | ||||||
|  | parent_train.csv | ||||||
|  | test_seq.csv | ||||||
|  | train_seq.csv | ||||||
|  | @ -0,0 +1,124 @@ | ||||||
|  | # %% | ||||||
|  | import json | ||||||
|  | import pandas as pd | ||||||
|  | 
 | ||||||
|  | ########################################## | ||||||
|  | # %% | ||||||
|  | # import training file | ||||||
|  | data_path = '../esAppMod_data_import/train.csv' | ||||||
|  | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # import entity file | ||||||
|  | # Keep only one row per unique value in 'column1' | ||||||
|  | unique_df = train_df.drop_duplicates(subset="entity_name", keep="first") | ||||||
|  | id2label = {} | ||||||
|  | for _, row in unique_df.iterrows(): | ||||||
|  |     id2label[row['entity_id']] = row['entity_name'] | ||||||
|  | 
 | ||||||
|  | inverse_dict = {value:key for key,value in id2label.items()} | ||||||
|  | # %% | ||||||
|  | # Create a new dictionary with sorted keys | ||||||
|  | # sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())} | ||||||
|  | sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())} | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | sorted_dict | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | rule_set ={ | ||||||
|  |     '.NET': [497,482,484,487,485,486,483], | ||||||
|  |     'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376], | ||||||
|  |     'C++': [583,306], | ||||||
|  |     'CA': [290,22,23,24,25], | ||||||
|  |     'CSS': [307,377], | ||||||
|  |     'Cisco': [28,420,29], | ||||||
|  |     'Citrix': [563,565,31,292,291,564,32,30], | ||||||
|  |     'coldfusion': [311,37], | ||||||
|  |     'eclipse': [46,622,641,456], | ||||||
|  |     'xml': [596, 318], | ||||||
|  |     'xsl': [319,320], | ||||||
|  |     'HP': [59,293,60,61,58], | ||||||
|  |     'http': [505,543], | ||||||
|  |     'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605], | ||||||
|  |     'IBM BigFix': [62,457], | ||||||
|  |     'IBM ILOG': [253,255,254,256,252], | ||||||
|  |     'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79], | ||||||
|  |     'IBM WebSphere': [80,82,83,81], | ||||||
|  |     'IBM i': [424,329], | ||||||
|  |     'IDMS': [667,668], | ||||||
|  |     'IIS': [609,490,489,491], | ||||||
|  |     'JBoss': [268,492,493], | ||||||
|  |     'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414], | ||||||
|  |     'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404], | ||||||
|  |     'KVS': [549,550,551], | ||||||
|  |     'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437], | ||||||
|  |     'MS SQL': [581,121,466,467,465,468,469,470,471,472,473], | ||||||
|  |     'MVS': [577,440,441], | ||||||
|  |     'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114], | ||||||
|  |     'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281], | ||||||
|  |     'Oracle WebLogic': [600,233], | ||||||
|  |     'Oracle Application Server': [610,494], | ||||||
|  |     'Oracle Database': [134,474,475,478], | ||||||
|  |     'Oracle Hyperion': [607,138,139], | ||||||
|  |     'Oracle WebCenter': [276,495], | ||||||
|  |     'Pascal': [599,346], | ||||||
|  |     'Perl': [585,348,417,349], | ||||||
|  |     'ProjectWise': [161,162], | ||||||
|  |     'Rational': [166,167], | ||||||
|  |     'SAP': [173,175,695,176,676,178,179], | ||||||
|  |     'SAP ERP': [174,476,477], | ||||||
|  |     'SAP NetWeaver': [279,496,177], | ||||||
|  |     'Sybase SQL Server': [190,479,480], | ||||||
|  |     'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212], | ||||||
|  |     'TIBCO': [218,219], | ||||||
|  |     'TIBCO Business Works': [217,481], | ||||||
|  |     'Tivoli': [220,251], | ||||||
|  |     'Tortoise': [221,222], | ||||||
|  |     'Unix': [578,445,579,447,602,590,448,449], | ||||||
|  |     'VB': [368,369], | ||||||
|  |     'VMware': [568,569,229,230,231], | ||||||
|  |     'Visual Basic': [370,371,372], | ||||||
|  |     'WebSphere': [234,285,235,286,284,601,287], | ||||||
|  |     'Windows': [580,238,239,451,452], | ||||||
|  |     'z': [598,608,591] | ||||||
|  | 
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # iterate through the whole training set | ||||||
|  | new_df = train_df.copy() | ||||||
|  | for idx, row in train_df.iterrows(): | ||||||
|  |     # we iterate through each rule set, replacing any matching values in the | ||||||
|  |     # list with the first element of the list | ||||||
|  |     for key in rule_set.keys(): | ||||||
|  |         id = row['entity_id'] | ||||||
|  |         if (id in rule_set[key]): | ||||||
|  |             new_df.loc[idx,('entity_id')] = rule_set[key][0] | ||||||
|  | # %% | ||||||
|  | len(set(new_df['entity_id'].to_list())) | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | new_df.to_csv('parent_train.csv') | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # now do the same for the test data | ||||||
|  | # import training file | ||||||
|  | data_path = '../esAppMod_data_import/test.csv' | ||||||
|  | test_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | new_df = test_df.copy() | ||||||
|  | for idx, row in test_df.iterrows(): | ||||||
|  |     # we iterate through each rule set, replacing any matching values in the | ||||||
|  |     # list with the first element of the list | ||||||
|  |     for key in rule_set.keys(): | ||||||
|  |         id = row['entity_id'] | ||||||
|  |         if (id in rule_set[key]): | ||||||
|  |             new_df.loc[idx,('entity_id')] = rule_set[key][0] | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | new_df | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | new_df.to_csv('parent_test.csv') | ||||||
|  | # %% | ||||||
|  | @ -0,0 +1,129 @@ | ||||||
|  | # %% | ||||||
|  | import json | ||||||
|  | import pandas as pd | ||||||
|  | 
 | ||||||
|  | ########################################## | ||||||
|  | # %% | ||||||
|  | # import training file | ||||||
|  | data_path = '../esAppMod_data_import/train.csv' | ||||||
|  | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # import entity file | ||||||
|  | # Keep only one row per unique value in 'column1' | ||||||
|  | unique_df = train_df.drop_duplicates(subset="entity_name", keep="first") | ||||||
|  | id2label = {} | ||||||
|  | for _, row in unique_df.iterrows(): | ||||||
|  |     id2label[row['entity_id']] = row['entity_name'] | ||||||
|  | 
 | ||||||
|  | inverse_dict = {value:key for key,value in id2label.items()} | ||||||
|  | # %% | ||||||
|  | # Create a new dictionary with sorted keys | ||||||
|  | # sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())} | ||||||
|  | sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())} | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | sorted_dict | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | rule_set ={ | ||||||
|  |     '.NET': [497,482,484,487,485,486,483], | ||||||
|  |     'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376], | ||||||
|  |     'C++': [583,306], | ||||||
|  |     'CA': [290,22,23,24,25], | ||||||
|  |     'CSS': [307,377], | ||||||
|  |     'Cisco': [28,420,29], | ||||||
|  |     'Citrix': [563,565,31,292,291,564,32,30], | ||||||
|  |     'coldfusion': [311,37], | ||||||
|  |     'eclipse': [46,622,641,456], | ||||||
|  |     'xml': [596, 318], | ||||||
|  |     'xsl': [319,320], | ||||||
|  |     'HP': [59,293,60,61,58], | ||||||
|  |     'http': [505,543], | ||||||
|  |     'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605], | ||||||
|  |     'IBM BigFix': [62,457], | ||||||
|  |     'IBM ILOG': [253,255,254,256,252], | ||||||
|  |     'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79], | ||||||
|  |     'IBM WebSphere': [80,82,83,81], | ||||||
|  |     'IBM i': [424,329], | ||||||
|  |     'IDMS': [667,668], | ||||||
|  |     'IIS': [609,490,489,491], | ||||||
|  |     'JBoss': [268,492,493], | ||||||
|  |     'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414], | ||||||
|  |     'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404], | ||||||
|  |     'KVS': [549,550,551], | ||||||
|  |     'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437], | ||||||
|  |     'MS SQL': [581,121,466,467,465,468,469,470,471,472,473], | ||||||
|  |     'MVS': [577,440,441], | ||||||
|  |     'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114], | ||||||
|  |     'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281], | ||||||
|  |     'Oracle WebLogic': [600,233], | ||||||
|  |     'Oracle Application Server': [610,494], | ||||||
|  |     'Oracle Database': [134,474,475,478], | ||||||
|  |     'Oracle Hyperion': [607,138,139], | ||||||
|  |     'Oracle WebCenter': [276,495], | ||||||
|  |     'Pascal': [599,346], | ||||||
|  |     'Perl': [585,348,417,349], | ||||||
|  |     'ProjectWise': [161,162], | ||||||
|  |     'Rational': [166,167], | ||||||
|  |     'SAP': [173,175,695,176,676,178,179], | ||||||
|  |     'SAP ERP': [174,476,477], | ||||||
|  |     'SAP NetWeaver': [279,496,177], | ||||||
|  |     'Sybase SQL Server': [190,479,480], | ||||||
|  |     'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212], | ||||||
|  |     'TIBCO': [218,219], | ||||||
|  |     'TIBCO Business Works': [217,481], | ||||||
|  |     'Tivoli': [220,251], | ||||||
|  |     'Tortoise': [221,222], | ||||||
|  |     'Unix': [578,445,579,447,602,590,448,449], | ||||||
|  |     'VB': [368,369], | ||||||
|  |     'VMware': [568,569,229,230,231], | ||||||
|  |     'Visual Basic': [370,371,372], | ||||||
|  |     'WebSphere': [234,285,235,286,284,601,287], | ||||||
|  |     'Windows': [580,238,239,451,452], | ||||||
|  |     'z': [598,608,591] | ||||||
|  | 
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # iterate through the whole training set | ||||||
|  | new_df = train_df.copy() | ||||||
|  | for idx, row in train_df.iterrows(): | ||||||
|  |     # we iterate through each rule set, replacing any matching values in the | ||||||
|  |     # list with the first element of the list | ||||||
|  |     for key in rule_set.keys(): | ||||||
|  |         id = row['entity_id'] | ||||||
|  |         if (id in rule_set[key]): | ||||||
|  |             stem = rule_set[key][0] | ||||||
|  |             leaf = rule_set[key].index(id) | ||||||
|  |             new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}" | ||||||
|  | # %% | ||||||
|  | len(set(new_df['entity_seq'].to_list())) | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | new_df.to_csv('train_seq.csv') | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # now do the same for the test data | ||||||
|  | # import training file | ||||||
|  | data_path = '../esAppMod_data_import/test.csv' | ||||||
|  | test_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | new_df = test_df.copy() | ||||||
|  | for idx, row in test_df.iterrows(): | ||||||
|  |     # we iterate through each rule set, replacing any matching values in the | ||||||
|  |     # list with the first element of the list | ||||||
|  |     for key in rule_set.keys(): | ||||||
|  |         id = row['entity_id'] | ||||||
|  |         if (id in rule_set[key]): | ||||||
|  |             stem = rule_set[key][0] | ||||||
|  |             leaf = rule_set[key].index(id) | ||||||
|  |             new_df.loc[idx,('entity_seq')] = f"{stem}_{leaf}" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | new_df | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | new_df.to_csv('test_seq.csv') | ||||||
|  | # %% | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| 
 | 
 | ||||||
| ******************************************************************************* | ******************************************************************************* | ||||||
| Accuracy: 0.77655 | Accuracy: 0.80197 | ||||||
| F1 Score: 0.79605 | F1 Score: 0.81948 | ||||||
| Precision: 0.85637 | Precision: 0.88067 | ||||||
| Recall: 0.77655 | Recall: 0.80197 | ||||||
|  | @ -32,6 +32,8 @@ torch.set_float32_matmul_precision('high') | ||||||
| BATCH_SIZE = 256 | BATCH_SIZE = 256 | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
|  | # construct the target id list | ||||||
|  | # data_path = '../../../esAppMod_data_import/train.csv' | ||||||
| data_path = '../../../esAppMod_data_import/train.csv' | data_path = '../../../esAppMod_data_import/train.csv' | ||||||
| train_df = pd.read_csv(data_path, skipinitialspace=True) | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| # rather than use pattern, we use the real thing and property | # rather than use pattern, we use the real thing and property | ||||||
|  | @ -52,19 +54,8 @@ def preprocess_text(text): | ||||||
|     # 1. Make all uppercase |     # 1. Make all uppercase | ||||||
|     text = text.lower() |     text = text.lower() | ||||||
| 
 | 
 | ||||||
|     # Remove any non alphanumeric character |  | ||||||
|     # text = re.sub(r'[^\w\s]', ' ', text)  # Retains only alphanumeric and spaces |  | ||||||
|     text = re.sub(r"[-;:]", " ", text) |  | ||||||
| 
 |  | ||||||
|     # Add space between digit followed by a letter |  | ||||||
|     text = re.sub(r"(\d)([A-Z])", r"\1 \2", text) |  | ||||||
| 
 |  | ||||||
|     # Add space between letter followed by a digit |  | ||||||
|     text = re.sub(r"([A-Z])(\d)", r"\1 \2", text) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     # Substitute digits with '#' |     # Substitute digits with '#' | ||||||
|     text = re.sub(r'\d+', 'x', text) |     # text = re.sub(r'\d+', '#', text) | ||||||
| 
 | 
 | ||||||
|     # standardize spacing |     # standardize spacing | ||||||
|     text = re.sub(r'\s+', ' ', text).strip() |     text = re.sub(r'\s+', ' ', text).strip() | ||||||
|  | @ -0,0 +1,562 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import re | ||||||
|  | import random | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | from transformers import ( | ||||||
|  |     AutoTokenizer, | ||||||
|  |     AutoModelForSequenceClassification, | ||||||
|  |     DataCollatorWithPadding, | ||||||
|  |     Trainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     TrainingArguments | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | def set_seed(seed): | ||||||
|  |     """ | ||||||
|  |     Set the random seed for reproducibility. | ||||||
|  |     """ | ||||||
|  |     random.seed(seed)  # Python random module | ||||||
|  |     np.random.seed(seed)  # NumPy random | ||||||
|  |     torch.manual_seed(seed)  # PyTorch CPU | ||||||
|  |     torch.cuda.manual_seed(seed)  # PyTorch GPU | ||||||
|  |     torch.cuda.manual_seed_all(seed)  # If using multiple GPUs | ||||||
|  |     torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior | ||||||
|  |     torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility | ||||||
|  | 
 | ||||||
|  | set_seed(42) | ||||||
|  | 
 | ||||||
|  | SHUFFLES=10 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # import training file | ||||||
|  | data_path = '../../esAppMod_data_import/train.csv' | ||||||
|  | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | # rather than use pattern, we use the real thing and property | ||||||
|  | entity_ids = train_df['entity_id'].to_list() | ||||||
|  | target_id_list = sorted(list(set(entity_ids))) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | id2label = {} | ||||||
|  | label2id = {} | ||||||
|  | for idx, val in enumerate(target_id_list): | ||||||
|  |     id2label[idx] = val | ||||||
|  |     label2id[val] = idx | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # introduce pre-processing functions | ||||||
|  | def preprocess_text(text): | ||||||
|  | 
 | ||||||
|  |     # 1. Make all uppercase | ||||||
|  |     text = text.lower() | ||||||
|  | 
 | ||||||
|  |     # Substitute digits with 'x' | ||||||
|  |     # text = re.sub(r'\d+', '#', text) | ||||||
|  | 
 | ||||||
|  |     # standardize spacing | ||||||
|  |     text = re.sub(r'\s+', ' ', text).strip() | ||||||
|  | 
 | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def generate_random_shuffles(text, n): | ||||||
|  |     """ | ||||||
|  |     Generate n strings with randomly shuffled words from the input text. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         text (str): The input text. | ||||||
|  |         n (int): The number of random variations to generate. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         list: A list of strings with shuffled words. | ||||||
|  |     """ | ||||||
|  |     words = text.split()  # Split the input into words | ||||||
|  |     shuffled_variations = [] | ||||||
|  |      | ||||||
|  |     for _ in range(n): | ||||||
|  |         shuffled = words[:]  # Copy the word list to avoid in-place modification | ||||||
|  |         random.shuffle(shuffled)  # Randomly shuffle the words | ||||||
|  |         shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string | ||||||
|  |      | ||||||
|  |     return shuffled_variations | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # generate n more shuffled examples | ||||||
|  | def shuffle_text(text, n_shuffles=SHUFFLES): | ||||||
|  |     """ | ||||||
|  |     Preprocess a list of texts and add n random shuffles for each string. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         texts (list): An input strings. | ||||||
|  |         n_shuffles (int): Number of random shuffles to generate for each string. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         list: A list of preprocessed and shuffled strings. | ||||||
|  |     """ | ||||||
|  |     all_processed = [] | ||||||
|  |     # add the original text | ||||||
|  |     all_processed.append(text) | ||||||
|  |          | ||||||
|  |     # Generate random shuffles | ||||||
|  |     shuffled_variations = generate_random_shuffles(text, n_shuffles) | ||||||
|  |     all_processed.extend(shuffled_variations) | ||||||
|  |      | ||||||
|  |     return all_processed | ||||||
|  | 
 | ||||||
|  | acronym_mapping = { | ||||||
|  |  'hpsa': 'hp server automation', | ||||||
|  |  'tam': 'tivoli access manager', | ||||||
|  |  'adf': 'application development facility', | ||||||
|  |  'html': 'hypertext markup language', | ||||||
|  |  'wff': 'microsoft web farm framework', | ||||||
|  |  'jsp': 'javaserver pages', | ||||||
|  |  'bw': 'business works', | ||||||
|  |  'ssrs': 'sql server reporting services', | ||||||
|  |  'cl': 'control language', | ||||||
|  |  'vba': 'visual basic for applications', | ||||||
|  |  'esapi': 'enterprise security api', | ||||||
|  |  'gwt': 'google web toolkit', | ||||||
|  |  'pki': 'perkin elmer informatics', | ||||||
|  |  'rtd': 'oracle realtime decisions', | ||||||
|  |  'jms': 'java message service', | ||||||
|  |  'db': 'database', | ||||||
|  |  'soa': 'service oriented architecture', | ||||||
|  |  'xsl': 'extensible stylesheet language', | ||||||
|  |  'com': 'compopent object model', | ||||||
|  |  'ldap': 'lightweight directory access protocol', | ||||||
|  |  'odm': 'ibm operational decision manager', | ||||||
|  |  'soql': 'salesforce object query language', | ||||||
|  |  'oms': 'order management system', | ||||||
|  |  'cfml': 'coldfusion markup language', | ||||||
|  |  'nas': 'netscape application server', | ||||||
|  |  'sql': 'structured query language', | ||||||
|  |  'bde': 'borland database engine', | ||||||
|  |  'imap': 'internet message access protocol', | ||||||
|  |  'uws': 'ultidev web server', | ||||||
|  |  'birt': 'business intelligence and reporting tools', | ||||||
|  |  'mdw': 'model driven workflow', | ||||||
|  |  'tws': 'tivoli workload scheduler', | ||||||
|  |  'jre': 'java runtime environment', | ||||||
|  |  'wcs': 'websphere commerce suite', | ||||||
|  |  'was': 'websphere application server', | ||||||
|  |  'ssis': 'sql server integration services', | ||||||
|  |  'xhtml': 'extensible hypertext markup language', | ||||||
|  |  'soap': 'simple object access protocol', | ||||||
|  |  'san': 'storage area network', | ||||||
|  |  'elk': 'elastic stack', | ||||||
|  |  'arr': 'application request routing', | ||||||
|  |  'xlst': 'extensible stylesheet language transformations', | ||||||
|  |  'sccm': 'microsoft endpoint configuration manager', | ||||||
|  |  'ejb': 'enterprise java beans', | ||||||
|  |  'css': 'cascading style sheets', | ||||||
|  |  'hpoo': 'hp operations orchestration', | ||||||
|  |  'xml': 'extensible markup language', | ||||||
|  |  'esb': 'enterprise service bus', | ||||||
|  |  'edi': 'electronic data interchange', | ||||||
|  |  'imsva': 'interscan messaging security virtual appliance', | ||||||
|  |  'wtx': 'ibm websphere transformation extender', | ||||||
|  |  'cgi': 'common gateway interface', | ||||||
|  |  'bal': 'ibm basic assembly language', | ||||||
|  |  'issow': 'integrated safe system of work', | ||||||
|  |  'dcl': 'data control language', | ||||||
|  |  'jdom': 'java document object model', | ||||||
|  |  'fim': 'microsoft forefront identity manager', | ||||||
|  |  'npl': 'niakwa programming language', | ||||||
|  |  'wf': 'windows workflow foundation', | ||||||
|  |  'lm': 'etap license manager', | ||||||
|  |  'wts': 'windows terminal server', | ||||||
|  |  'asp': 'active server pages', | ||||||
|  |  'jil': 'job information language', | ||||||
|  |  'mvc': 'model view controller', | ||||||
|  |  'rmi': 'remote method invocation', | ||||||
|  |  'ad': 'active directory', | ||||||
|  |  'owb': 'oracle warehouse builder', | ||||||
|  |  'rest': 'representational state transfer', | ||||||
|  |  'jdk': 'java development kit', | ||||||
|  |  'ids': 'integrated data store', | ||||||
|  |  'bms': 'batch management software', | ||||||
|  |  'vsx': 'vmware solution exchange', | ||||||
|  |  'ssas': 'sql server analysis services', | ||||||
|  |  'atl': 'atlas transformation language', | ||||||
|  |  'ice': 'infobright community edition', | ||||||
|  |  'esql': 'extended structured query language', | ||||||
|  |  'corba': 'common object request broker architecture', | ||||||
|  |  'dpe': 'device provisioning engines', | ||||||
|  |  'rac': 'oracle real application clusters', | ||||||
|  |  'iemt': 'iis easy migration tool', | ||||||
|  |  'mes': 'manufacturing execution system', | ||||||
|  |  'odbc': 'open database connectivity', | ||||||
|  |  'lms': 'lan management solution', | ||||||
|  |  'wcf': 'windows communication foundation', | ||||||
|  |  'nes': 'netscape enterprise server', | ||||||
|  |  'jsf': 'javaserver faces', | ||||||
|  |  'alm': 'application lifecycle management', | ||||||
|  |  'hlasm': 'high level assembler', | ||||||
|  |  'cmod': 'content manager ondemand'} | ||||||
|  | 
 | ||||||
|  | external_source = { | ||||||
|  |  'vb.net': 'visual basic dot net', | ||||||
|  |  'jes': 'job entry subsystem', | ||||||
|  |  'svn': 'subversion', | ||||||
|  |  'vcs': 'version control system', | ||||||
|  |  'lims': 'laboratory information management system', | ||||||
|  |  'ide': 'integrated development environment', | ||||||
|  |  'sdk': 'software development kit', | ||||||
|  |  'mq': 'message queue', | ||||||
|  |  'ims': 'information management system', | ||||||
|  |  'isa': 'internet security and acceleration', | ||||||
|  |  'vs': 'visual studio', | ||||||
|  |  'esr': 'extended support release', | ||||||
|  |  'ff': 'firefox', | ||||||
|  |  'vb': 'visual basic', | ||||||
|  |  'rhel': 'red hat enterprise linux', | ||||||
|  |  'iis': 'internet information server', | ||||||
|  |  'api': 'application programming interface', | ||||||
|  |  'se': 'standard edition', | ||||||
|  |  '\.net': 'dot net', | ||||||
|  |  'c#': 'c sharp' | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # synonyms = { | ||||||
|  | #  'windows server': 'windows nt', | ||||||
|  | #  'windows 7': 'windows desktop', | ||||||
|  | #  'windows 8': 'windows desktop', | ||||||
|  | #  'windows 10': 'windows desktop' | ||||||
|  | # } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # add more information | ||||||
|  | acronym_mapping.update(external_source) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()} | ||||||
|  | term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()} | ||||||
|  | 
 | ||||||
|  | def replace_terms_with_abbreviations(text): | ||||||
|  |     for input, replacement in term_to_abbrev.items(): | ||||||
|  |         text = re.sub(input, replacement, text) | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | def replace_abbreviations_with_terms(text): | ||||||
|  |     for input, replacement in abbrev_to_term.items(): | ||||||
|  |         text = re.sub(input, replacement, text) | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | ###################################### | ||||||
|  | 
 | ||||||
|  | # augmentation by text corruption | ||||||
|  | 
 | ||||||
|  | def corrupt_word(word): | ||||||
|  |     """Corrupt a single word using random corruption techniques.""" | ||||||
|  |     if len(word) <= 1:  # Skip corruption for single-character words | ||||||
|  |         return word | ||||||
|  |      | ||||||
|  |     corruption_type = random.choice(["delete", "swap"]) | ||||||
|  |      | ||||||
|  |     if corruption_type == "delete": | ||||||
|  |         # Randomly delete a character | ||||||
|  |         idx = random.randint(0, len(word) - 1) | ||||||
|  |         word = word[:idx] + word[idx + 1:] | ||||||
|  |      | ||||||
|  |     elif corruption_type == "swap": | ||||||
|  |         # Swap two adjacent characters | ||||||
|  |         if len(word) > 1: | ||||||
|  |             idx = random.randint(0, len(word) - 2) | ||||||
|  |             word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]) | ||||||
|  |      | ||||||
|  |      | ||||||
|  |     return word | ||||||
|  | 
 | ||||||
|  | def corrupt_string(sentence, corruption_probability=0.01): | ||||||
|  |     """Corrupt each word in the string with a given probability.""" | ||||||
|  |     words = sentence.split() | ||||||
|  |     corrupted_words = [ | ||||||
|  |         corrupt_word(word) if random.random() < corruption_probability else word | ||||||
|  |         for word in words | ||||||
|  |     ] | ||||||
|  |     return " ".join(corrupted_words) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | # processes dataframe into lists of dictionaries | ||||||
|  | # each element maps input to output | ||||||
|  | # input: tag_description | ||||||
|  | # output: class label | ||||||
|  | label_flag_list = [] | ||||||
|  | 
 | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         # produce shuffling | ||||||
|  |         index = row['entity_id'] | ||||||
|  |         parent_desc = row['mention'] | ||||||
|  |         parent_desc = preprocess_text(parent_desc) | ||||||
|  | 
 | ||||||
|  |         # Split the string into words | ||||||
|  |         words = parent_desc.split() | ||||||
|  | 
 | ||||||
|  |         # Count the number of words | ||||||
|  |         word_count = len(words) | ||||||
|  | 
 | ||||||
|  |         # short sequences are rare, and we must compensate by including more examples | ||||||
|  |         # mutation of other longer sequences might drown out rare short sequences | ||||||
|  |         if word_count < 3: | ||||||
|  |             for _ in range(10): | ||||||
|  |                 element = { | ||||||
|  |                     'text': parent_desc, | ||||||
|  |                     'label': label2id[index], | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # check if label is in label_flag_list | ||||||
|  |         if index not in label_flag_list: | ||||||
|  | 
 | ||||||
|  |             entity_name = row['entity_name'] | ||||||
|  |             # add the "entity_name" label as a mention | ||||||
|  |             element = { | ||||||
|  |                 'text': entity_name, | ||||||
|  |                 'label': label2id[index], | ||||||
|  |             } | ||||||
|  |             output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             # remove all non-alphanumerics | ||||||
|  |             desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces | ||||||
|  |             if (desc != parent_desc): | ||||||
|  |                 element = { | ||||||
|  |                     'text' : desc, | ||||||
|  |                     'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |             # add shufles of the original entity name | ||||||
|  |             no_of_shuffles = SHUFFLES | ||||||
|  |             processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles) | ||||||
|  |             for desc in processed_descs: | ||||||
|  |                 if (desc != parent_desc): | ||||||
|  |                     element = { | ||||||
|  |                         'text' : desc, | ||||||
|  |                         'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |                     } | ||||||
|  |                     output_list.append(element) | ||||||
|  | 
 | ||||||
|  |             label_flag_list.append(index) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # add shuffled strings | ||||||
|  |         processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES) | ||||||
|  |         for desc in processed_descs: | ||||||
|  |             if (desc != parent_desc): | ||||||
|  |                 element = { | ||||||
|  |                     'text' : desc, | ||||||
|  |                     'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |         # corrupt string | ||||||
|  |         desc = corrupt_string(parent_desc, corruption_probability=0.1) | ||||||
|  |         if (desc != parent_desc): | ||||||
|  |             element = { | ||||||
|  |                 'text' : desc, | ||||||
|  |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |             } | ||||||
|  |             output_list.append(element) | ||||||
|  | 
 | ||||||
|  |          | ||||||
|  |         # augmentation | ||||||
|  |         # remove all non-alphanumerics | ||||||
|  |         desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces | ||||||
|  |         if (desc != parent_desc): | ||||||
|  |             element = { | ||||||
|  |                 'text' : desc, | ||||||
|  |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |             } | ||||||
|  |             output_list.append(element) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # # augmentation | ||||||
|  |         # # perform abbrev_to_term | ||||||
|  |         # temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces | ||||||
|  |         # desc = replace_terms_with_abbreviations(temp_desc) | ||||||
|  |         # if (desc != temp_desc): | ||||||
|  |         #     element = { | ||||||
|  |         #         'text' : desc, | ||||||
|  |         #         'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |         #     } | ||||||
|  |         #     output_list.append(element) | ||||||
|  | 
 | ||||||
|  |         # augmentation | ||||||
|  |         # perform term to abbrev | ||||||
|  |         desc = replace_abbreviations_with_terms(parent_desc) | ||||||
|  |         if (desc != parent_desc): | ||||||
|  |             element = { | ||||||
|  |                 'text' : desc, | ||||||
|  |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |             } | ||||||
|  |             output_list.append(element) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_dataset(): | ||||||
|  |     # train  | ||||||
|  |     data_path = '../../esAppMod_data_import/train.csv' | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | def train(): | ||||||
|  | 
 | ||||||
|  |     save_path = f'checkpoint' | ||||||
|  |     split_datasets = create_dataset() | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  | 
 | ||||||
|  |     model_checkpoint = "distilbert/distilbert-base-uncased" | ||||||
|  |     # model_checkpoint = 'google-bert/bert-base-cased' | ||||||
|  |     # model_checkpoint = 'prajjwal1/bert-small' | ||||||
|  |     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     # additional_special_tokens = ["<DESC>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['text'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding=True | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns="text", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # %% temp | ||||||
|  |     # tokenized_datasets['train'].rename_columns() | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # create data collator | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # compute metrics | ||||||
|  |     metric = evaluate.load("accuracy") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         preds = np.argmax(preds, axis=1) | ||||||
|  |         return metric.compute(predictions=preds, references=labels) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # create id2label and label2id | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     model = AutoModelForSequenceClassification.from_pretrained( | ||||||
|  |         model_checkpoint, | ||||||
|  |         num_labels=len(target_id_list), | ||||||
|  |         id2label=id2label, | ||||||
|  |         label2id=label2id) | ||||||
|  |     # important! after extending tokens vocab | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     training_args = TrainingArguments( | ||||||
|  |         output_dir=f"{save_path}", | ||||||
|  |         # eval_strategy="epoch", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=5e-5, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=40, | ||||||
|  |         warmup_steps=400, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Trainer( | ||||||
|  |         model, | ||||||
|  |         training_args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | train() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | checkpoint* | ||||||
|  | tensorboard-log | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | exports | ||||||
|  | @ -0,0 +1,11 @@ | ||||||
|  | 
 | ||||||
|  | ******************************************************************************* | ||||||
|  | Accuracy: 0.71956 | ||||||
|  | F1 Score: 0.74142 | ||||||
|  | Precision: 0.81529 | ||||||
|  | Recall: 0.71956 | ||||||
|  | ******************************************************************************** | ||||||
|  | Accuracy: 0.71710 | ||||||
|  | F1 Score: 0.74095 | ||||||
|  | Precision: 0.82181 | ||||||
|  | Recall: 0.71710 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | ******************************************************************************* | ||||||
|  | Accuracy: 0.81591 | ||||||
|  | F1 Score: 0.82162 | ||||||
|  | Precision: 0.85519 | ||||||
|  | Recall: 0.81591 | ||||||
|  | @ -0,0 +1,6 @@ | ||||||
|  | 
 | ||||||
|  | ******************************************************************************* | ||||||
|  | Accuracy: 0.59943 | ||||||
|  | F1 Score: 0.60266 | ||||||
|  | Precision: 0.66956 | ||||||
|  | Recall: 0.59943 | ||||||
|  | @ -0,0 +1,265 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import re | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | 
 | ||||||
|  | from transformers import ( | ||||||
|  |     AutoTokenizer, | ||||||
|  |     AutoModelForSequenceClassification, | ||||||
|  |     DataCollatorWithPadding, | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | from tqdm import tqdm | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 256 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # construct the target id list | ||||||
|  | # data_path = '../../../esAppMod_data_import/train.csv' | ||||||
|  | data_path = '../../../esAppMod_data_import/train.csv' | ||||||
|  | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | # rather than use pattern, we use the real thing and property | ||||||
|  | entity_ids = train_df['entity_id'].to_list() | ||||||
|  | target_id_list = sorted(list(set(entity_ids))) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | id2label = {} | ||||||
|  | label2id = {} | ||||||
|  | for idx, val in enumerate(target_id_list): | ||||||
|  |     id2label[idx] = val | ||||||
|  |     label2id[val] = idx | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # introduce pre-processing functions | ||||||
|  | def preprocess_text(text): | ||||||
|  |     # 1. Make all uppercase | ||||||
|  |     text = text.lower() | ||||||
|  | 
 | ||||||
|  |     # Substitute digits with '#' | ||||||
|  |     text = re.sub(r'\d+', '#', text) | ||||||
|  | 
 | ||||||
|  |     # standardize spacing | ||||||
|  |     text = re.sub(r'\s+', ' ', text).strip() | ||||||
|  | 
 | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | # processes dataframe into lists of dictionaries | ||||||
|  | # each element maps input to output | ||||||
|  | # input: tag_description | ||||||
|  | # output: class label | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = row['mention'] | ||||||
|  |         desc = preprocess_text(desc) | ||||||
|  |         index = row['entity_id'] | ||||||
|  |         element = { | ||||||
|  |             'text' : desc, | ||||||
|  |             'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_dataset(): | ||||||
|  |     # train  | ||||||
|  |     # data_path = '../../../esAppMod_data_import/test.csv' | ||||||
|  |     data_path = '../../../esAppMod_data_import/parent_test.csv' | ||||||
|  |     test_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # combined_data = DatasetDict({ | ||||||
|  |     #     'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |     # }) | ||||||
|  |     return Dataset.from_list(process_df_to_dict(test_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | def test(): | ||||||
|  | 
 | ||||||
|  |     test_dataset = create_dataset() | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  | 
 | ||||||
|  |     checkpoint_directory = f'../checkpoint' | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint_part1-*' | ||||||
|  |     model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  |     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # compute max token length | ||||||
|  |     max_length = 0 | ||||||
|  |     for sample in test_dataset['text']: | ||||||
|  |         # Tokenize the sample and get the length | ||||||
|  |         input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] | ||||||
|  |         length = len(input_ids) | ||||||
|  |          | ||||||
|  |         # Update max_length if this sample is longer | ||||||
|  |         if length > max_length: | ||||||
|  |             max_length = length | ||||||
|  | 
 | ||||||
|  |     print(max_length) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  | 
 | ||||||
|  |     max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['text'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             max_length=max_length, | ||||||
|  |             # truncation=True, | ||||||
|  |             padding='max_length' | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     datasets = test_dataset.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns="text", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) | ||||||
|  | 
 | ||||||
|  |     # %% temp | ||||||
|  |     # tokenized_datasets['train'].rename_columns() | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # create data collator | ||||||
|  | 
 | ||||||
|  |     # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # compute metrics | ||||||
|  |     # metric = evaluate.load("accuracy") | ||||||
|  |     #  | ||||||
|  |     #  | ||||||
|  |     # def compute_metrics(eval_preds): | ||||||
|  |     #     preds, labels = eval_preds | ||||||
|  |     #     preds = np.argmax(preds, axis=1) | ||||||
|  |     #     return metric.compute(predictions=preds, references=labels) | ||||||
|  | 
 | ||||||
|  |     model = AutoModelForSequenceClassification.from_pretrained( | ||||||
|  |         model_checkpoint, | ||||||
|  |         num_labels=len(target_id_list), | ||||||
|  |         id2label=id2label, | ||||||
|  |         label2id=label2id) | ||||||
|  |     # important! after extending tokens vocab | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     model = model.eval() | ||||||
|  | 
 | ||||||
|  |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |     model.to(device) | ||||||
|  | 
 | ||||||
|  |     pred_labels = [] | ||||||
|  |     actual_labels = [] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) | ||||||
|  |     for batch in tqdm(dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             actual_labels.extend(batch['label']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 logits = model( | ||||||
|  |                     input_ids, | ||||||
|  |                     attention_mask).logits | ||||||
|  |                 predicted_class_ids = logits.argmax(dim=1).to("cpu") | ||||||
|  |                 pred_labels.extend(predicted_class_ids) | ||||||
|  | 
 | ||||||
|  |     pred_labels = [tensor.item() for tensor in pred_labels] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix | ||||||
|  |     y_true = actual_labels | ||||||
|  |     y_pred = pred_labels | ||||||
|  | 
 | ||||||
|  |     # Compute metrics | ||||||
|  |     accuracy = accuracy_score(y_true, y_pred) | ||||||
|  |     average_parameter = 'weighted' | ||||||
|  |     zero_division_parameter = 0 | ||||||
|  |     f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) | ||||||
|  |     precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) | ||||||
|  |     recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) | ||||||
|  | 
 | ||||||
|  |     with open("output_1.txt", "a") as f: | ||||||
|  | 
 | ||||||
|  |         print('*' * 80, file=f) | ||||||
|  |         # Print the results | ||||||
|  |         print(f'Accuracy: {accuracy:.5f}', file=f) | ||||||
|  |         print(f'F1 Score: {f1:.5f}', file=f) | ||||||
|  |         print(f'Precision: {precision:.5f}', file=f) | ||||||
|  |         print(f'Recall: {recall:.5f}', file=f) | ||||||
|  | 
 | ||||||
|  |     # export result | ||||||
|  |     label_list = [id2label[id] for id in pred_labels] | ||||||
|  |     df = pd.DataFrame({ | ||||||
|  |         'class_prediction': pd.Series(label_list)  | ||||||
|  |     }) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_1.csv", index=False) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output_1.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  |     test() | ||||||
|  | @ -0,0 +1,265 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import re | ||||||
|  | import torch | ||||||
|  | from torch.utils.data import DataLoader | ||||||
|  | 
 | ||||||
|  | from transformers import ( | ||||||
|  |     AutoTokenizer, | ||||||
|  |     AutoModelForSequenceClassification, | ||||||
|  |     DataCollatorWithPadding, | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | from tqdm import tqdm | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | BATCH_SIZE = 256 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # construct the target id list | ||||||
|  | # data_path = '../../../esAppMod_data_import/train.csv' | ||||||
|  | data_path = '../../../esAppMod_data_import/train.csv' | ||||||
|  | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | # rather than use pattern, we use the real thing and property | ||||||
|  | entity_ids = train_df['entity_id'].to_list() | ||||||
|  | target_id_list = sorted(list(set(entity_ids))) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | id2label = {} | ||||||
|  | label2id = {} | ||||||
|  | for idx, val in enumerate(target_id_list): | ||||||
|  |     id2label[idx] = val | ||||||
|  |     label2id[val] = idx | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # introduce pre-processing functions | ||||||
|  | def preprocess_text(text): | ||||||
|  |     # 1. Make all uppercase | ||||||
|  |     text = text.lower() | ||||||
|  | 
 | ||||||
|  |     # Substitute digits with '#' | ||||||
|  |     text = re.sub(r'\d+', '#', text) | ||||||
|  | 
 | ||||||
|  |     # standardize spacing | ||||||
|  |     text = re.sub(r'\s+', ' ', text).strip() | ||||||
|  | 
 | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | # processes dataframe into lists of dictionaries | ||||||
|  | # each element maps input to output | ||||||
|  | # input: tag_description | ||||||
|  | # output: class label | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         desc = row['mention'] | ||||||
|  |         desc = preprocess_text(desc) | ||||||
|  |         index = row['entity_id'] | ||||||
|  |         element = { | ||||||
|  |             'text' : desc, | ||||||
|  |             'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |         } | ||||||
|  |         output_list.append(element) | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_dataset(): | ||||||
|  |     # train  | ||||||
|  |     # data_path = '../../../esAppMod_data_import/test.csv' | ||||||
|  |     data_path = '../../../esAppMod_data_import/test.csv' | ||||||
|  |     test_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # combined_data = DatasetDict({ | ||||||
|  |     #     'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |     # }) | ||||||
|  |     return Dataset.from_list(process_df_to_dict(test_df)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | def test(): | ||||||
|  | 
 | ||||||
|  |     test_dataset = create_dataset() | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  | 
 | ||||||
|  |     checkpoint_directory = f'../checkpoint' | ||||||
|  |     # Use glob to find matching paths | ||||||
|  |     # path is usually checkpoint_fold_1/checkpoint-<step number> | ||||||
|  |     # we are guaranteed to save only 1 checkpoint from training | ||||||
|  |     pattern = 'checkpoint-*' | ||||||
|  |     model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  |     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # compute max token length | ||||||
|  |     max_length = 0 | ||||||
|  |     for sample in test_dataset['text']: | ||||||
|  |         # Tokenize the sample and get the length | ||||||
|  |         input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] | ||||||
|  |         length = len(input_ids) | ||||||
|  |          | ||||||
|  |         # Update max_length if this sample is longer | ||||||
|  |         if length > max_length: | ||||||
|  |             max_length = length | ||||||
|  | 
 | ||||||
|  |     print(max_length) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  | 
 | ||||||
|  |     max_length = 128 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['text'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             max_length=max_length, | ||||||
|  |             # truncation=True, | ||||||
|  |             padding='max_length' | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     datasets = test_dataset.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns="text", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) | ||||||
|  | 
 | ||||||
|  |     # %% temp | ||||||
|  |     # tokenized_datasets['train'].rename_columns() | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # create data collator | ||||||
|  | 
 | ||||||
|  |     # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # compute metrics | ||||||
|  |     # metric = evaluate.load("accuracy") | ||||||
|  |     #  | ||||||
|  |     #  | ||||||
|  |     # def compute_metrics(eval_preds): | ||||||
|  |     #     preds, labels = eval_preds | ||||||
|  |     #     preds = np.argmax(preds, axis=1) | ||||||
|  |     #     return metric.compute(predictions=preds, references=labels) | ||||||
|  | 
 | ||||||
|  |     model = AutoModelForSequenceClassification.from_pretrained( | ||||||
|  |         model_checkpoint, | ||||||
|  |         num_labels=len(target_id_list), | ||||||
|  |         id2label=id2label, | ||||||
|  |         label2id=label2id) | ||||||
|  |     # important! after extending tokens vocab | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     model = model.eval() | ||||||
|  | 
 | ||||||
|  |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|  |     model.to(device) | ||||||
|  | 
 | ||||||
|  |     pred_labels = [] | ||||||
|  |     actual_labels = [] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) | ||||||
|  |     for batch in tqdm(dataloader): | ||||||
|  |             # Inference in batches | ||||||
|  |             input_ids = batch['input_ids'] | ||||||
|  |             attention_mask = batch['attention_mask'] | ||||||
|  |             # save labels too | ||||||
|  |             actual_labels.extend(batch['label']) | ||||||
|  |              | ||||||
|  | 
 | ||||||
|  |             # Move to GPU if available | ||||||
|  |             input_ids = input_ids.to(device) | ||||||
|  |             attention_mask = attention_mask.to(device) | ||||||
|  | 
 | ||||||
|  |             # Perform inference | ||||||
|  |             with torch.no_grad(): | ||||||
|  |                 logits = model( | ||||||
|  |                     input_ids, | ||||||
|  |                     attention_mask).logits | ||||||
|  |                 predicted_class_ids = logits.argmax(dim=1).to("cpu") | ||||||
|  |                 pred_labels.extend(predicted_class_ids) | ||||||
|  | 
 | ||||||
|  |     pred_labels = [tensor.item() for tensor in pred_labels] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix | ||||||
|  |     y_true = actual_labels | ||||||
|  |     y_pred = pred_labels | ||||||
|  | 
 | ||||||
|  |     # Compute metrics | ||||||
|  |     accuracy = accuracy_score(y_true, y_pred) | ||||||
|  |     average_parameter = 'weighted' | ||||||
|  |     zero_division_parameter = 0 | ||||||
|  |     f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) | ||||||
|  |     precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) | ||||||
|  |     recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) | ||||||
|  | 
 | ||||||
|  |     with open("output_2.txt", "a") as f: | ||||||
|  | 
 | ||||||
|  |         print('*' * 80, file=f) | ||||||
|  |         # Print the results | ||||||
|  |         print(f'Accuracy: {accuracy:.5f}', file=f) | ||||||
|  |         print(f'F1 Score: {f1:.5f}', file=f) | ||||||
|  |         print(f'Precision: {precision:.5f}', file=f) | ||||||
|  |         print(f'Recall: {recall:.5f}', file=f) | ||||||
|  | 
 | ||||||
|  |     # export result | ||||||
|  |     label_list = [id2label[id] for id in pred_labels] | ||||||
|  |     df = pd.DataFrame({ | ||||||
|  |         'class_prediction': pd.Series(label_list)  | ||||||
|  |     }) | ||||||
|  | 
 | ||||||
|  |     # we can save the t5 generation output here | ||||||
|  |     df.to_csv(f"exports/result_2.csv", index=False) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # reset file before writing to it | ||||||
|  | with open("output_2.txt", "w") as f: | ||||||
|  |     print('', file=f) | ||||||
|  |     test() | ||||||
|  | @ -45,7 +45,7 @@ def set_seed(seed): | ||||||
| 
 | 
 | ||||||
| set_seed(42) | set_seed(42) | ||||||
| 
 | 
 | ||||||
| SHUFFLES=2 | SHUFFLES=5 | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| 
 | 
 | ||||||
|  | @ -56,37 +56,6 @@ train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| entity_ids = train_df['entity_id'].to_list() | entity_ids = train_df['entity_id'].to_list() | ||||||
| target_id_list = sorted(list(set(entity_ids))) | target_id_list = sorted(list(set(entity_ids))) | ||||||
| 
 | 
 | ||||||
| def compute_normalized_class_weights(class_counts, max_resamples=SHUFFLES): |  | ||||||
|     """ |  | ||||||
|     Compute normalized class weights inversely proportional to class counts. |  | ||||||
|     The weights are normalized so that they sum to 1. |  | ||||||
| 
 |  | ||||||
|     Args: |  | ||||||
|         class_counts (array-like): An array or list where each element represents the count of samples for a class. |  | ||||||
| 
 |  | ||||||
|     Returns: |  | ||||||
|         numpy.ndarray: A normalized array of weights for each class. |  | ||||||
|     """ |  | ||||||
|     class_counts = np.array(class_counts) |  | ||||||
|     total_samples = np.sum(class_counts) |  | ||||||
|     class_weights = total_samples / class_counts |  | ||||||
|     # so that highest weight is 1 |  | ||||||
|     normalized_weights = class_weights / np.max(class_weights) |  | ||||||
|     # Scale weights such that the highest weight corresponds to `max_resamples` |  | ||||||
|     resample_counts = normalized_weights * max_resamples |  | ||||||
|     # Round resamples to nearest integer |  | ||||||
|     resample_counts = np.round(resample_counts).astype(int) |  | ||||||
|     return resample_counts |  | ||||||
| 
 |  | ||||||
| # %% |  | ||||||
| id_counts = train_df['entity_id'].value_counts() |  | ||||||
| id_weights = compute_normalized_class_weights(id_counts, max_resamples=SHUFFLES) |  | ||||||
| id_index = id_counts.index |  | ||||||
| label2weight = {} |  | ||||||
| for idx, label in enumerate(id_index): |  | ||||||
|     label2weight[label] = id_weights[idx] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # %% | # %% | ||||||
| id2label = {} | id2label = {} | ||||||
| label2id = {} | label2id = {} | ||||||
|  | @ -101,20 +70,8 @@ def preprocess_text(text): | ||||||
|     # 1. Make all uppercase |     # 1. Make all uppercase | ||||||
|     text = text.lower() |     text = text.lower() | ||||||
| 
 | 
 | ||||||
|     # Remove any non alphanumeric character |  | ||||||
|     # text = re.sub(r'[^\w\s]', ' ', text)  # Retains only alphanumeric and spaces |  | ||||||
|     # replace dashes |  | ||||||
|     text = re.sub(r"[-;:]", " ", text) |  | ||||||
| 
 |  | ||||||
|     # Add space between digit followed by a letter |  | ||||||
|     text = re.sub(r"(\d)([A-Z])", r"\1 \2", text) |  | ||||||
| 
 |  | ||||||
|     # Add space between letter followed by a digit |  | ||||||
|     text = re.sub(r"([A-Z])(\d)", r"\1 \2", text) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     # Substitute digits with 'x' |     # Substitute digits with 'x' | ||||||
|     text = re.sub(r'\d+', 'x', text) |     text = re.sub(r'\d+', '#', text) | ||||||
| 
 | 
 | ||||||
|     # standardize spacing |     # standardize spacing | ||||||
|     text = re.sub(r'\s+', ' ', text).strip() |     text = re.sub(r'\s+', ' ', text).strip() | ||||||
|  | @ -165,35 +122,143 @@ def shuffle_text(text, n_shuffles=SHUFFLES): | ||||||
|      |      | ||||||
|     return all_processed |     return all_processed | ||||||
| 
 | 
 | ||||||
| term_to_abbrev = { | acronym_mapping = { | ||||||
|     r'job entry system': 'jes', |  'hpsa': 'hp server automation', | ||||||
|     r'subversion': 'svn', |  'tam': 'tivoli access manager', | ||||||
|     r'borland database engine': 'bde', |  'adf': 'application development facility', | ||||||
|     r'business intelligence and reporting tools': 'birt', |  'html': 'hypertext markup language', | ||||||
|     r'lan management solution': 'lms', |  'wff': 'microsoft web farm framework', | ||||||
|     r'laboratory information management system': 'lims', |  'jsp': 'javaserver pages', | ||||||
|     r'ibm database 2': 'db/2', |  'bw': 'business works', | ||||||
|     r'integrated development environment': 'ide', |  'ssrs': 'sql server reporting services', | ||||||
|     r'software development kit': 'sdk', |  'cl': 'control language', | ||||||
|     r'hp operations orchestration': 'hpoo', |  'vba': 'visual basic for applications', | ||||||
|     r'hp server automation': 'hpsa', |  'esapi': 'enterprise security api', | ||||||
|     r'internet information server': 'iis', |  'gwt': 'google web toolkit', | ||||||
|     r'release 2': 'r2', |  'pki': 'perkin elmer informatics', | ||||||
|     r'red hat enterprise linux': 'rhel', |  'rtd': 'oracle realtime decisions', | ||||||
|     r'oracle enterprise linux': 'oel', |  'jms': 'java message service', | ||||||
|     r'websphere application server': 'was', |  'db': 'database', | ||||||
|     r'application development facility': 'adf', |  'soa': 'service oriented architecture', | ||||||
|     r'server analysis services': 'ssas' |  'xsl': 'extensible stylesheet language', | ||||||
|  |  'com': 'compopent object model', | ||||||
|  |  'ldap': 'lightweight directory access protocol', | ||||||
|  |  'odm': 'ibm operational decision manager', | ||||||
|  |  'soql': 'salesforce object query language', | ||||||
|  |  'oms': 'order management system', | ||||||
|  |  'cfml': 'coldfusion markup language', | ||||||
|  |  'nas': 'netscape application server', | ||||||
|  |  'sql': 'structured query language', | ||||||
|  |  'bde': 'borland database engine', | ||||||
|  |  'imap': 'internet message access protocol', | ||||||
|  |  'uws': 'ultidev web server', | ||||||
|  |  'birt': 'business intelligence and reporting tools', | ||||||
|  |  'mdw': 'model driven workflow', | ||||||
|  |  'tws': 'tivoli workload scheduler', | ||||||
|  |  'jre': 'java runtime environment', | ||||||
|  |  'wcs': 'websphere commerce suite', | ||||||
|  |  'was': 'websphere application server', | ||||||
|  |  'ssis': 'sql server integration services', | ||||||
|  |  'xhtml': 'extensible hypertext markup language', | ||||||
|  |  'soap': 'simple object access protocol', | ||||||
|  |  'san': 'storage area network', | ||||||
|  |  'elk': 'elastic stack', | ||||||
|  |  'arr': 'application request routing', | ||||||
|  |  'xlst': 'extensible stylesheet language transformations', | ||||||
|  |  'sccm': 'microsoft endpoint configuration manager', | ||||||
|  |  'ejb': 'enterprise java beans', | ||||||
|  |  'css': 'cascading style sheets', | ||||||
|  |  'hpoo': 'hp operations orchestration', | ||||||
|  |  'xml': 'extensible markup language', | ||||||
|  |  'esb': 'enterprise service bus', | ||||||
|  |  'edi': 'electronic data interchange', | ||||||
|  |  'imsva': 'interscan messaging security virtual appliance', | ||||||
|  |  'wtx': 'ibm websphere transformation extender', | ||||||
|  |  'cgi': 'common gateway interface', | ||||||
|  |  'bal': 'ibm basic assembly language', | ||||||
|  |  'issow': 'integrated safe system of work', | ||||||
|  |  'dcl': 'data control language', | ||||||
|  |  'jdom': 'java document object model', | ||||||
|  |  'fim': 'microsoft forefront identity manager', | ||||||
|  |  'npl': 'niakwa programming language', | ||||||
|  |  'wf': 'windows workflow foundation', | ||||||
|  |  'lm': 'etap license manager', | ||||||
|  |  'wts': 'windows terminal server', | ||||||
|  |  'asp': 'active server pages', | ||||||
|  |  'jil': 'job information language', | ||||||
|  |  'mvc': 'model view controller', | ||||||
|  |  'rmi': 'remote method invocation', | ||||||
|  |  'ad': 'active directory', | ||||||
|  |  'owb': 'oracle warehouse builder', | ||||||
|  |  'rest': 'representational state transfer', | ||||||
|  |  'jdk': 'java development kit', | ||||||
|  |  'ids': 'integrated data store', | ||||||
|  |  'bms': 'batch management software', | ||||||
|  |  'vsx': 'vmware solution exchange', | ||||||
|  |  'ssas': 'sql server analysis services', | ||||||
|  |  'atl': 'atlas transformation language', | ||||||
|  |  'ice': 'infobright community edition', | ||||||
|  |  'esql': 'extended structured query language', | ||||||
|  |  'corba': 'common object request broker architecture', | ||||||
|  |  'dpe': 'device provisioning engines', | ||||||
|  |  'rac': 'oracle real application clusters', | ||||||
|  |  'iemt': 'iis easy migration tool', | ||||||
|  |  'mes': 'manufacturing execution system', | ||||||
|  |  'odbc': 'open database connectivity', | ||||||
|  |  'lms': 'lan management solution', | ||||||
|  |  'wcf': 'windows communication foundation', | ||||||
|  |  'nes': 'netscape enterprise server', | ||||||
|  |  'jsf': 'javaserver faces', | ||||||
|  |  'alm': 'application lifecycle management', | ||||||
|  |  'hlasm': 'high level assembler', | ||||||
|  |  'cmod': 'content manager ondemand'} | ||||||
|  | 
 | ||||||
|  | external_source = { | ||||||
|  |  'vb.net': 'visual basic dot net', | ||||||
|  |  'jes': 'job entry subsystem', | ||||||
|  |  'svn': 'subversion', | ||||||
|  |  'vcs': 'version control system', | ||||||
|  |  'lims': 'laboratory information management system', | ||||||
|  |  'ide': 'integrated development environment', | ||||||
|  |  'sdk': 'software development kit', | ||||||
|  |  'mq': 'message queue', | ||||||
|  |  'ims': 'information management system', | ||||||
|  |  'isa': 'internet security and acceleration', | ||||||
|  |  'vs': 'visual studio', | ||||||
|  |  'esr': 'extended support release', | ||||||
|  |  'ff': 'firefox', | ||||||
|  |  'vb': 'visual basic', | ||||||
|  |  'rhel': 'red hat enterprise linux', | ||||||
|  |  'iis': 'internet information server', | ||||||
|  |  'api': 'application programming interface', | ||||||
|  |  'se': 'standard edition', | ||||||
|  |  '\.net': 'dot net', | ||||||
|  |  'c#': 'c sharp', | ||||||
|  |  'ms': 'microsoft' | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| abbrev_to_term = {rf'\b{value}\b': key for key, value in term_to_abbrev.items()} | 
 | ||||||
|  | # synonyms = { | ||||||
|  | #  'windows server': 'windows nt', | ||||||
|  | #  'windows 7': 'windows desktop', | ||||||
|  | #  'windows 8': 'windows desktop', | ||||||
|  | #  'windows 10': 'windows desktop' | ||||||
|  | # } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # add more information | ||||||
|  | acronym_mapping.update(external_source) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()} | ||||||
|  | term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()} | ||||||
| 
 | 
 | ||||||
| def replace_terms_with_abbreviations(text): | def replace_terms_with_abbreviations(text): | ||||||
|     for input, replacement in term_to_abbrev.items(): |     for input, replacement in term_to_abbrev.items(): | ||||||
|         text = re.sub(input, replacement, text) |         text = re.sub(input, replacement, text) | ||||||
|     return text |     return text | ||||||
| 
 | 
 | ||||||
| def replace_abbreivations_with_terms(text): | def replace_abbreviations_with_terms(text): | ||||||
|     for input, replacement in abbrev_to_term.items(): |     for input, replacement in abbrev_to_term.items(): | ||||||
|         text = re.sub(input, replacement, text) |         text = re.sub(input, replacement, text) | ||||||
|     return text |     return text | ||||||
|  | @ -218,8 +283,19 @@ def process_df_to_dict(df): | ||||||
|         # no_of_shuffles = label2weight[index] + 1 |         # no_of_shuffles = label2weight[index] + 1 | ||||||
|         no_of_shuffles = SHUFFLES |         no_of_shuffles = SHUFFLES | ||||||
|         processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles) |         processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles) | ||||||
| 
 |  | ||||||
|         for desc in processed_descs: |         for desc in processed_descs: | ||||||
|  |             if (desc != parent_desc): | ||||||
|  |                 element = { | ||||||
|  |                     'text' : desc, | ||||||
|  |                     'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |          | ||||||
|  |         # augmentation | ||||||
|  |         # remove all non-alphanumerics | ||||||
|  |         desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces | ||||||
|  |         if (desc != parent_desc): | ||||||
|             element = { |             element = { | ||||||
|                 'text' : desc, |                 'text' : desc, | ||||||
|                 'label': label2id[index], # ensure labels starts from 0 |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|  | @ -227,24 +303,21 @@ def process_df_to_dict(df): | ||||||
|             output_list.append(element) |             output_list.append(element) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |         # augmentation | ||||||
|         # perform abbrev_to_term |         # perform abbrev_to_term | ||||||
|         desc = replace_terms_with_abbreviations(parent_desc) |         temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces | ||||||
|         no_of_shuffles = SHUFFLES |         desc = replace_terms_with_abbreviations(temp_desc) | ||||||
|         processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles) |         if (desc != temp_desc): | ||||||
| 
 |  | ||||||
|         for desc in processed_descs: |  | ||||||
|             element = { |             element = { | ||||||
|                 'text' : desc, |                 'text' : desc, | ||||||
|                 'label': label2id[index], # ensure labels starts from 0 |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|             } |             } | ||||||
|             output_list.append(element) |             output_list.append(element) | ||||||
| 
 | 
 | ||||||
|  |         # augmentation | ||||||
|         # perform term to abbrev |         # perform term to abbrev | ||||||
|         desc = replace_abbreivations_with_terms(parent_desc) |         desc = replace_abbreviations_with_terms(parent_desc) | ||||||
|         no_of_shuffles = SHUFFLES |         if (desc != parent_desc): | ||||||
|         processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles) |  | ||||||
| 
 |  | ||||||
|         for desc in processed_descs: |  | ||||||
|             element = { |             element = { | ||||||
|                 'text' : desc, |                 'text' : desc, | ||||||
|                 'label': label2id[index], # ensure labels starts from 0 |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|  | @ -257,7 +330,7 @@ def process_df_to_dict(df): | ||||||
| 
 | 
 | ||||||
| def create_dataset(): | def create_dataset(): | ||||||
|     # train  |     # train  | ||||||
|     data_path = '../../esAppMod_data_import/train.csv' |     data_path = '../../esAppMod_data_import/parent_train.csv' | ||||||
|     train_df = pd.read_csv(data_path, skipinitialspace=True) |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -271,13 +344,13 @@ def create_dataset(): | ||||||
| 
 | 
 | ||||||
| def train(): | def train(): | ||||||
| 
 | 
 | ||||||
|     save_path = f'checkpoint' |     save_path = f'checkpoint_part1' | ||||||
|     split_datasets = create_dataset() |     split_datasets = create_dataset() | ||||||
| 
 | 
 | ||||||
|     # prepare tokenizer |     # prepare tokenizer | ||||||
| 
 | 
 | ||||||
|     model_checkpoint = "distilbert/distilbert-base-uncased" |     model_checkpoint = "distilbert/distilbert-base-uncased" | ||||||
|     # model_checkpoint = 'google-bert/bert-base-cased' |     # model_checkpoint = 'google-bert/bert-base-uncased' | ||||||
|     # model_checkpoint = 'prajjwal1/bert-small' |     # model_checkpoint = 'prajjwal1/bert-small' | ||||||
|     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) |     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|     # Define additional special tokens |     # Define additional special tokens | ||||||
|  | @ -348,7 +421,6 @@ def train(): | ||||||
| 
 | 
 | ||||||
|     training_args = TrainingArguments( |     training_args = TrainingArguments( | ||||||
|         output_dir=f"{save_path}", |         output_dir=f"{save_path}", | ||||||
|         # eval_strategy="epoch", |  | ||||||
|         eval_strategy="no", |         eval_strategy="no", | ||||||
|         logging_dir="tensorboard-log", |         logging_dir="tensorboard-log", | ||||||
|         logging_strategy="epoch", |         logging_strategy="epoch", | ||||||
|  | @ -0,0 +1,469 @@ | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # from datasets import load_from_disk | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | os.environ['NCCL_P2P_DISABLE'] = '1' | ||||||
|  | os.environ['NCCL_IB_DISABLE'] = '1' | ||||||
|  | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||||
|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" | ||||||
|  | 
 | ||||||
|  | import re | ||||||
|  | import random | ||||||
|  | import glob | ||||||
|  | 
 | ||||||
|  | import torch | ||||||
|  | from transformers import ( | ||||||
|  |     AutoTokenizer, | ||||||
|  |     AutoModelForSequenceClassification, | ||||||
|  |     DataCollatorWithPadding, | ||||||
|  |     Trainer, | ||||||
|  |     EarlyStoppingCallback, | ||||||
|  |     TrainingArguments | ||||||
|  | ) | ||||||
|  | import evaluate | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | # import matplotlib.pyplot as plt | ||||||
|  | from datasets import Dataset, DatasetDict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | torch.set_float32_matmul_precision('high') | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | def set_seed(seed): | ||||||
|  |     """ | ||||||
|  |     Set the random seed for reproducibility. | ||||||
|  |     """ | ||||||
|  |     random.seed(seed)  # Python random module | ||||||
|  |     np.random.seed(seed)  # NumPy random | ||||||
|  |     torch.manual_seed(seed)  # PyTorch CPU | ||||||
|  |     torch.cuda.manual_seed(seed)  # PyTorch GPU | ||||||
|  |     torch.cuda.manual_seed_all(seed)  # If using multiple GPUs | ||||||
|  |     torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior | ||||||
|  |     torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility | ||||||
|  | 
 | ||||||
|  | set_seed(42) | ||||||
|  | 
 | ||||||
|  | SHUFFLES=0 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | # import training file | ||||||
|  | data_path = '../../esAppMod_data_import/train.csv' | ||||||
|  | train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | # rather than use pattern, we use the real thing and property | ||||||
|  | entity_ids = train_df['entity_id'].to_list() | ||||||
|  | target_id_list = sorted(list(set(entity_ids))) | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | id2label = {} | ||||||
|  | label2id = {} | ||||||
|  | for idx, val in enumerate(target_id_list): | ||||||
|  |     id2label[idx] = val | ||||||
|  |     label2id[val] = idx | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | # introduce pre-processing functions | ||||||
|  | def preprocess_text(text): | ||||||
|  | 
 | ||||||
|  |     # 1. Make all uppercase | ||||||
|  |     text = text.lower() | ||||||
|  | 
 | ||||||
|  |     # Substitute digits with 'x' | ||||||
|  |     text = re.sub(r'\d+', '#', text) | ||||||
|  | 
 | ||||||
|  |     # standardize spacing | ||||||
|  |     text = re.sub(r'\s+', ' ', text).strip() | ||||||
|  | 
 | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def generate_random_shuffles(text, n): | ||||||
|  |     """ | ||||||
|  |     Generate n strings with randomly shuffled words from the input text. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         text (str): The input text. | ||||||
|  |         n (int): The number of random variations to generate. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         list: A list of strings with shuffled words. | ||||||
|  |     """ | ||||||
|  |     words = text.split()  # Split the input into words | ||||||
|  |     shuffled_variations = [] | ||||||
|  |      | ||||||
|  |     for _ in range(n): | ||||||
|  |         shuffled = words[:]  # Copy the word list to avoid in-place modification | ||||||
|  |         random.shuffle(shuffled)  # Randomly shuffle the words | ||||||
|  |         shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string | ||||||
|  |      | ||||||
|  |     return shuffled_variations | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # generate n more shuffled examples | ||||||
|  | def shuffle_text(text, n_shuffles=SHUFFLES): | ||||||
|  |     """ | ||||||
|  |     Preprocess a list of texts and add n random shuffles for each string. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         texts (list): An input strings. | ||||||
|  |         n_shuffles (int): Number of random shuffles to generate for each string. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         list: A list of preprocessed and shuffled strings. | ||||||
|  |     """ | ||||||
|  |     all_processed = [] | ||||||
|  |     all_processed.append(text) | ||||||
|  |          | ||||||
|  |     # Generate random shuffles | ||||||
|  |     shuffled_variations = generate_random_shuffles(text, n_shuffles) | ||||||
|  |     all_processed.extend(shuffled_variations) | ||||||
|  |      | ||||||
|  |     return all_processed | ||||||
|  | 
 | ||||||
|  | acronym_mapping = { | ||||||
|  |  'hpsa': 'hp server automation', | ||||||
|  |  'tam': 'tivoli access manager', | ||||||
|  |  'adf': 'application development facility', | ||||||
|  |  'html': 'hypertext markup language', | ||||||
|  |  'wff': 'microsoft web farm framework', | ||||||
|  |  'jsp': 'javaserver pages', | ||||||
|  |  'bw': 'business works', | ||||||
|  |  'ssrs': 'sql server reporting services', | ||||||
|  |  'cl': 'control language', | ||||||
|  |  'vba': 'visual basic for applications', | ||||||
|  |  'esapi': 'enterprise security api', | ||||||
|  |  'gwt': 'google web toolkit', | ||||||
|  |  'pki': 'perkin elmer informatics', | ||||||
|  |  'rtd': 'oracle realtime decisions', | ||||||
|  |  'jms': 'java message service', | ||||||
|  |  'db': 'database', | ||||||
|  |  'soa': 'service oriented architecture', | ||||||
|  |  'xsl': 'extensible stylesheet language', | ||||||
|  |  'com': 'compopent object model', | ||||||
|  |  'ldap': 'lightweight directory access protocol', | ||||||
|  |  'odm': 'ibm operational decision manager', | ||||||
|  |  'soql': 'salesforce object query language', | ||||||
|  |  'oms': 'order management system', | ||||||
|  |  'cfml': 'coldfusion markup language', | ||||||
|  |  'nas': 'netscape application server', | ||||||
|  |  'sql': 'structured query language', | ||||||
|  |  'bde': 'borland database engine', | ||||||
|  |  'imap': 'internet message access protocol', | ||||||
|  |  'uws': 'ultidev web server', | ||||||
|  |  'birt': 'business intelligence and reporting tools', | ||||||
|  |  'mdw': 'model driven workflow', | ||||||
|  |  'tws': 'tivoli workload scheduler', | ||||||
|  |  'jre': 'java runtime environment', | ||||||
|  |  'wcs': 'websphere commerce suite', | ||||||
|  |  'was': 'websphere application server', | ||||||
|  |  'ssis': 'sql server integration services', | ||||||
|  |  'xhtml': 'extensible hypertext markup language', | ||||||
|  |  'soap': 'simple object access protocol', | ||||||
|  |  'san': 'storage area network', | ||||||
|  |  'elk': 'elastic stack', | ||||||
|  |  'arr': 'application request routing', | ||||||
|  |  'xlst': 'extensible stylesheet language transformations', | ||||||
|  |  'sccm': 'microsoft endpoint configuration manager', | ||||||
|  |  'ejb': 'enterprise java beans', | ||||||
|  |  'css': 'cascading style sheets', | ||||||
|  |  'hpoo': 'hp operations orchestration', | ||||||
|  |  'xml': 'extensible markup language', | ||||||
|  |  'esb': 'enterprise service bus', | ||||||
|  |  'edi': 'electronic data interchange', | ||||||
|  |  'imsva': 'interscan messaging security virtual appliance', | ||||||
|  |  'wtx': 'ibm websphere transformation extender', | ||||||
|  |  'cgi': 'common gateway interface', | ||||||
|  |  'bal': 'ibm basic assembly language', | ||||||
|  |  'issow': 'integrated safe system of work', | ||||||
|  |  'dcl': 'data control language', | ||||||
|  |  'jdom': 'java document object model', | ||||||
|  |  'fim': 'microsoft forefront identity manager', | ||||||
|  |  'npl': 'niakwa programming language', | ||||||
|  |  'wf': 'windows workflow foundation', | ||||||
|  |  'lm': 'etap license manager', | ||||||
|  |  'wts': 'windows terminal server', | ||||||
|  |  'asp': 'active server pages', | ||||||
|  |  'jil': 'job information language', | ||||||
|  |  'mvc': 'model view controller', | ||||||
|  |  'rmi': 'remote method invocation', | ||||||
|  |  'ad': 'active directory', | ||||||
|  |  'owb': 'oracle warehouse builder', | ||||||
|  |  'rest': 'representational state transfer', | ||||||
|  |  'jdk': 'java development kit', | ||||||
|  |  'ids': 'integrated data store', | ||||||
|  |  'bms': 'batch management software', | ||||||
|  |  'vsx': 'vmware solution exchange', | ||||||
|  |  'ssas': 'sql server analysis services', | ||||||
|  |  'atl': 'atlas transformation language', | ||||||
|  |  'ice': 'infobright community edition', | ||||||
|  |  'esql': 'extended structured query language', | ||||||
|  |  'corba': 'common object request broker architecture', | ||||||
|  |  'dpe': 'device provisioning engines', | ||||||
|  |  'rac': 'oracle real application clusters', | ||||||
|  |  'iemt': 'iis easy migration tool', | ||||||
|  |  'mes': 'manufacturing execution system', | ||||||
|  |  'odbc': 'open database connectivity', | ||||||
|  |  'lms': 'lan management solution', | ||||||
|  |  'wcf': 'windows communication foundation', | ||||||
|  |  'nes': 'netscape enterprise server', | ||||||
|  |  'jsf': 'javaserver faces', | ||||||
|  |  'alm': 'application lifecycle management', | ||||||
|  |  'hlasm': 'high level assembler', | ||||||
|  |  'cmod': 'content manager ondemand'} | ||||||
|  | 
 | ||||||
|  | external_source = { | ||||||
|  |  'vb.net': 'visual basic dot net', | ||||||
|  |  'jes': 'job entry subsystem', | ||||||
|  |  'svn': 'subversion', | ||||||
|  |  'vcs': 'version control system', | ||||||
|  |  'lims': 'laboratory information management system', | ||||||
|  |  'ide': 'integrated development environment', | ||||||
|  |  'sdk': 'software development kit', | ||||||
|  |  'mq': 'message queue', | ||||||
|  |  'ims': 'information management system', | ||||||
|  |  'isa': 'internet security and acceleration', | ||||||
|  |  'vs': 'visual studio', | ||||||
|  |  'esr': 'extended support release', | ||||||
|  |  'ff': 'firefox', | ||||||
|  |  'vb': 'visual basic', | ||||||
|  |  'rhel': 'red hat enterprise linux', | ||||||
|  |  'iis': 'internet information server', | ||||||
|  |  'api': 'application programming interface', | ||||||
|  |  'se': 'standard edition', | ||||||
|  |  '\.net': 'dot net', | ||||||
|  |  'c#': 'c sharp', | ||||||
|  |  'ms': 'microsoft' | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # synonyms = { | ||||||
|  | #  'windows server': 'windows nt', | ||||||
|  | #  'windows 7': 'windows desktop', | ||||||
|  | #  'windows 8': 'windows desktop', | ||||||
|  | #  'windows 10': 'windows desktop' | ||||||
|  | # } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # add more information | ||||||
|  | acronym_mapping.update(external_source) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()} | ||||||
|  | term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()} | ||||||
|  | 
 | ||||||
|  | def replace_terms_with_abbreviations(text): | ||||||
|  |     for input, replacement in term_to_abbrev.items(): | ||||||
|  |         text = re.sub(input, replacement, text) | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | def replace_abbreviations_with_terms(text): | ||||||
|  |     for input, replacement in abbrev_to_term.items(): | ||||||
|  |         text = re.sub(input, replacement, text) | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # outputs a list of dictionaries | ||||||
|  | # processes dataframe into lists of dictionaries | ||||||
|  | # each element maps input to output | ||||||
|  | # input: tag_description | ||||||
|  | # output: class label | ||||||
|  | def process_df_to_dict(df): | ||||||
|  |     output_list = [] | ||||||
|  |     for _, row in df.iterrows(): | ||||||
|  |         # produce shuffling | ||||||
|  |         index = row['entity_id'] | ||||||
|  |         parent_desc = row['mention'] | ||||||
|  |         parent_desc = preprocess_text(parent_desc) | ||||||
|  |         # ensure at least 1 shuffle | ||||||
|  |         # no_of_shuffles = label2weight[index] + 1 | ||||||
|  |         no_of_shuffles = SHUFFLES | ||||||
|  |         processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles) | ||||||
|  |         for desc in processed_descs: | ||||||
|  |             if (desc != parent_desc): | ||||||
|  |                 element = { | ||||||
|  |                     'text' : desc, | ||||||
|  |                     'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |                 } | ||||||
|  |                 output_list.append(element) | ||||||
|  | 
 | ||||||
|  |          | ||||||
|  |         # augmentation | ||||||
|  |         # remove all non-alphanumerics | ||||||
|  |         desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces | ||||||
|  |         if (desc != parent_desc): | ||||||
|  |             element = { | ||||||
|  |                 'text' : desc, | ||||||
|  |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |             } | ||||||
|  |             output_list.append(element) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         # augmentation | ||||||
|  |         # perform abbrev_to_term | ||||||
|  |         temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces | ||||||
|  |         desc = replace_terms_with_abbreviations(temp_desc) | ||||||
|  |         if (desc != temp_desc): | ||||||
|  |             element = { | ||||||
|  |                 'text' : desc, | ||||||
|  |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |             } | ||||||
|  |             output_list.append(element) | ||||||
|  | 
 | ||||||
|  |         # augmentation | ||||||
|  |         # perform term to abbrev | ||||||
|  |         desc = replace_abbreviations_with_terms(parent_desc) | ||||||
|  |         if (desc != parent_desc): | ||||||
|  |             element = { | ||||||
|  |                 'text' : desc, | ||||||
|  |                 'label': label2id[index], # ensure labels starts from 0 | ||||||
|  |             } | ||||||
|  |             output_list.append(element) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     return output_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_dataset(): | ||||||
|  |     # train  | ||||||
|  |     data_path = '../../esAppMod_data_import/train.csv' | ||||||
|  |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     combined_data = DatasetDict({ | ||||||
|  |         'train': Dataset.from_list(process_df_to_dict(train_df)), | ||||||
|  |     }) | ||||||
|  |     return combined_data | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | 
 | ||||||
|  | def train(): | ||||||
|  | 
 | ||||||
|  |     save_path = f'checkpoint' | ||||||
|  |     split_datasets = create_dataset() | ||||||
|  | 
 | ||||||
|  |     # prepare tokenizer | ||||||
|  | 
 | ||||||
|  |     pattern = 'checkpoint_part1-*' | ||||||
|  |     checkpoint_directory = 'checkpoint' | ||||||
|  |     model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] | ||||||
|  | 
 | ||||||
|  |     # model_checkpoint = "distilbert/distilbert-base-uncased" | ||||||
|  |     # model_checkpoint = 'google-bert/bert-base-uncased' | ||||||
|  |     # model_checkpoint = 'prajjwal1/bert-small' | ||||||
|  |     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) | ||||||
|  |     # Define additional special tokens | ||||||
|  |     # additional_special_tokens = ["<DESC>"] | ||||||
|  |     # Add the additional special tokens to the tokenizer | ||||||
|  |     # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) | ||||||
|  | 
 | ||||||
|  |     max_length = 120 | ||||||
|  | 
 | ||||||
|  |     # given a dataset entry, run it through the tokenizer | ||||||
|  |     def preprocess_function(example): | ||||||
|  |         input = example['text'] | ||||||
|  |         # text_target sets the corresponding label to inputs | ||||||
|  |         # there is no need to create a separate 'labels' | ||||||
|  |         model_inputs = tokenizer( | ||||||
|  |             input, | ||||||
|  |             max_length=max_length, | ||||||
|  |             truncation=True, | ||||||
|  |             padding=True | ||||||
|  |         ) | ||||||
|  |         return model_inputs | ||||||
|  | 
 | ||||||
|  |     # map maps function to each "row" in the dataset | ||||||
|  |     # aka the data in the immediate nesting | ||||||
|  |     tokenized_datasets = split_datasets.map( | ||||||
|  |         preprocess_function, | ||||||
|  |         batched=True, | ||||||
|  |         num_proc=8, | ||||||
|  |         remove_columns="text", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # %% temp | ||||||
|  |     # tokenized_datasets['train'].rename_columns() | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # create data collator | ||||||
|  | 
 | ||||||
|  |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # compute metrics | ||||||
|  |     metric = evaluate.load("accuracy") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def compute_metrics(eval_preds): | ||||||
|  |         preds, labels = eval_preds | ||||||
|  |         preds = np.argmax(preds, axis=1) | ||||||
|  |         return metric.compute(predictions=preds, references=labels) | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # create id2label and label2id | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     model = AutoModelForSequenceClassification.from_pretrained( | ||||||
|  |         model_checkpoint, | ||||||
|  |         num_labels=len(target_id_list), | ||||||
|  |         id2label=id2label, | ||||||
|  |         label2id=label2id) | ||||||
|  |     # important! after extending tokens vocab | ||||||
|  |     model.resize_token_embeddings(len(tokenizer)) | ||||||
|  | 
 | ||||||
|  |     # model = torch.compile(model, backend="inductor", dynamic=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # %% | ||||||
|  |     # Trainer | ||||||
|  | 
 | ||||||
|  |     training_args = TrainingArguments( | ||||||
|  |         output_dir=f"{save_path}", | ||||||
|  |         eval_strategy="no", | ||||||
|  |         logging_dir="tensorboard-log", | ||||||
|  |         logging_strategy="epoch", | ||||||
|  |         # save_strategy="epoch", | ||||||
|  |         load_best_model_at_end=False, | ||||||
|  |         learning_rate=5e-5, | ||||||
|  |         per_device_train_batch_size=64, | ||||||
|  |         per_device_eval_batch_size=64, | ||||||
|  |         auto_find_batch_size=False, | ||||||
|  |         ddp_find_unused_parameters=False, | ||||||
|  |         weight_decay=0.01, | ||||||
|  |         save_total_limit=1, | ||||||
|  |         num_train_epochs=300, | ||||||
|  |         warmup_steps=400, | ||||||
|  |         bf16=True, | ||||||
|  |         push_to_hub=False, | ||||||
|  |         remove_unused_columns=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     trainer = Trainer( | ||||||
|  |         model, | ||||||
|  |         training_args, | ||||||
|  |         train_dataset=tokenized_datasets["train"], | ||||||
|  |         tokenizer=tokenizer, | ||||||
|  |         data_collator=data_collator, | ||||||
|  |         compute_metrics=compute_metrics, | ||||||
|  |         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # uncomment to load training from checkpoint | ||||||
|  |     # checkpoint_path = 'default_40_1/checkpoint-5600' | ||||||
|  |     # trainer.train(resume_from_checkpoint=checkpoint_path) | ||||||
|  | 
 | ||||||
|  |     trainer.train() | ||||||
|  | 
 | ||||||
|  | # execute training | ||||||
|  | train() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | @ -1,2 +0,0 @@ | ||||||
| 
 |  | ||||||
| Accuracy for fold: 0.5846658466584665 |  | ||||||
|  | @ -57,10 +57,10 @@ class Inference(): | ||||||
|             output_list = [] |             output_list = [] | ||||||
|             for _, row in df.iterrows(): |             for _, row in df.iterrows(): | ||||||
|                 desc = row['mention'] |                 desc = row['mention'] | ||||||
|                 label = row['entity_name'] |                 label = row['entity_seq'] | ||||||
|                 element = { |                 element = { | ||||||
|                     'input' : desc, |                     'input' : desc, | ||||||
|                     'output': label |                     'output': f'{label}' | ||||||
|                 } |                 } | ||||||
| 
 | 
 | ||||||
|                 output_list.append(element) |                 output_list.append(element) | ||||||
|  | @ -101,7 +101,7 @@ class Inference(): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def generate(self): |     def generate(self): | ||||||
|         device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') |         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
|         MAX_GENERATE_LENGTH = 128 |         MAX_GENERATE_LENGTH = 128 | ||||||
| 
 | 
 | ||||||
|         pred_generations = [] |         pred_generations = [] | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | 
 | ||||||
|  | Accuracy for fold: 0.5022550225502255 | ||||||
|  | @ -11,7 +11,7 @@ BATCH_SIZE = 512 | ||||||
| def infer(): | def infer(): | ||||||
|     print(f"Inference for data") |     print(f"Inference for data") | ||||||
|     # import test data |     # import test data | ||||||
|     data_path = '../../../data_import/test.csv' |     data_path = '../../../esAppMod_data_import/test_seq.csv' | ||||||
|     df = pd.read_csv(data_path, skipinitialspace=True) |     df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -35,18 +35,19 @@ def infer(): | ||||||
|     # thing_actual_list, property_actual_list = decode_preds(pred_labels) |     # thing_actual_list, property_actual_list = decode_preds(pred_labels) | ||||||
|     # Convert the list to a Pandas DataFrame |     # Convert the list to a Pandas DataFrame | ||||||
|     df_out = pd.DataFrame({ |     df_out = pd.DataFrame({ | ||||||
|         'predictions': prediction_list |         'class_prediction': prediction_list | ||||||
|     }) |     }) | ||||||
|     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] |     # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] | ||||||
|     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] |     # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] | ||||||
|     df = pd.concat([df, df_out], axis=1) |     # df = pd.concat([df, df_out], axis=1) | ||||||
| 
 | 
 | ||||||
|     # we can save the t5 generation output here |     # we can save the t5 generation output here | ||||||
|     df.to_csv(f"exports/result.csv", index=False) |     df_out.to_csv(f"exports/result.csv", index=False) | ||||||
| 
 | 
 | ||||||
|     # here we want to evaluate mapping accuracy within the valid in mdm data only |     # here we want to evaluate mapping accuracy within the valid in mdm data only | ||||||
|     condition_correct = df['predictions'] == df['entity_name'] |     # predictions = pd.to_numeric(df_out['class_prediction'], errors="coerce") | ||||||
|     pred_correct_proportion = sum(condition_correct)/len(df) |     condition_correct = df_out['class_prediction'] == df['entity_seq'] | ||||||
|  |     pred_correct_proportion = sum(condition_correct)/len(df_out) | ||||||
| 
 | 
 | ||||||
|     # write output to file output.txt |     # write output to file output.txt | ||||||
|     with open("output.txt", "a") as f: |     with open("output.txt", "a") as f: | ||||||
|  | @ -33,10 +33,10 @@ def process_df_to_dict(df): | ||||||
|     output_list = [] |     output_list = [] | ||||||
|     for _, row in df.iterrows(): |     for _, row in df.iterrows(): | ||||||
|         desc = row['mention'] |         desc = row['mention'] | ||||||
|         label = row['entity_name'] |         label = row['entity_seq'] | ||||||
|         element = { |         element = { | ||||||
|             'input' : desc, |             'input' : desc, | ||||||
|             'output': label |             'output': f'{label}' | ||||||
|         } |         } | ||||||
|         output_list.append(element) |         output_list.append(element) | ||||||
| 
 | 
 | ||||||
|  | @ -45,7 +45,7 @@ def process_df_to_dict(df): | ||||||
| 
 | 
 | ||||||
| def create_dataset(): | def create_dataset(): | ||||||
|     # train  |     # train  | ||||||
|     data_path = f"../../data_import/train.csv" |     data_path = f"../../esAppMod_data_import/train_seq.csv" | ||||||
|     train_df = pd.read_csv(data_path, skipinitialspace=True) |     train_df = pd.read_csv(data_path, skipinitialspace=True) | ||||||
| 
 | 
 | ||||||
|     combined_data = DatasetDict({ |     combined_data = DatasetDict({ | ||||||
|  |  | ||||||
|  | @ -3,8 +3,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| # Load model and tokenizer | # Load model and tokenizer | ||||||
| # model_name = "bigscience/bloom-7b1"  # Replace with your model | model_name = "bigscience/bloom-7b1"  # Replace with your model | ||||||
| model_name = "bigscience/bloomz-1b1" | # model_name = "bigscience/bloomz-1b1" | ||||||
| tokenizer = AutoTokenizer.from_pretrained(model_name) | tokenizer = AutoTokenizer.from_pretrained(model_name) | ||||||
| 
 | 
 | ||||||
| # Automatically map model layers to available GPUs | # Automatically map model layers to available GPUs | ||||||
|  | @ -26,13 +26,12 @@ outputs = model.generate(inputs["input_ids"], max_length=50) | ||||||
| # Decode and print result | # Decode and print result | ||||||
| print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | ||||||
| # %% | # %% | ||||||
| # %% |  | ||||||
| # Prepare input | # Prepare input | ||||||
| 
 | 
 | ||||||
| def generate(text): | def generate(text): | ||||||
| 
 | 
 | ||||||
|     # Define prompt |     # Define prompt | ||||||
|     prompt = f"Answer Concisely: Give me a mapping between the acronym and descriptor in the format '(acronym: description): '{text}'" |     prompt = f"Give me past product names relating to: '{text}'" | ||||||
| 
 | 
 | ||||||
|     # Generate acronym |     # Generate acronym | ||||||
|     inputs = tokenizer(prompt, return_tensors="pt") |     inputs = tokenizer(prompt, return_tensors="pt") | ||||||
|  | @ -45,7 +44,7 @@ def generate(text): | ||||||
| 
 | 
 | ||||||
| # Example usage | # Example usage | ||||||
| # text = "Advanced Data Analytics Platform" | # text = "Advanced Data Analytics Platform" | ||||||
| text = 'ColdFusion Markup Language (CFML)' | text = 'windows server' | ||||||
| acronym = generate(text) | acronym = generate(text) | ||||||
| print(f"Acronym: {acronym}") | print(f"Generation: {acronym}") | ||||||
| # %% | # %% | ||||||
|  |  | ||||||
|  | @ -0,0 +1,21 @@ | ||||||
|  | # %% | ||||||
|  | import requests | ||||||
|  | 
 | ||||||
|  | def get_related_terms(term, language="en", limit=10): | ||||||
|  |     url = f"http://api.conceptnet.io/c/{language}/{term}" | ||||||
|  |     response = requests.get(url).json() | ||||||
|  |      | ||||||
|  |     # Extract related terms | ||||||
|  |     related_terms = [] | ||||||
|  |     for edge in response.get("edges", []): | ||||||
|  |         related = edge.get("end", {}).get("label", None) | ||||||
|  |         if related and related.lower() != term.lower(): | ||||||
|  |             related_terms.append(related) | ||||||
|  |         if len(related_terms) >= limit: | ||||||
|  |             break | ||||||
|  |     return related_terms | ||||||
|  | 
 | ||||||
|  | # Example | ||||||
|  | related_terms = get_related_terms("windows_server") | ||||||
|  | print("Related Terms:", related_terms) | ||||||
|  | # %% | ||||||
|  | @ -0,0 +1,38 @@ | ||||||
|  | # %% | ||||||
|  | from SPARQLWrapper import SPARQLWrapper, JSON | ||||||
|  | 
 | ||||||
|  | # %% | ||||||
|  | sparql = SPARQLWrapper("https://dbpedia.org/sparql") | ||||||
|  | sparql.setQuery(""" | ||||||
|  |     SELECT ?altLabel WHERE { | ||||||
|  |     ?item rdfs:label "Windows Server"@en. | ||||||
|  |     ?item skos:altLabel ?altLabel. | ||||||
|  |     FILTER (LANG(?altLabel) = "en") | ||||||
|  |     } | ||||||
|  |     LIMIT 10 | ||||||
|  | """) | ||||||
|  | sparql.setReturnFormat(JSON) | ||||||
|  | results = sparql.query().convert() | ||||||
|  | 
 | ||||||
|  | for result in results["results"]["bindings"]: | ||||||
|  |     print(result["label"]["value"]) | ||||||
|  | # %% | ||||||
|  | from SPARQLWrapper import SPARQLWrapper, JSON | ||||||
|  | 
 | ||||||
|  | sparql = SPARQLWrapper("https://query.wikidata.org/sparql") | ||||||
|  | sparql.setQuery(""" | ||||||
|  |     SELECT ?itemLabel ?altLabel WHERE { | ||||||
|  |         ?item ?label "Windows Server"@en. | ||||||
|  |         OPTIONAL { ?item skos:altLabel ?altLabel. FILTER (LANG(?altLabel) = "en") } | ||||||
|  |         SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } | ||||||
|  |     } | ||||||
|  |     LIMIT 10 | ||||||
|  | """) | ||||||
|  | sparql.setReturnFormat(JSON) | ||||||
|  | results = sparql.query().convert() | ||||||
|  | 
 | ||||||
|  | for result in results["results"]["bindings"]: | ||||||
|  |     print("Label:", result["itemLabel"]["value"]) | ||||||
|  |     if "altLabel" in result: | ||||||
|  |         print("Alias:", result["altLabel"]["value"]) | ||||||
|  | # %% | ||||||
|  | @ -0,0 +1,626 @@ | ||||||
|  | ,mention,entity_id,entity_name,class_prediction,predicted_name | ||||||
|  | 0,DOT NET,497,.NET Framework,579,Unix|BSD|* | ||||||
|  | 2,Dot net - FW 4,497,.NET Framework,368,VB.NET | ||||||
|  | 3,.Net 4.7.1 Enterprise Lib,497,.NET Framework,368,VB.NET | ||||||
|  | 11,.NET,497,.NET Framework,579,Unix|BSD|* | ||||||
|  | 13,.Net 4.5.2 Enterprise Lib,497,.NET Framework,368,VB.NET | ||||||
|  | 40,APACHE LOG4NET,483,.NET Framework|log4net,394,Java|Log4j | ||||||
|  | 41,LOG4NET,483,.NET Framework|log4net,394,Java|Log4j | ||||||
|  | 42,Magik,484,.NET Framework|Magick.NET,533,YAML | ||||||
|  | 43,WCF,485,.NET Framework|Windows Communication Foundation (WCF),486,.NET Framework|Windows Workflow Foundation (WF) | ||||||
|  | 45,WWF,486,.NET Framework|Windows Workflow Foundation (WF),443,OS/2 | ||||||
|  | 47,Ejes,1,(E)JES,101,Microsoft Dynamics AX | ||||||
|  | 48,(UNIRITA) A-AUTO 7.2.2,2,A-Auto Job Scheduling Software,299,AutoIt | ||||||
|  | 50,Active Directoy,498,Active Directory (AD),40,Connect Direct | ||||||
|  | 54,APSX,592,Active Server Pages (ASP)|*,609,IIS|* | ||||||
|  | 69,Andriod,418,Android,586,PHP|* | ||||||
|  | 71,Apache Active Queue,6,Apache ActiveMQ,259,Apache HTTP Server | ||||||
|  | 72,MQ Apache Active Queue,6,Apache ActiveMQ,81,IBM Websphere MQ | ||||||
|  | 75,cordova-android,501,Apache Cordova,418,Android | ||||||
|  | 77,Hive,8,Apache Hive,177,SAP NetWeaver Business Warehouse | ||||||
|  | 99,solr,11,Apache Solr,375,Apache Lucene | ||||||
|  | 135,ADF,13,Application Development Facility (ADF),130,Oracle ADF | ||||||
|  | 144,WLS 10.2,600,Oracle WebLogic Server|*,442,OpenVMS | ||||||
|  | 149,BEA WLS,600,Oracle WebLogic Server|*,442,OpenVMS | ||||||
|  | 152,Weblogic 12c,600,Oracle WebLogic Server|*,582,C#|* | ||||||
|  | 160,WLE,600,Oracle WebLogic Server|*,443,OS/2 | ||||||
|  | 168,Web Logic,600,Oracle WebLogic Server|*,97,MarkLogic DB | ||||||
|  | 174,BEA WLE,600,Oracle WebLogic Server|*,443,OS/2 | ||||||
|  | 175,WLS 10,600,Oracle WebLogic Server|*,442,OpenVMS | ||||||
|  | 176,WLS,600,Oracle WebLogic Server|*,442,OpenVMS | ||||||
|  | 189,brain script,302,Brainscript,369,VBScript | ||||||
|  | 190,BRAINScript,302,Brainscript,367,TypeScript | ||||||
|  | 191,Business Intelligence and Reporting Tools,21,Business Intelligence and Reporting Tools (BIRT),133,Oracle Business Intelligence | ||||||
|  | 192,Actuate Report Server,21,Business Intelligence and Reporting Tools (BIRT),42,Crystal Reports | ||||||
|  | 194,CSHARP,582,C#|*,87,Informatica PowerCenter | ||||||
|  | 218,WinFrame,30,Citrix Virtual Apps and Desktops,443,OS/2 | ||||||
|  | 221,METAFRAME,30,Citrix Virtual Apps and Desktops,406,JavaScript|AngularJS | ||||||
|  | 225,Presentation Server,30,Citrix Virtual Apps and Desktops,541,File Server | ||||||
|  | 226,NETSCALER-1.5,563,Citrix ADC,273,Netscape Enterprise Server (NES) | ||||||
|  | 227,NETSCALER-11.,563,Citrix ADC,273,Netscape Enterprise Server (NES) | ||||||
|  | 228,Citrix SD-WAN,563,Citrix ADC,30,Citrix Virtual Apps and Desktops | ||||||
|  | 229,NetScaler SD-WAN,563,Citrix ADC,273,Netscape Enterprise Server (NES) | ||||||
|  | 231,NetScaler ADC,563,Citrix ADC,272,Netscape Application Server (NAS) | ||||||
|  | 236,NetScaler SDX,291,Citrix ADC SDX,273,Netscape Enterprise Server (NES) | ||||||
|  | 240,Provisioning Services 7.15.8,32,Citrix Provisioning,538,Device Provisioning Engines (DPE) | ||||||
|  | 241,Citrix PVS,32,Citrix Provisioning,30,Citrix Virtual Apps and Desktops | ||||||
|  | 243,CLISTS,309,CLIST,329,IBM i Control Language (CL) | ||||||
|  | 253,CFML,311,ColdFusion Markup Language (CFML),316,eXtensible HyperText Markup Language (XHTML) | ||||||
|  | 254,ColdFusion Markup Language,311,ColdFusion Markup Language (CFML),37,Coldfusion | ||||||
|  | 255,Sterling Connect,40,Connect Direct,542,General Ledger | ||||||
|  | 264,Cormerstone,41,Cornerstone software,516,Compopent Object Model (COM) | ||||||
|  | 265,Cornerstone,41,Cornerstone software,370,Visual Basic | ||||||
|  | 279,DB2 UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA) | ||||||
|  | 282,DB2-UDB,43,DB2,365,TCL | ||||||
|  | 291,DB2/UDB,43,DB2,365,TCL | ||||||
|  | 292,IBM DB2 ENTERPRISE SERVER EDITION PVU OPTION 10.5,43,DB2,163,PVCS Version Manager | ||||||
|  | 300,IBM - IBM DB2 Advanced Enterprise Server Edition PVU Option 10.5,43,DB2,72,IBM Mobile Foundation | ||||||
|  | 301,UDB,43,DB2,517,Common Object Request Broker Architecture (CORBA) | ||||||
|  | 302,IBM - IBM DB2 Enterprise Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|* | ||||||
|  | 306,IBM - IBM DB2 Workgroup Server Edition Product Trial 9.7,43,DB2,610,Oracle Application Server|* | ||||||
|  | 313,EZTriev,314,Easytrieve,296,Intel Xeon Processor | ||||||
|  | 314,Eztrieve,314,Easytrieve,296,Intel Xeon Processor | ||||||
|  | 321,PrestoSoft - ExamDiff Application 1.6,49,ExamDiff,346,Pascal|Object Pascal | ||||||
|  | 322,PrestoSoft - ExamDiff Application,49,ExamDiff,346,Pascal|Object Pascal | ||||||
|  | 323,ExamDiff Application,49,ExamDiff,467,MS SQL Server|Log Reader Agent | ||||||
|  | 324,Expect Scripts,315,Expect,109,Microsoft MQ | ||||||
|  | 329,Microsoft - MSXML 4.0 SP2 4.2,318,Extensible Markup Language (XML)|MSXML,316,eXtensible HyperText Markup Language (XHTML) | ||||||
|  | 331,XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST) | ||||||
|  | 332,JAVA-XSL,319,Extensible Stylesheet Language (XSL),320,Extensible Stylesheet Language Transformations (XLST) | ||||||
|  | 335,ServerCA Access GatewayF5,50,F5 Secure Web Gateway Services,290,CA API Gateway | ||||||
|  | 347,HP C++,58,HP aC++ compiler,59,HP C/ANSI C compiler | ||||||
|  | 350,HP C++ 10.20,58,HP aC++ compiler,59,HP C/ANSI C compiler | ||||||
|  | 351,HPC 11.11,59,HP C/ANSI C compiler,58,HP aC++ compiler | ||||||
|  | 358,HFS,505,HTTP File Server,486,.NET Framework|Windows Workflow Foundation (WF) | ||||||
|  | 359,www.rejetto.com - HttpFileServer 2.3,505,HTTP File Server,55,Google Chrome | ||||||
|  | 360,HttpFileServer,505,HTTP File Server,522,Application Web Server | ||||||
|  | 367,IBM - IBM BigFix Platform Client Deploy Tool 9.5,457,IBM BigFix Platform|Client Deploy Tool,62,IBM BigFix Platform | ||||||
|  | 369,IBM BPM,64,IBM Business Process Manager,328,IBM High Level Assembler (HLASM) | ||||||
|  | 375,Data Power,294,IBM DataPower Gateway,295,IBM Power Systems | ||||||
|  | 376,IDG.7.5.2.19hp,294,IBM DataPower Gateway,449,Unix|HP-UX | ||||||
|  | 380,hlasm,328,IBM High Level Assembler (HLASM),438,macOS | ||||||
|  | 383,IHS,265,IBM HTTP Server,424,IBM i | ||||||
|  | 386,WebSphere and IHS,265,IBM HTTP Server,67,IBM InfoSphere DataStage | ||||||
|  | 387,WebSphere http,265,IBM HTTP Server,284,Websphere Application Server (WAS) | ||||||
|  | 391,IBM Websphere HTTP Server,265,IBM HTTP Server,285,WebSphere Liberty | ||||||
|  | 393,WebSphere IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS | ||||||
|  | 394,WebSphere -IHS,265,IBM HTTP Server,601,Websphere ILOG JRules BRMS | ||||||
|  | 397,OS400 V7R1,424,IBM i,443,OS/2 | ||||||
|  | 398,OS400,424,IBM i,443,OS/2 | ||||||
|  | 399,OS/400,424,IBM i,443,OS/2 | ||||||
|  | 408,IIB,68,IBM Integration Bus,370,Visual Basic | ||||||
|  | 411,Extended Structured Query Language,458,IBM Integration Bus|Extended Structured Query Language (ESQL),572,Structured Query Language (SQL) | ||||||
|  | 415,IBM WorkLight,72,IBM Mobile Foundation,649,IBM Cloud | ||||||
|  | 417,ILOG JRules,73,IBM Operational Decision Manager (ODM),601,Websphere ILOG JRules BRMS | ||||||
|  | 420,Decision Center 8.0.1.0,73,IBM Operational Decision Manager (ODM),252,IBM ILOG Views | ||||||
|  | 423,AS400,295,IBM Power Systems,443,OS/2 | ||||||
|  | 424,AS/400,295,IBM Power Systems,443,OS/2 | ||||||
|  | 426,System i,295,IBM Power Systems,424,IBM i | ||||||
|  | 427,P-series,295,IBM Power Systems,81,IBM Websphere MQ | ||||||
|  | 428,IBM iSeries/AS400 system Model 520,295,IBM Power Systems,443,OS/2 | ||||||
|  | 439,Tivoli Asset Discovery for Distributed,459,IBM Tivoli Asset Management|Asset Discovery for Distributed,606,IBM Tivoli Asset Management|* | ||||||
|  | 447,Database MS SQL Agent,77,IBM Tivoli Monitoring,469,MS SQL Server|SQL Server Database Engine | ||||||
|  | 448,Linux OS Agent,77,IBM Tivoli Monitoring,576,Linux|* | ||||||
|  | 449,Database DB2 Agent,77,IBM Tivoli Monitoring,520,Database (DB) | ||||||
|  | 452,Windows OS Agent,77,IBM Tivoli Monitoring,580,Windows|* | ||||||
|  | 454,IBM - IBM TSM FCM,604,IBM Tivoli Storage Manager|*,460,IBM Tivoli Storage Manager|TSM API | ||||||
|  | 459,Databases Data Protection for Microsoft SQL,604,IBM Tivoli Storage Manager|*,572,Structured Query Language (SQL) | ||||||
|  | 461,IBM - IBM Spectrum Protect Data Protection for Microsoft SQL Server 8.1,604,IBM Tivoli Storage Manager|*,469,MS SQL Server|SQL Server Database Engine | ||||||
|  | 462,IBM Spectrum Protect Data Protection,604,IBM Tivoli Storage Manager|*,312,Data Language Interface (DL/I) | ||||||
|  | 463,IBM - IBM Spectrum Protect API 7.1,460,IBM Tivoli Storage Manager|TSM API,294,IBM DataPower Gateway | ||||||
|  | 464,IBM - IBM Spectrum Protect Client,461,IBM Tivoli Storage Manager|TSM Client,294,IBM DataPower Gateway | ||||||
|  | 465,IBM - IBM Tivoli Storage Manager Client,461,IBM Tivoli Storage Manager|TSM Client,604,IBM Tivoli Storage Manager|* | ||||||
|  | 467,VSS Requestor configured 8.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|* | ||||||
|  | 468,VSS Requestor 7.1,463,IBM Tivoli Storage Manager|VSS Requestor,577,MVS|* | ||||||
|  | 469,TWS-WS,79,IBM Tivoli Workload Scheduler (TWS),239,Windows Terminal Server (WTS) | ||||||
|  | 472,wbia 2.6,80,IBM WebSphere Business Integration Adaptor,627,XtraDB | ||||||
|  | 473,IBM WBIA 2.6.0.12,80,IBM WebSphere Business Integration Adaptor,424,IBM i | ||||||
|  | 475,MQ,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 476,MQ 9.1,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 479,MQ 7,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 480,MQ 6,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 481,MQ 9.0,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 482,MQ 5.3,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 483,MQ 7.01,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 484,MQ 7.5,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 485,MQSeries 8.0,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 488,WSMQ 8.0,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 489,MQ 9.0.5,81,IBM Websphere MQ,248,ZeroMQ | ||||||
|  | 491,WTX,83,IBM WebSphere Transformation Extender (WTX),274,Nginx | ||||||
|  | 505,Microsoft Internet Inf,609,IIS|*,130,Oracle ADF | ||||||
|  | 508,Microsoft Internet Informat,609,IIS|*,330,IBM Informix-4GL | ||||||
|  | 550,Microsoft - IIS 6.0 Migration Tool 1,489,IIS|Easy Migration Tool (IEMT),609,IIS|* | ||||||
|  | 558,Infozip 6,85,Info-ZIP,677,Git | ||||||
|  | 559,Infozip,85,Info-ZIP,677,Git | ||||||
|  | 578,IMSVA 9.1,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB | ||||||
|  | 580,IMSVA,566,InterScan Messaging Security Virtual Appliance (IMSVA),84,IMS DB | ||||||
|  | 581,Java 1.8,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 582,Java 7,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 583,Java on Weblogic server,584,Java|*,600,Oracle WebLogic Server|* | ||||||
|  | 584,Java5,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 585,Java 6,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 586,Java 6.0,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 587,Java 7 Update 25,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 589,Java (open source),584,Java|*,397,Java|Servlet | ||||||
|  | 590,Java 5,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 591,Java 1.5,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 593,Java 1.8.0_92,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 594,Java 1.6,584,Java|*,334,Java|Java Standard Edition (Java SE) | ||||||
|  | 595,J2EE 6,584,Java|*,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 596,Java (J2EE),584,Java|*,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 598,JRE,506,Java Runtime Environment (JRE),84,IMS DB | ||||||
|  | 629,JEE,333,Java|Java Enterprise Edition (Java EE),1,(E)JES | ||||||
|  | 639,JSF,391,Java|JavaServer Faces (JSF),334,Java|Java Standard Edition (Java SE) | ||||||
|  | 643,JSP Scriptlets,336,Java|JavaServer Pages (JSP)|Scriptlets,335,Java|JavaServer Pages (JSP) | ||||||
|  | 644,Java Scriplet,336,Java|JavaServer Pages (JSP)|Scriptlets,88,Ingres | ||||||
|  | 645,Core 9.2.0.0,393,Java|JRuby Core,583,C++|* | ||||||
|  | 647,Java RMI,396,Java|Remote Method Invocation (RMI),584,Java|* | ||||||
|  | 650,Java Servlets,397,Java|Servlet,453,Linux|Fedora | ||||||
|  | 651,Java 6 Servlets,397,Java|Servlet,453,Linux|Fedora | ||||||
|  | 652,J2EE Servlets,397,Java|Servlet,443,OS/2 | ||||||
|  | 653,Servlets,397,Java|Servlet,420,Cisco IOS | ||||||
|  | 654,Servlets v2.3,397,Java|Servlet,370,Visual Basic | ||||||
|  | 656,Spring BOOT,399,Java|Spring|Spring Boot,398,Java|Spring | ||||||
|  | 657,Springboot,399,Java|Spring|Spring Boot,398,Java|Spring | ||||||
|  | 661,javasript,589,JavaScript|*,335,Java|JavaServer Pages (JSP) | ||||||
|  | 662,JS,589,JavaScript|*,507,Node.js | ||||||
|  | 664,Java Script,589,JavaScript|*,584,Java|* | ||||||
|  | 671,Sencha 4.2.0,409,JavaScript|Ext JS,589,JavaScript|* | ||||||
|  | 674,jqueryui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery | ||||||
|  | 675,jquery-ui,412,JavaScript|Jquery|jQuery UI,411,JavaScript|JQuery | ||||||
|  | 679,Scriptaculous,414,JavaScript|script.aculo.us,582,C#|* | ||||||
|  | 684,EAP,268,JBoss|*,174,SAP ERP | ||||||
|  | 685,JBOSS-EAP,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 686,JBoss Application Server 4,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 687,JBoss Application Server 7,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 688,JBoss Application Server 5,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 689,JBoss Application Server,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 690,Enterprise Application Platform,268,JBoss|*,670,EAServer | ||||||
|  | 692,JBOSS 5.1.2 EAP,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 693,server: Jboss,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 694,JBOSS 6.3.2 EAP,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 695,JBoss EAP 4.3,268,JBoss|*,493,JBoss|Wildfly | ||||||
|  | 700,Job Information Language,339,Job Information Language (JIL),338,JCL | ||||||
|  | 703,JoinIT by Acayosoft,91,joinIT,4,Adobe Acrobat Reader | ||||||
|  | 704,Acayosoft JoinIT,91,joinIT,4,Adobe Acrobat Reader | ||||||
|  | 705,JoinIT by Acayosoft v 9.0.8,91,joinIT,4,Adobe Acrobat Reader | ||||||
|  | 706,LifeFlow Tool,92,LifeFlow,486,.NET Framework|Windows Workflow Foundation (WF) | ||||||
|  | 707,Linux 2.6.32-696.28.1.el6.x86_64,576,Linux|*,437,Linux|zLinux | ||||||
|  | 709,Linux 2.6.32-696.30.1.el6.x86_64,576,Linux|*,437,Linux|zLinux | ||||||
|  | 710,Linux 2.6.9,576,Linux|*,437,Linux|zLinux | ||||||
|  | 711,Linux 2.6.32-642.3.1.el6.x86_64,576,Linux|*,437,Linux|zLinux | ||||||
|  | 712,Linux - 2.6.18-371.1.2.el5,576,Linux|*,437,Linux|zLinux | ||||||
|  | 713,Linux 2.6.32-696.23.1.el6.x86_64,576,Linux|*,437,Linux|zLinux | ||||||
|  | 749,Gaia Kernel version 2.7,428,Linux|Check Point,432,Linux|Oracle Linux | ||||||
|  | 752,Gaia Kernel version 2.6,428,Linux|Check Point,432,Linux|Oracle Linux | ||||||
|  | 766,OEL,432,Linux|Oracle Linux,449,Unix|HP-UX | ||||||
|  | 778,Oracle Enterprise Server 7.5,432,Linux|Oracle Linux,134,Oracle Database | ||||||
|  | 780,OEL6.7 - 3.8.13-68.3.4.el6uek.x86_64,432,Linux|Oracle Linux,449,Unix|HP-UX | ||||||
|  | 792,VMware Photon,433,Linux|Photon OS,569,VMware Server | ||||||
|  | 793,VMware Photon 1,433,Linux|Photon OS,569,VMware Server | ||||||
|  | 809,Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 818,Redhat - Redhat Linux 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 819,Linux RH6,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux | ||||||
|  | 865,Redhat - Redhat Linux 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 870,Redhat - RHEL 7.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 874,Red Hat Entreprise Linux 6.2,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 882,Redhat 6 64-Bit,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 893,RED HAT ADVANCED SERVER 5,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 910,redhat6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 912,Redhat - Redhat Linux 6.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 913,Linux RH,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux | ||||||
|  | 916,Redhat - Red Hat(Linux),434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 920,Linux RH7,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux | ||||||
|  | 926,Red Hat V6,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 932,Linux RH5,434,Linux|Red Hat Enterprise Linux,437,Linux|zLinux | ||||||
|  | 934,rehl5.9,434,Linux|Red Hat Enterprise Linux,43,DB2 | ||||||
|  | 964,Red Hat 6.6,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 979,red hat,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 991,Redhat,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 996,RedHat 7.3,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 998,LINUX RED HAT 5 EL,434,Linux|Red Hat Enterprise Linux,268,JBoss|* | ||||||
|  | 1003,SUSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1006,Linux SuSE12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1011,SUSE10,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1012,SUSE Linux 12,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1017,SUSELinux Enterprise 11.x,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1023,SUSE Linux 11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1024,SUSE Linux 11 SP3,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1029,Linux SuSE11,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1030,SUSE,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1038,SuseLinux,435,Linux|SUSE Linux Enterprise Server,431,Linux|openSUSE | ||||||
|  | 1051,domino8.5,270,Lotus Domino,93,Lotus Notes | ||||||
|  | 1052,Domino 8.x,270,Lotus Domino,93,Lotus Notes | ||||||
|  | 1054,Lucee 5.2.6.60,271,Lucee,375,Apache Lucene | ||||||
|  | 1056,Darwin,438,macOS,117,Mozilla Firefox | ||||||
|  | 1061,Memcache,98,Memcached,18,BMC Control-M | ||||||
|  | 1062,ACCDB,99,Microsoft Access,525,Open Database Connectivity (ODBC) | ||||||
|  | 1070,ConfigMgr,102,Microsoft Endpoint Configuration Manager (SCCM),21,Business Intelligence and Reporting Tools (BIRT) | ||||||
|  | 1080,FIM SQL Development Server,105,Microsoft Forefront Identity Manager (FIM),572,Structured Query Language (SQL) | ||||||
|  | 1082,Microsoft - Internet Explor,107,Microsoft Internet Explorer,356,Rexx | ||||||
|  | 1084,Internet Explor,107,Microsoft Internet Explorer,356,Rexx | ||||||
|  | 1090,SCEP for Linux,110,Microsoft System Center Endpoint Protection,437,Linux|zLinux | ||||||
|  | 1094,SCEP for Mac,110,Microsoft System Center Endpoint Protection,438,macOS | ||||||
|  | 1101,msdeploy,112,Microsoft Web Deploy,56,Greenplum DB | ||||||
|  | 1106,WebPI,114,Microsoft Web Platform Installer,522,Application Web Server | ||||||
|  | 1109,Web PI,114,Microsoft Web Platform Installer,531,Simple Object Access Protocol (SOAP) | ||||||
|  | 1111,MDW Framework,115,Model Driven Workflow (MDW),406,JavaScript|AngularJS | ||||||
|  | 1115,Mango DB,116,MongoDB,43,DB2 | ||||||
|  | 1117,MangoDB,116,MongoDB,43,DB2 | ||||||
|  | 1125,O365,119,MS Office 365,424,IBM i | ||||||
|  | 1141,MICROSOFT SQL SERVER 2012 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact | ||||||
|  | 1153,MICROSOFT SQL SERVER 2012 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact | ||||||
|  | 1154,MS SQL Server 2008 Developer,581,MS SQL Server|*,146,Oracle SQL Developer | ||||||
|  | 1156,MICROSOFT SQL SERVER 2008 DEVELOPER EDITION,581,MS SQL Server|*,121,MS SQL Server Compact | ||||||
|  | 1167,MSSQL Database Server,581,MS SQL Server|*,122,MySQL | ||||||
|  | 1173,MSSQL2008,581,MS SQL Server|*,122,MySQL | ||||||
|  | 1192,Microsoft SQL Server Standard Edition,581,MS SQL Server|*,121,MS SQL Server Compact | ||||||
|  | 1201,SQLServer,581,MS SQL Server|*,572,Structured Query Language (SQL) | ||||||
|  | 1226,MICROSOFT SQL SERVER 2012 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact | ||||||
|  | 1233,MICROSOFT SQL SERVER 2005 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact | ||||||
|  | 1234,SQLSVR2008,581,MS SQL Server|*,352,PL/SQL | ||||||
|  | 1235,MICROSOFT SQL SERVER 2008 ENTERPRISE EDITION,581,MS SQL Server|*,121,MS SQL Server Compact | ||||||
|  | 1239,MICROSOFT SQL SERVER 2008 STANDARD EDITION,581,MS SQL Server|*,121,MS SQL Server Compact | ||||||
|  | 1244,MS SQL Server 2012 Developer,581,MS SQL Server|*,146,Oracle SQL Developer | ||||||
|  | 1253,Microsoft - SQL Server Express LocalDB 2014,581,MS SQL Server|*,469,MS SQL Server|SQL Server Database Engine | ||||||
|  | 1256,MSSQL,581,MS SQL Server|*,122,MySQL | ||||||
|  | 1280,Microsoft - Microsoft SQL Server Analysis Services 2012 for Microsoft SQL Server 2012 Standard Edition 2012,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|* | ||||||
|  | 1281,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|* | ||||||
|  | 1283,Microsoft - Microsoft SQL Server Analysis Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,468,MS SQL Server|SQL Server Analysis Services (SSAS),581,MS SQL Server|* | ||||||
|  | 1290,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Enterprise Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|* | ||||||
|  | 1293,Microsoft - Microsoft SQL Server Integration Services 2014 for Microsoft SQL Server 2014 Standard Edition 2014,470,MS SQL Server|SQL Server Integration Services (SSIS),581,MS SQL Server|* | ||||||
|  | 1295,SQL Server Integration Services,470,MS SQL Server|SQL Server Integration Services (SSIS),473,MS SQL Server|SQL Server Reporting Services (SSRS) | ||||||
|  | 1316,ZOS Base 1.12,441,MVS|z/OS,437,Linux|zLinux | ||||||
|  | 1335,NAS,272,Netscape Application Server (NAS),443,OS/2 | ||||||
|  | 1337,NES,273,Netscape Enterprise Server (NES),443,OS/2 | ||||||
|  | 1349,Node.js 0.10 (Linux),507,Node.js,437,Linux|zLinux | ||||||
|  | 1361,Node.js 4 (Linux),507,Node.js,437,Linux|zLinux | ||||||
|  | 1371,Symas OpenLDAP,126,OpenLDAP,178,SAP SQL Anywhere | ||||||
|  | 1377,OAM 12c,129,Oracle Access Management,303,C | ||||||
|  | 1378,ADF 12c,130,Oracle ADF,343,Objective C | ||||||
|  | 1381,OHS,610,Oracle Application Server|*,122,MySQL | ||||||
|  | 1383,Oracle HTTP Server powered by Apache,610,Oracle Application Server|*,259,Apache HTTP Server | ||||||
|  | 1384,j2eeoracleca,610,Oracle Application Server|*,453,Linux|Fedora | ||||||
|  | 1385,Oracle HTTP,610,Oracle Application Server|*,134,Oracle Database | ||||||
|  | 1389,9i AS server,610,Oracle Application Server|*,227,Virtual I/O Server | ||||||
|  | 1391,Oracle Application R12.1.3,610,Oracle Application Server|*,134,Oracle Database | ||||||
|  | 1394,Weblogic BI Publisher,132,Oracle BI Publisher,600,Oracle WebLogic Server|* | ||||||
|  | 1396,OBI,133,Oracle Business Intelligence,343,Objective C | ||||||
|  | 1397,OBIEE,133,Oracle Business Intelligence,343,Objective C | ||||||
|  | 1398,OBI Reporting,133,Oracle Business Intelligence,343,Objective C | ||||||
|  | 1401,Oracle 12.2 Client,134,Oracle Database,610,Oracle Application Server|* | ||||||
|  | 1406,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0,134,Oracle Database,610,Oracle Application Server|* | ||||||
|  | 1407,Oracle 11.2 (Oracle Database 11g Enterprise Edition Release 11.2.0.3.0 - 64bit) RAC,134,Oracle Database,610,Oracle Application Server|* | ||||||
|  | 1415,Oracle 11 on AIX,134,Oracle Database,445,Unix|AIX | ||||||
|  | 1416,Oracle Database 10g Enterprise Edition Release 10.1.0.4.0 - 64bit,134,Oracle Database,610,Oracle Application Server|* | ||||||
|  | 1431,Oracle Database 10g Release 10.2.0.4.0 - 64bit Production,134,Oracle Database,298,Oracle Exadata | ||||||
|  | 1432,Oarcle 11G,134,Oracle Database,218,TIBCO InConcert | ||||||
|  | 1443,DB - Oracle inbuilt,134,Oracle Database,158,Powerbuilder | ||||||
|  | 1460,Oracle Server,134,Oracle Database,610,Oracle Application Server|* | ||||||
|  | 1475,Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 - 64bit Production,134,Oracle Database,610,Oracle Application Server|* | ||||||
|  | 1480,Oracle 12C on linux,134,Oracle Database,303,C | ||||||
|  | 1484,Oracle9i Enterprise Edition Release 9.2.0.5.0,134,Oracle Database,610,Oracle Application Server|* | ||||||
|  | 1486,Oracle 11g on linux,134,Oracle Database,432,Linux|Oracle Linux | ||||||
|  | 1487,Oracle 11gEssbase,134,Oracle Database,298,Oracle Exadata | ||||||
|  | 1490,JServer Release 9.2.0.5.0,474,Oracle Database|Jserver,335,Java|JavaServer Pages (JSP) | ||||||
|  | 1492,Designer 6i,135,Oracle Designer,516,Compopent Object Model (COM) | ||||||
|  | 1493,Enterprise Manager 12.2.1.1,136,Oracle Enterprise Manager,427,Linux|CentOS | ||||||
|  | 1494,Enterprise Manager 12.2.1.2,136,Oracle Enterprise Manager,427,Linux|CentOS | ||||||
|  | 1495,Enterprise Manager 11.1.1.7,136,Oracle Enterprise Manager,427,Linux|CentOS | ||||||
|  | 1501,"Oracle, Nets",140,Oracle Net Services,273,Netscape Enterprise Server (NES) | ||||||
|  | 1502,Oracle RAC,141,Oracle Real Application Clusters (RAC),134,Oracle Database | ||||||
|  | 1504,ORPOS 13.3.3,142,Oracle Retail Point-of-Service,609,IIS|* | ||||||
|  | 1505,ORPOS 13.3.5,142,Oracle Retail Point-of-Service,609,IIS|* | ||||||
|  | 1506,ORPOS 13.3.4,142,Oracle Retail Point-of-Service,609,IIS|* | ||||||
|  | 1509,OSB Servers,143,Oracle Service Bus,443,OS/2 | ||||||
|  | 1514,Oracle TT,147,Oracle TimesTen In-Memory Database,134,Oracle Database | ||||||
|  | 1517,OWB 10g,148,Oracle Warehouse Builder (OWB),300,AWK | ||||||
|  | 1524,Clascal,346,Pascal|Object Pascal,307,Cascading Style Sheets (CSS) | ||||||
|  | 1526,Oracle-HR-9.2,151,PeopleSoft,134,Oracle Database | ||||||
|  | 1529,ActiveState Tool Corp. - ActivePerl 5.12,348,Perl|ActivePerl,500,ActiveX | ||||||
|  | 1530,ActiveState Tool Corp. - ActivePerl 5.8,348,Perl|ActivePerl,500,ActiveX | ||||||
|  | 1531,ORAPERL,417,Perl|Oraperl,242,WinRAR | ||||||
|  | 1532,REX,349,Perl|Rex,356,Rexx | ||||||
|  | 1536,TCServer V6,277,Pivotal tc Server,365,TCL | ||||||
|  | 1537,IBM PKWARE PKZip 2,155,PKZIP,387,Java|IBM SDK | ||||||
|  | 1541,PLQSL,352,PL/SQL,351,PL/I | ||||||
|  | 1542,Oracle - SQL,352,PL/SQL,581,MS SQL Server|* | ||||||
|  | 1544,Oracle SQL,352,PL/SQL,134,Oracle Database | ||||||
|  | 1545,PLSQL;,352,PL/SQL,351,PL/I | ||||||
|  | 1547,Oracle PLSQL,352,PL/SQL,351,PL/I | ||||||
|  | 1548,plsql,352,PL/SQL,351,PL/I | ||||||
|  | 1551,Projectplace,156,Planview,21,Business Intelligence and Reporting Tools (BIRT) | ||||||
|  | 1558,Power Builder,158,Powerbuilder,151,PeopleSoft | ||||||
|  | 1560,Power Builder 6.5,158,Powerbuilder,27,Chef Automate | ||||||
|  | 1565,ProjectWise Oracle Server,161,ProjectWise,162,ProjectWise Web Server | ||||||
|  | 1576,RMQ,165,RabbitMQ,355,R | ||||||
|  | 1579,Clearquest,167,Rational ClearQuest,455,Clarify|Clear Basic | ||||||
|  | 1581,Remedy ARS,169,Remedy,322,Fortran | ||||||
|  | 1584,RightFax client 10,171,RightFax,118,MQ Client | ||||||
|  | 1585,SOQL,359,Salesforce Object Query Language (SOQL),621,ArangoDB | ||||||
|  | 1587,SAP Business Objects,173,SAP BusinessObjects BI server,177,SAP NetWeaver Business Warehouse | ||||||
|  | 1588,Business Objects 12,173,SAP BusinessObjects BI server,488,ActiveX|ADO | ||||||
|  | 1590,SAP BI 4.2 Sp5,173,SAP BusinessObjects BI server,174,SAP ERP | ||||||
|  | 1593,SAP HANA ON SUSEOracle 11g on Linux,175,SAP HANA DB,435,Linux|SUSE Linux Enterprise Server | ||||||
|  | 1596,NetWeaver,279,SAP NetWeaver App Server,431,Linux|openSUSE | ||||||
|  | 1605,SCSS,361,Sass,102,Microsoft Endpoint Configuration Manager (SCCM) | ||||||
|  | 1606,Scalla,362,Scala,664,Forte | ||||||
|  | 1609,Microsoft SPS 2010,603,SharePoint|*,577,MVS|* | ||||||
|  | 1613,SQL Server SP2013 Database Server,603,SharePoint|*,581,MS SQL Server|* | ||||||
|  | 1615,Siebel IP 2015,182,Siebel,583,C++|* | ||||||
|  | 1616,Siebel 7.8.2.16,182,Siebel,43,DB2 | ||||||
|  | 1617,Siebel CRM,182,Siebel,583,C++|* | ||||||
|  | 1619,Techsmith Corporation - SnagIt 8,184,SnagIt,183,SNA Manager | ||||||
|  | 1620,Solid development server,185,solidDB,600,Oracle WebLogic Server|* | ||||||
|  | 1622,Sixty-Five Software - SpaceMonger 1.4,187,SpaceMonger,296,Intel Xeon Processor | ||||||
|  | 1623,SQLPlus,478,Oracle Database|SQL*Plus,572,Structured Query Language (SQL) | ||||||
|  | 1625,SQLIO 1.0,189,SQLIO,178,SAP SQL Anywhere | ||||||
|  | 1630,SunOne,281,Oracle iPlanet Web Server,448,Unix|BSD|SunOS | ||||||
|  | 1637,SAP - Sybase Central 4.3,479,Sybase SQL Server|Sybase Central,190,Sybase SQL Server | ||||||
|  | 1639,Sysncsort,191,Syncsort,178,SAP SQL Anywhere | ||||||
|  | 1640,syncsort,191,Syncsort,98,Memcached | ||||||
|  | 1641,Sysinternals LLC - AccessEnum 1 1,194,Sysinternal Tools|AccessEnum,124,Nexus Repository OSS | ||||||
|  | 1642,Sysinternals LLC - ClockRes 2,195,Sysinternal Tools|ClockRes,374,Xbase++ | ||||||
|  | 1643,Sysinternals LLC - Coreinfo 3.21,196,Sysinternal Tools|Coreinfo,670,EAServer | ||||||
|  | 1644,Sysinternals LLC - DiskExt 1.1,197,Sysinternal Tools|DiskExt,374,Xbase++ | ||||||
|  | 1645,Sysinternals LLC - DiskMon 2.01,198,Sysinternal Tools|DiskMon,670,EAServer | ||||||
|  | 1647,Sysinternals LLC - Junction 1.6,200,Sysinternal Tools|Junction,374,Xbase++ | ||||||
|  | 1648,Sysinternals LLC - LDMDump 1.02,201,Sysinternal Tools|LDMDump,178,SAP SQL Anywhere | ||||||
|  | 1649,Sysinternals LLC - LoadOrder 1,202,Sysinternal Tools|LoadOrder,374,Xbase++ | ||||||
|  | 1650,Sysinternals LLC - PipeList 1.01,203,Sysinternal Tools|PipeList,670,EAServer | ||||||
|  | 1651,Sysinternals LLC - Process Explorer 16.5,204,Sysinternal Tools|Process Explorer,464,Microsoft Exchange Server|Veeam Explorer | ||||||
|  | 1652,Sysinternals LLC - PsKill 1.15,205,Sysinternal Tools|PsKill,151,PeopleSoft | ||||||
|  | 1653,Sysinternals LLC - PsPasswd 1.23,206,Sysinternal Tools|PsPasswd,231,VMware vCenter | ||||||
|  | 1654,Sysinternals LLC - SDelete 1.61,207,Sysinternal Tools|SDelete,670,EAServer | ||||||
|  | 1655,Sysinternals LLC - ShareEnum 1.6,208,Sysinternal Tools|ShareEnum,603,SharePoint|* | ||||||
|  | 1656,Sysinternals LLC - Sync 2.2,209,Sysinternal Tools|Sync,374,Xbase++ | ||||||
|  | 1657,Sysinternals LLC - Sysinternals TCPView 3.5,210,Sysinternal Tools|TCPView,365,TCL | ||||||
|  | 1658,Sysinternals LLC - VMMap 3.11,211,Sysinternal Tools|VMMap,176,SAP MaxDB | ||||||
|  | 1659,Sysinternals LLC - Whois 1.11,212,Sysinternal Tools|Whois,178,SAP SQL Anywhere | ||||||
|  | 1664,TERADATA QUERY SCHEDULER SERVER VERSION 15,216,Teradata QS Server,215,Teradata | ||||||
|  | 1667,BusinessWorks,217,TIBCO Business Works (BW),111,Microsoft Visual Studio | ||||||
|  | 1668,Tibco-IM,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous | ||||||
|  | 1669,Tibco Integration Manager,481,TIBCO Business Works (BW)|Integration Manager,219,TIBCO Rendezvous | ||||||
|  | 1674,TSQL,366,Transact-SQL,621,ArangoDB | ||||||
|  | 1675,Trasact SQL,366,Transact-SQL,352,PL/SQL | ||||||
|  | 1746,Solaris 11.2 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene | ||||||
|  | 1747,Solaris UNIX,448,Unix|BSD|SunOS,578,Unix|* | ||||||
|  | 1748,Unix Servers (Solaris,448,Unix|BSD|SunOS,578,Unix|* | ||||||
|  | 1749,Oracle Solaris 11.3 SPARC,448,Unix|BSD|SunOS,375,Apache Lucene | ||||||
|  | 1753,Solaris 5.10 (Generic_150400-61),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI) | ||||||
|  | 1754,Solaris 5.10 (Generic_150400-62),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI) | ||||||
|  | 1756,Solaris 5.10 (Generic_150400-55),448,Unix|BSD|SunOS,521,Electronic Data Interchange (EDI) | ||||||
|  | 1760,Oracle Solaris,448,Unix|BSD|SunOS,134,Oracle Database | ||||||
|  | 1762,Solaris 1 (SPARC),448,Unix|BSD|SunOS,375,Apache Lucene | ||||||
|  | 1765,SunSolaris 10.0,448,Unix|BSD|SunOS,430,Linux|Junos OS | ||||||
|  | 1771,Oracle Solaris 10,448,Unix|BSD|SunOS,134,Oracle Database | ||||||
|  | 1800,VIO 2.2.0.10,227,Virtual I/O Server,159,Primavera P6 | ||||||
|  | 1801,VIOS,227,Virtual I/O Server,443,OS/2 | ||||||
|  | 1802,visibroker,228,Visibroker,420,Cisco IOS | ||||||
|  | 1803,VB6,370,Visual Basic,368,VB.NET | ||||||
|  | 1804,VB 6.0,370,Visual Basic,368,VB.NET | ||||||
|  | 1805,visualbasic,370,Visual Basic,306,C++|Visual C++ | ||||||
|  | 1808,Visual Basic 6.0,370,Visual Basic,368,VB.NET | ||||||
|  | 1811,VBA,371,Visual Basic for Applications (VBA),370,Visual Basic | ||||||
|  | 1812,Access VB,371,Visual Basic for Applications (VBA),99,Microsoft Access | ||||||
|  | 1813,vfoxpro,372,Visual FoxPro,117,Mozilla Firefox | ||||||
|  | 1827,VMware Appliance,569,VMware Server,559,Virtual Appliance | ||||||
|  | 1828,VSX,229,VMware Solution Exchange Marketplace (VSX),111,Microsoft Visual Studio | ||||||
|  | 1830,VMware - VMware Tools 10.2,230,VMware Tools,569,VMware Server | ||||||
|  | 1832,VXML,373,VoiceXML,316,eXtensible HyperText Markup Language (XHTML) | ||||||
|  | 1833,Web Focus,232,WebFOCUS,321,FOCUS | ||||||
|  | 1834,FOCEXEC,232,WebFOCUS,495,Oracle WebCenter Content Server|Idoc Script | ||||||
|  | 1836,WLI 8,233,WebLogic Integration,442,OpenVMS | ||||||
|  | 1842,IBM WEBSPHERE APPLICATION SERVER VERSION 6.1.0,284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1848,"IBM WebSphere Application Server Network Deployment, 8.0.0.5",284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1850,IBM WebSphere Application Server Network Deployment 7,284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1858,IBM WebSphere 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1861,IBM - WebSphere Application Server - Base 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1865,Websphere AS (JVM),284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1872,IBM WebSphere,284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1875,IBM WebSphere Application Server 8.5,284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1877,IBM WebSphere Application,284,Websphere Application Server (WAS),285,WebSphere Liberty | ||||||
|  | 1878,WAS 6.x,284,Websphere Application Server (WAS),521,Electronic Data Interchange (EDI) | ||||||
|  | 1880,IBM OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE | ||||||
|  | 1882,Open Liberty,285,WebSphere Liberty,397,Java|Servlet | ||||||
|  | 1883,IBM Open Liberty,285,WebSphere Liberty,62,IBM BigFix Platform | ||||||
|  | 1887,WAS Liberty,285,WebSphere Liberty,397,Java|Servlet | ||||||
|  | 1889,OpenStack Liberty,285,WebSphere Liberty,431,Linux|openSUSE | ||||||
|  | 1891,WMB 6.1,235,WebSphere Message Broker,486,.NET Framework|Windows Workflow Foundation (WF) | ||||||
|  | 1892,WebSphere Message Broker v6.0,235,WebSphere Message Broker,285,WebSphere Liberty | ||||||
|  | 1899,WebSphere Portal Extend Limited Use 6.1,286,WebSphere Portal Server,285,WebSphere Liberty | ||||||
|  | 1901,Windchill 11.1,237,Windchill,17,Bluebeam|Bluebeam Q | ||||||
|  | 1908,Window,580,Windows|*,637,Microsoft Azure | ||||||
|  | 1914,Windows Terminal Server,239,Windows Terminal Server (WTS),452,Windows|Windows Server | ||||||
|  | 1915,Windows 7 Standard,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1916,WINDOWS 10 SERVER STANDARD EDITION X64,451,Windows|Windows Desktop,452,Windows|Windows Server | ||||||
|  | 1917,Microsoft Windows 7 (64-bit),451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1918,Microsoft Windows XP Professional (32-bit),451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1919,Windows 7 Professional x64,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1920,Microsoft Microsoft Windows Entreprise,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1921,Microsoft Windows 2000,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1922,Microsoft Windows 10,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1923,MS Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1924,Microsoft Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1925,Microsoft Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1926,Microsoft Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1927,Win Desktop,451,Windows|Windows Desktop,560,Webtop | ||||||
|  | 1928,Windows 10 Pro,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1929,Windows 10,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1930,Windows 7 Ultimate,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1931,Microsoft Windows 8 (64-bit),451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1932,Microsoft Windows XP,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1933,Windows 10 Enterprise,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1934,Windows XP,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1935,Windows 10 Professional,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1936,Windows 7,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1937,Microsoft Windows 10 (64-bit),451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1938,Win 7,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 1939,windowsxp,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1940,Microsoft Windows Unknown,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1941,Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1942,Windows XP Professional,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1943,Windows 7 Professional,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1944,Window XP,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1945,Microsoft Windows 7 Enterprise,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1946,Microsoft Windows 7 - SOE,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1947,Windows 7 Enterprise Edition,451,Windows|Windows Desktop,452,Windows|Windows Server | ||||||
|  | 1948,Windows 8,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1949,Microsoft Windows 7,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1950,Microsoft Windows 7 (32-bit),451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1951,Windows Embedded Standard 7,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1952,Win10,451,Windows|Windows Desktop,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 1953,Windows 2003,451,Windows|Windows Desktop,580,Windows|* | ||||||
|  | 1955,Windows 2003 Standard,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1956,Windows 2008 Enterprise R2 x64,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1960,WINDOWS 2008R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1961,Microsoft Windows Server 2008 Standard Editio,452,Windows|Windows Server,121,MS SQL Server Compact | ||||||
|  | 1962,MICROSOFT WINDOWS NT 2003,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1967,Microsoft Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server | ||||||
|  | 1979,Windows 2008 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1982,Windows 2003 R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1983,Windows 2008 R2 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1988,Windows 2008 R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1989,Windows 2012 Standard,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1992,Windows 2008 R2 Standard 6.1.7601 Service Pack 1,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1994,Windows 2008 Standard x64,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 1998,Windows 2012 R2 Standard 64-Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2007,w2k12,452,Windows|Windows Server,582,C#|* | ||||||
|  | 2008,WINDOWS 2013,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2009,WINDOWS 2016 SE 64 BIT,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2011,Microsoft - Windows 2012,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2019,MICROSOFT WINDOWS 2008 TPM,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2021,MICROSOFT WINDOWS STD 2008,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2025,Windows 2008 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2028,MICROSOFT WINDOWS STD 2008 TPM,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2030,Windows 2012 64 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2031,MICROSOFT WINDOWS NT 2003 ENT,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2034,MICROSOFT WINDOWS 2012,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2036,Windows 2003 Standard5.2.3790,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2040,Windows 2012 R,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2044,Windows 2008 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2045,MICROSOFT WINDOWS 2008 ENT,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2047,Windows 2012 R2 Standard 6.3.9600,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2053,Windows 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server | ||||||
|  | 2055,Microsoft Windows Server 2016 Datacenter,452,Windows|Windows Server,276,Oracle WebCenter Content Server | ||||||
|  | 2061,Windows 2016 Datacenter10.0.14393,452,Windows|Windows Server,637,Microsoft Azure | ||||||
|  | 2065,windows6.3.9600,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2066,Windows 2012 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2069,Windows 2008 Enterprise,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2080,Windows 2008 Standard without Hyper-V6.0.6003,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2084,Windows 2012 R2 Datacenter,452,Windows|Windows Server,110,Microsoft System Center Endpoint Protection | ||||||
|  | 2089,Windows 2008 Standard 64-bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2096,Windows 2000,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2097,W2K8R2 Standard 64 BIT,452,Windows|Windows Server,303,C | ||||||
|  | 2099,Windows 2008 Standard6.0.6003,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2100,Windows2008 R2 Enterprise 64bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2105,Win2008R2,452,Windows|Windows Server,355,R | ||||||
|  | 2107,Windows 2008 Standard 64 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2109,Windows Server 2003 Appliance,452,Windows|Windows Server,559,Virtual Appliance | ||||||
|  | 2111,Windows 2008 ENT R2 (64 bits),452,Windows|Windows Server,355,R | ||||||
|  | 2114,WIN2008R2 6.1.7601,452,Windows|Windows Server,355,R | ||||||
|  | 2116,microsoft windows std 2012  tpm,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2118,microsoft windows 2008,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2120,Windows 2008 Standard 32 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2121,Microsoft Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2126,Window2008 R2,452,Windows|Windows Server,355,R | ||||||
|  | 2130,Windows 2008 Standard,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2134,WS03,452,Windows|Windows Server,239,Windows Terminal Server (WTS) | ||||||
|  | 2136,Windows 2008 Enterprise x64,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2141,Windows 2008 R2 Enterprise,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2142,Windows Server 2003 Std 32-bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2143,Windows 2008 R2 Standard 64bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2146,Microsoft Windows 2003 R2 Standard,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2148,MICROSOFT WINDOWS NT 2003 TPM,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2149,Win Server 2008,452,Windows|Windows Server,569,VMware Server | ||||||
|  | 2150,Windows 2003 R2 Standard 64 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2152,WIN2014,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 2156,Win 2012 R2,452,Windows|Windows Server,355,R | ||||||
|  | 2160,Win Server,452,Windows|Windows Server,12,Apache Subversion | ||||||
|  | 2161,Windows 2008 Standard R2 x64,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2163,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 6196 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor | ||||||
|  | 2164,Windows2012,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2165,Windows 2008 R2 Standard6.1.7601,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2166,Windows 2016,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2167,Windows 2008 R2 Standard,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2179,Windows Server 2003 Std 64-bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2180,Windows 2012 R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2181,Wintel,452,Windows|Windows Server,461,IBM Tivoli Storage Manager|TSM Client | ||||||
|  | 2191,Windows 2003 Enterprise5.2.3790,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2192,WINDOWS 2012,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2193,Windows 2008 R2 OS,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2196,Windows 2003 Standard R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2197,Windows 2008 R2 Enterprise6.1.7601,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2198,Windows 2003 Standard 32 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2199,WINDOWS SERVER 2003 APPLIANCE 5.2,452,Windows|Windows Server,559,Virtual Appliance | ||||||
|  | 2201,WS08R2,452,Windows|Windows Server,355,R | ||||||
|  | 2204,Windows 2008 Enterprise 64 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2213,w2k8r2sp1,452,Windows|Windows Server,355,R | ||||||
|  | 2217,Win 2003,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 2222,Windows 2012 R2 Standard,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2226,Windows 2008 R2 Standard 64-bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2228,Windows 2003 Enterprise 32-bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2230,Windows 2012 Storage R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2231,Windows server 2008 Dual processor Intel Xeon x5660 @2.80 GHz 4096 MB memory installed,452,Windows|Windows Server,296,Intel Xeon Processor | ||||||
|  | 2235,MICROSOFT WINDOWS NT 2003 ENT TPM,452,Windows|Windows Server,239,Windows Terminal Server (WTS) | ||||||
|  | 2237,Windows 2016 Standard10.0.14393,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2240,MICROSOFT WINDOWS 2003,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2242,Windows 2012 Standard R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2246,Win Server 2008 R2,452,Windows|Windows Server,355,R | ||||||
|  | 2248,MICROSOFT WINDOWS STD 2012 TPM,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2249,Windows 2003 Enterprise 32 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2250,Windows 2008 Enterprise R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2251,Windows 2008,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2252,Microsoft Microsoft Windows 2008 R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2257,Win Server 2012,452,Windows|Windows Server,569,VMware Server | ||||||
|  | 2258,Windows 2016 Standard,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2264,Windows 2008 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2267,Windows 2003 Standard 5.2.3790 Service Pack 2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2268,Windows 2012 Standard6.2.9200,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2269,MICROSOFT WINDOWS 2016 TPM,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2272,Windows 2003 Enterprise,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2275,Windows 2008 R2 Enterprise 64-bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2277,Windows 2012 R2 Standard6.3.9600,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2286,Windows 2008 Standard R2,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2287,MicrosoftWindows Server 2008 R2 (64-bit),452,Windows|Windows Server,443,OS/2 | ||||||
|  | 2288,windows6.3,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2290,Windows 2016 64 Bit,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2296,Windows 2008 Enterprise6.0.6003,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2301,Win 2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 2302,Win2012,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 2303,Win2012R2,452,Windows|Windows Server,355,R | ||||||
|  | 2305,win2008,452,Windows|Windows Server,333,Java|Java Enterprise Edition (Java EE) | ||||||
|  | 2306,Windows 2003 Standard x64,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2315,WINDOWS 2016 STANDARD EDITION,452,Windows|Windows Server,580,Windows|* | ||||||
|  | 2325,WinSCP.net - WinSCP 5.11,243,WinSCP,178,SAP SQL Anywhere | ||||||
|  | 2332,Zerto Vritual Appliance,249,Zerto Virtual Replication,559,Virtual Appliance | ||||||
|  | 2333,Oracle RTD,289,Oracle Real-Time Decisions (RTD),134,Oracle Database | ||||||
|  | 2338,OMNIbus,251,Tivoli Netcool/OMNIbus,582,C#|* | ||||||
|  | 2347,ALM,511,Application Lifecycle Management (ALM),421,DART | ||||||
|  | 2349,BMS,513,Batch Management Software (BMS),442,OpenVMS | ||||||
|  | 2354,COM,516,Compopent Object Model (COM),661,COM+ | ||||||
|  | 2357,CORBA Interface Definition Language,518,CORBA Interface Definition Language (CORBA IDL),517,Common Object Request Broker Architecture (CORBA) | ||||||
|  | 2359,Data Control Language,519,Data Control Language (DCL),329,IBM i Control Language (CL) | ||||||
|  | 2361,Database,520,Database (DB),43,DB2 | ||||||
|  | 2362,DB,520,Database (DB),43,DB2 | ||||||
|  | 2365,Electronic Data Interchange,521,Electronic Data Interchange (EDI),104,Microsoft Exchange Server | ||||||
|  | 2369,JDOM,523,Java-based Document Object Model for XML (JDOM),84,IMS DB | ||||||
|  | 2381,Simple Object Access Protocol,531,Simple Object Access Protocol (SOAP),547,Internet Message Access Protocol (IMAP) | ||||||
|  | 2383,SQL,572,Structured Query Language (SQL),581,MS SQL Server|* | ||||||
|  | 2386,DPE,538,Device Provisioning Engines (DPE),661,COM+ | ||||||
|  | 2388,ESB,540,Enterprise Service Bus(ESB),370,Visual Basic | ||||||
|  | 2395,MES,553,Manufacturing Execution System (MES),623,Amazon S3 | ||||||
|  | 2401,Z/Virtual System Environment,591,z/VSE,441,MVS|z/OS | ||||||
|  | 2403,DOS/VSE,591,z/VSE,597,DOS/360 | ||||||
|  | 2404,Microsoft Disk Operating System,593,MS-DOS,443,OS/2 | ||||||
|  | 2407,VME/B,595,VME,368,VB.NET | ||||||
|  | 2408,Virtual Machine Environment,595,VME,111,Microsoft Visual Studio | ||||||
|  | 2409,VME 2900,595,VME,107,Microsoft Internet Explorer | ||||||
|  | 2410,OpenVME,595,VME,442,OpenVMS | ||||||
|  | 2411,Disk Operating System/360,597,DOS/360,443,OS/2 | ||||||
|  | 2413,Transaction Processing Facility,598,z/TPF,572,Structured Query Language (SQL) | ||||||
|  | 2419,NPL,653,Natural Programming Language,342,Niakwa Programming Language (NPL) | ||||||
|  | 2426,IDMS/DB Data Manipulation Language,668,IDMS DML,312,Data Language Interface (DL/I) | ||||||
|  | 2433,Basic Mapping Supprt,689,BMS Map,21,Business Intelligence and Reporting Tools (BIRT) | ||||||
|  | 2434,DB/400,690,DB400,43,DB2 | ||||||
|  | 2435,IBM ISAM,693,ISAM,73,IBM Operational Decision Manager (ODM) | ||||||
| 
 | 
|  | @ -32,21 +32,25 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | ||||||
| def generate_acronym(text): | def generate_acronym(text): | ||||||
| 
 | 
 | ||||||
|     # Define prompt |     # Define prompt | ||||||
|     prompt = f"Answer concisely: make a possible acronym from the following: '{text}'" |     # prompt = f"Imagine you are a diverse database. Given the following: '{text}', please suggest to me 5 possible variations. Give 5." | ||||||
|  |     prompt = f"Give me a list of 10 historical product names related to: '{text}'. Format the output in a list, like this 1. Item, 2. Item, 3. ..." | ||||||
| 
 | 
 | ||||||
|     # Generate acronym |     # Generate acronym | ||||||
|     inputs = tokenizer(prompt, return_tensors="pt") |     inputs = tokenizer(prompt, return_tensors="pt") | ||||||
|     inputs = inputs.to("cuda") |     inputs = inputs.to("cuda") | ||||||
|     outputs = model.generate( |     outputs = model.generate( | ||||||
|         inputs["input_ids"], |         inputs["input_ids"], | ||||||
|         max_length=100, |         max_length=200, | ||||||
|         no_repeat_ngram_size=3)  |         do_sample=True, | ||||||
|  |         top_k=50, | ||||||
|  |         temperature=0.8) | ||||||
|  |         # no_repeat_ngram_size=3)  | ||||||
|     return tokenizer.decode(outputs[0], skip_special_tokens=True) |     return tokenizer.decode(outputs[0], skip_special_tokens=True) | ||||||
| 
 | 
 | ||||||
| # %% | # %% | ||||||
| # Example usage | # Example usage | ||||||
| # text = "Advanced Data Analytics Platform" | # text = "Advanced Data Analytics Platform" | ||||||
| text = "red hat enterprise linux" | text = "windows desktop" | ||||||
| acronym = generate_acronym(text) | acronym = generate_acronym(text) | ||||||
| print(f"Acronym: {acronym}") | print(f"Generation: {acronym}") | ||||||
| # %% | # %% | ||||||
		Loading…
	
		Reference in New Issue