domain_mapping/esAppMod_data_import/entity_hierarchy.py

# %%
import json
import pandas as pd

##########################################
# %%
# import training file
data_path = '../esAppMod_data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)

# %%
# import entity file
# Keep only one row per unique value in 'column1'
unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
id2label = {}
for _, row in unique_df.iterrows():
    id2label[row['entity_id']] = row['entity_name']

inverse_dict = {value:key for key,value in id2label.items()}
# %%
# Create a new dictionary with sorted keys
# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}

# %%
sorted_dict

# %%
rule_set ={
    '.NET': [497,482,484,487,485,486,483],
    'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
    'C++': [583,306],
    'CA': [290,22,23,24,25],
    'CSS': [307,377],
    'Cisco': [28,420,29],
    'Citrix': [563,565,31,292,291,564,32,30],
    'coldfusion': [311,37],
    'eclipse': [46,622,641,456],
    'xml': [596, 318],
    'xsl': [319,320],
    'HP': [59,293,60,61,58],
    'http': [505,543],
    'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
    'IBM BigFix': [62,457],
    'IBM ILOG': [253,255,254,256,252],
    'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
    'IBM WebSphere': [80,82,83,81],
    'IBM i': [424,329],
    'IDMS': [667,668],
    'IIS': [609,490,489,491],
    'JBoss': [268,492,493],
    'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
    'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
    'KVS': [549,550,551],
    'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
    'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
    'MVS': [577,440,441],
    'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
    'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
    'Oracle WebLogic': [600,233],
    'Oracle Application Server': [610,494],
    'Oracle Database': [134,474,475,478],
    'Oracle Hyperion': [607,138,139],
    'Oracle WebCenter': [276,495],
    'Pascal': [599,346],
    'Perl': [585,348,417,349],
    'ProjectWise': [161,162],
    'Rational': [166,167],
    'SAP': [173,175,695,176,676,178,179],
    'SAP ERP': [174,476,477],
    'SAP NetWeaver': [279,496,177],
    'Sybase SQL Server': [190,479,480],
    'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
    'TIBCO': [218,219],
    'TIBCO Business Works': [217,481],
    'Tivoli': [220,251],
    'Tortoise': [221,222],
    'Unix': [578,445,579,447,602,590,448,449],
    'VB': [368,369],
    'VMware': [568,569,229,230,231],
    'Visual Basic': [370,371,372],
    'WebSphere': [234,285,235,286,284,601,287],
    'Windows': [580,238,239,451,452],
    'z': [598,608,591]

}

# %%
# iterate through the whole training set
new_df = train_df.copy()
for idx, row in train_df.iterrows():
    # we iterate through each rule set, replacing any matching values in the
    # list with the first element of the list
    for key in rule_set.keys():
        id = row['entity_id']
        if (id in rule_set[key]):
            new_df.loc[idx,('entity_id')] = rule_set[key][0]
# %%
len(set(new_df['entity_id'].to_list()))

# %%
new_df.to_csv('parent_train.csv')

# %%
# now do the same for the test data
# import training file
data_path = '../esAppMod_data_import/test.csv'
test_df = pd.read_csv(data_path, skipinitialspace=True)

new_df = test_df.copy()
for idx, row in test_df.iterrows():
    # we iterate through each rule set, replacing any matching values in the
    # list with the first element of the list
    for key in rule_set.keys():
        id = row['entity_id']
        if (id in rule_set[key]):
            new_df.loc[idx,('entity_id')] = rule_set[key][0]

# %%
new_df

# %%
new_df.to_csv('parent_test.csv')
# %%
added more augmentations to finally beat sota - class_bert_augmentation is now the reference training code 2025-01-15 20:09:15 +09:00			`# %%`
			`import json`
			`import pandas as pd`

			`##########################################`
			`# %%`
			`# import training file`
			`data_path = '../esAppMod_data_import/train.csv'`
			`train_df = pd.read_csv(data_path, skipinitialspace=True)`

			`# %%`
			`# import entity file`
			`# Keep only one row per unique value in 'column1'`
			`unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")`
			`id2label = {}`
			`for _, row in unique_df.iterrows():`
			`id2label[row['entity_id']] = row['entity_name']`

			`inverse_dict = {value:key for key,value in id2label.items()}`
			`# %%`
			`# Create a new dictionary with sorted keys`
			`# sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}`
			`sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}`

			`# %%`
			`sorted_dict`

			`# %%`
			`rule_set ={`
			`'.NET': [497,482,484,487,485,486,483],`
			`'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],`
			`'C++': [583,306],`
			`'CA': [290,22,23,24,25],`
			`'CSS': [307,377],`
			`'Cisco': [28,420,29],`
			`'Citrix': [563,565,31,292,291,564,32,30],`
			`'coldfusion': [311,37],`
			`'eclipse': [46,622,641,456],`
			`'xml': [596, 318],`
			`'xsl': [319,320],`
			`'HP': [59,293,60,61,58],`
			`'http': [505,543],`
			`'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],`
			`'IBM BigFix': [62,457],`
			`'IBM ILOG': [253,255,254,256,252],`
			`'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],`
			`'IBM WebSphere': [80,82,83,81],`
			`'IBM i': [424,329],`
			`'IDMS': [667,668],`
			`'IIS': [609,490,489,491],`
			`'JBoss': [268,492,493],`
			`'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],`
			`'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],`
			`'KVS': [549,550,551],`
			`'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],`
			`'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],`
			`'MVS': [577,440,441],`
			`'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],`
			`'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],`
			`'Oracle WebLogic': [600,233],`
			`'Oracle Application Server': [610,494],`
			`'Oracle Database': [134,474,475,478],`
			`'Oracle Hyperion': [607,138,139],`
			`'Oracle WebCenter': [276,495],`
			`'Pascal': [599,346],`
			`'Perl': [585,348,417,349],`
			`'ProjectWise': [161,162],`
			`'Rational': [166,167],`
			`'SAP': [173,175,695,176,676,178,179],`
			`'SAP ERP': [174,476,477],`
			`'SAP NetWeaver': [279,496,177],`
			`'Sybase SQL Server': [190,479,480],`
			`'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],`
			`'TIBCO': [218,219],`
			`'TIBCO Business Works': [217,481],`
			`'Tivoli': [220,251],`
			`'Tortoise': [221,222],`
			`'Unix': [578,445,579,447,602,590,448,449],`
			`'VB': [368,369],`
			`'VMware': [568,569,229,230,231],`
			`'Visual Basic': [370,371,372],`
			`'WebSphere': [234,285,235,286,284,601,287],`
			`'Windows': [580,238,239,451,452],`
			`'z': [598,608,591]`

			`}`

			`# %%`
			`# iterate through the whole training set`
			`new_df = train_df.copy()`
			`for idx, row in train_df.iterrows():`
			`# we iterate through each rule set, replacing any matching values in the`
			`# list with the first element of the list`
			`for key in rule_set.keys():`
			`id = row['entity_id']`
			`if (id in rule_set[key]):`
			`new_df.loc[idx,('entity_id')] = rule_set[key][0]`
			`# %%`
			`len(set(new_df['entity_id'].to_list()))`

			`# %%`
			`new_df.to_csv('parent_train.csv')`

			`# %%`
			`# now do the same for the test data`
			`# import training file`
			`data_path = '../esAppMod_data_import/test.csv'`
			`test_df = pd.read_csv(data_path, skipinitialspace=True)`

			`new_df = test_df.copy()`
			`for idx, row in test_df.iterrows():`
			`# we iterate through each rule set, replacing any matching values in the`
			`# list with the first element of the list`
			`for key in rule_set.keys():`
			`id = row['entity_id']`
			`if (id in rule_set[key]):`
			`new_df.loc[idx,('entity_id')] = rule_set[key][0]`

			`# %%`
			`new_df`

			`# %%`
			`new_df.to_csv('parent_test.csv')`
			`# %%`