125 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			125 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Python
		
	
	
	
| # %%
 | |
| import json
 | |
| import pandas as pd
 | |
| 
 | |
| ##########################################
 | |
| # %%
 | |
| # import training file
 | |
| data_path = '../esAppMod_data_import/train.csv'
 | |
| train_df = pd.read_csv(data_path, skipinitialspace=True)
 | |
| 
 | |
| # %%
 | |
| # import entity file
 | |
| # Keep only one row per unique value in 'column1'
 | |
| unique_df = train_df.drop_duplicates(subset="entity_name", keep="first")
 | |
| id2label = {}
 | |
| for _, row in unique_df.iterrows():
 | |
|     id2label[row['entity_id']] = row['entity_name']
 | |
| 
 | |
| inverse_dict = {value:key for key,value in id2label.items()}
 | |
| # %%
 | |
| # Create a new dictionary with sorted keys
 | |
| # sorted_dict = {key: id2label[key] for key in sorted(id2label.keys())}
 | |
| sorted_dict = {key: inverse_dict[key] for key in sorted(inverse_dict.keys())}
 | |
| 
 | |
| # %%
 | |
| sorted_dict
 | |
| 
 | |
| # %%
 | |
| rule_set ={
 | |
|     '.NET': [497,482,484,487,485,486,483],
 | |
|     'apache': [6,634,501,646,259,7,8,9,375,697,10,11,12,260,376],
 | |
|     'C++': [583,306],
 | |
|     'CA': [290,22,23,24,25],
 | |
|     'CSS': [307,377],
 | |
|     'Cisco': [28,420,29],
 | |
|     'Citrix': [563,565,31,292,291,564,32,30],
 | |
|     'coldfusion': [311,37],
 | |
|     'eclipse': [46,622,641,456],
 | |
|     'xml': [596, 318],
 | |
|     'xsl': [319,320],
 | |
|     'HP': [59,293,60,61,58],
 | |
|     'http': [505,543],
 | |
|     'IBM': [698,63,64,649,65,666,294,66,265,328,67,330,68,458,69,70,71,72,672,73,295,250,605],
 | |
|     'IBM BigFix': [62,457],
 | |
|     'IBM ILOG': [253,255,254,256,252],
 | |
|     'IBM Tivoli': [606,459,76,77,604,460,461,462,463,79],
 | |
|     'IBM WebSphere': [80,82,83,81],
 | |
|     'IBM i': [424,329],
 | |
|     'IDMS': [667,668],
 | |
|     'IIS': [609,490,489,491],
 | |
|     'JBoss': [268,492,493],
 | |
|     'JavaScript': [589,405,406,407,408,409,411,412,413,415,410,414],
 | |
|     'Java': [506,523,584,378,379,380,381,384,382,383,385,386,387,392,393,388,333,389,334,390,391,335,336,394,395,396,397,398,399,400,401,402,403,404],
 | |
|     'KVS': [549,550,551],
 | |
|     'Linux': [576,454,427,428,429,453,430,432,433,434,435,436,431,437],
 | |
|     'MS SQL': [581,121,466,467,465,468,469,470,471,472,473],
 | |
|     'MVS': [577,440,441],
 | |
|     'Microsoft': [99,637,100,101,102,103,104,464,105,108,106,107,109,110,111,112,113,114],
 | |
|     'Oracle': [130,131,129,132,133,135,136,298,137,140,694,141,289,675,142,145,146,143,144,147,567,148,527,281],
 | |
|     'Oracle WebLogic': [600,233],
 | |
|     'Oracle Application Server': [610,494],
 | |
|     'Oracle Database': [134,474,475,478],
 | |
|     'Oracle Hyperion': [607,138,139],
 | |
|     'Oracle WebCenter': [276,495],
 | |
|     'Pascal': [599,346],
 | |
|     'Perl': [585,348,417,349],
 | |
|     'ProjectWise': [161,162],
 | |
|     'Rational': [166,167],
 | |
|     'SAP': [173,175,695,176,676,178,179],
 | |
|     'SAP ERP': [174,476,477],
 | |
|     'SAP NetWeaver': [279,496,177],
 | |
|     'Sybase SQL Server': [190,479,480],
 | |
|     'Sysinternal Tools': [194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212],
 | |
|     'TIBCO': [218,219],
 | |
|     'TIBCO Business Works': [217,481],
 | |
|     'Tivoli': [220,251],
 | |
|     'Tortoise': [221,222],
 | |
|     'Unix': [578,445,579,447,602,590,448,449],
 | |
|     'VB': [368,369],
 | |
|     'VMware': [568,569,229,230,231],
 | |
|     'Visual Basic': [370,371,372],
 | |
|     'WebSphere': [234,285,235,286,284,601,287],
 | |
|     'Windows': [580,238,239,451,452],
 | |
|     'z': [598,608,591]
 | |
| 
 | |
| }
 | |
| 
 | |
| # %%
 | |
| # iterate through the whole training set
 | |
| new_df = train_df.copy()
 | |
| for idx, row in train_df.iterrows():
 | |
|     # we iterate through each rule set, replacing any matching values in the
 | |
|     # list with the first element of the list
 | |
|     for key in rule_set.keys():
 | |
|         id = row['entity_id']
 | |
|         if (id in rule_set[key]):
 | |
|             new_df.loc[idx,('entity_id')] = rule_set[key][0]
 | |
| # %%
 | |
| len(set(new_df['entity_id'].to_list()))
 | |
| 
 | |
| # %%
 | |
| new_df.to_csv('parent_train.csv')
 | |
| 
 | |
| # %%
 | |
| # now do the same for the test data
 | |
| # import training file
 | |
| data_path = '../esAppMod_data_import/test.csv'
 | |
| test_df = pd.read_csv(data_path, skipinitialspace=True)
 | |
| 
 | |
| new_df = test_df.copy()
 | |
| for idx, row in test_df.iterrows():
 | |
|     # we iterate through each rule set, replacing any matching values in the
 | |
|     # list with the first element of the list
 | |
|     for key in rule_set.keys():
 | |
|         id = row['entity_id']
 | |
|         if (id in rule_set[key]):
 | |
|             new_df.loc[idx,('entity_id')] = rule_set[key][0]
 | |
| 
 | |
| # %%
 | |
| new_df
 | |
| 
 | |
| # %%
 | |
| new_df.to_csv('parent_test.csv')
 | |
| # %%
 |