Feat: added classification with number tokens
- added analysis for overall statistics
This commit is contained in:
parent
1b6659a600
commit
1b9c4323c3
|
@ -13,6 +13,10 @@ full_df
|
||||||
# %%
|
# %%
|
||||||
mdm_list
|
mdm_list
|
||||||
|
|
||||||
|
# %%
|
||||||
|
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
|
||||||
|
# %%
|
||||||
|
mdm_list
|
||||||
# %%
|
# %%
|
||||||
mask = full_df['pattern'] == 'GE#Flow FGMassFlow'
|
mask = full_df['pattern'] == 'GE#Flow FGMassFlow'
|
||||||
full_df[mask]
|
full_df[mask]
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
# %%
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# %%
|
|
||||||
data_path = '../../data_import/exports/raw_data.csv'
|
|
||||||
df = pd.read_csv(data_path)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df
|
|
||||||
|
|
||||||
# %%
|
|
||||||
len(set(df['ships_idx']))
|
|
||||||
# %%
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# data_path = '../../data_import/exports/raw_data.csv'
|
||||||
|
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
|
||||||
|
df = pd.read_csv(data_path)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df = df[df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# we want to print the string length
|
||||||
|
|
||||||
|
# print summary stats
|
||||||
|
def summary_stats(arr):
|
||||||
|
return {
|
||||||
|
"Mean": np.mean(arr),
|
||||||
|
"Median": np.median(arr),
|
||||||
|
"Standard Deviation": np.std(arr),
|
||||||
|
"Variance": np.var(arr),
|
||||||
|
"Min": np.min(arr),
|
||||||
|
"Max": np.max(arr),
|
||||||
|
"Range": np.ptp(arr),
|
||||||
|
"25th Percentile": np.percentile(arr, 25),
|
||||||
|
"75th Percentile": np.percentile(arr, 75),
|
||||||
|
"Sum": np.sum(arr),
|
||||||
|
}
|
||||||
|
|
||||||
|
# %%
|
||||||
|
ship_domain_data = df['tag_description'] + df['unit'].fillna('')
|
||||||
|
|
||||||
|
ship_domain_array = np.array([len(item) for item in ship_domain_data])
|
||||||
|
|
||||||
|
stats = summary_stats(ship_domain_array)
|
||||||
|
|
||||||
|
for key, value in stats.items():
|
||||||
|
print(f"{key}: {value}")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
plt.hist(ship_domain_array, bins=50)
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# %%
|
||||||
|
platform_domain_data = df['thing'] + df['property']
|
||||||
|
|
||||||
|
platform_domain_array = np.array([len(item) for item in platform_domain_data])
|
||||||
|
|
||||||
|
stats = summary_stats(platform_domain_array)
|
||||||
|
|
||||||
|
for key, value in stats.items():
|
||||||
|
print(f"{key}: {value}")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1 @@
|
||||||
|
exports
|
|
@ -0,0 +1,62 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||||
|
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
fold = 5
|
||||||
|
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df_bert = pd.read_csv(file_path)
|
||||||
|
# %%
|
||||||
|
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df_t5 = pd.read_csv(file_path)
|
||||||
|
df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True)
|
||||||
|
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
|
||||||
|
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_t5['bert_prediction'] = df_bert['class_prediction']
|
||||||
|
df_bert['t5_prediction'] = df_t5['class_prediction']
|
||||||
|
# %%
|
||||||
|
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
|
||||||
|
# %%
|
||||||
|
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
sum(t5_correct)/len(t5_correct)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# replace t5 not in vocab with bert values
|
||||||
|
t5_correct_modified = t5_correct.copy()
|
||||||
|
condition = ~df_t5['in_vocab']
|
||||||
|
t5_correct_modified[condition] = np.array(bert_correct[condition])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# new replacement correctness
|
||||||
|
sum(t5_correct_modified)/len(t5_correct_modified)
|
||||||
|
# %%
|
||||||
|
# when bert is correct and t5 is wrong
|
||||||
|
cond_mask = bert_correct & (~t5_correct)
|
||||||
|
print(sum(cond_mask))
|
||||||
|
print(df_t5[cond_mask].to_string())
|
||||||
|
# %%
|
||||||
|
# when bert is wrong and t5 is correct
|
||||||
|
cond_mask = (~bert_correct) & (t5_correct)
|
||||||
|
print(sum(cond_mask))
|
||||||
|
print(df_bert[cond_mask].to_string())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# when both are wrong
|
||||||
|
cond_mask = (~bert_correct) & (~t5_correct)
|
||||||
|
print(sum(cond_mask))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,72 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||||
|
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def run_mdm(fold):
|
||||||
|
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df_bert = pd.read_csv(file_path)
|
||||||
|
df_bert = df_bert[df_bert['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
|
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df_t5 = pd.read_csv(file_path)
|
||||||
|
df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True)
|
||||||
|
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
|
||||||
|
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
|
||||||
|
|
||||||
|
df_t5['bert_prediction'] = df_bert['class_prediction']
|
||||||
|
df_bert['t5_prediction'] = df_t5['class_prediction']
|
||||||
|
|
||||||
|
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
|
||||||
|
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
|
||||||
|
|
||||||
|
t5_original_accuracy = sum(t5_correct)/len(t5_correct)
|
||||||
|
|
||||||
|
# replace t5 not in vocab with bert values
|
||||||
|
t5_correct_modified = t5_correct.copy()
|
||||||
|
condition = ~df_t5['in_vocab']
|
||||||
|
t5_correct_modified[condition] = np.array(bert_correct[condition])
|
||||||
|
pd.Series(t5_correct_modified).to_csv(f'exports/result_group_{fold}.csv')
|
||||||
|
|
||||||
|
t5_new_accuracy = sum(t5_correct_modified)/len(t5_correct_modified)
|
||||||
|
|
||||||
|
print('original accuracy', t5_original_accuracy)
|
||||||
|
print('new accuracy', t5_new_accuracy)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# this does replacement for the full prediction
|
||||||
|
def run_full(fold):
|
||||||
|
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df_bert = pd.read_csv(file_path)
|
||||||
|
|
||||||
|
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df_t5 = pd.read_csv(file_path)
|
||||||
|
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
|
||||||
|
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
|
||||||
|
|
||||||
|
df_t5['bert_prediction'] = df_bert['class_prediction']
|
||||||
|
df_bert['t5_prediction'] = df_t5['class_prediction']
|
||||||
|
|
||||||
|
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
|
||||||
|
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
|
||||||
|
|
||||||
|
# replace t5 not in vocab with bert values
|
||||||
|
t5_correct_modified = t5_correct.copy()
|
||||||
|
condition = ~df_t5['in_vocab']
|
||||||
|
t5_correct_modified[condition] = np.array(bert_correct[condition])
|
||||||
|
pd.Series(t5_correct_modified, name='grounded_pred').to_csv(f'exports/result_group_{fold}.csv')
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
for fold in [1,2,3,4,5]:
|
||||||
|
run_mdm(fold)
|
||||||
|
run_full(fold)
|
||||||
|
# %%
|
|
@ -0,0 +1,67 @@
|
||||||
|
,thing,property,ships_idx,tag_name,tag_description,signal_type,min,max,unit,data_type,thing_pattern,property_pattern,pattern,MDM,class_prediction
|
||||||
|
6,SB1Flow,FOMassFlowTotal,1003,FM6_XI001_Y,AUXILIARY BOILER FUEL OIL TOTAL FLOW RATE,AI,0,0,FLOW,1304.0,SB#Flow,FOMassFlowTotal,SB#Flow FOMassFlowTotal,True,SB1FlowFOMassFlowIn
|
||||||
|
38,ShipBoiler3,RunningState,1030,BC330,COMPOSITE BOILER FIRING,DI,0,0,NOVALUE,1301.0,ShipBoiler#,RunningState,ShipBoiler# RunningState,True,ShipBoiler1RunningState
|
||||||
|
61,GeneratorEngine5,CBNonClosed,1003,PMS_5ACBNCL_Y,NO5 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine5RunningState
|
||||||
|
72,CargoHandling,BoostPp_Port_Current,1018,IT_1400_Y,MP1400 BOOSTER PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,BoostPp_Port_Current,CargoHandling BoostPp_Port_Current,True,CargoHandlingBoostPp_Stbd_Current
|
||||||
|
81,Navigation,MidPDraft,1018,TL_200002_Y,MID DRAFTP_LV,A,0,0,NOVALUE,1310.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft
|
||||||
|
86,ShipBoiler1,FOInletTemp,1018,AB_000001_Y,BOILER FUEL OIL IN BURNER_TEMP,A,0,0,NOVALUE,1310.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
|
||||||
|
140,Navigation,MidPDraft,1003,DCM_P3_Y,DRAUGHT MID PS (DRAFT SENSOR),AI,0,0,m ,1304.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft
|
||||||
|
174,ShipBoiler1,FOInletPress,1051,MB.YO.IAS.Q3.40224,BOILER FUEL OIL IN BURNER_PRESS,Analog,0,4,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress
|
||||||
|
200,GeneratorEngine3,VoltageB,1050,MB.KM.IAS.Q3.A40193,NO3 GENERATOR_ENGINE(B) GEN VOLTAGE,AO,0,655,VOLTAGE,1300.0,GeneratorEngine#,VoltageB,GeneratorEngine# VoltageB,True,GeneratorEngine3Voltage
|
||||||
|
342,EngineRoom,AirTemp,1018,MA_TT8612_Y,MAIN_ENGINE AMBIENT_TEMP,A,0,0,NOVALUE,1310.0,EngineRoom,AirTemp,EngineRoom AirTemp,True,GeneratorEngine1CBTrip
|
||||||
|
395,GeneratorEngine3,SAPress,1036,MB.KM.IAS.Q2.400121,NO3 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine3WindingTempR
|
||||||
|
396,MainEngine1,RPM,1051,MB.YO.IAS.Q1.40006,M/E_RPM,Analog,-120,120,RPM,1304.0,MainEngine#,RPM,MainEngine# RPM,True,Shaft1RPM
|
||||||
|
653,ShipBoiler1,FOInletTemp,1033,CB014,COMPOSITE BOILER FUEL OIL TEMPERATURE,AI,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
|
||||||
|
731,GeneratorEngine4,CBNonClosed,1003,PMS_4ACBNCL_Y,NO4 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine4CBClosed
|
||||||
|
745,ShipBoiler1,FOInletPress,1018,AB_000002_Y,BOILER FUEL OIL IN BURNER PRESSURE,A,0,0,PRESSURE,1310.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress
|
||||||
|
783,GeneratorEngine1,LOFilterInletPress,1030,GA069,NO1 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine1LOInletPress
|
||||||
|
786,GeneratorEngine1,FOFilterInletPress,1030,GA085,NO1 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine1FOInletPress
|
||||||
|
812,GE1Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400031,GENERATOR_ENGINE FUEL OIL VISCOSITY INDICATION,AO,0,2346,VOLUME FLOW,1304.0,GE#Flow,FOViscosity,GE#Flow FOViscosity,True,GE1FlowFOVolumeFlowIn
|
||||||
|
813,ME2Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400025,MAIN_ENGINE(P) FUEL OIL VISCOSITY INDICATION,AO,0,2285,VOLUME FLOW,1304.0,ME#Flow,FOViscosity,ME#Flow FOViscosity,True,ME2FlowFOVolumeFlowIn
|
||||||
|
840,GeneratorEngine1,SAPress,1036,MB.KM.IAS.Q1.400051,NO1 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine1WindingTempR
|
||||||
|
891,GE1Flow,FOMassFlowIn,1051,MB.YO.IAS.Q2.40103,GENERATOR_ENGINE HFO_FLOW,Analog,0,1800,MASS FLOW,1304.0,GE#Flow,FOMassFlowIn,GE#Flow FOMassFlowIn,True,GE1FlowFGMassFlow
|
||||||
|
935,ShipBoiler1,FOInletTemp,1051,MB.YO.IAS.Q3.40223,BOILER FUEL OIL IN BURNER_TEMP,Analog,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
|
||||||
|
951,MainEngine2,CFWInletTemp,1020,MB.YO.IAS.Q1.A400388,MAIN_ENGINE(P) CYLINDER COOL WATER TEMPERATURE INLET,AO,-50,130,TEMPERATURE,1304.0,MainEngine#,CFWInletTemp,MainEngine# CFWInletTemp,True,MainEngine2Cy3CWTemp
|
||||||
|
1005,GeneratorEngine1,HFOUse,1051,MB.YO.IAS.Q1.10096,G/E_HFUEL OIL USE,Digital,0,1,-,1301.0,GeneratorEngine#,HFOUse,GeneratorEngine# HFOUse,True,MainEngine1HFOUse
|
||||||
|
1075,ME1Flow,FGMassFlow,1004,MB.YO.IAS.Q2.A400121,LP LPG FUEL P/P FLOW,AI,0,3500,MASS FLOW,1304.0,ME#Flow,FGMassFlow,ME#Flow FGMassFlow,True,ME2FlowFGMassFlow
|
||||||
|
1116,CargoHandling,LPGComp1MotorCurrent,1004,MB.YO.IAS.Q3.A400281,MP-2100 COMPRESSOR (P) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT3_DWPump_Port_Current
|
||||||
|
1117,CargoHandling,LPGComp2MotorCurrent,1004,MB.YO.IAS.Q3.A400282,MP-2200 COMPRESSOR (C) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT2_DWPump_Stbd_Current
|
||||||
|
1118,CargoHandling,LPGComp3MotorCurrent,1004,MB.YO.IAS.Q3.A400283,MP-2300 COMPRESSOR (S) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingBoostPp_Stbd_Current
|
||||||
|
1174,FuelOilSystem,LFOVolumeSettleTK,1003,LC_XI001_Y,NO2 LIGHT FUEL OIL SETTLING TANK VOLUME,AI,0,999999,VOLUME,1304.0,FuelOilSystem,LFOVolumeSettleTK,FuelOilSystem LFOVolumeSettleTK,True,FuelOilSystemLFOVolumeStorageTK2P
|
||||||
|
1198,GeneratorEngine4,BearingNDETemp1,1003,GE4_TIAH6_Y,NO4 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine4WindingTempT
|
||||||
|
1199,GeneratorEngine5,BearingNDETemp1,1003,GE5_TIAH6_Y,NO5 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine5WindingTempT
|
||||||
|
1200,MainEngine1,LoadPercent,1018,EG_0000005_Y,M/E_LOAD,D,0,0,%,1301.0,MainEngine#,LoadPercent,MainEngine# LoadPercent,True,GeneratorEngine2LoadPercent
|
||||||
|
1214,GE1TurboCharger1,ExhGasOutletTemp,1003,GE1_TE27_Y,NO1 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger1ExhGasOutletTemp
|
||||||
|
1226,GE2TurboCharger1,ExhGasOutletTemp,1003,GE2_TE27_Y,NO2 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger2ExhGasOutletTemp
|
||||||
|
1237,GE3TurboCharger1,ExhGasOutletTemp,1003,GE3_TE27_Y,NO3 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger3ExhGasOutletTemp
|
||||||
|
1246,GeneratorEngine3,BearingDETemp8,1003,GE3_TE698_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5
|
||||||
|
1247,GeneratorEngine3,BearingDETemp9,1003,GE3_TE699_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
|
||||||
|
1273,GeneratorEngine4,BearingDETemp8,1003,GE4_TE698_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
|
||||||
|
1274,GeneratorEngine4,BearingDETemp9,1003,GE4_TE699_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
|
||||||
|
1280,GeneratorEngine5,BearingDETemp2,1003,GE5_TE692_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP2,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2BearingDETemp6
|
||||||
|
1281,GeneratorEngine5,BearingDETemp3,1003,GE5_TE693_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP3,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5
|
||||||
|
1282,GeneratorEngine5,BearingDETemp4,1003,GE5_TE694_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP4,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp4
|
||||||
|
1283,GeneratorEngine5,BearingDETemp5,1003,GE5_TE695_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP5,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5
|
||||||
|
1284,GeneratorEngine5,BearingDETemp6,1003,GE5_TE696_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP6,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
|
||||||
|
1285,GeneratorEngine5,BearingDETemp7,1003,GE5_TE697_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP7,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
|
||||||
|
1286,GeneratorEngine5,BearingDETemp8,1003,GE5_TE698_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2Cy8KnockIntensity
|
||||||
|
1287,GeneratorEngine5,BearingDETemp9,1003,GE5_TE699_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
|
||||||
|
1298,ME1TurboCharger1,ExhGasInletTemp,1003,AMSI_TT3721A_Y,EXHAUST GAS TEMPERATURE BEFORE TURBOCHARGER 1,AI,0,600,TEMPERATURE,1304.0,ME#TurboCharger#,ExhGasInletTemp,ME#TurboCharger# ExhGasInletTemp,True,ME1TurboCharger1ExhGasOutletTemp
|
||||||
|
1309,GeneratorEngine2,LOFilterInletPress,1030,GB069,NO2 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine2LOInletPress
|
||||||
|
1472,GeneratorEngine3,VoltageA,1050,MB.KM.IAS.Q3.A40189,NO3 GENERATOR_ENGINE(A) GEN VOLTAGE,AO,0,654,VOLTAGE,1300.0,GeneratorEngine#,VoltageA,GeneratorEngine# VoltageA,True,GeneratorEngine3Voltage
|
||||||
|
1524,GeneratorEngine2,FOFilterInletPress,1030,GB085,NO2 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine2FOInletPress
|
||||||
|
1536,ShipBoiler1,FOInletTemp,1028,MB.KM.IAS.Q2.A400184,OIL TEMPERATURE (4-20MA),AI,0,200,°C,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,GeneratorEngine4WindingTempT
|
||||||
|
1537,ShipBoiler1,FOInletPress,1028,MB.KM.IAS.Q2.A400185,FUEL OIL PRESSURE (4-20MA),AI,0,40,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,GeneratorEngine4FOInletPress
|
||||||
|
1594,GeneratorEngine3,LOFilterInletPress,1030,GC069,NO3 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine3LOInletPress
|
||||||
|
1597,GeneratorEngine3,FOFilterInletPress,1030,GC085,NO3 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine3FOInletPress
|
||||||
|
1679,GeneratorEngine3,busBarVoltage,1003,PMS_3BUSVOLA_Y,BUS VOLTAGE,AI,0,10000,VOLTAGE,1304.0,GeneratorEngine#,busBarVoltage,GeneratorEngine# busBarVoltage,True,GeneratorEngine1busBarVoltage
|
||||||
|
1727,GeneratorEngine2,SAPress,1036,MB.KM.IAS.Q1.400086,NO2 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine2WindingTempR
|
||||||
|
1763,GeneratorEngine5,BearingDETemp1,1003,GE5_TE691_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP1,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5
|
||||||
|
1873,GeneratorEngine5,CBClosed,1003,PMS_5VCBCLED_Y,NO5 GENERATOR_ENGINE MVSB VCB CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBClosed,GeneratorEngine# CBClosed,True,GeneratorEngine5StopState
|
||||||
|
2034,CargoHandling,CT1_DWPump_Stbd_Current,1018,IT_1101_Y,MP1100 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current
|
||||||
|
2035,CargoHandling,CT2_DWPump_Port_Current,1018,IT_1200_Y,MP1200 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
|
||||||
|
2037,CargoHandling,CT3_DWPump_Stbd_Current,1018,IT_1501_Y,MP1501 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current
|
||||||
|
2038,CargoHandling,CT4_DWPump_Port_Current,1018,IT_1700_Y,MP1700 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
|
||||||
|
2048,GeneratorEngine5,RunningHour,1003,PMS_5GENWHRS_Y,NO5 GENERATOR_ENGINE WORKING HOURS,AI,0,10000,NOVALUE,1304.0,GeneratorEngine#,RunningHour,GeneratorEngine# RunningHour,True,GeneratorEngine4RunningHour
|
||||||
|
2057,CargoHandling,CT1_DWPump_Port_Current,1018,IT_1100_Y,MP1100 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
|
||||||
|
2079,ShipBoiler1,ExhGasOutletTemp,1003,EG_G02_Y,EXHAUST GAS ECONOMIZER EXHAUST GAS OUTLET TEMPERATURE,AI,0,600,TEMPERATURE,1304.0,ShipBoiler#,ExhGasOutletTemp,ShipBoiler# ExhGasOutletTemp,True,MainEngine1Cy1ExhGasOutletTemp
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
type,fold,accuracy
|
||||||
|
1layer,1,0.8968291528632276
|
||||||
|
1layer,2,0.8859813084112149
|
||||||
|
1layer,3,0.9382530120481928
|
||||||
|
1layer,4,0.9586108468125595
|
||||||
|
1layer,5,0.8827301878149336
|
||||||
|
2layer,1,0.9318504495977283
|
||||||
|
2layer,2,0.8859813084112149
|
||||||
|
2layer,3,0.9678714859437751
|
||||||
|
2layer,4,0.9738344433872502
|
||||||
|
2layer,5,0.9015116811726981
|
||||||
|
4layer,1,0.9503076194983436
|
||||||
|
4layer,2,0.9135514018691588
|
||||||
|
4layer,3,0.9698795180722891
|
||||||
|
4layer,4,0.9790675547098002
|
||||||
|
4layer,5,0.907924874026569
|
||||||
|
6layer,1,0.9522006625650734
|
||||||
|
6layer,2,0.9093457943925234
|
||||||
|
6layer,3,0.9678714859437751
|
||||||
|
6layer,4,0.9814462416745956
|
||||||
|
6layer,5,0.890975721484196
|
||||||
|
8layer,1,0.9441552295314718
|
||||||
|
8layer,2,0.9121495327102803
|
||||||
|
8layer,3,0.963855421686747
|
||||||
|
8layer,4,0.9752616555661275
|
||||||
|
8layer,5,0.907924874026569
|
||||||
|
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
type,fold,accuracy
|
||||||
|
normal,1,0.9522006625650734
|
||||||
|
normal,2,0.9093457943925234
|
||||||
|
normal,3,0.9678714859437751
|
||||||
|
normal,4,0.9814462416745956
|
||||||
|
normal,5,0.890975721484196
|
||||||
|
frozen,1,0.9342167534311405
|
||||||
|
frozen,2,0.883177570093458
|
||||||
|
frozen,3,0.963855421686747
|
||||||
|
frozen,4,0.9705042816365367
|
||||||
|
frozen,5,0.9051763628034815
|
||||||
|
|
|
|
@ -0,0 +1,199 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
####################################################################################
|
||||||
|
# stage 1
|
||||||
|
# %%
|
||||||
|
# stage 1a: binary classification
|
||||||
|
df_stage1a = pd.read_csv('stage1a.csv')
|
||||||
|
# %%
|
||||||
|
# desc only
|
||||||
|
mask = df_stage1a['type'] == 'desc'
|
||||||
|
df_stage1a[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# desc and unit
|
||||||
|
mask = df_stage1a['type'] == 'desc_unit'
|
||||||
|
df_stage1a[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# stage 1b: similarity-based classification
|
||||||
|
df_stage1b = pd.read_csv('stage1b.csv')
|
||||||
|
# %%
|
||||||
|
# desc only
|
||||||
|
mask = df_stage1b['type'] == 'desc'
|
||||||
|
df_stage1b[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# desc and unit
|
||||||
|
mask = df_stage1b['type'] == 'desc_unit'
|
||||||
|
df_stage1b[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
#################################################################################
|
||||||
|
# stage 2: mapping model
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# stage 2a: mapping by classification
|
||||||
|
df_stage2a = pd.read_csv('stage2a.csv')
|
||||||
|
# %%
|
||||||
|
# desc only
|
||||||
|
mask = df_stage2a['type'] == 'desc'
|
||||||
|
df_stage2a[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# desc and unit
|
||||||
|
mask = df_stage2a['type'] == 'desc_unit'
|
||||||
|
df_stage2a[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# stage 2b: mapping by seq2seq
|
||||||
|
df_stage2b = pd.read_csv('stage2b.csv')
|
||||||
|
# %%
|
||||||
|
# desc only
|
||||||
|
mask = df_stage2b['type'] == 'desc'
|
||||||
|
df_stage2b[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# desc and unit
|
||||||
|
mask = df_stage2b['type'] == 'desc_unit'
|
||||||
|
df_stage2b[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
############################
|
||||||
|
# frozen encoder
|
||||||
|
# %%
|
||||||
|
df = pd.read_csv('frozen_encoder.csv')
|
||||||
|
# %%
|
||||||
|
# normal
|
||||||
|
mask = df['type'] == 'normal'
|
||||||
|
df[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# frozen
|
||||||
|
mask = df['type'] == 'frozen'
|
||||||
|
df[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
############################
|
||||||
|
# frozen encoder
|
||||||
|
# %%
|
||||||
|
df = pd.read_csv('decoder_scaling.csv')
|
||||||
|
# %%
|
||||||
|
# 1 layer
|
||||||
|
mask = df['type'] == '1layer'
|
||||||
|
df[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# 2 layer
|
||||||
|
mask = df['type'] == '2layer'
|
||||||
|
df[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# 4 layer
|
||||||
|
mask = df['type'] == '4layer'
|
||||||
|
df[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# 6 layer
|
||||||
|
mask = df['type'] == '6layer'
|
||||||
|
df[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# 8 layer
|
||||||
|
mask = df['type'] == '8layer'
|
||||||
|
df[mask].describe().loc[['mean', 'std']]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
#########################
|
||||||
|
# compute overall result
|
||||||
|
|
||||||
|
# frac{1808}{2113} = 0.856$ & $\frac{10692}{10961} = 0.975$ & $\frac{12500}{13074} = 0.956$ \\
|
||||||
|
# frac{1932}{2140} = 0.903$ & $\frac{8304}{8582} = 0.968$ & $\frac{10236}{10722} = 0.955$ \\
|
||||||
|
# frac{1789}{1992} = 0.898$ & $\frac{7613}{7863} = 0.968$ & $\frac{9402}{9855} = 0.954$ \\
|
||||||
|
# frac{1967}{2102} = 0.936$ & $\frac{12929}{13349} = 0.969$ & $\frac{14896}{15451} = 0.964$ \\
|
||||||
|
# frac{1915}{2183} = 0.877$ & $\frac{10381}{10786} = 0.962$ & $\frac{12296}{12969} = 0.948$ \\
|
||||||
|
|
||||||
|
# %%
|
||||||
|
matrix = np.array([
|
||||||
|
[1808, 2113, 10692, 10961, 13074],
|
||||||
|
[1932, 2140, 8304, 8582, 10722],
|
||||||
|
[1789, 1992, 7613, 7863, 9855],
|
||||||
|
[1967, 2102, 12929, 13349, 15451],
|
||||||
|
[1915, 2183, 10381, 10786, 12969]
|
||||||
|
])
|
||||||
|
# %%
|
||||||
|
relevant_class = matrix[:,0]/matrix[:,1]
|
||||||
|
print(relevant_class)
|
||||||
|
print(np.std(relevant_class))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
non_relevant_class = matrix[:,2]/matrix[:,3]
|
||||||
|
print(non_relevant_class)
|
||||||
|
print(np.std(non_relevant_class))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
numerator = (matrix[:,0] + matrix[:,2])
|
||||||
|
denominator = (matrix[:,1] + matrix[:,3])
|
||||||
|
print(numerator)
|
||||||
|
print(denominator) # same as last column
|
||||||
|
overall = numerator/denominator
|
||||||
|
print(overall)
|
||||||
|
print(np.std(overall))
|
||||||
|
|
||||||
|
|
||||||
|
######################
|
||||||
|
# compute mapping result
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# $\frac{1761}{1808} = 0.974$ \\
|
||||||
|
# $\frac{1802}{1932} = 0.933$ \\
|
||||||
|
# $\frac{1760}{1789} = 0.984$ \\
|
||||||
|
# $\frac{1945}{1967} = 0.989$ \\
|
||||||
|
# $\frac{1837}{1915} = 0.959$ \\
|
||||||
|
|
||||||
|
matrix = np.array([
|
||||||
|
[1761, 1808],
|
||||||
|
[1802, 1932],
|
||||||
|
[1760, 1789],
|
||||||
|
[1945, 1967],
|
||||||
|
[1837, 1915]
|
||||||
|
])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
result = matrix[:,0]/matrix[:,1]
|
||||||
|
print(result)
|
||||||
|
print(np.mean(result))
|
||||||
|
print(np.std(result))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
####################################
|
||||||
|
# compute overall result
|
||||||
|
# & 1761 & 10692 & $\frac{1761 + 10692}{13074} = 0.953$ \\
|
||||||
|
# & 1802 & 8304 & $\frac{1802 + 8304}{10722} = 0.943$ \\
|
||||||
|
# & 1760 & 7613 & $\frac{1760 + 7613}{9855} = 0.951$ \\
|
||||||
|
# & 1945 & 12929 & $\frac{1945 + 12929}{15451} = 0.963$ \\
|
||||||
|
# & 1837 & 10381 & $\frac{1837 + 10381}{12969} = 0.942$ \\
|
||||||
|
|
||||||
|
matrix = np.array([
|
||||||
|
[1761,10692, 13074],
|
||||||
|
[1802, 8304, 10722],
|
||||||
|
[1760, 7613, 9855],
|
||||||
|
[1945,12929, 15451],
|
||||||
|
[1837,10381, 12969]
|
||||||
|
])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
overall = (matrix[:,0] + matrix[:,1])/matrix[:,2]
|
||||||
|
print(overall)
|
||||||
|
print(np.mean(overall))
|
||||||
|
print(np.std(overall))
|
||||||
|
# %%
|
|
@ -0,0 +1,11 @@
|
||||||
|
type,fold,accuracy,f1_score,precision,recall
|
||||||
|
desc,1,0.92588,0.74001,0.85440,0.65263
|
||||||
|
desc,2,0.88733,0.64239,0.87641,0.50701
|
||||||
|
desc,3,0.90583,0.71429,0.92357,0.58233
|
||||||
|
desc,4,0.93114,0.70929,0.83312,0.61751
|
||||||
|
desc,5,0.91171,0.67683,0.88162,0.54924
|
||||||
|
desc_unit,1,0.95610,0.86301,0.87049,0.85566
|
||||||
|
desc_unit,2,0.95467,0.88828,0.87421,0.90280
|
||||||
|
desc_unit,3,0.95403,0.88762,0.87739,0.89809
|
||||||
|
desc_unit,4,0.96408,0.87636,0.82405,0.93578
|
||||||
|
desc_unit,5,0.94811,0.85054,0.82543,0.87723
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
type,fold,accuracy,f1_score,precision,recall
|
||||||
|
desc,1,0.93162,0.79580,0.76909,0.82442
|
||||||
|
desc,2,0.92884,0.82440,0.81224,0.83692
|
||||||
|
desc,3,0.93201,0.83375,0.82434,0.84337
|
||||||
|
desc,4,0.94259,0.80937,0.73814,0.89581
|
||||||
|
desc,5,0.92228,0.78397,0.73661,0.83784
|
||||||
|
desc_unit,1,0.93353,0.79945,0.78018,0.81969
|
||||||
|
desc_unit,2,0.92184,0.81006,0.78653,0.83505
|
||||||
|
desc_unit,3,0.91821,0.80513,0.77659,0.83584
|
||||||
|
desc_unit,4,0.93334,0.78675,0.69648,0.90390
|
||||||
|
desc_unit,5,0.93084,0.80445,0.76747,0.84517
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
type,fold,accuracy
|
||||||
|
desc,1,0.93706
|
||||||
|
desc,2,0.88785
|
||||||
|
desc,3,0.96285
|
||||||
|
desc,4,0.95861
|
||||||
|
desc,5,0.89601
|
||||||
|
desc_unit,1,0.94226
|
||||||
|
desc_unit,2,0.90561
|
||||||
|
desc_unit,3,0.96436
|
||||||
|
desc_unit,4,0.96955
|
||||||
|
desc_unit,5,0.90289
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
type,fold,accuracy
|
||||||
|
desc,1,0.9427354472314246
|
||||||
|
desc,2,0.8981308411214953
|
||||||
|
desc,3,0.9588353413654619
|
||||||
|
desc,4,0.9633682207421503
|
||||||
|
desc,5,0.8928080622995878
|
||||||
|
desc_unit,1,0.9578797917652626
|
||||||
|
desc_unit,2,0.9088785046728972
|
||||||
|
desc_unit,3,0.9673694779116466
|
||||||
|
desc_unit,4,0.9785918173168411
|
||||||
|
desc_unit,5,0.8918918918918919
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
|
@ -0,0 +1 @@
|
||||||
|
*.png
|
|
@ -41,13 +41,26 @@ distance_array
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
plt.rcParams.update({'font.size': 14}) # Adjust the size as needed
|
||||||
plt.figure(figsize=(8, 6))
|
plt.figure(figsize=(8, 6))
|
||||||
plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||||||
plt.xlabel("Normalized Levenshtein Distance")
|
plt.xlabel("Normalized Levenshtein Distance")
|
||||||
plt.ylabel("Count")
|
plt.ylabel("Count")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig("histogram.png", dpi=300)
|
# Add arrow for increasing dissimilarity
|
||||||
|
plt.annotate(
|
||||||
|
"Decreasing Similarity", # Text label
|
||||||
|
xy=(0.7, 500), # Arrow end (near the end of x-axis)
|
||||||
|
xytext=(0.4, 500), # Arrow start (near the middle of x-axis)
|
||||||
|
arrowprops=dict(arrowstyle="->", lw=2, color="black"), # Arrow style
|
||||||
|
va='center', # needed to make arrow centered
|
||||||
|
fontsize=14, # Font size for the text
|
||||||
|
color="black" # Text color
|
||||||
|
)
|
||||||
|
# Add arrows and text
|
||||||
|
plt.savefig("input_output_similarity.png", dpi=300)
|
||||||
#
|
#
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# summary statistics of computed levenshtein distance
|
# summary statistics of computed levenshtein distance
|
||||||
def summary_stats(arr):
|
def summary_stats(arr):
|
||||||
|
|
|
@ -58,12 +58,24 @@ score_list
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# plt.hist(score_list, bins=50)
|
# plt.hist(score_list, bins=50)
|
||||||
|
plt.rcParams.update({'font.size': 14}) # Adjust the size as needed
|
||||||
plt.figure(figsize=(8, 6))
|
plt.figure(figsize=(8, 6))
|
||||||
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||||||
plt.xlabel("Normalized Levenshtein Distance")
|
plt.xlabel("Normalized Levenshtein Distance")
|
||||||
plt.ylabel("Platform Domain Class Count")
|
plt.ylabel("Platform Domain Class Count")
|
||||||
|
# Add arrow for increasing dissimilarity
|
||||||
|
plt.annotate(
|
||||||
|
"Decreasing Similarity", # Text label
|
||||||
|
xy=(0.7, 70), # Arrow end (near the end of x-axis)
|
||||||
|
xytext=(0.2, 70), # Arrow start (near the middle of x-axis)
|
||||||
|
arrowprops=dict(arrowstyle="->", lw=2, color="black"), # Arrow style
|
||||||
|
va='center', # needed to make arrow centered
|
||||||
|
fontsize=14, # Font size for the text
|
||||||
|
color="black" # Text color
|
||||||
|
)
|
||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig("histogram.png", dpi=300)
|
plt.savefig("within_class_similarity.png", dpi=300)
|
||||||
# %%
|
# %%
|
||||||
# summary statistics of computed levenshtein distance
|
# summary statistics of computed levenshtein distance
|
||||||
def summary_stats(arr):
|
def summary_stats(arr):
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
|
||||||
|
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_in = full_df[full_df['MDM']].reset_index(drop=True)
|
||||||
|
# %%
|
||||||
|
df_out = full_df[~full_df['MDM']].reset_index(drop=True)
|
||||||
|
# %%
|
||||||
|
label_counts_in = df_in['unit'].value_counts()
|
||||||
|
print(label_counts_in.to_string())
|
||||||
|
|
||||||
|
# %%
|
||||||
|
label_counts_out = df_out['unit'].value_counts()
|
||||||
|
print(label_counts_out.to_string())
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
label_counts_out['NOVALUE']/len(df_out)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
label_counts_in['NOVALUE']/len(df_out)
|
||||||
|
# %%
|
|
@ -9,14 +9,19 @@ def run(fold):
|
||||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
p_mdm = df['p_mdm']
|
p_mdm = df['p_mdm']
|
||||||
|
|
||||||
# data_path = f'../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv'
|
data_path = f'../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
data_path = f'../train/modified_t5_decoder_4_layers/mapping_prediction/exports/result_group_{fold}.csv'
|
|
||||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
actual_mdm = df['MDM']
|
actual_mdm = df['MDM']
|
||||||
|
|
||||||
thing_correctness = df['thing'] == df['p_thing']
|
# grounded labels
|
||||||
property_correctness = df['property'] == df['p_property']
|
data_path = f'../analysis/delta_analysis/exports/result_group_{fold}.csv'
|
||||||
answer = thing_correctness & property_correctness
|
df_grounded = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
answer = df_grounded['grounded_pred']
|
||||||
|
|
||||||
|
# original labels
|
||||||
|
# thing_correctness = df['thing'] == df['p_thing']
|
||||||
|
# property_correctness = df['property'] == df['p_property']
|
||||||
|
# answer = thing_correctness & property_correctness
|
||||||
|
|
||||||
##############
|
##############
|
||||||
# evaluate relevant-class prediction performance
|
# evaluate relevant-class prediction performance
|
||||||
|
@ -53,6 +58,13 @@ def run(fold):
|
||||||
print(mapping_rate)
|
print(mapping_rate)
|
||||||
print('size', correct_positive_mdm_and_map, '/', sum(p_mdm & actual_mdm))
|
print('size', correct_positive_mdm_and_map, '/', sum(p_mdm & actual_mdm))
|
||||||
|
|
||||||
|
# evaluate relevant mappings
|
||||||
|
correct_positive_mdm_and_map = sum(p_mdm & actual_mdm & answer)
|
||||||
|
mapping_rate = correct_positive_mdm_and_map / sum(actual_mdm)
|
||||||
|
print('relevant data mapping rate')
|
||||||
|
print(mapping_rate)
|
||||||
|
print('size', correct_positive_mdm_and_map, '/', sum(actual_mdm))
|
||||||
|
|
||||||
|
|
||||||
##############
|
##############
|
||||||
# evaluate overall pipeline result
|
# evaluate overall pipeline result
|
||||||
|
@ -76,3 +88,5 @@ for fold in [1,2,3,4,5]:
|
||||||
print('*' * 40)
|
print('*' * 40)
|
||||||
run(fold)
|
run(fold)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
|
@ -179,8 +179,8 @@ def train(fold):
|
||||||
# save_strategy="epoch",
|
# save_strategy="epoch",
|
||||||
load_best_model_at_end=False,
|
load_best_model_at_end=False,
|
||||||
learning_rate=1e-5,
|
learning_rate=1e-5,
|
||||||
per_device_train_batch_size=128,
|
per_device_train_batch_size=64,
|
||||||
per_device_eval_batch_size=128,
|
per_device_eval_batch_size=64,
|
||||||
auto_find_batch_size=False,
|
auto_find_batch_size=False,
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
|
|
|
@ -180,8 +180,8 @@ def train(fold):
|
||||||
# save_strategy="epoch",
|
# save_strategy="epoch",
|
||||||
load_best_model_at_end=False,
|
load_best_model_at_end=False,
|
||||||
learning_rate=1e-5,
|
learning_rate=1e-5,
|
||||||
per_device_train_batch_size=128,
|
per_device_train_batch_size=64,
|
||||||
per_device_eval_batch_size=128,
|
per_device_eval_batch_size=64,
|
||||||
auto_find_batch_size=False,
|
auto_find_batch_size=False,
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
checkpoint*
|
||||||
|
tensorboard-log
|
|
@ -0,0 +1 @@
|
||||||
|
exports
|
|
@ -0,0 +1,31 @@
|
||||||
|
|
||||||
|
********************************************************************************
|
||||||
|
Fold: 1
|
||||||
|
Accuracy: 0.94510
|
||||||
|
F1 Score: 0.94087
|
||||||
|
Precision: 0.94623
|
||||||
|
Recall: 0.94510
|
||||||
|
********************************************************************************
|
||||||
|
Fold: 2
|
||||||
|
Accuracy: 0.91682
|
||||||
|
F1 Score: 0.91698
|
||||||
|
Precision: 0.92824
|
||||||
|
Recall: 0.91682
|
||||||
|
********************************************************************************
|
||||||
|
Fold: 3
|
||||||
|
Accuracy: 0.96185
|
||||||
|
F1 Score: 0.95743
|
||||||
|
Precision: 0.96001
|
||||||
|
Recall: 0.96185
|
||||||
|
********************************************************************************
|
||||||
|
Fold: 4
|
||||||
|
Accuracy: 0.97479
|
||||||
|
F1 Score: 0.97074
|
||||||
|
Precision: 0.97072
|
||||||
|
Recall: 0.97479
|
||||||
|
********************************************************************************
|
||||||
|
Fold: 5
|
||||||
|
Accuracy: 0.90563
|
||||||
|
F1 Score: 0.89532
|
||||||
|
Precision: 0.90040
|
||||||
|
Recall: 0.90563
|
|
@ -0,0 +1,289 @@
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# from datasets import load_from_disk
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
|
||||||
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from transformers import (
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
DataCollatorWithPadding,
|
||||||
|
)
|
||||||
|
import evaluate
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
|
||||||
|
BATCH_SIZE = 128
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# we need to create the mdm_list
|
||||||
|
# import the full mdm-only file
|
||||||
|
data_path = '../../../data_import/exports/data_mapping_mdm.csv'
|
||||||
|
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# rather than use pattern, we use the real thing and property
|
||||||
|
# mdm_list = sorted(list((set(full_df['pattern']))))
|
||||||
|
thing_property = full_df['thing'] + full_df['property']
|
||||||
|
thing_property = thing_property.to_list()
|
||||||
|
mdm_list = sorted(list(set(thing_property)))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
id2label = {}
|
||||||
|
label2id = {}
|
||||||
|
for idx, val in enumerate(mdm_list):
|
||||||
|
id2label[idx] = val
|
||||||
|
label2id[val] = idx
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def substitute_and_append_digits(s):
|
||||||
|
"""
|
||||||
|
Finds all digit groups in a string, substitutes them with a <digit> placeholder,
|
||||||
|
and appends the extracted digit groups at the end of the string flanked by <digit> markers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
s (str): The input string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The transformed string.
|
||||||
|
"""
|
||||||
|
# Find all digit groups in the string
|
||||||
|
digit_groups = re.findall(r'\d+', s)
|
||||||
|
|
||||||
|
# Substitute digit groups with <digit> placeholder
|
||||||
|
substituted_string = re.sub(r'\d+', '<DIGIT>', s)
|
||||||
|
|
||||||
|
# Append extracted digit groups to the end of the string
|
||||||
|
appended_digits = ''.join([f'<DIGIT>{group}<DIGIT>' for group in digit_groups])
|
||||||
|
|
||||||
|
return substituted_string + appended_digits
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# outputs a list of dictionaries
|
||||||
|
# processes dataframe into lists of dictionaries
|
||||||
|
# each element maps input to output
|
||||||
|
# input: tag_description
|
||||||
|
# output: class label
|
||||||
|
def process_df_to_dict(df, mdm_list):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
processed_desc = substitute_and_append_digits(row['tag_description'])
|
||||||
|
desc = f"<DESC>{processed_desc}<DESC>"
|
||||||
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
|
|
||||||
|
pattern = f"{row['thing'] + row['property']}"
|
||||||
|
try:
|
||||||
|
index = mdm_list.index(pattern)
|
||||||
|
except ValueError:
|
||||||
|
index = -1
|
||||||
|
element = {
|
||||||
|
'text' : f"{desc}{unit}",
|
||||||
|
'label': index,
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset(fold, mdm_list):
|
||||||
|
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# uncomment for mdm
|
||||||
|
# we only use the mdm subset
|
||||||
|
test_df = test_df[test_df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
|
test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
|
||||||
|
|
||||||
|
return test_dataset
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# function to perform training for a given fold
|
||||||
|
def test(fold):
|
||||||
|
|
||||||
|
test_dataset = create_dataset(fold, mdm_list)
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
|
||||||
|
checkpoint_directory = f'../checkpoint_fold_{fold}'
|
||||||
|
# Use glob to find matching paths
|
||||||
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
|
pattern = 'checkpoint-*'
|
||||||
|
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# Define additional special tokens
|
||||||
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<DESC>", "<UNIT>", "<DIGIT>"]
|
||||||
|
# Add the additional special tokens to the tokenizer
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute max token length
|
||||||
|
max_length = 0
|
||||||
|
for sample in test_dataset['text']:
|
||||||
|
# Tokenize the sample and get the length
|
||||||
|
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||||
|
length = len(input_ids)
|
||||||
|
|
||||||
|
# Update max_length if this sample is longer
|
||||||
|
if length > max_length:
|
||||||
|
max_length = length
|
||||||
|
|
||||||
|
print(max_length)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
max_length = 128
|
||||||
|
|
||||||
|
# given a dataset entry, run it through the tokenizer
|
||||||
|
def preprocess_function(example):
|
||||||
|
input = example['text']
|
||||||
|
# text_target sets the corresponding label to inputs
|
||||||
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = tokenizer(
|
||||||
|
input,
|
||||||
|
max_length=max_length,
|
||||||
|
# truncation=True,
|
||||||
|
padding='max_length'
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
datasets = test_dataset.map(
|
||||||
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||||
|
|
||||||
|
# %% temp
|
||||||
|
# tokenized_datasets['train'].rename_columns()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create data collator
|
||||||
|
|
||||||
|
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute metrics
|
||||||
|
# metric = evaluate.load("accuracy")
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def compute_metrics(eval_preds):
|
||||||
|
# preds, labels = eval_preds
|
||||||
|
# preds = np.argmax(preds, axis=1)
|
||||||
|
# return metric.compute(predictions=preds, references=labels)
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_checkpoint,
|
||||||
|
num_labels=len(mdm_list),
|
||||||
|
id2label=id2label,
|
||||||
|
label2id=label2id)
|
||||||
|
# important! after extending tokens vocab
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
model = model.eval()
|
||||||
|
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
pred_labels = []
|
||||||
|
actual_labels = []
|
||||||
|
|
||||||
|
|
||||||
|
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||||
|
for batch in tqdm(dataloader):
|
||||||
|
# Inference in batches
|
||||||
|
input_ids = batch['input_ids']
|
||||||
|
attention_mask = batch['attention_mask']
|
||||||
|
# save labels too
|
||||||
|
actual_labels.extend(batch['label'])
|
||||||
|
|
||||||
|
|
||||||
|
# Move to GPU if available
|
||||||
|
input_ids = input_ids.to(device)
|
||||||
|
attention_mask = attention_mask.to(device)
|
||||||
|
|
||||||
|
# Perform inference
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = model(
|
||||||
|
input_ids,
|
||||||
|
attention_mask).logits
|
||||||
|
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||||
|
pred_labels.extend(predicted_class_ids)
|
||||||
|
|
||||||
|
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||||
|
y_true = actual_labels
|
||||||
|
y_pred = pred_labels
|
||||||
|
|
||||||
|
# Compute metrics
|
||||||
|
accuracy = accuracy_score(y_true, y_pred)
|
||||||
|
average_parameter = 'weighted'
|
||||||
|
zero_division_parameter = 0
|
||||||
|
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
with open("output.txt", "a") as f:
|
||||||
|
|
||||||
|
print('*' * 80, file=f)
|
||||||
|
print(f'Fold: {fold}', file=f)
|
||||||
|
# Print the results
|
||||||
|
print(f'Accuracy: {accuracy:.5f}', file=f)
|
||||||
|
print(f'F1 Score: {f1:.5f}', file=f)
|
||||||
|
print(f'Precision: {precision:.5f}', file=f)
|
||||||
|
print(f'Recall: {recall:.5f}', file=f)
|
||||||
|
|
||||||
|
# export result
|
||||||
|
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# uncomment if you want to predict for all
|
||||||
|
# df = df[df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
|
label_list = [id2label[id] for id in pred_labels]
|
||||||
|
df_out = pd.DataFrame({
|
||||||
|
'class_prediction': pd.Series(label_list)
|
||||||
|
})
|
||||||
|
df = pd.concat([df, df_out], axis=1)
|
||||||
|
|
||||||
|
# we can save the t5 generation output here
|
||||||
|
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# reset file before writing to it
|
||||||
|
with open("output.txt", "w") as f:
|
||||||
|
print('', file=f)
|
||||||
|
|
||||||
|
for fold in [1,2,3,4,5]:
|
||||||
|
test(fold)
|
|
@ -0,0 +1,241 @@
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# from datasets import load_from_disk
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import (
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
DataCollatorWithPadding,
|
||||||
|
Trainer,
|
||||||
|
EarlyStoppingCallback,
|
||||||
|
TrainingArguments
|
||||||
|
)
|
||||||
|
import evaluate
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# we need to create the mdm_list
|
||||||
|
# import the full mdm-only file
|
||||||
|
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||||
|
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# rather than use pattern, we use the real thing and property
|
||||||
|
# mdm_list = sorted(list((set(full_df['pattern']))))
|
||||||
|
thing_property = full_df['thing'] + full_df['property']
|
||||||
|
thing_property = thing_property.to_list()
|
||||||
|
mdm_list = sorted(list(set(thing_property)))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
id2label = {}
|
||||||
|
label2id = {}
|
||||||
|
for idx, val in enumerate(mdm_list):
|
||||||
|
id2label[idx] = val
|
||||||
|
label2id[val] = idx
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def substitute_and_append_digits(s):
|
||||||
|
"""
|
||||||
|
Finds all digit groups in a string, substitutes them with a <digit> placeholder,
|
||||||
|
and appends the extracted digit groups at the end of the string flanked by <digit> markers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
s (str): The input string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The transformed string.
|
||||||
|
"""
|
||||||
|
# Find all digit groups in the string
|
||||||
|
digit_groups = re.findall(r'\d+', s)
|
||||||
|
|
||||||
|
# Substitute digit groups with <digit> placeholder
|
||||||
|
substituted_string = re.sub(r'\d+', '<DIGIT>', s)
|
||||||
|
|
||||||
|
# Append extracted digit groups to the end of the string
|
||||||
|
appended_digits = ''.join([f'<DIGIT>{group}<DIGIT>' for group in digit_groups])
|
||||||
|
|
||||||
|
return substituted_string + appended_digits
|
||||||
|
|
||||||
|
|
||||||
|
# outputs a list of dictionaries
|
||||||
|
# processes dataframe into lists of dictionaries
|
||||||
|
# each element maps input to output
|
||||||
|
# input: tag_description
|
||||||
|
# output: class label
|
||||||
|
def process_df_to_dict(df, mdm_list):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
processed_desc = substitute_and_append_digits(row['tag_description'])
|
||||||
|
desc = f"<DESC>{processed_desc}<DESC>"
|
||||||
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
|
pattern = f"{row['thing'] + row['property']}"
|
||||||
|
try:
|
||||||
|
index = mdm_list.index(pattern)
|
||||||
|
except ValueError:
|
||||||
|
print("Error: value not found in MDM list")
|
||||||
|
index = -1
|
||||||
|
element = {
|
||||||
|
'text' : f"{desc}{unit}",
|
||||||
|
'label': index,
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
|
||||||
|
def create_split_dataset(fold, mdm_list):
|
||||||
|
# train
|
||||||
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# valid
|
||||||
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
|
||||||
|
validation_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
combined_data = DatasetDict({
|
||||||
|
'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
|
||||||
|
'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
|
||||||
|
})
|
||||||
|
return combined_data
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# function to perform training for a given fold
|
||||||
|
def train(fold):
|
||||||
|
|
||||||
|
save_path = f'checkpoint_fold_{fold}'
|
||||||
|
split_datasets = create_split_dataset(fold, mdm_list)
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
|
||||||
|
# model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||||
|
model_checkpoint = 'google-bert/bert-base-cased'
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# Define additional special tokens
|
||||||
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<DESC>", "<UNIT>", "<DIGIT>"]
|
||||||
|
# Add the additional special tokens to the tokenizer
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
max_length = 120
|
||||||
|
|
||||||
|
# given a dataset entry, run it through the tokenizer
|
||||||
|
def preprocess_function(example):
|
||||||
|
input = example['text']
|
||||||
|
# text_target sets the corresponding label to inputs
|
||||||
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = tokenizer(
|
||||||
|
input,
|
||||||
|
max_length=max_length,
|
||||||
|
truncation=True,
|
||||||
|
padding=True
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
tokenized_datasets = split_datasets.map(
|
||||||
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
# %% temp
|
||||||
|
# tokenized_datasets['train'].rename_columns()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create data collator
|
||||||
|
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute metrics
|
||||||
|
metric = evaluate.load("accuracy")
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(eval_preds):
|
||||||
|
preds, labels = eval_preds
|
||||||
|
preds = np.argmax(preds, axis=1)
|
||||||
|
return metric.compute(predictions=preds, references=labels)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create id2label and label2id
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_checkpoint,
|
||||||
|
num_labels=len(mdm_list),
|
||||||
|
id2label=id2label,
|
||||||
|
label2id=label2id)
|
||||||
|
# important! after extending tokens vocab
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Trainer
|
||||||
|
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
output_dir=f"{save_path}",
|
||||||
|
# eval_strategy="epoch",
|
||||||
|
eval_strategy="no",
|
||||||
|
logging_dir="tensorboard-log",
|
||||||
|
logging_strategy="epoch",
|
||||||
|
# save_strategy="epoch",
|
||||||
|
load_best_model_at_end=False,
|
||||||
|
learning_rate=1e-4,
|
||||||
|
per_device_train_batch_size=64,
|
||||||
|
per_device_eval_batch_size=64,
|
||||||
|
auto_find_batch_size=False,
|
||||||
|
ddp_find_unused_parameters=False,
|
||||||
|
weight_decay=0.01,
|
||||||
|
save_total_limit=1,
|
||||||
|
num_train_epochs=80,
|
||||||
|
bf16=True,
|
||||||
|
push_to_hub=False,
|
||||||
|
remove_unused_columns=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model,
|
||||||
|
training_args,
|
||||||
|
train_dataset=tokenized_datasets["train"],
|
||||||
|
eval_dataset=tokenized_datasets["validation"],
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
data_collator=data_collator,
|
||||||
|
compute_metrics=compute_metrics,
|
||||||
|
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||||
|
)
|
||||||
|
|
||||||
|
# uncomment to load training from checkpoint
|
||||||
|
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||||
|
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
|
||||||
|
# execute training
|
||||||
|
for fold in [1,2,3,4,5]:
|
||||||
|
print(fold)
|
||||||
|
train(fold)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1 @@
|
||||||
|
exports
|
|
@ -1,31 +1,31 @@
|
||||||
|
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 1
|
Fold: 1
|
||||||
Accuracy: 0.78277
|
Accuracy: 0.93706
|
||||||
F1 Score: 0.73629
|
F1 Score: 0.93286
|
||||||
Precision: 0.71419
|
Precision: 0.93920
|
||||||
Recall: 0.78277
|
Recall: 0.93706
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 2
|
Fold: 2
|
||||||
Accuracy: 0.78598
|
Accuracy: 0.88785
|
||||||
F1 Score: 0.73708
|
F1 Score: 0.88726
|
||||||
Precision: 0.71578
|
Precision: 0.90566
|
||||||
Recall: 0.78598
|
Recall: 0.88785
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 3
|
Fold: 3
|
||||||
Accuracy: 0.79819
|
Accuracy: 0.96285
|
||||||
F1 Score: 0.74411
|
F1 Score: 0.95930
|
||||||
Precision: 0.71749
|
Precision: 0.96310
|
||||||
Recall: 0.79819
|
Recall: 0.96285
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 4
|
Fold: 4
|
||||||
Accuracy: 0.79543
|
Accuracy: 0.95861
|
||||||
F1 Score: 0.73902
|
F1 Score: 0.95320
|
||||||
Precision: 0.71094
|
Precision: 0.95615
|
||||||
Recall: 0.79543
|
Recall: 0.95861
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 5
|
Fold: 5
|
||||||
Accuracy: 0.77279
|
Accuracy: 0.89601
|
||||||
F1 Score: 0.72098
|
F1 Score: 0.88613
|
||||||
Precision: 0.69817
|
Precision: 0.89038
|
||||||
Recall: 0.77279
|
Recall: 0.89601
|
||||||
|
|
|
@ -235,6 +235,24 @@ def test(fold):
|
||||||
print(f'Precision: {precision:.5f}', file=f)
|
print(f'Precision: {precision:.5f}', file=f)
|
||||||
print(f'Recall: {recall:.5f}', file=f)
|
print(f'Recall: {recall:.5f}', file=f)
|
||||||
|
|
||||||
|
# export result
|
||||||
|
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
df[df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
|
label_list = [id2label[id] for id in pred_labels]
|
||||||
|
df_out = pd.DataFrame({
|
||||||
|
'class_prediction': pd.Series(label_list)
|
||||||
|
})
|
||||||
|
df = pd.concat([df, df_out], axis=1)
|
||||||
|
|
||||||
|
# we can save the t5 generation output here
|
||||||
|
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# reset file before writing to it
|
# reset file before writing to it
|
||||||
|
|
|
@ -176,7 +176,7 @@ def train(fold):
|
||||||
logging_strategy="epoch",
|
logging_strategy="epoch",
|
||||||
# save_strategy="epoch",
|
# save_strategy="epoch",
|
||||||
load_best_model_at_end=False,
|
load_best_model_at_end=False,
|
||||||
learning_rate=1e-3,
|
learning_rate=1e-4,
|
||||||
per_device_train_batch_size=64,
|
per_device_train_batch_size=64,
|
||||||
per_device_eval_batch_size=64,
|
per_device_eval_batch_size=64,
|
||||||
auto_find_batch_size=False,
|
auto_find_batch_size=False,
|
||||||
|
|
1
train/classification_bert_complete_desc_unit/classification_prediction/.gitignore
vendored
Normal file
1
train/classification_bert_complete_desc_unit/classification_prediction/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
exports
|
|
@ -1,31 +1,31 @@
|
||||||
|
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 1
|
Fold: 1
|
||||||
Accuracy: 0.78940
|
Accuracy: 0.15229
|
||||||
F1 Score: 0.73284
|
F1 Score: 0.07923
|
||||||
Precision: 0.70389
|
Precision: 0.05929
|
||||||
Recall: 0.78940
|
Recall: 0.15229
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 2
|
Fold: 2
|
||||||
Accuracy: 0.78411
|
Accuracy: 0.18075
|
||||||
F1 Score: 0.73695
|
F1 Score: 0.09625
|
||||||
Precision: 0.71914
|
Precision: 0.07243
|
||||||
Recall: 0.78411
|
Recall: 0.18075
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 3
|
Fold: 3
|
||||||
Accuracy: 0.80522
|
Accuracy: 0.19493
|
||||||
F1 Score: 0.75406
|
F1 Score: 0.10903
|
||||||
Precision: 0.72847
|
Precision: 0.08332
|
||||||
Recall: 0.80522
|
Recall: 0.19493
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 4
|
Fold: 4
|
||||||
Accuracy: 0.80780
|
Accuracy: 0.13190
|
||||||
F1 Score: 0.75361
|
F1 Score: 0.05761
|
||||||
Precision: 0.72432
|
Precision: 0.04173
|
||||||
Recall: 0.80780
|
Recall: 0.13190
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
Fold: 5
|
Fold: 5
|
||||||
Accuracy: 0.76958
|
Accuracy: 0.15198
|
||||||
F1 Score: 0.71912
|
F1 Score: 0.07383
|
||||||
Precision: 0.69965
|
Precision: 0.05411
|
||||||
Recall: 0.76958
|
Recall: 0.15198
|
||||||
|
|
|
@ -80,8 +80,9 @@ def process_df_to_dict(df, mdm_list):
|
||||||
def create_dataset(fold, mdm_list):
|
def create_dataset(fold, mdm_list):
|
||||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# uncomment for mdm
|
||||||
# we only use the mdm subset
|
# we only use the mdm subset
|
||||||
test_df = test_df[test_df['MDM']].reset_index(drop=True)
|
# test_df = test_df[test_df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
|
test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
|
||||||
|
|
||||||
|
@ -237,6 +238,22 @@ def test(fold):
|
||||||
print(f'Precision: {precision:.5f}', file=f)
|
print(f'Precision: {precision:.5f}', file=f)
|
||||||
print(f'Recall: {recall:.5f}', file=f)
|
print(f'Recall: {recall:.5f}', file=f)
|
||||||
|
|
||||||
|
# export result
|
||||||
|
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# uncomment if you want to predict for all
|
||||||
|
# df = df[df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
|
label_list = [id2label[id] for id in pred_labels]
|
||||||
|
df_out = pd.DataFrame({
|
||||||
|
'class_prediction': pd.Series(label_list)
|
||||||
|
})
|
||||||
|
df = pd.concat([df, df_out], axis=1)
|
||||||
|
|
||||||
|
# we can save the t5 generation output here
|
||||||
|
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# reset file before writing to it
|
# reset file before writing to it
|
||||||
|
|
|
@ -177,7 +177,7 @@ def train(fold):
|
||||||
logging_strategy="epoch",
|
logging_strategy="epoch",
|
||||||
# save_strategy="epoch",
|
# save_strategy="epoch",
|
||||||
load_best_model_at_end=False,
|
load_best_model_at_end=False,
|
||||||
learning_rate=1e-5,
|
learning_rate=1e-4,
|
||||||
per_device_train_batch_size=64,
|
per_device_train_batch_size=64,
|
||||||
per_device_eval_batch_size=64,
|
per_device_eval_batch_size=64,
|
||||||
auto_find_batch_size=False,
|
auto_find_batch_size=False,
|
||||||
|
|
|
@ -202,7 +202,7 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
|
|
|
@ -76,7 +76,7 @@ class Inference():
|
||||||
text_target=target,
|
text_target=target,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
padding="max_length",
|
padding='max_length',
|
||||||
truncation=True,
|
truncation=True,
|
||||||
)
|
)
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
@ -100,7 +100,7 @@ class Inference():
|
||||||
|
|
||||||
|
|
||||||
def generate(self):
|
def generate(self):
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||||
MAX_GENERATE_LENGTH = 128
|
MAX_GENERATE_LENGTH = 128
|
||||||
|
|
||||||
pred_generations = []
|
pred_generations = []
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
Accuracy for fold 1: 0.9536204448651207
|
||||||
|
Accuracy for fold 2: 0.8845794392523364
|
||||||
|
Accuracy for fold 3: 0.9618473895582329
|
||||||
|
Accuracy for fold 4: 0.9576593720266413
|
||||||
|
Accuracy for fold 5: 0.8928080622995878
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
Accuracy for fold 1: 0.9588263132986276
|
||||||
|
Accuracy for fold 2: 0.9182242990654206
|
||||||
|
Accuracy for fold 3: 0.9633534136546185
|
||||||
|
Accuracy for fold 4: 0.9809705042816366
|
||||||
|
Accuracy for fold 5: 0.8891433806688044
|
|
@ -26,7 +26,7 @@ def infer_and_select(fold):
|
||||||
# run inference
|
# run inference
|
||||||
# checkpoint
|
# checkpoint
|
||||||
# Use glob to find matching paths
|
# Use glob to find matching paths
|
||||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
|
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||||
# Use glob to find matching paths
|
# Use glob to find matching paths
|
||||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
# we are guaranteed to save only 1 checkpoint from training
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
|
@ -70,5 +70,6 @@ def infer_and_select(fold):
|
||||||
with open("output.txt", "w") as f:
|
with open("output.txt", "w") as f:
|
||||||
print('', file=f)
|
print('', file=f)
|
||||||
|
|
||||||
|
# for fold in [1,2,3,4,5]:
|
||||||
for fold in [1,2,3,4,5]:
|
for fold in [1,2,3,4,5]:
|
||||||
infer_and_select(fold)
|
infer_and_select(fold)
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
# from datasets import load_from_disk
|
# from datasets import load_from_disk
|
||||||
import os
|
import os
|
||||||
import glob
|
|
||||||
|
|
||||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from safetensors.torch import load_file
|
|
||||||
|
|
||||||
from transformers.models.t5.modeling_t5 import T5Block
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
T5Config,
|
|
||||||
T5TokenizerFast,
|
T5TokenizerFast,
|
||||||
AutoModelForSeq2SeqLM,
|
AutoModelForSeq2SeqLM,
|
||||||
DataCollatorForSeq2Seq,
|
DataCollatorForSeq2Seq,
|
||||||
Seq2SeqTrainer,
|
Seq2SeqTrainer,
|
||||||
EarlyStoppingCallback,
|
EarlyStoppingCallback,
|
||||||
Seq2SeqTrainingArguments,
|
Seq2SeqTrainingArguments
|
||||||
T5ForConditionalGeneration,
|
|
||||||
T5Model
|
|
||||||
)
|
)
|
||||||
import evaluate
|
import evaluate
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -35,23 +27,13 @@ from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
torch.set_float32_matmul_precision('high')
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
|
||||||
|
|
||||||
# model_checkpoint = "t5-small"
|
|
||||||
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
|
||||||
# model.config
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# outputs a list of dictionaries
|
# outputs a list of dictionaries
|
||||||
def process_df_to_dict(df):
|
def process_df_to_dict(df):
|
||||||
output_list = []
|
output_list = []
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
desc = f"<DESC>{row['tag_description']}<DESC>"
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
unit = f"<UNIT>{row['unit']}<UNIT>"
|
|
||||||
element = {
|
element = {
|
||||||
'input' : f"{desc}{unit}",
|
'input' : f"{desc}",
|
||||||
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
||||||
}
|
}
|
||||||
output_list.append(element)
|
output_list.append(element)
|
||||||
|
@ -77,11 +59,12 @@ def create_split_dataset(fold):
|
||||||
|
|
||||||
# function to perform training for a given fold
|
# function to perform training for a given fold
|
||||||
def train(fold):
|
def train(fold):
|
||||||
save_path = f'checkpoint_fold_{fold}b'
|
save_path = f'checkpoint_fold_{fold}'
|
||||||
split_datasets = create_split_dataset(fold)
|
split_datasets = create_split_dataset(fold)
|
||||||
|
|
||||||
# prepare tokenizer
|
# prepare tokenizer
|
||||||
model_checkpoint = "t5-small"
|
|
||||||
|
model_checkpoint = "t5-base"
|
||||||
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
# Define additional special tokens
|
# Define additional special tokens
|
||||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||||
|
@ -101,7 +84,7 @@ def train(fold):
|
||||||
text_target=target,
|
text_target=target,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
padding="max_length"
|
padding=True
|
||||||
)
|
)
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
@ -119,52 +102,10 @@ def train(fold):
|
||||||
# device_map set to auto to force it to load contiguous weights
|
# device_map set to auto to force it to load contiguous weights
|
||||||
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||||
|
|
||||||
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||||
# # Use glob to find matching paths
|
# important! after extending tokens vocab
|
||||||
# # path is usually checkpoint_fold_1/checkpoint-<step number>
|
|
||||||
# # we are guaranteed to save only 1 checkpoint from training
|
|
||||||
# pattern = 'checkpoint-*'
|
|
||||||
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
|
|
||||||
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
|
|
||||||
# # Load the checkpoint
|
|
||||||
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
|
|
||||||
# checkpoint = load_file(checkpoint_path)
|
|
||||||
# # Filter out weights related to the classification head
|
|
||||||
# # given name format: t5.encoder.embed_tokens.weight
|
|
||||||
# # we want: encoder.embed.tokens.weight
|
|
||||||
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
|
|
||||||
|
|
||||||
|
|
||||||
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
|
||||||
|
|
||||||
# Access the decoder stack
|
|
||||||
# config = T5Config("t5-small")
|
|
||||||
|
|
||||||
config = pretrained_model.config
|
|
||||||
config.num_layers = 6
|
|
||||||
config.num_decoder_layers = 3 # set new decoder layer count
|
|
||||||
|
|
||||||
model = T5ForConditionalGeneration(config)
|
|
||||||
|
|
||||||
model.shared = pretrained_model.shared
|
|
||||||
model.encoder = pretrained_model.encoder
|
|
||||||
|
|
||||||
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
|
|
||||||
for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]):
|
|
||||||
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
|
|
||||||
|
|
||||||
|
|
||||||
# print number of decoder blocks
|
|
||||||
print(f'Number of decoder blocks: {len(model.decoder.block)}')
|
|
||||||
print(f'num_layers: {model.config.num_layers}')
|
|
||||||
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
|
|
||||||
|
|
||||||
|
|
||||||
# change the token embedding size to match the shape
|
|
||||||
model.resize_token_embeddings(len(tokenizer))
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||||
metric = evaluate.load("sacrebleu")
|
metric = evaluate.load("sacrebleu")
|
||||||
|
|
||||||
|
@ -199,7 +140,7 @@ def train(fold):
|
||||||
# Generation Config
|
# Generation Config
|
||||||
# from transformers import GenerationConfig
|
# from transformers import GenerationConfig
|
||||||
gen_config = model.generation_config
|
gen_config = model.generation_config
|
||||||
gen_config.max_length = 128
|
gen_config.max_length = 64
|
||||||
|
|
||||||
# compile
|
# compile
|
||||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||||
|
@ -222,7 +163,7 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
|
@ -0,0 +1,2 @@
|
||||||
|
checkpoint*
|
||||||
|
tensorboard-log/
|
|
@ -76,7 +76,7 @@ class Inference():
|
||||||
text_target=target,
|
text_target=target,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
padding="max_length",
|
padding='max_length',
|
||||||
truncation=True,
|
truncation=True,
|
||||||
)
|
)
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
@ -100,7 +100,7 @@ class Inference():
|
||||||
|
|
||||||
|
|
||||||
def generate(self):
|
def generate(self):
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||||
MAX_GENERATE_LENGTH = 128
|
MAX_GENERATE_LENGTH = 128
|
||||||
|
|
||||||
pred_generations = []
|
pred_generations = []
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
Accuracy for fold 1: 0.9697113109323237
|
||||||
|
Accuracy for fold 2: 0.9
|
||||||
|
Accuracy for fold 3: 0.9613453815261044
|
||||||
|
Accuracy for fold 4: 0.9686013320647003
|
||||||
|
Accuracy for fold 5: 0.8932661475034357
|
|
@ -6,13 +6,14 @@ from inference import Inference
|
||||||
|
|
||||||
checkpoint_directory = '../'
|
checkpoint_directory = '../'
|
||||||
|
|
||||||
BATCH_SIZE = 512
|
BATCH_SIZE = 128
|
||||||
|
|
||||||
def infer_and_select(fold):
|
def infer_and_select(fold):
|
||||||
print(f"Inference for fold {fold}")
|
print(f"Inference for fold {fold}")
|
||||||
# import test data
|
# import test data
|
||||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
# uncomment for mdm only
|
||||||
df = df[df['MDM']].reset_index(drop=True)
|
df = df[df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
# get target data
|
# get target data
|
||||||
|
@ -26,7 +27,7 @@ def infer_and_select(fold):
|
||||||
# run inference
|
# run inference
|
||||||
# checkpoint
|
# checkpoint
|
||||||
# Use glob to find matching paths
|
# Use glob to find matching paths
|
||||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
|
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||||
# Use glob to find matching paths
|
# Use glob to find matching paths
|
||||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
# we are guaranteed to save only 1 checkpoint from training
|
# we are guaranteed to save only 1 checkpoint from training
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
# from datasets import load_from_disk
|
# from datasets import load_from_disk
|
||||||
import os
|
import os
|
||||||
import glob
|
|
||||||
|
|
||||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from safetensors.torch import load_file
|
|
||||||
|
|
||||||
from transformers.models.t5.modeling_t5 import T5Block
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
T5Config,
|
|
||||||
T5TokenizerFast,
|
T5TokenizerFast,
|
||||||
AutoModelForSeq2SeqLM,
|
AutoModelForSeq2SeqLM,
|
||||||
DataCollatorForSeq2Seq,
|
DataCollatorForSeq2Seq,
|
||||||
Seq2SeqTrainer,
|
Seq2SeqTrainer,
|
||||||
EarlyStoppingCallback,
|
EarlyStoppingCallback,
|
||||||
Seq2SeqTrainingArguments,
|
Seq2SeqTrainingArguments
|
||||||
T5ForConditionalGeneration,
|
|
||||||
T5Model
|
|
||||||
)
|
)
|
||||||
import evaluate
|
import evaluate
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
torch.set_float32_matmul_precision('high')
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
|
||||||
|
|
||||||
# model_checkpoint = "t5-small"
|
|
||||||
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
|
||||||
# model.config
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# outputs a list of dictionaries
|
# outputs a list of dictionaries
|
||||||
def process_df_to_dict(df):
|
def process_df_to_dict(df):
|
||||||
output_list = []
|
output_list = []
|
||||||
|
@ -77,11 +60,12 @@ def create_split_dataset(fold):
|
||||||
|
|
||||||
# function to perform training for a given fold
|
# function to perform training for a given fold
|
||||||
def train(fold):
|
def train(fold):
|
||||||
save_path = f'checkpoint_fold_{fold}b'
|
save_path = f'checkpoint_fold_{fold}'
|
||||||
split_datasets = create_split_dataset(fold)
|
split_datasets = create_split_dataset(fold)
|
||||||
|
|
||||||
# prepare tokenizer
|
# prepare tokenizer
|
||||||
model_checkpoint = "t5-small"
|
|
||||||
|
model_checkpoint = "t5-base"
|
||||||
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
# Define additional special tokens
|
# Define additional special tokens
|
||||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||||
|
@ -101,7 +85,7 @@ def train(fold):
|
||||||
text_target=target,
|
text_target=target,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
padding="max_length"
|
padding=True
|
||||||
)
|
)
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
@ -119,52 +103,10 @@ def train(fold):
|
||||||
# device_map set to auto to force it to load contiguous weights
|
# device_map set to auto to force it to load contiguous weights
|
||||||
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||||
|
|
||||||
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||||
# # Use glob to find matching paths
|
# important! after extending tokens vocab
|
||||||
# # path is usually checkpoint_fold_1/checkpoint-<step number>
|
|
||||||
# # we are guaranteed to save only 1 checkpoint from training
|
|
||||||
# pattern = 'checkpoint-*'
|
|
||||||
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
|
|
||||||
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
|
|
||||||
# # Load the checkpoint
|
|
||||||
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
|
|
||||||
# checkpoint = load_file(checkpoint_path)
|
|
||||||
# # Filter out weights related to the classification head
|
|
||||||
# # given name format: t5.encoder.embed_tokens.weight
|
|
||||||
# # we want: encoder.embed.tokens.weight
|
|
||||||
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
|
|
||||||
|
|
||||||
|
|
||||||
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
|
||||||
|
|
||||||
# Access the decoder stack
|
|
||||||
# config = T5Config("t5-small")
|
|
||||||
|
|
||||||
config = pretrained_model.config
|
|
||||||
config.num_layers = 6
|
|
||||||
config.num_decoder_layers = 12 # set new decoder layer count
|
|
||||||
|
|
||||||
model = T5ForConditionalGeneration(config)
|
|
||||||
|
|
||||||
model.shared = pretrained_model.shared
|
|
||||||
model.encoder = pretrained_model.encoder
|
|
||||||
|
|
||||||
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
|
|
||||||
for i, layer in enumerate(pretrained_decoder_weights):
|
|
||||||
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
|
|
||||||
|
|
||||||
|
|
||||||
# print number of decoder blocks
|
|
||||||
print(f'Number of decoder blocks: {len(model.decoder.block)}')
|
|
||||||
print(f'num_layers: {model.config.num_layers}')
|
|
||||||
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
|
|
||||||
|
|
||||||
|
|
||||||
# change the token embedding size to match the shape
|
|
||||||
model.resize_token_embeddings(len(tokenizer))
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||||
metric = evaluate.load("sacrebleu")
|
metric = evaluate.load("sacrebleu")
|
||||||
|
|
||||||
|
@ -199,10 +141,11 @@ def train(fold):
|
||||||
# Generation Config
|
# Generation Config
|
||||||
# from transformers import GenerationConfig
|
# from transformers import GenerationConfig
|
||||||
gen_config = model.generation_config
|
gen_config = model.generation_config
|
||||||
gen_config.max_length = 128
|
gen_config.max_length = 64
|
||||||
|
|
||||||
# compile
|
# compile
|
||||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||||
|
# model = torch.compile(model)
|
||||||
|
|
||||||
|
|
||||||
# Trainer
|
# Trainer
|
||||||
|
@ -210,10 +153,10 @@ def train(fold):
|
||||||
args = Seq2SeqTrainingArguments(
|
args = Seq2SeqTrainingArguments(
|
||||||
f"{save_path}",
|
f"{save_path}",
|
||||||
# eval_strategy="epoch",
|
# eval_strategy="epoch",
|
||||||
|
save_strategy="epoch",
|
||||||
eval_strategy="no",
|
eval_strategy="no",
|
||||||
logging_dir="tensorboard-log",
|
logging_dir="tensorboard-log",
|
||||||
logging_strategy="epoch",
|
logging_strategy="epoch",
|
||||||
# save_strategy="epoch",
|
|
||||||
load_best_model_at_end=False,
|
load_best_model_at_end=False,
|
||||||
learning_rate=1e-3,
|
learning_rate=1e-3,
|
||||||
per_device_train_batch_size=64,
|
per_device_train_batch_size=64,
|
||||||
|
@ -222,12 +165,13 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
generation_config=gen_config,
|
generation_config=gen_config,
|
||||||
remove_unused_columns=False,
|
remove_unused_columns=False,
|
||||||
|
warmup_steps=400
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
checkpoint*
|
||||||
|
tensorboard-log/
|
|
@ -76,7 +76,7 @@ class Inference():
|
||||||
text_target=target,
|
text_target=target,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
padding="max_length",
|
padding='max_length',
|
||||||
truncation=True,
|
truncation=True,
|
||||||
)
|
)
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
@ -100,7 +100,7 @@ class Inference():
|
||||||
|
|
||||||
|
|
||||||
def generate(self):
|
def generate(self):
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||||
MAX_GENERATE_LENGTH = 128
|
MAX_GENERATE_LENGTH = 128
|
||||||
|
|
||||||
pred_generations = []
|
pred_generations = []
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
Accuracy for fold 1: 0.934690014197823
|
||||||
|
Accuracy for fold 2: 0.9023364485981309
|
||||||
|
Accuracy for fold 3: 0.9643574297188755
|
||||||
|
Accuracy for fold 4: 0.9700285442435775
|
||||||
|
Accuracy for fold 5: 0.8941823179111315
|
|
@ -26,7 +26,7 @@ def infer_and_select(fold):
|
||||||
# run inference
|
# run inference
|
||||||
# checkpoint
|
# checkpoint
|
||||||
# Use glob to find matching paths
|
# Use glob to find matching paths
|
||||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
|
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||||
# Use glob to find matching paths
|
# Use glob to find matching paths
|
||||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
# we are guaranteed to save only 1 checkpoint from training
|
# we are guaranteed to save only 1 checkpoint from training
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
# from datasets import load_from_disk
|
# from datasets import load_from_disk
|
||||||
import os
|
import os
|
||||||
import glob
|
|
||||||
|
|
||||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from safetensors.torch import load_file
|
|
||||||
|
|
||||||
from transformers.models.t5.modeling_t5 import T5Block
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
T5Config,
|
|
||||||
T5TokenizerFast,
|
T5TokenizerFast,
|
||||||
AutoModelForSeq2SeqLM,
|
AutoModelForSeq2SeqLM,
|
||||||
DataCollatorForSeq2Seq,
|
DataCollatorForSeq2Seq,
|
||||||
Seq2SeqTrainer,
|
Seq2SeqTrainer,
|
||||||
EarlyStoppingCallback,
|
EarlyStoppingCallback,
|
||||||
Seq2SeqTrainingArguments,
|
Seq2SeqTrainingArguments
|
||||||
T5ForConditionalGeneration,
|
|
||||||
T5Model
|
|
||||||
)
|
)
|
||||||
import evaluate
|
import evaluate
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
torch.set_float32_matmul_precision('high')
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
|
||||||
|
|
||||||
# model_checkpoint = "t5-small"
|
|
||||||
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
|
||||||
# model.config
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# outputs a list of dictionaries
|
# outputs a list of dictionaries
|
||||||
def process_df_to_dict(df):
|
def process_df_to_dict(df):
|
||||||
output_list = []
|
output_list = []
|
||||||
|
@ -77,10 +60,11 @@ def create_split_dataset(fold):
|
||||||
|
|
||||||
# function to perform training for a given fold
|
# function to perform training for a given fold
|
||||||
def train(fold):
|
def train(fold):
|
||||||
save_path = f'checkpoint_fold_{fold}b'
|
save_path = f'checkpoint_fold_{fold}'
|
||||||
split_datasets = create_split_dataset(fold)
|
split_datasets = create_split_dataset(fold)
|
||||||
|
|
||||||
# prepare tokenizer
|
# prepare tokenizer
|
||||||
|
|
||||||
model_checkpoint = "t5-small"
|
model_checkpoint = "t5-small"
|
||||||
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
# Define additional special tokens
|
# Define additional special tokens
|
||||||
|
@ -101,7 +85,7 @@ def train(fold):
|
||||||
text_target=target,
|
text_target=target,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
padding="max_length"
|
padding=True
|
||||||
)
|
)
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
@ -119,52 +103,10 @@ def train(fold):
|
||||||
# device_map set to auto to force it to load contiguous weights
|
# device_map set to auto to force it to load contiguous weights
|
||||||
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||||
|
|
||||||
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||||
# # Use glob to find matching paths
|
# important! after extending tokens vocab
|
||||||
# # path is usually checkpoint_fold_1/checkpoint-<step number>
|
|
||||||
# # we are guaranteed to save only 1 checkpoint from training
|
|
||||||
# pattern = 'checkpoint-*'
|
|
||||||
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
|
|
||||||
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
|
|
||||||
# # Load the checkpoint
|
|
||||||
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
|
|
||||||
# checkpoint = load_file(checkpoint_path)
|
|
||||||
# # Filter out weights related to the classification head
|
|
||||||
# # given name format: t5.encoder.embed_tokens.weight
|
|
||||||
# # we want: encoder.embed.tokens.weight
|
|
||||||
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
|
|
||||||
|
|
||||||
|
|
||||||
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
|
||||||
|
|
||||||
# Access the decoder stack
|
|
||||||
# config = T5Config("t5-small")
|
|
||||||
|
|
||||||
config = pretrained_model.config
|
|
||||||
config.num_layers = 6
|
|
||||||
config.num_decoder_layers = 9 # set new decoder layer count
|
|
||||||
|
|
||||||
model = T5ForConditionalGeneration(config)
|
|
||||||
|
|
||||||
model.shared = pretrained_model.shared
|
|
||||||
model.encoder = pretrained_model.encoder
|
|
||||||
|
|
||||||
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
|
|
||||||
for i, layer in enumerate(pretrained_decoder_weights):
|
|
||||||
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
|
|
||||||
|
|
||||||
|
|
||||||
# print number of decoder blocks
|
|
||||||
print(f'Number of decoder blocks: {len(model.decoder.block)}')
|
|
||||||
print(f'num_layers: {model.config.num_layers}')
|
|
||||||
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
|
|
||||||
|
|
||||||
|
|
||||||
# change the token embedding size to match the shape
|
|
||||||
model.resize_token_embeddings(len(tokenizer))
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||||
metric = evaluate.load("sacrebleu")
|
metric = evaluate.load("sacrebleu")
|
||||||
|
|
||||||
|
@ -199,7 +141,7 @@ def train(fold):
|
||||||
# Generation Config
|
# Generation Config
|
||||||
# from transformers import GenerationConfig
|
# from transformers import GenerationConfig
|
||||||
gen_config = model.generation_config
|
gen_config = model.generation_config
|
||||||
gen_config.max_length = 128
|
gen_config.max_length = 64
|
||||||
|
|
||||||
# compile
|
# compile
|
||||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||||
|
@ -215,14 +157,14 @@ def train(fold):
|
||||||
logging_strategy="epoch",
|
logging_strategy="epoch",
|
||||||
# save_strategy="epoch",
|
# save_strategy="epoch",
|
||||||
load_best_model_at_end=False,
|
load_best_model_at_end=False,
|
||||||
learning_rate=1e-3,
|
learning_rate=1e-4,
|
||||||
per_device_train_batch_size=64,
|
per_device_train_batch_size=64,
|
||||||
per_device_eval_batch_size=64,
|
per_device_eval_batch_size=64,
|
||||||
auto_find_batch_size=False,
|
auto_find_batch_size=False,
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
Accuracy for fold 1: 0.9455750118315192
|
Accuracy for fold 1: 0.9427354472314246
|
||||||
Accuracy for fold 2: 0.8864485981308411
|
Accuracy for fold 2: 0.8981308411214953
|
||||||
Accuracy for fold 3: 0.9558232931726908
|
Accuracy for fold 3: 0.9588353413654619
|
||||||
Accuracy for fold 4: 0.9686013320647003
|
Accuracy for fold 4: 0.9633682207421503
|
||||||
Accuracy for fold 5: 0.896930829134219
|
Accuracy for fold 5: 0.8928080622995878
|
||||||
|
|
|
@ -157,13 +157,13 @@ def train(fold):
|
||||||
# save_strategy="epoch",
|
# save_strategy="epoch",
|
||||||
load_best_model_at_end=False,
|
load_best_model_at_end=False,
|
||||||
learning_rate=1e-3,
|
learning_rate=1e-3,
|
||||||
per_device_train_batch_size=128,
|
per_device_train_batch_size=64,
|
||||||
per_device_eval_batch_size=128,
|
per_device_eval_batch_size=64,
|
||||||
auto_find_batch_size=False,
|
auto_find_batch_size=False,
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
Accuracy for fold 1: 0.9522006625650734
|
Accuracy for fold 1: 0.9578797917652626
|
||||||
Accuracy for fold 2: 0.9093457943925234
|
Accuracy for fold 2: 0.9088785046728972
|
||||||
Accuracy for fold 3: 0.9678714859437751
|
Accuracy for fold 3: 0.9673694779116466
|
||||||
Accuracy for fold 4: 0.9814462416745956
|
Accuracy for fold 4: 0.9785918173168411
|
||||||
Accuracy for fold 5: 0.890975721484196
|
Accuracy for fold 5: 0.8918918918918919
|
||||||
|
|
|
@ -13,7 +13,8 @@ def infer_and_select(fold):
|
||||||
# import test data
|
# import test data
|
||||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
df = df[df['MDM']].reset_index(drop=True)
|
# note: we need to uncomment this for overall evaluation
|
||||||
|
# df = df[df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
# get target data
|
# get target data
|
||||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||||
|
|
|
@ -164,7 +164,7 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
|
|
||||||
Accuracy for fold 1: 0.9403691433980123
|
|
||||||
Accuracy for fold 2: 0.9046728971962616
|
|
||||||
Accuracy for fold 3: 0.9678714859437751
|
|
||||||
Accuracy for fold 4: 0.9695528068506185
|
|
||||||
Accuracy for fold 5: 0.902427851580394
|
|
|
@ -222,7 +222,7 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
|
|
|
@ -222,7 +222,7 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
checkpoint*
|
|
||||||
tensorboard-log
|
|
|
@ -1,6 +0,0 @@
|
||||||
|
|
||||||
Accuracy for fold 1: 0.9427354472314246
|
|
||||||
Accuracy for fold 2: 0.9098130841121496
|
|
||||||
Accuracy for fold 3: 0.964859437751004
|
|
||||||
Accuracy for fold 4: 0.9719314938154139
|
|
||||||
Accuracy for fold 5: 0.9070087036188731
|
|
|
@ -222,7 +222,7 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
|
|
|
@ -222,7 +222,7 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=40,
|
num_train_epochs=80,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
checkpoint*
|
|
||||||
tensorboard-log
|
|
|
@ -1,6 +0,0 @@
|
||||||
|
|
||||||
Accuracy for fold 1: 0.9441552295314718
|
|
||||||
Accuracy for fold 2: 0.9121495327102803
|
|
||||||
Accuracy for fold 3: 0.963855421686747
|
|
||||||
Accuracy for fold 4: 0.9752616555661275
|
|
||||||
Accuracy for fold 5: 0.907924874026569
|
|
|
@ -1,28 +1,14 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
cd hybrid_t5_complete_desc_unit
|
cd mapping_t5-base_desc
|
||||||
micromamba run -n hug accelerate launch train_encoder.py
|
micromamba run -n hug accelerate launch train.py
|
||||||
micromamba run -n hug accelerate launch train_decoder.py
|
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
cd hybrid_t5_pattern_desc_unit
|
cd mapping_t5-base_desc_unit
|
||||||
micromamba run -n hug accelerate launch train_encoder.py
|
micromamba run -n hug accelerate launch train.py
|
||||||
micromamba run -n hug accelerate launch train_decoder.py
|
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
|
|
||||||
# cd classification_bert_complete_desc
|
|
||||||
# micromamba run -n hug accelerate launch train.py
|
|
||||||
# cd ..
|
|
||||||
|
|
||||||
# cd classification_bert_complete_desc_unit
|
|
||||||
# micromamba run -n hug accelerate launch train.py
|
|
||||||
# cd ..
|
|
||||||
|
|
||||||
# cd classification_bert_complete_desc_unit_name
|
|
||||||
# micromamba run -n hug accelerate launch train.py
|
|
||||||
# cd ..
|
|
||||||
|
|
||||||
# cd mapping_t5_complete_desc
|
# cd mapping_t5_complete_desc
|
||||||
# micromamba run -n hug accelerate launch train.py
|
# micromamba run -n hug accelerate launch train.py
|
||||||
# cd ..
|
# cd ..
|
||||||
|
@ -31,6 +17,31 @@ cd ..
|
||||||
# micromamba run -n hug accelerate launch train.py
|
# micromamba run -n hug accelerate launch train.py
|
||||||
# cd ..
|
# cd ..
|
||||||
#
|
#
|
||||||
# cd mapping_t5_complete_name_desc_unit
|
# cd frozen_t5_encoder
|
||||||
|
# micromamba run -n hug accelerate launch train_decoder.py
|
||||||
|
# cd ..
|
||||||
|
#
|
||||||
|
# cd modified_t5_decoder_1_layers
|
||||||
|
# micromamba run -n hug accelerate launch train_decoder.py
|
||||||
|
# cd ..
|
||||||
|
#
|
||||||
|
# cd modified_t5_decoder_2_layers
|
||||||
|
# micromamba run -n hug accelerate launch train_decoder.py
|
||||||
|
# cd ..
|
||||||
|
#
|
||||||
|
# cd modified_t5_decoder_4_layers
|
||||||
|
# micromamba run -n hug accelerate launch train_decoder.py
|
||||||
|
# cd ..
|
||||||
|
#
|
||||||
|
# cd modified_t5_decoder_8_layers
|
||||||
|
# micromamba run -n hug accelerate launch train_decoder.py
|
||||||
|
# cd ..
|
||||||
|
#
|
||||||
|
# cd classification_bert_complete_desc
|
||||||
# micromamba run -n hug accelerate launch train.py
|
# micromamba run -n hug accelerate launch train.py
|
||||||
# cd ..
|
# cd ..
|
||||||
|
#
|
||||||
|
# cd classification_bert_complete_desc_unit
|
||||||
|
# micromamba run -n hug accelerate launch train.py
|
||||||
|
# cd ..
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue