Feat: added classification with number tokens
- added analysis for overall statistics
This commit is contained in:
parent
1b6659a600
commit
1b9c4323c3
|
@ -13,6 +13,10 @@ full_df
|
|||
# %%
|
||||
mdm_list
|
||||
|
||||
# %%
|
||||
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
|
||||
# %%
|
||||
mdm_list
|
||||
# %%
|
||||
mask = full_df['pattern'] == 'GE#Flow FGMassFlow'
|
||||
full_df[mask]
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
|
||||
# %%
|
||||
data_path = '../../data_import/exports/raw_data.csv'
|
||||
df = pd.read_csv(data_path)
|
||||
|
||||
# %%
|
||||
df
|
||||
|
||||
# %%
|
||||
len(set(df['ships_idx']))
|
||||
# %%
|
|
@ -0,0 +1,58 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# %%
|
||||
# data_path = '../../data_import/exports/raw_data.csv'
|
||||
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
|
||||
df = pd.read_csv(data_path)
|
||||
|
||||
# %%
|
||||
df = df[df['MDM']].reset_index(drop=True)
|
||||
|
||||
# %%
|
||||
# we want to print the string length
|
||||
|
||||
# print summary stats
|
||||
def summary_stats(arr):
|
||||
return {
|
||||
"Mean": np.mean(arr),
|
||||
"Median": np.median(arr),
|
||||
"Standard Deviation": np.std(arr),
|
||||
"Variance": np.var(arr),
|
||||
"Min": np.min(arr),
|
||||
"Max": np.max(arr),
|
||||
"Range": np.ptp(arr),
|
||||
"25th Percentile": np.percentile(arr, 25),
|
||||
"75th Percentile": np.percentile(arr, 75),
|
||||
"Sum": np.sum(arr),
|
||||
}
|
||||
|
||||
# %%
|
||||
ship_domain_data = df['tag_description'] + df['unit'].fillna('')
|
||||
|
||||
ship_domain_array = np.array([len(item) for item in ship_domain_data])
|
||||
|
||||
stats = summary_stats(ship_domain_array)
|
||||
|
||||
for key, value in stats.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
|
||||
# %%
|
||||
plt.hist(ship_domain_array, bins=50)
|
||||
# %%
|
||||
|
||||
# %%
|
||||
platform_domain_data = df['thing'] + df['property']
|
||||
|
||||
platform_domain_array = np.array([len(item) for item in platform_domain_data])
|
||||
|
||||
stats = summary_stats(platform_domain_array)
|
||||
|
||||
for key, value in stats.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1 @@
|
|||
exports
|
|
@ -0,0 +1,62 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# %%
|
||||
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
|
||||
|
||||
|
||||
# %%
|
||||
fold = 5
|
||||
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
|
||||
df_bert = pd.read_csv(file_path)
|
||||
# %%
|
||||
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
df_t5 = pd.read_csv(file_path)
|
||||
df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True)
|
||||
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
|
||||
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
|
||||
|
||||
# %%
|
||||
df_t5['bert_prediction'] = df_bert['class_prediction']
|
||||
df_bert['t5_prediction'] = df_t5['class_prediction']
|
||||
# %%
|
||||
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
|
||||
# %%
|
||||
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
|
||||
|
||||
# %%
|
||||
sum(t5_correct)/len(t5_correct)
|
||||
|
||||
# %%
|
||||
# replace t5 not in vocab with bert values
|
||||
t5_correct_modified = t5_correct.copy()
|
||||
condition = ~df_t5['in_vocab']
|
||||
t5_correct_modified[condition] = np.array(bert_correct[condition])
|
||||
|
||||
# %%
|
||||
# new replacement correctness
|
||||
sum(t5_correct_modified)/len(t5_correct_modified)
|
||||
# %%
|
||||
# when bert is correct and t5 is wrong
|
||||
cond_mask = bert_correct & (~t5_correct)
|
||||
print(sum(cond_mask))
|
||||
print(df_t5[cond_mask].to_string())
|
||||
# %%
|
||||
# when bert is wrong and t5 is correct
|
||||
cond_mask = (~bert_correct) & (t5_correct)
|
||||
print(sum(cond_mask))
|
||||
print(df_bert[cond_mask].to_string())
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# when both are wrong
|
||||
cond_mask = (~bert_correct) & (~t5_correct)
|
||||
print(sum(cond_mask))
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1,72 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# %%
|
||||
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
|
||||
|
||||
# %%
|
||||
def run_mdm(fold):
|
||||
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
|
||||
df_bert = pd.read_csv(file_path)
|
||||
df_bert = df_bert[df_bert['MDM']].reset_index(drop=True)
|
||||
|
||||
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
df_t5 = pd.read_csv(file_path)
|
||||
df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True)
|
||||
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
|
||||
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
|
||||
|
||||
df_t5['bert_prediction'] = df_bert['class_prediction']
|
||||
df_bert['t5_prediction'] = df_t5['class_prediction']
|
||||
|
||||
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
|
||||
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
|
||||
|
||||
t5_original_accuracy = sum(t5_correct)/len(t5_correct)
|
||||
|
||||
# replace t5 not in vocab with bert values
|
||||
t5_correct_modified = t5_correct.copy()
|
||||
condition = ~df_t5['in_vocab']
|
||||
t5_correct_modified[condition] = np.array(bert_correct[condition])
|
||||
pd.Series(t5_correct_modified).to_csv(f'exports/result_group_{fold}.csv')
|
||||
|
||||
t5_new_accuracy = sum(t5_correct_modified)/len(t5_correct_modified)
|
||||
|
||||
print('original accuracy', t5_original_accuracy)
|
||||
print('new accuracy', t5_new_accuracy)
|
||||
|
||||
|
||||
# %%
|
||||
# this does replacement for the full prediction
|
||||
def run_full(fold):
|
||||
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
|
||||
df_bert = pd.read_csv(file_path)
|
||||
|
||||
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
df_t5 = pd.read_csv(file_path)
|
||||
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
|
||||
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
|
||||
|
||||
df_t5['bert_prediction'] = df_bert['class_prediction']
|
||||
df_bert['t5_prediction'] = df_t5['class_prediction']
|
||||
|
||||
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
|
||||
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
|
||||
|
||||
# replace t5 not in vocab with bert values
|
||||
t5_correct_modified = t5_correct.copy()
|
||||
condition = ~df_t5['in_vocab']
|
||||
t5_correct_modified[condition] = np.array(bert_correct[condition])
|
||||
pd.Series(t5_correct_modified, name='grounded_pred').to_csv(f'exports/result_group_{fold}.csv')
|
||||
|
||||
|
||||
# %%
|
||||
for fold in [1,2,3,4,5]:
|
||||
run_mdm(fold)
|
||||
run_full(fold)
|
||||
# %%
|
|
@ -0,0 +1,67 @@
|
|||
,thing,property,ships_idx,tag_name,tag_description,signal_type,min,max,unit,data_type,thing_pattern,property_pattern,pattern,MDM,class_prediction
|
||||
6,SB1Flow,FOMassFlowTotal,1003,FM6_XI001_Y,AUXILIARY BOILER FUEL OIL TOTAL FLOW RATE,AI,0,0,FLOW,1304.0,SB#Flow,FOMassFlowTotal,SB#Flow FOMassFlowTotal,True,SB1FlowFOMassFlowIn
|
||||
38,ShipBoiler3,RunningState,1030,BC330,COMPOSITE BOILER FIRING,DI,0,0,NOVALUE,1301.0,ShipBoiler#,RunningState,ShipBoiler# RunningState,True,ShipBoiler1RunningState
|
||||
61,GeneratorEngine5,CBNonClosed,1003,PMS_5ACBNCL_Y,NO5 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine5RunningState
|
||||
72,CargoHandling,BoostPp_Port_Current,1018,IT_1400_Y,MP1400 BOOSTER PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,BoostPp_Port_Current,CargoHandling BoostPp_Port_Current,True,CargoHandlingBoostPp_Stbd_Current
|
||||
81,Navigation,MidPDraft,1018,TL_200002_Y,MID DRAFTP_LV,A,0,0,NOVALUE,1310.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft
|
||||
86,ShipBoiler1,FOInletTemp,1018,AB_000001_Y,BOILER FUEL OIL IN BURNER_TEMP,A,0,0,NOVALUE,1310.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
|
||||
140,Navigation,MidPDraft,1003,DCM_P3_Y,DRAUGHT MID PS (DRAFT SENSOR),AI,0,0,m ,1304.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft
|
||||
174,ShipBoiler1,FOInletPress,1051,MB.YO.IAS.Q3.40224,BOILER FUEL OIL IN BURNER_PRESS,Analog,0,4,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress
|
||||
200,GeneratorEngine3,VoltageB,1050,MB.KM.IAS.Q3.A40193,NO3 GENERATOR_ENGINE(B) GEN VOLTAGE,AO,0,655,VOLTAGE,1300.0,GeneratorEngine#,VoltageB,GeneratorEngine# VoltageB,True,GeneratorEngine3Voltage
|
||||
342,EngineRoom,AirTemp,1018,MA_TT8612_Y,MAIN_ENGINE AMBIENT_TEMP,A,0,0,NOVALUE,1310.0,EngineRoom,AirTemp,EngineRoom AirTemp,True,GeneratorEngine1CBTrip
|
||||
395,GeneratorEngine3,SAPress,1036,MB.KM.IAS.Q2.400121,NO3 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine3WindingTempR
|
||||
396,MainEngine1,RPM,1051,MB.YO.IAS.Q1.40006,M/E_RPM,Analog,-120,120,RPM,1304.0,MainEngine#,RPM,MainEngine# RPM,True,Shaft1RPM
|
||||
653,ShipBoiler1,FOInletTemp,1033,CB014,COMPOSITE BOILER FUEL OIL TEMPERATURE,AI,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
|
||||
731,GeneratorEngine4,CBNonClosed,1003,PMS_4ACBNCL_Y,NO4 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine4CBClosed
|
||||
745,ShipBoiler1,FOInletPress,1018,AB_000002_Y,BOILER FUEL OIL IN BURNER PRESSURE,A,0,0,PRESSURE,1310.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress
|
||||
783,GeneratorEngine1,LOFilterInletPress,1030,GA069,NO1 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine1LOInletPress
|
||||
786,GeneratorEngine1,FOFilterInletPress,1030,GA085,NO1 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine1FOInletPress
|
||||
812,GE1Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400031,GENERATOR_ENGINE FUEL OIL VISCOSITY INDICATION,AO,0,2346,VOLUME FLOW,1304.0,GE#Flow,FOViscosity,GE#Flow FOViscosity,True,GE1FlowFOVolumeFlowIn
|
||||
813,ME2Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400025,MAIN_ENGINE(P) FUEL OIL VISCOSITY INDICATION,AO,0,2285,VOLUME FLOW,1304.0,ME#Flow,FOViscosity,ME#Flow FOViscosity,True,ME2FlowFOVolumeFlowIn
|
||||
840,GeneratorEngine1,SAPress,1036,MB.KM.IAS.Q1.400051,NO1 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine1WindingTempR
|
||||
891,GE1Flow,FOMassFlowIn,1051,MB.YO.IAS.Q2.40103,GENERATOR_ENGINE HFO_FLOW,Analog,0,1800,MASS FLOW,1304.0,GE#Flow,FOMassFlowIn,GE#Flow FOMassFlowIn,True,GE1FlowFGMassFlow
|
||||
935,ShipBoiler1,FOInletTemp,1051,MB.YO.IAS.Q3.40223,BOILER FUEL OIL IN BURNER_TEMP,Analog,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
|
||||
951,MainEngine2,CFWInletTemp,1020,MB.YO.IAS.Q1.A400388,MAIN_ENGINE(P) CYLINDER COOL WATER TEMPERATURE INLET,AO,-50,130,TEMPERATURE,1304.0,MainEngine#,CFWInletTemp,MainEngine# CFWInletTemp,True,MainEngine2Cy3CWTemp
|
||||
1005,GeneratorEngine1,HFOUse,1051,MB.YO.IAS.Q1.10096,G/E_HFUEL OIL USE,Digital,0,1,-,1301.0,GeneratorEngine#,HFOUse,GeneratorEngine# HFOUse,True,MainEngine1HFOUse
|
||||
1075,ME1Flow,FGMassFlow,1004,MB.YO.IAS.Q2.A400121,LP LPG FUEL P/P FLOW,AI,0,3500,MASS FLOW,1304.0,ME#Flow,FGMassFlow,ME#Flow FGMassFlow,True,ME2FlowFGMassFlow
|
||||
1116,CargoHandling,LPGComp1MotorCurrent,1004,MB.YO.IAS.Q3.A400281,MP-2100 COMPRESSOR (P) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT3_DWPump_Port_Current
|
||||
1117,CargoHandling,LPGComp2MotorCurrent,1004,MB.YO.IAS.Q3.A400282,MP-2200 COMPRESSOR (C) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT2_DWPump_Stbd_Current
|
||||
1118,CargoHandling,LPGComp3MotorCurrent,1004,MB.YO.IAS.Q3.A400283,MP-2300 COMPRESSOR (S) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingBoostPp_Stbd_Current
|
||||
1174,FuelOilSystem,LFOVolumeSettleTK,1003,LC_XI001_Y,NO2 LIGHT FUEL OIL SETTLING TANK VOLUME,AI,0,999999,VOLUME,1304.0,FuelOilSystem,LFOVolumeSettleTK,FuelOilSystem LFOVolumeSettleTK,True,FuelOilSystemLFOVolumeStorageTK2P
|
||||
1198,GeneratorEngine4,BearingNDETemp1,1003,GE4_TIAH6_Y,NO4 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine4WindingTempT
|
||||
1199,GeneratorEngine5,BearingNDETemp1,1003,GE5_TIAH6_Y,NO5 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine5WindingTempT
|
||||
1200,MainEngine1,LoadPercent,1018,EG_0000005_Y,M/E_LOAD,D,0,0,%,1301.0,MainEngine#,LoadPercent,MainEngine# LoadPercent,True,GeneratorEngine2LoadPercent
|
||||
1214,GE1TurboCharger1,ExhGasOutletTemp,1003,GE1_TE27_Y,NO1 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger1ExhGasOutletTemp
|
||||
1226,GE2TurboCharger1,ExhGasOutletTemp,1003,GE2_TE27_Y,NO2 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger2ExhGasOutletTemp
|
||||
1237,GE3TurboCharger1,ExhGasOutletTemp,1003,GE3_TE27_Y,NO3 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger3ExhGasOutletTemp
|
||||
1246,GeneratorEngine3,BearingDETemp8,1003,GE3_TE698_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5
|
||||
1247,GeneratorEngine3,BearingDETemp9,1003,GE3_TE699_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
|
||||
1273,GeneratorEngine4,BearingDETemp8,1003,GE4_TE698_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
|
||||
1274,GeneratorEngine4,BearingDETemp9,1003,GE4_TE699_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
|
||||
1280,GeneratorEngine5,BearingDETemp2,1003,GE5_TE692_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP2,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2BearingDETemp6
|
||||
1281,GeneratorEngine5,BearingDETemp3,1003,GE5_TE693_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP3,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5
|
||||
1282,GeneratorEngine5,BearingDETemp4,1003,GE5_TE694_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP4,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp4
|
||||
1283,GeneratorEngine5,BearingDETemp5,1003,GE5_TE695_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP5,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5
|
||||
1284,GeneratorEngine5,BearingDETemp6,1003,GE5_TE696_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP6,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
|
||||
1285,GeneratorEngine5,BearingDETemp7,1003,GE5_TE697_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP7,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
|
||||
1286,GeneratorEngine5,BearingDETemp8,1003,GE5_TE698_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2Cy8KnockIntensity
|
||||
1287,GeneratorEngine5,BearingDETemp9,1003,GE5_TE699_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
|
||||
1298,ME1TurboCharger1,ExhGasInletTemp,1003,AMSI_TT3721A_Y,EXHAUST GAS TEMPERATURE BEFORE TURBOCHARGER 1,AI,0,600,TEMPERATURE,1304.0,ME#TurboCharger#,ExhGasInletTemp,ME#TurboCharger# ExhGasInletTemp,True,ME1TurboCharger1ExhGasOutletTemp
|
||||
1309,GeneratorEngine2,LOFilterInletPress,1030,GB069,NO2 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine2LOInletPress
|
||||
1472,GeneratorEngine3,VoltageA,1050,MB.KM.IAS.Q3.A40189,NO3 GENERATOR_ENGINE(A) GEN VOLTAGE,AO,0,654,VOLTAGE,1300.0,GeneratorEngine#,VoltageA,GeneratorEngine# VoltageA,True,GeneratorEngine3Voltage
|
||||
1524,GeneratorEngine2,FOFilterInletPress,1030,GB085,NO2 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine2FOInletPress
|
||||
1536,ShipBoiler1,FOInletTemp,1028,MB.KM.IAS.Q2.A400184,OIL TEMPERATURE (4-20MA),AI,0,200,°C,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,GeneratorEngine4WindingTempT
|
||||
1537,ShipBoiler1,FOInletPress,1028,MB.KM.IAS.Q2.A400185,FUEL OIL PRESSURE (4-20MA),AI,0,40,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,GeneratorEngine4FOInletPress
|
||||
1594,GeneratorEngine3,LOFilterInletPress,1030,GC069,NO3 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine3LOInletPress
|
||||
1597,GeneratorEngine3,FOFilterInletPress,1030,GC085,NO3 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine3FOInletPress
|
||||
1679,GeneratorEngine3,busBarVoltage,1003,PMS_3BUSVOLA_Y,BUS VOLTAGE,AI,0,10000,VOLTAGE,1304.0,GeneratorEngine#,busBarVoltage,GeneratorEngine# busBarVoltage,True,GeneratorEngine1busBarVoltage
|
||||
1727,GeneratorEngine2,SAPress,1036,MB.KM.IAS.Q1.400086,NO2 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine2WindingTempR
|
||||
1763,GeneratorEngine5,BearingDETemp1,1003,GE5_TE691_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP1,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5
|
||||
1873,GeneratorEngine5,CBClosed,1003,PMS_5VCBCLED_Y,NO5 GENERATOR_ENGINE MVSB VCB CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBClosed,GeneratorEngine# CBClosed,True,GeneratorEngine5StopState
|
||||
2034,CargoHandling,CT1_DWPump_Stbd_Current,1018,IT_1101_Y,MP1100 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current
|
||||
2035,CargoHandling,CT2_DWPump_Port_Current,1018,IT_1200_Y,MP1200 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
|
||||
2037,CargoHandling,CT3_DWPump_Stbd_Current,1018,IT_1501_Y,MP1501 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current
|
||||
2038,CargoHandling,CT4_DWPump_Port_Current,1018,IT_1700_Y,MP1700 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
|
||||
2048,GeneratorEngine5,RunningHour,1003,PMS_5GENWHRS_Y,NO5 GENERATOR_ENGINE WORKING HOURS,AI,0,10000,NOVALUE,1304.0,GeneratorEngine#,RunningHour,GeneratorEngine# RunningHour,True,GeneratorEngine4RunningHour
|
||||
2057,CargoHandling,CT1_DWPump_Port_Current,1018,IT_1100_Y,MP1100 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
|
||||
2079,ShipBoiler1,ExhGasOutletTemp,1003,EG_G02_Y,EXHAUST GAS ECONOMIZER EXHAUST GAS OUTLET TEMPERATURE,AI,0,600,TEMPERATURE,1304.0,ShipBoiler#,ExhGasOutletTemp,ShipBoiler# ExhGasOutletTemp,True,MainEngine1Cy1ExhGasOutletTemp
|
|
|
@ -0,0 +1,27 @@
|
|||
type,fold,accuracy
|
||||
1layer,1,0.8968291528632276
|
||||
1layer,2,0.8859813084112149
|
||||
1layer,3,0.9382530120481928
|
||||
1layer,4,0.9586108468125595
|
||||
1layer,5,0.8827301878149336
|
||||
2layer,1,0.9318504495977283
|
||||
2layer,2,0.8859813084112149
|
||||
2layer,3,0.9678714859437751
|
||||
2layer,4,0.9738344433872502
|
||||
2layer,5,0.9015116811726981
|
||||
4layer,1,0.9503076194983436
|
||||
4layer,2,0.9135514018691588
|
||||
4layer,3,0.9698795180722891
|
||||
4layer,4,0.9790675547098002
|
||||
4layer,5,0.907924874026569
|
||||
6layer,1,0.9522006625650734
|
||||
6layer,2,0.9093457943925234
|
||||
6layer,3,0.9678714859437751
|
||||
6layer,4,0.9814462416745956
|
||||
6layer,5,0.890975721484196
|
||||
8layer,1,0.9441552295314718
|
||||
8layer,2,0.9121495327102803
|
||||
8layer,3,0.963855421686747
|
||||
8layer,4,0.9752616555661275
|
||||
8layer,5,0.907924874026569
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
type,fold,accuracy
|
||||
normal,1,0.9522006625650734
|
||||
normal,2,0.9093457943925234
|
||||
normal,3,0.9678714859437751
|
||||
normal,4,0.9814462416745956
|
||||
normal,5,0.890975721484196
|
||||
frozen,1,0.9342167534311405
|
||||
frozen,2,0.883177570093458
|
||||
frozen,3,0.963855421686747
|
||||
frozen,4,0.9705042816365367
|
||||
frozen,5,0.9051763628034815
|
||||
|
|
|
@ -0,0 +1,199 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
####################################################################################
|
||||
# stage 1
|
||||
# %%
|
||||
# stage 1a: binary classification
|
||||
df_stage1a = pd.read_csv('stage1a.csv')
|
||||
# %%
|
||||
# desc only
|
||||
mask = df_stage1a['type'] == 'desc'
|
||||
df_stage1a[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# desc and unit
|
||||
mask = df_stage1a['type'] == 'desc_unit'
|
||||
df_stage1a[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# stage 1b: similarity-based classification
|
||||
df_stage1b = pd.read_csv('stage1b.csv')
|
||||
# %%
|
||||
# desc only
|
||||
mask = df_stage1b['type'] == 'desc'
|
||||
df_stage1b[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# desc and unit
|
||||
mask = df_stage1b['type'] == 'desc_unit'
|
||||
df_stage1b[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
|
||||
# %%
|
||||
#################################################################################
|
||||
# stage 2: mapping model
|
||||
|
||||
# %%
|
||||
# stage 2a: mapping by classification
|
||||
df_stage2a = pd.read_csv('stage2a.csv')
|
||||
# %%
|
||||
# desc only
|
||||
mask = df_stage2a['type'] == 'desc'
|
||||
df_stage2a[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# desc and unit
|
||||
mask = df_stage2a['type'] == 'desc_unit'
|
||||
df_stage2a[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
|
||||
# %%
|
||||
# stage 2b: mapping by seq2seq
|
||||
df_stage2b = pd.read_csv('stage2b.csv')
|
||||
# %%
|
||||
# desc only
|
||||
mask = df_stage2b['type'] == 'desc'
|
||||
df_stage2b[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# desc and unit
|
||||
mask = df_stage2b['type'] == 'desc_unit'
|
||||
df_stage2b[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
|
||||
|
||||
############################
|
||||
# frozen encoder
|
||||
# %%
|
||||
df = pd.read_csv('frozen_encoder.csv')
|
||||
# %%
|
||||
# normal
|
||||
mask = df['type'] == 'normal'
|
||||
df[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# frozen
|
||||
mask = df['type'] == 'frozen'
|
||||
df[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
|
||||
# %%
|
||||
############################
|
||||
# frozen encoder
|
||||
# %%
|
||||
df = pd.read_csv('decoder_scaling.csv')
|
||||
# %%
|
||||
# 1 layer
|
||||
mask = df['type'] == '1layer'
|
||||
df[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
|
||||
# %%
|
||||
# 2 layer
|
||||
mask = df['type'] == '2layer'
|
||||
df[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# 4 layer
|
||||
mask = df['type'] == '4layer'
|
||||
df[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# 6 layer
|
||||
mask = df['type'] == '6layer'
|
||||
df[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
# %%
|
||||
# 8 layer
|
||||
mask = df['type'] == '8layer'
|
||||
df[mask].describe().loc[['mean', 'std']]
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
#########################
|
||||
# compute overall result
|
||||
|
||||
# frac{1808}{2113} = 0.856$ & $\frac{10692}{10961} = 0.975$ & $\frac{12500}{13074} = 0.956$ \\
|
||||
# frac{1932}{2140} = 0.903$ & $\frac{8304}{8582} = 0.968$ & $\frac{10236}{10722} = 0.955$ \\
|
||||
# frac{1789}{1992} = 0.898$ & $\frac{7613}{7863} = 0.968$ & $\frac{9402}{9855} = 0.954$ \\
|
||||
# frac{1967}{2102} = 0.936$ & $\frac{12929}{13349} = 0.969$ & $\frac{14896}{15451} = 0.964$ \\
|
||||
# frac{1915}{2183} = 0.877$ & $\frac{10381}{10786} = 0.962$ & $\frac{12296}{12969} = 0.948$ \\
|
||||
|
||||
# %%
|
||||
matrix = np.array([
|
||||
[1808, 2113, 10692, 10961, 13074],
|
||||
[1932, 2140, 8304, 8582, 10722],
|
||||
[1789, 1992, 7613, 7863, 9855],
|
||||
[1967, 2102, 12929, 13349, 15451],
|
||||
[1915, 2183, 10381, 10786, 12969]
|
||||
])
|
||||
# %%
|
||||
relevant_class = matrix[:,0]/matrix[:,1]
|
||||
print(relevant_class)
|
||||
print(np.std(relevant_class))
|
||||
|
||||
# %%
|
||||
non_relevant_class = matrix[:,2]/matrix[:,3]
|
||||
print(non_relevant_class)
|
||||
print(np.std(non_relevant_class))
|
||||
|
||||
# %%
|
||||
numerator = (matrix[:,0] + matrix[:,2])
|
||||
denominator = (matrix[:,1] + matrix[:,3])
|
||||
print(numerator)
|
||||
print(denominator) # same as last column
|
||||
overall = numerator/denominator
|
||||
print(overall)
|
||||
print(np.std(overall))
|
||||
|
||||
|
||||
######################
|
||||
# compute mapping result
|
||||
# %%
|
||||
|
||||
# $\frac{1761}{1808} = 0.974$ \\
|
||||
# $\frac{1802}{1932} = 0.933$ \\
|
||||
# $\frac{1760}{1789} = 0.984$ \\
|
||||
# $\frac{1945}{1967} = 0.989$ \\
|
||||
# $\frac{1837}{1915} = 0.959$ \\
|
||||
|
||||
matrix = np.array([
|
||||
[1761, 1808],
|
||||
[1802, 1932],
|
||||
[1760, 1789],
|
||||
[1945, 1967],
|
||||
[1837, 1915]
|
||||
])
|
||||
|
||||
# %%
|
||||
result = matrix[:,0]/matrix[:,1]
|
||||
print(result)
|
||||
print(np.mean(result))
|
||||
print(np.std(result))
|
||||
|
||||
# %%
|
||||
####################################
|
||||
# compute overall result
|
||||
# & 1761 & 10692 & $\frac{1761 + 10692}{13074} = 0.953$ \\
|
||||
# & 1802 & 8304 & $\frac{1802 + 8304}{10722} = 0.943$ \\
|
||||
# & 1760 & 7613 & $\frac{1760 + 7613}{9855} = 0.951$ \\
|
||||
# & 1945 & 12929 & $\frac{1945 + 12929}{15451} = 0.963$ \\
|
||||
# & 1837 & 10381 & $\frac{1837 + 10381}{12969} = 0.942$ \\
|
||||
|
||||
matrix = np.array([
|
||||
[1761,10692, 13074],
|
||||
[1802, 8304, 10722],
|
||||
[1760, 7613, 9855],
|
||||
[1945,12929, 15451],
|
||||
[1837,10381, 12969]
|
||||
])
|
||||
|
||||
# %%
|
||||
overall = (matrix[:,0] + matrix[:,1])/matrix[:,2]
|
||||
print(overall)
|
||||
print(np.mean(overall))
|
||||
print(np.std(overall))
|
||||
# %%
|
|
@ -0,0 +1,11 @@
|
|||
type,fold,accuracy,f1_score,precision,recall
|
||||
desc,1,0.92588,0.74001,0.85440,0.65263
|
||||
desc,2,0.88733,0.64239,0.87641,0.50701
|
||||
desc,3,0.90583,0.71429,0.92357,0.58233
|
||||
desc,4,0.93114,0.70929,0.83312,0.61751
|
||||
desc,5,0.91171,0.67683,0.88162,0.54924
|
||||
desc_unit,1,0.95610,0.86301,0.87049,0.85566
|
||||
desc_unit,2,0.95467,0.88828,0.87421,0.90280
|
||||
desc_unit,3,0.95403,0.88762,0.87739,0.89809
|
||||
desc_unit,4,0.96408,0.87636,0.82405,0.93578
|
||||
desc_unit,5,0.94811,0.85054,0.82543,0.87723
|
|
|
@ -0,0 +1,11 @@
|
|||
type,fold,accuracy,f1_score,precision,recall
|
||||
desc,1,0.93162,0.79580,0.76909,0.82442
|
||||
desc,2,0.92884,0.82440,0.81224,0.83692
|
||||
desc,3,0.93201,0.83375,0.82434,0.84337
|
||||
desc,4,0.94259,0.80937,0.73814,0.89581
|
||||
desc,5,0.92228,0.78397,0.73661,0.83784
|
||||
desc_unit,1,0.93353,0.79945,0.78018,0.81969
|
||||
desc_unit,2,0.92184,0.81006,0.78653,0.83505
|
||||
desc_unit,3,0.91821,0.80513,0.77659,0.83584
|
||||
desc_unit,4,0.93334,0.78675,0.69648,0.90390
|
||||
desc_unit,5,0.93084,0.80445,0.76747,0.84517
|
|
|
@ -0,0 +1,11 @@
|
|||
type,fold,accuracy
|
||||
desc,1,0.93706
|
||||
desc,2,0.88785
|
||||
desc,3,0.96285
|
||||
desc,4,0.95861
|
||||
desc,5,0.89601
|
||||
desc_unit,1,0.94226
|
||||
desc_unit,2,0.90561
|
||||
desc_unit,3,0.96436
|
||||
desc_unit,4,0.96955
|
||||
desc_unit,5,0.90289
|
|
|
@ -0,0 +1,16 @@
|
|||
type,fold,accuracy
|
||||
desc,1,0.9427354472314246
|
||||
desc,2,0.8981308411214953
|
||||
desc,3,0.9588353413654619
|
||||
desc,4,0.9633682207421503
|
||||
desc,5,0.8928080622995878
|
||||
desc_unit,1,0.9578797917652626
|
||||
desc_unit,2,0.9088785046728972
|
||||
desc_unit,3,0.9673694779116466
|
||||
desc_unit,4,0.9785918173168411
|
||||
desc_unit,5,0.8918918918918919
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
*.png
|
|
@ -41,13 +41,26 @@ distance_array
|
|||
|
||||
|
||||
# %%
|
||||
plt.rcParams.update({'font.size': 14}) # Adjust the size as needed
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||||
plt.xlabel("Normalized Levenshtein Distance")
|
||||
plt.ylabel("Count")
|
||||
plt.tight_layout()
|
||||
plt.savefig("histogram.png", dpi=300)
|
||||
# Add arrow for increasing dissimilarity
|
||||
plt.annotate(
|
||||
"Decreasing Similarity", # Text label
|
||||
xy=(0.7, 500), # Arrow end (near the end of x-axis)
|
||||
xytext=(0.4, 500), # Arrow start (near the middle of x-axis)
|
||||
arrowprops=dict(arrowstyle="->", lw=2, color="black"), # Arrow style
|
||||
va='center', # needed to make arrow centered
|
||||
fontsize=14, # Font size for the text
|
||||
color="black" # Text color
|
||||
)
|
||||
# Add arrows and text
|
||||
plt.savefig("input_output_similarity.png", dpi=300)
|
||||
#
|
||||
|
||||
# %%
|
||||
# summary statistics of computed levenshtein distance
|
||||
def summary_stats(arr):
|
||||
|
|
|
@ -58,12 +58,24 @@ score_list
|
|||
|
||||
# %%
|
||||
# plt.hist(score_list, bins=50)
|
||||
plt.rcParams.update({'font.size': 14}) # Adjust the size as needed
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||||
plt.xlabel("Normalized Levenshtein Distance")
|
||||
plt.ylabel("Platform Domain Class Count")
|
||||
# Add arrow for increasing dissimilarity
|
||||
plt.annotate(
|
||||
"Decreasing Similarity", # Text label
|
||||
xy=(0.7, 70), # Arrow end (near the end of x-axis)
|
||||
xytext=(0.2, 70), # Arrow start (near the middle of x-axis)
|
||||
arrowprops=dict(arrowstyle="->", lw=2, color="black"), # Arrow style
|
||||
va='center', # needed to make arrow centered
|
||||
fontsize=14, # Font size for the text
|
||||
color="black" # Text color
|
||||
)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig("histogram.png", dpi=300)
|
||||
plt.savefig("within_class_similarity.png", dpi=300)
|
||||
# %%
|
||||
# summary statistics of computed levenshtein distance
|
||||
def summary_stats(arr):
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
|
||||
# %%
|
||||
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
|
||||
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# %%
|
||||
df_in = full_df[full_df['MDM']].reset_index(drop=True)
|
||||
# %%
|
||||
df_out = full_df[~full_df['MDM']].reset_index(drop=True)
|
||||
# %%
|
||||
label_counts_in = df_in['unit'].value_counts()
|
||||
print(label_counts_in.to_string())
|
||||
|
||||
# %%
|
||||
label_counts_out = df_out['unit'].value_counts()
|
||||
print(label_counts_out.to_string())
|
||||
|
||||
|
||||
# %%
|
||||
label_counts_out['NOVALUE']/len(df_out)
|
||||
|
||||
# %%
|
||||
label_counts_in['NOVALUE']/len(df_out)
|
||||
# %%
|
|
@ -9,14 +9,19 @@ def run(fold):
|
|||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
p_mdm = df['p_mdm']
|
||||
|
||||
# data_path = f'../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
data_path = f'../train/modified_t5_decoder_4_layers/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
data_path = f'../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
actual_mdm = df['MDM']
|
||||
|
||||
thing_correctness = df['thing'] == df['p_thing']
|
||||
property_correctness = df['property'] == df['p_property']
|
||||
answer = thing_correctness & property_correctness
|
||||
# grounded labels
|
||||
data_path = f'../analysis/delta_analysis/exports/result_group_{fold}.csv'
|
||||
df_grounded = pd.read_csv(data_path, skipinitialspace=True)
|
||||
answer = df_grounded['grounded_pred']
|
||||
|
||||
# original labels
|
||||
# thing_correctness = df['thing'] == df['p_thing']
|
||||
# property_correctness = df['property'] == df['p_property']
|
||||
# answer = thing_correctness & property_correctness
|
||||
|
||||
##############
|
||||
# evaluate relevant-class prediction performance
|
||||
|
@ -53,6 +58,13 @@ def run(fold):
|
|||
print(mapping_rate)
|
||||
print('size', correct_positive_mdm_and_map, '/', sum(p_mdm & actual_mdm))
|
||||
|
||||
# evaluate relevant mappings
|
||||
correct_positive_mdm_and_map = sum(p_mdm & actual_mdm & answer)
|
||||
mapping_rate = correct_positive_mdm_and_map / sum(actual_mdm)
|
||||
print('relevant data mapping rate')
|
||||
print(mapping_rate)
|
||||
print('size', correct_positive_mdm_and_map, '/', sum(actual_mdm))
|
||||
|
||||
|
||||
##############
|
||||
# evaluate overall pipeline result
|
||||
|
@ -76,3 +88,5 @@ for fold in [1,2,3,4,5]:
|
|||
print('*' * 40)
|
||||
run(fold)
|
||||
|
||||
|
||||
# %%
|
||||
|
|
|
@ -179,8 +179,8 @@ def train(fold):
|
|||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-5,
|
||||
per_device_train_batch_size=128,
|
||||
per_device_eval_batch_size=128,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
|
|
|
@ -180,8 +180,8 @@ def train(fold):
|
|||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-5,
|
||||
per_device_train_batch_size=128,
|
||||
per_device_eval_batch_size=128,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
checkpoint*
|
||||
tensorboard-log
|
|
@ -0,0 +1 @@
|
|||
exports
|
|
@ -0,0 +1,31 @@
|
|||
|
||||
********************************************************************************
|
||||
Fold: 1
|
||||
Accuracy: 0.94510
|
||||
F1 Score: 0.94087
|
||||
Precision: 0.94623
|
||||
Recall: 0.94510
|
||||
********************************************************************************
|
||||
Fold: 2
|
||||
Accuracy: 0.91682
|
||||
F1 Score: 0.91698
|
||||
Precision: 0.92824
|
||||
Recall: 0.91682
|
||||
********************************************************************************
|
||||
Fold: 3
|
||||
Accuracy: 0.96185
|
||||
F1 Score: 0.95743
|
||||
Precision: 0.96001
|
||||
Recall: 0.96185
|
||||
********************************************************************************
|
||||
Fold: 4
|
||||
Accuracy: 0.97479
|
||||
F1 Score: 0.97074
|
||||
Precision: 0.97072
|
||||
Recall: 0.97479
|
||||
********************************************************************************
|
||||
Fold: 5
|
||||
Accuracy: 0.90563
|
||||
F1 Score: 0.89532
|
||||
Precision: 0.90040
|
||||
Recall: 0.90563
|
|
@ -0,0 +1,289 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
import glob
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
)
|
||||
import evaluate
|
||||
import re
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
|
||||
BATCH_SIZE = 128
|
||||
|
||||
# %%
|
||||
|
||||
# we need to create the mdm_list
|
||||
# import the full mdm-only file
|
||||
data_path = '../../../data_import/exports/data_mapping_mdm.csv'
|
||||
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
# mdm_list = sorted(list((set(full_df['pattern']))))
|
||||
thing_property = full_df['thing'] + full_df['property']
|
||||
thing_property = thing_property.to_list()
|
||||
mdm_list = sorted(list(set(thing_property)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(mdm_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
# %%
|
||||
def substitute_and_append_digits(s):
|
||||
"""
|
||||
Finds all digit groups in a string, substitutes them with a <digit> placeholder,
|
||||
and appends the extracted digit groups at the end of the string flanked by <digit> markers.
|
||||
|
||||
Args:
|
||||
s (str): The input string.
|
||||
|
||||
Returns:
|
||||
str: The transformed string.
|
||||
"""
|
||||
# Find all digit groups in the string
|
||||
digit_groups = re.findall(r'\d+', s)
|
||||
|
||||
# Substitute digit groups with <digit> placeholder
|
||||
substituted_string = re.sub(r'\d+', '<DIGIT>', s)
|
||||
|
||||
# Append extracted digit groups to the end of the string
|
||||
appended_digits = ''.join([f'<DIGIT>{group}<DIGIT>' for group in digit_groups])
|
||||
|
||||
return substituted_string + appended_digits
|
||||
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df, mdm_list):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
processed_desc = substitute_and_append_digits(row['tag_description'])
|
||||
desc = f"<DESC>{processed_desc}<DESC>"
|
||||
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||
|
||||
pattern = f"{row['thing'] + row['property']}"
|
||||
try:
|
||||
index = mdm_list.index(pattern)
|
||||
except ValueError:
|
||||
index = -1
|
||||
element = {
|
||||
'text' : f"{desc}{unit}",
|
||||
'label': index,
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset(fold, mdm_list):
|
||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# uncomment for mdm
|
||||
# we only use the mdm subset
|
||||
test_df = test_df[test_df['MDM']].reset_index(drop=True)
|
||||
|
||||
test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
|
||||
|
||||
return test_dataset
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# function to perform training for a given fold
|
||||
def test(fold):
|
||||
|
||||
test_dataset = create_dataset(fold, mdm_list)
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
checkpoint_directory = f'../checkpoint_fold_{fold}'
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
pattern = 'checkpoint-*'
|
||||
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<DESC>", "<UNIT>", "<DIGIT>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
# %%
|
||||
# compute max token length
|
||||
max_length = 0
|
||||
for sample in test_dataset['text']:
|
||||
# Tokenize the sample and get the length
|
||||
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||
length = len(input_ids)
|
||||
|
||||
# Update max_length if this sample is longer
|
||||
if length > max_length:
|
||||
max_length = length
|
||||
|
||||
print(max_length)
|
||||
|
||||
# %%
|
||||
|
||||
max_length = 128
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
# truncation=True,
|
||||
padding='max_length'
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
datasets = test_dataset.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
|
||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
# metric = evaluate.load("accuracy")
|
||||
#
|
||||
#
|
||||
# def compute_metrics(eval_preds):
|
||||
# preds, labels = eval_preds
|
||||
# preds = np.argmax(preds, axis=1)
|
||||
# return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(mdm_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
model = model.eval()
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
model.to(device)
|
||||
|
||||
pred_labels = []
|
||||
actual_labels = []
|
||||
|
||||
|
||||
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||
for batch in tqdm(dataloader):
|
||||
# Inference in batches
|
||||
input_ids = batch['input_ids']
|
||||
attention_mask = batch['attention_mask']
|
||||
# save labels too
|
||||
actual_labels.extend(batch['label'])
|
||||
|
||||
|
||||
# Move to GPU if available
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
|
||||
# Perform inference
|
||||
with torch.no_grad():
|
||||
logits = model(
|
||||
input_ids,
|
||||
attention_mask).logits
|
||||
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||
pred_labels.extend(predicted_class_ids)
|
||||
|
||||
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||
|
||||
|
||||
# %%
|
||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||
y_true = actual_labels
|
||||
y_pred = pred_labels
|
||||
|
||||
# Compute metrics
|
||||
accuracy = accuracy_score(y_true, y_pred)
|
||||
average_parameter = 'weighted'
|
||||
zero_division_parameter = 0
|
||||
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
|
||||
|
||||
|
||||
|
||||
with open("output.txt", "a") as f:
|
||||
|
||||
print('*' * 80, file=f)
|
||||
print(f'Fold: {fold}', file=f)
|
||||
# Print the results
|
||||
print(f'Accuracy: {accuracy:.5f}', file=f)
|
||||
print(f'F1 Score: {f1:.5f}', file=f)
|
||||
print(f'Precision: {precision:.5f}', file=f)
|
||||
print(f'Recall: {recall:.5f}', file=f)
|
||||
|
||||
# export result
|
||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# uncomment if you want to predict for all
|
||||
# df = df[df['MDM']].reset_index(drop=True)
|
||||
|
||||
label_list = [id2label[id] for id in pred_labels]
|
||||
df_out = pd.DataFrame({
|
||||
'class_prediction': pd.Series(label_list)
|
||||
})
|
||||
df = pd.concat([df, df_out], axis=1)
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# reset file before writing to it
|
||||
with open("output.txt", "w") as f:
|
||||
print('', file=f)
|
||||
|
||||
for fold in [1,2,3,4,5]:
|
||||
test(fold)
|
|
@ -0,0 +1,241 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
DataCollatorWithPadding,
|
||||
Trainer,
|
||||
EarlyStoppingCallback,
|
||||
TrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import re
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
# %%
|
||||
|
||||
# we need to create the mdm_list
|
||||
# import the full mdm-only file
|
||||
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
# mdm_list = sorted(list((set(full_df['pattern']))))
|
||||
thing_property = full_df['thing'] + full_df['property']
|
||||
thing_property = thing_property.to_list()
|
||||
mdm_list = sorted(list(set(thing_property)))
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
label2id = {}
|
||||
for idx, val in enumerate(mdm_list):
|
||||
id2label[idx] = val
|
||||
label2id[val] = idx
|
||||
|
||||
# %%
|
||||
def substitute_and_append_digits(s):
|
||||
"""
|
||||
Finds all digit groups in a string, substitutes them with a <digit> placeholder,
|
||||
and appends the extracted digit groups at the end of the string flanked by <digit> markers.
|
||||
|
||||
Args:
|
||||
s (str): The input string.
|
||||
|
||||
Returns:
|
||||
str: The transformed string.
|
||||
"""
|
||||
# Find all digit groups in the string
|
||||
digit_groups = re.findall(r'\d+', s)
|
||||
|
||||
# Substitute digit groups with <digit> placeholder
|
||||
substituted_string = re.sub(r'\d+', '<DIGIT>', s)
|
||||
|
||||
# Append extracted digit groups to the end of the string
|
||||
appended_digits = ''.join([f'<DIGIT>{group}<DIGIT>' for group in digit_groups])
|
||||
|
||||
return substituted_string + appended_digits
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
# input: tag_description
|
||||
# output: class label
|
||||
def process_df_to_dict(df, mdm_list):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
processed_desc = substitute_and_append_digits(row['tag_description'])
|
||||
desc = f"<DESC>{processed_desc}<DESC>"
|
||||
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||
pattern = f"{row['thing'] + row['property']}"
|
||||
try:
|
||||
index = mdm_list.index(pattern)
|
||||
except ValueError:
|
||||
print("Error: value not found in MDM list")
|
||||
index = -1
|
||||
element = {
|
||||
'text' : f"{desc}{unit}",
|
||||
'label': index,
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_split_dataset(fold, mdm_list):
|
||||
# train
|
||||
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# valid
|
||||
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
|
||||
validation_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
combined_data = DatasetDict({
|
||||
'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
|
||||
'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
|
||||
})
|
||||
return combined_data
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# function to perform training for a given fold
|
||||
def train(fold):
|
||||
|
||||
save_path = f'checkpoint_fold_{fold}'
|
||||
split_datasets = create_split_dataset(fold, mdm_list)
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
# model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||
model_checkpoint = 'google-bert/bert-base-cased'
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<DESC>", "<UNIT>", "<DIGIT>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
max_length = 120
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['text']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
tokenized_datasets = split_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns="text",
|
||||
)
|
||||
|
||||
# %% temp
|
||||
# tokenized_datasets['train'].rename_columns()
|
||||
|
||||
# %%
|
||||
# create data collator
|
||||
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# %%
|
||||
# compute metrics
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels = eval_preds
|
||||
preds = np.argmax(preds, axis=1)
|
||||
return metric.compute(predictions=preds, references=labels)
|
||||
|
||||
# %%
|
||||
# create id2label and label2id
|
||||
|
||||
|
||||
# %%
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_checkpoint,
|
||||
num_labels=len(mdm_list),
|
||||
id2label=id2label,
|
||||
label2id=label2id)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
||||
|
||||
# %%
|
||||
# Trainer
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=f"{save_path}",
|
||||
# eval_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-4,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=80,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
|
||||
trainer = Trainer(
|
||||
model,
|
||||
training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["validation"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||
)
|
||||
|
||||
# uncomment to load training from checkpoint
|
||||
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# execute training
|
||||
for fold in [1,2,3,4,5]:
|
||||
print(fold)
|
||||
train(fold)
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1 @@
|
|||
exports
|
|
@ -1,31 +1,31 @@
|
|||
|
||||
********************************************************************************
|
||||
Fold: 1
|
||||
Accuracy: 0.78277
|
||||
F1 Score: 0.73629
|
||||
Precision: 0.71419
|
||||
Recall: 0.78277
|
||||
Accuracy: 0.93706
|
||||
F1 Score: 0.93286
|
||||
Precision: 0.93920
|
||||
Recall: 0.93706
|
||||
********************************************************************************
|
||||
Fold: 2
|
||||
Accuracy: 0.78598
|
||||
F1 Score: 0.73708
|
||||
Precision: 0.71578
|
||||
Recall: 0.78598
|
||||
Accuracy: 0.88785
|
||||
F1 Score: 0.88726
|
||||
Precision: 0.90566
|
||||
Recall: 0.88785
|
||||
********************************************************************************
|
||||
Fold: 3
|
||||
Accuracy: 0.79819
|
||||
F1 Score: 0.74411
|
||||
Precision: 0.71749
|
||||
Recall: 0.79819
|
||||
Accuracy: 0.96285
|
||||
F1 Score: 0.95930
|
||||
Precision: 0.96310
|
||||
Recall: 0.96285
|
||||
********************************************************************************
|
||||
Fold: 4
|
||||
Accuracy: 0.79543
|
||||
F1 Score: 0.73902
|
||||
Precision: 0.71094
|
||||
Recall: 0.79543
|
||||
Accuracy: 0.95861
|
||||
F1 Score: 0.95320
|
||||
Precision: 0.95615
|
||||
Recall: 0.95861
|
||||
********************************************************************************
|
||||
Fold: 5
|
||||
Accuracy: 0.77279
|
||||
F1 Score: 0.72098
|
||||
Precision: 0.69817
|
||||
Recall: 0.77279
|
||||
Accuracy: 0.89601
|
||||
F1 Score: 0.88613
|
||||
Precision: 0.89038
|
||||
Recall: 0.89601
|
||||
|
|
|
@ -235,6 +235,24 @@ def test(fold):
|
|||
print(f'Precision: {precision:.5f}', file=f)
|
||||
print(f'Recall: {recall:.5f}', file=f)
|
||||
|
||||
# export result
|
||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
df[df['MDM']].reset_index(drop=True)
|
||||
|
||||
label_list = [id2label[id] for id in pred_labels]
|
||||
df_out = pd.DataFrame({
|
||||
'class_prediction': pd.Series(label_list)
|
||||
})
|
||||
df = pd.concat([df, df_out], axis=1)
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# reset file before writing to it
|
||||
|
|
|
@ -176,7 +176,7 @@ def train(fold):
|
|||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-3,
|
||||
learning_rate=1e-4,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
|
|
1
train/classification_bert_complete_desc_unit/classification_prediction/.gitignore
vendored
Normal file
1
train/classification_bert_complete_desc_unit/classification_prediction/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
exports
|
|
@ -1,31 +1,31 @@
|
|||
|
||||
********************************************************************************
|
||||
Fold: 1
|
||||
Accuracy: 0.78940
|
||||
F1 Score: 0.73284
|
||||
Precision: 0.70389
|
||||
Recall: 0.78940
|
||||
Accuracy: 0.15229
|
||||
F1 Score: 0.07923
|
||||
Precision: 0.05929
|
||||
Recall: 0.15229
|
||||
********************************************************************************
|
||||
Fold: 2
|
||||
Accuracy: 0.78411
|
||||
F1 Score: 0.73695
|
||||
Precision: 0.71914
|
||||
Recall: 0.78411
|
||||
Accuracy: 0.18075
|
||||
F1 Score: 0.09625
|
||||
Precision: 0.07243
|
||||
Recall: 0.18075
|
||||
********************************************************************************
|
||||
Fold: 3
|
||||
Accuracy: 0.80522
|
||||
F1 Score: 0.75406
|
||||
Precision: 0.72847
|
||||
Recall: 0.80522
|
||||
Accuracy: 0.19493
|
||||
F1 Score: 0.10903
|
||||
Precision: 0.08332
|
||||
Recall: 0.19493
|
||||
********************************************************************************
|
||||
Fold: 4
|
||||
Accuracy: 0.80780
|
||||
F1 Score: 0.75361
|
||||
Precision: 0.72432
|
||||
Recall: 0.80780
|
||||
Accuracy: 0.13190
|
||||
F1 Score: 0.05761
|
||||
Precision: 0.04173
|
||||
Recall: 0.13190
|
||||
********************************************************************************
|
||||
Fold: 5
|
||||
Accuracy: 0.76958
|
||||
F1 Score: 0.71912
|
||||
Precision: 0.69965
|
||||
Recall: 0.76958
|
||||
Accuracy: 0.15198
|
||||
F1 Score: 0.07383
|
||||
Precision: 0.05411
|
||||
Recall: 0.15198
|
||||
|
|
|
@ -80,8 +80,9 @@ def process_df_to_dict(df, mdm_list):
|
|||
def create_dataset(fold, mdm_list):
|
||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# uncomment for mdm
|
||||
# we only use the mdm subset
|
||||
test_df = test_df[test_df['MDM']].reset_index(drop=True)
|
||||
# test_df = test_df[test_df['MDM']].reset_index(drop=True)
|
||||
|
||||
test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
|
||||
|
||||
|
@ -237,6 +238,22 @@ def test(fold):
|
|||
print(f'Precision: {precision:.5f}', file=f)
|
||||
print(f'Recall: {recall:.5f}', file=f)
|
||||
|
||||
# export result
|
||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# uncomment if you want to predict for all
|
||||
# df = df[df['MDM']].reset_index(drop=True)
|
||||
|
||||
label_list = [id2label[id] for id in pred_labels]
|
||||
df_out = pd.DataFrame({
|
||||
'class_prediction': pd.Series(label_list)
|
||||
})
|
||||
df = pd.concat([df, df_out], axis=1)
|
||||
|
||||
# we can save the t5 generation output here
|
||||
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# reset file before writing to it
|
||||
|
|
|
@ -177,7 +177,7 @@ def train(fold):
|
|||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-5,
|
||||
learning_rate=1e-4,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
|
|
|
@ -202,7 +202,7 @@ def train(fold):
|
|||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
|
|
|
@ -76,7 +76,7 @@ class Inference():
|
|||
text_target=target,
|
||||
max_length=max_length,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
)
|
||||
return model_inputs
|
||||
|
@ -100,7 +100,7 @@ class Inference():
|
|||
|
||||
|
||||
def generate(self):
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||
MAX_GENERATE_LENGTH = 128
|
||||
|
||||
pred_generations = []
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Accuracy for fold 1: 0.9536204448651207
|
||||
Accuracy for fold 2: 0.8845794392523364
|
||||
Accuracy for fold 3: 0.9618473895582329
|
||||
Accuracy for fold 4: 0.9576593720266413
|
||||
Accuracy for fold 5: 0.8928080622995878
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Accuracy for fold 1: 0.9588263132986276
|
||||
Accuracy for fold 2: 0.9182242990654206
|
||||
Accuracy for fold 3: 0.9633534136546185
|
||||
Accuracy for fold 4: 0.9809705042816366
|
||||
Accuracy for fold 5: 0.8891433806688044
|
|
@ -26,7 +26,7 @@ def infer_and_select(fold):
|
|||
# run inference
|
||||
# checkpoint
|
||||
# Use glob to find matching paths
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
|
@ -70,5 +70,6 @@ def infer_and_select(fold):
|
|||
with open("output.txt", "w") as f:
|
||||
print('', file=f)
|
||||
|
||||
# for fold in [1,2,3,4,5]:
|
||||
for fold in [1,2,3,4,5]:
|
||||
infer_and_select(fold)
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
import glob
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
|
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
|
||||
from safetensors.torch import load_file
|
||||
|
||||
from transformers.models.t5.modeling_t5 import T5Block
|
||||
from transformers import (
|
||||
T5Config,
|
||||
T5TokenizerFast,
|
||||
AutoModelForSeq2SeqLM,
|
||||
DataCollatorForSeq2Seq,
|
||||
Seq2SeqTrainer,
|
||||
EarlyStoppingCallback,
|
||||
Seq2SeqTrainingArguments,
|
||||
T5ForConditionalGeneration,
|
||||
T5Model
|
||||
Seq2SeqTrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
@ -35,23 +27,13 @@ from datasets import Dataset, DatasetDict
|
|||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# model_checkpoint = "t5-small"
|
||||
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
||||
# model.config
|
||||
|
||||
# %%
|
||||
# outputs a list of dictionaries
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||
element = {
|
||||
'input' : f"{desc}{unit}",
|
||||
'input' : f"{desc}",
|
||||
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
||||
}
|
||||
output_list.append(element)
|
||||
|
@ -77,11 +59,12 @@ def create_split_dataset(fold):
|
|||
|
||||
# function to perform training for a given fold
|
||||
def train(fold):
|
||||
save_path = f'checkpoint_fold_{fold}b'
|
||||
save_path = f'checkpoint_fold_{fold}'
|
||||
split_datasets = create_split_dataset(fold)
|
||||
|
||||
# prepare tokenizer
|
||||
model_checkpoint = "t5-small"
|
||||
|
||||
model_checkpoint = "t5-base"
|
||||
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||
|
@ -101,7 +84,7 @@ def train(fold):
|
|||
text_target=target,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding="max_length"
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
|
@ -119,52 +102,10 @@ def train(fold):
|
|||
# device_map set to auto to force it to load contiguous weights
|
||||
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||
|
||||
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
|
||||
# # Use glob to find matching paths
|
||||
# # path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# # we are guaranteed to save only 1 checkpoint from training
|
||||
# pattern = 'checkpoint-*'
|
||||
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
|
||||
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
|
||||
# # Load the checkpoint
|
||||
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
|
||||
# checkpoint = load_file(checkpoint_path)
|
||||
# # Filter out weights related to the classification head
|
||||
# # given name format: t5.encoder.embed_tokens.weight
|
||||
# # we want: encoder.embed.tokens.weight
|
||||
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
|
||||
|
||||
|
||||
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
||||
|
||||
# Access the decoder stack
|
||||
# config = T5Config("t5-small")
|
||||
|
||||
config = pretrained_model.config
|
||||
config.num_layers = 6
|
||||
config.num_decoder_layers = 3 # set new decoder layer count
|
||||
|
||||
model = T5ForConditionalGeneration(config)
|
||||
|
||||
model.shared = pretrained_model.shared
|
||||
model.encoder = pretrained_model.encoder
|
||||
|
||||
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
|
||||
for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]):
|
||||
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
|
||||
|
||||
|
||||
# print number of decoder blocks
|
||||
print(f'Number of decoder blocks: {len(model.decoder.block)}')
|
||||
print(f'num_layers: {model.config.num_layers}')
|
||||
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
|
||||
|
||||
|
||||
# change the token embedding size to match the shape
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
|
||||
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||
metric = evaluate.load("sacrebleu")
|
||||
|
||||
|
@ -199,7 +140,7 @@ def train(fold):
|
|||
# Generation Config
|
||||
# from transformers import GenerationConfig
|
||||
gen_config = model.generation_config
|
||||
gen_config.max_length = 128
|
||||
gen_config.max_length = 64
|
||||
|
||||
# compile
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
@ -222,7 +163,7 @@ def train(fold):
|
|||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
|
@ -0,0 +1,2 @@
|
|||
checkpoint*
|
||||
tensorboard-log/
|
|
@ -76,7 +76,7 @@ class Inference():
|
|||
text_target=target,
|
||||
max_length=max_length,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
)
|
||||
return model_inputs
|
||||
|
@ -100,7 +100,7 @@ class Inference():
|
|||
|
||||
|
||||
def generate(self):
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||
MAX_GENERATE_LENGTH = 128
|
||||
|
||||
pred_generations = []
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Accuracy for fold 1: 0.9697113109323237
|
||||
Accuracy for fold 2: 0.9
|
||||
Accuracy for fold 3: 0.9613453815261044
|
||||
Accuracy for fold 4: 0.9686013320647003
|
||||
Accuracy for fold 5: 0.8932661475034357
|
|
@ -6,13 +6,14 @@ from inference import Inference
|
|||
|
||||
checkpoint_directory = '../'
|
||||
|
||||
BATCH_SIZE = 512
|
||||
BATCH_SIZE = 128
|
||||
|
||||
def infer_and_select(fold):
|
||||
print(f"Inference for fold {fold}")
|
||||
# import test data
|
||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# uncomment for mdm only
|
||||
df = df[df['MDM']].reset_index(drop=True)
|
||||
|
||||
# get target data
|
||||
|
@ -26,7 +27,7 @@ def infer_and_select(fold):
|
|||
# run inference
|
||||
# checkpoint
|
||||
# Use glob to find matching paths
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
import glob
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
|
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
|
||||
from safetensors.torch import load_file
|
||||
|
||||
from transformers.models.t5.modeling_t5 import T5Block
|
||||
from transformers import (
|
||||
T5Config,
|
||||
T5TokenizerFast,
|
||||
AutoModelForSeq2SeqLM,
|
||||
DataCollatorForSeq2Seq,
|
||||
Seq2SeqTrainer,
|
||||
EarlyStoppingCallback,
|
||||
Seq2SeqTrainingArguments,
|
||||
T5ForConditionalGeneration,
|
||||
T5Model
|
||||
Seq2SeqTrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict
|
|||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# model_checkpoint = "t5-small"
|
||||
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
||||
# model.config
|
||||
|
||||
# %%
|
||||
# outputs a list of dictionaries
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
|
@ -77,11 +60,12 @@ def create_split_dataset(fold):
|
|||
|
||||
# function to perform training for a given fold
|
||||
def train(fold):
|
||||
save_path = f'checkpoint_fold_{fold}b'
|
||||
save_path = f'checkpoint_fold_{fold}'
|
||||
split_datasets = create_split_dataset(fold)
|
||||
|
||||
# prepare tokenizer
|
||||
model_checkpoint = "t5-small"
|
||||
|
||||
model_checkpoint = "t5-base"
|
||||
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||
|
@ -101,7 +85,7 @@ def train(fold):
|
|||
text_target=target,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding="max_length"
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
|
@ -119,52 +103,10 @@ def train(fold):
|
|||
# device_map set to auto to force it to load contiguous weights
|
||||
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||
|
||||
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
|
||||
# # Use glob to find matching paths
|
||||
# # path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# # we are guaranteed to save only 1 checkpoint from training
|
||||
# pattern = 'checkpoint-*'
|
||||
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
|
||||
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
|
||||
# # Load the checkpoint
|
||||
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
|
||||
# checkpoint = load_file(checkpoint_path)
|
||||
# # Filter out weights related to the classification head
|
||||
# # given name format: t5.encoder.embed_tokens.weight
|
||||
# # we want: encoder.embed.tokens.weight
|
||||
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
|
||||
|
||||
|
||||
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
||||
|
||||
# Access the decoder stack
|
||||
# config = T5Config("t5-small")
|
||||
|
||||
config = pretrained_model.config
|
||||
config.num_layers = 6
|
||||
config.num_decoder_layers = 12 # set new decoder layer count
|
||||
|
||||
model = T5ForConditionalGeneration(config)
|
||||
|
||||
model.shared = pretrained_model.shared
|
||||
model.encoder = pretrained_model.encoder
|
||||
|
||||
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
|
||||
for i, layer in enumerate(pretrained_decoder_weights):
|
||||
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
|
||||
|
||||
|
||||
# print number of decoder blocks
|
||||
print(f'Number of decoder blocks: {len(model.decoder.block)}')
|
||||
print(f'num_layers: {model.config.num_layers}')
|
||||
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
|
||||
|
||||
|
||||
# change the token embedding size to match the shape
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
|
||||
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||
metric = evaluate.load("sacrebleu")
|
||||
|
||||
|
@ -199,10 +141,11 @@ def train(fold):
|
|||
# Generation Config
|
||||
# from transformers import GenerationConfig
|
||||
gen_config = model.generation_config
|
||||
gen_config.max_length = 128
|
||||
gen_config.max_length = 64
|
||||
|
||||
# compile
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
# model = torch.compile(model)
|
||||
|
||||
|
||||
# Trainer
|
||||
|
@ -210,10 +153,10 @@ def train(fold):
|
|||
args = Seq2SeqTrainingArguments(
|
||||
f"{save_path}",
|
||||
# eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-3,
|
||||
per_device_train_batch_size=64,
|
||||
|
@ -222,12 +165,13 @@ def train(fold):
|
|||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
generation_config=gen_config,
|
||||
remove_unused_columns=False,
|
||||
warmup_steps=400
|
||||
)
|
||||
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
checkpoint*
|
||||
tensorboard-log/
|
|
@ -76,7 +76,7 @@ class Inference():
|
|||
text_target=target,
|
||||
max_length=max_length,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
)
|
||||
return model_inputs
|
||||
|
@ -100,7 +100,7 @@ class Inference():
|
|||
|
||||
|
||||
def generate(self):
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||
MAX_GENERATE_LENGTH = 128
|
||||
|
||||
pred_generations = []
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Accuracy for fold 1: 0.934690014197823
|
||||
Accuracy for fold 2: 0.9023364485981309
|
||||
Accuracy for fold 3: 0.9643574297188755
|
||||
Accuracy for fold 4: 0.9700285442435775
|
||||
Accuracy for fold 5: 0.8941823179111315
|
|
@ -26,7 +26,7 @@ def infer_and_select(fold):
|
|||
# run inference
|
||||
# checkpoint
|
||||
# Use glob to find matching paths
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
import glob
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
|
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
|
||||
from safetensors.torch import load_file
|
||||
|
||||
from transformers.models.t5.modeling_t5 import T5Block
|
||||
from transformers import (
|
||||
T5Config,
|
||||
T5TokenizerFast,
|
||||
AutoModelForSeq2SeqLM,
|
||||
DataCollatorForSeq2Seq,
|
||||
Seq2SeqTrainer,
|
||||
EarlyStoppingCallback,
|
||||
Seq2SeqTrainingArguments,
|
||||
T5ForConditionalGeneration,
|
||||
T5Model
|
||||
Seq2SeqTrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict
|
|||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# model_checkpoint = "t5-small"
|
||||
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
||||
# model.config
|
||||
|
||||
# %%
|
||||
# outputs a list of dictionaries
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
|
@ -77,10 +60,11 @@ def create_split_dataset(fold):
|
|||
|
||||
# function to perform training for a given fold
|
||||
def train(fold):
|
||||
save_path = f'checkpoint_fold_{fold}b'
|
||||
save_path = f'checkpoint_fold_{fold}'
|
||||
split_datasets = create_split_dataset(fold)
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
model_checkpoint = "t5-small"
|
||||
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
|
@ -101,7 +85,7 @@ def train(fold):
|
|||
text_target=target,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding="max_length"
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
|
@ -119,52 +103,10 @@ def train(fold):
|
|||
# device_map set to auto to force it to load contiguous weights
|
||||
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||
|
||||
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
|
||||
# # Use glob to find matching paths
|
||||
# # path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# # we are guaranteed to save only 1 checkpoint from training
|
||||
# pattern = 'checkpoint-*'
|
||||
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
|
||||
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
|
||||
# # Load the checkpoint
|
||||
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
|
||||
# checkpoint = load_file(checkpoint_path)
|
||||
# # Filter out weights related to the classification head
|
||||
# # given name format: t5.encoder.embed_tokens.weight
|
||||
# # we want: encoder.embed.tokens.weight
|
||||
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
|
||||
|
||||
|
||||
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
|
||||
|
||||
# Access the decoder stack
|
||||
# config = T5Config("t5-small")
|
||||
|
||||
config = pretrained_model.config
|
||||
config.num_layers = 6
|
||||
config.num_decoder_layers = 9 # set new decoder layer count
|
||||
|
||||
model = T5ForConditionalGeneration(config)
|
||||
|
||||
model.shared = pretrained_model.shared
|
||||
model.encoder = pretrained_model.encoder
|
||||
|
||||
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
|
||||
for i, layer in enumerate(pretrained_decoder_weights):
|
||||
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
|
||||
|
||||
|
||||
# print number of decoder blocks
|
||||
print(f'Number of decoder blocks: {len(model.decoder.block)}')
|
||||
print(f'num_layers: {model.config.num_layers}')
|
||||
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
|
||||
|
||||
|
||||
# change the token embedding size to match the shape
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
|
||||
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||
metric = evaluate.load("sacrebleu")
|
||||
|
||||
|
@ -199,7 +141,7 @@ def train(fold):
|
|||
# Generation Config
|
||||
# from transformers import GenerationConfig
|
||||
gen_config = model.generation_config
|
||||
gen_config.max_length = 128
|
||||
gen_config.max_length = 64
|
||||
|
||||
# compile
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
@ -215,14 +157,14 @@ def train(fold):
|
|||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-3,
|
||||
learning_rate=1e-4,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
Accuracy for fold 1: 0.9455750118315192
|
||||
Accuracy for fold 2: 0.8864485981308411
|
||||
Accuracy for fold 3: 0.9558232931726908
|
||||
Accuracy for fold 4: 0.9686013320647003
|
||||
Accuracy for fold 5: 0.896930829134219
|
||||
Accuracy for fold 1: 0.9427354472314246
|
||||
Accuracy for fold 2: 0.8981308411214953
|
||||
Accuracy for fold 3: 0.9588353413654619
|
||||
Accuracy for fold 4: 0.9633682207421503
|
||||
Accuracy for fold 5: 0.8928080622995878
|
||||
|
|
|
@ -157,13 +157,13 @@ def train(fold):
|
|||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-3,
|
||||
per_device_train_batch_size=128,
|
||||
per_device_eval_batch_size=128,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
Accuracy for fold 1: 0.9522006625650734
|
||||
Accuracy for fold 2: 0.9093457943925234
|
||||
Accuracy for fold 3: 0.9678714859437751
|
||||
Accuracy for fold 4: 0.9814462416745956
|
||||
Accuracy for fold 5: 0.890975721484196
|
||||
Accuracy for fold 1: 0.9578797917652626
|
||||
Accuracy for fold 2: 0.9088785046728972
|
||||
Accuracy for fold 3: 0.9673694779116466
|
||||
Accuracy for fold 4: 0.9785918173168411
|
||||
Accuracy for fold 5: 0.8918918918918919
|
||||
|
|
|
@ -13,7 +13,8 @@ def infer_and_select(fold):
|
|||
# import test data
|
||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
df = df[df['MDM']].reset_index(drop=True)
|
||||
# note: we need to uncomment this for overall evaluation
|
||||
# df = df[df['MDM']].reset_index(drop=True)
|
||||
|
||||
# get target data
|
||||
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||
|
|
|
@ -164,7 +164,7 @@ def train(fold):
|
|||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
|
||||
Accuracy for fold 1: 0.9403691433980123
|
||||
Accuracy for fold 2: 0.9046728971962616
|
||||
Accuracy for fold 3: 0.9678714859437751
|
||||
Accuracy for fold 4: 0.9695528068506185
|
||||
Accuracy for fold 5: 0.902427851580394
|
|
@ -222,7 +222,7 @@ def train(fold):
|
|||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
|
|
|
@ -222,7 +222,7 @@ def train(fold):
|
|||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
checkpoint*
|
||||
tensorboard-log
|
|
@ -1,6 +0,0 @@
|
|||
|
||||
Accuracy for fold 1: 0.9427354472314246
|
||||
Accuracy for fold 2: 0.9098130841121496
|
||||
Accuracy for fold 3: 0.964859437751004
|
||||
Accuracy for fold 4: 0.9719314938154139
|
||||
Accuracy for fold 5: 0.9070087036188731
|
|
@ -222,7 +222,7 @@ def train(fold):
|
|||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
|
|
|
@ -222,7 +222,7 @@ def train(fold):
|
|||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=40,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
checkpoint*
|
||||
tensorboard-log
|
|
@ -1,6 +0,0 @@
|
|||
|
||||
Accuracy for fold 1: 0.9441552295314718
|
||||
Accuracy for fold 2: 0.9121495327102803
|
||||
Accuracy for fold 3: 0.963855421686747
|
||||
Accuracy for fold 4: 0.9752616555661275
|
||||
Accuracy for fold 5: 0.907924874026569
|
|
@ -1,28 +1,14 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd hybrid_t5_complete_desc_unit
|
||||
micromamba run -n hug accelerate launch train_encoder.py
|
||||
micromamba run -n hug accelerate launch train_decoder.py
|
||||
cd mapping_t5-base_desc
|
||||
micromamba run -n hug accelerate launch train.py
|
||||
cd ..
|
||||
|
||||
cd hybrid_t5_pattern_desc_unit
|
||||
micromamba run -n hug accelerate launch train_encoder.py
|
||||
micromamba run -n hug accelerate launch train_decoder.py
|
||||
cd mapping_t5-base_desc_unit
|
||||
micromamba run -n hug accelerate launch train.py
|
||||
cd ..
|
||||
|
||||
|
||||
# cd classification_bert_complete_desc
|
||||
# micromamba run -n hug accelerate launch train.py
|
||||
# cd ..
|
||||
|
||||
# cd classification_bert_complete_desc_unit
|
||||
# micromamba run -n hug accelerate launch train.py
|
||||
# cd ..
|
||||
|
||||
# cd classification_bert_complete_desc_unit_name
|
||||
# micromamba run -n hug accelerate launch train.py
|
||||
# cd ..
|
||||
|
||||
# cd mapping_t5_complete_desc
|
||||
# micromamba run -n hug accelerate launch train.py
|
||||
# cd ..
|
||||
|
@ -31,6 +17,31 @@ cd ..
|
|||
# micromamba run -n hug accelerate launch train.py
|
||||
# cd ..
|
||||
#
|
||||
# cd mapping_t5_complete_name_desc_unit
|
||||
# cd frozen_t5_encoder
|
||||
# micromamba run -n hug accelerate launch train_decoder.py
|
||||
# cd ..
|
||||
#
|
||||
# cd modified_t5_decoder_1_layers
|
||||
# micromamba run -n hug accelerate launch train_decoder.py
|
||||
# cd ..
|
||||
#
|
||||
# cd modified_t5_decoder_2_layers
|
||||
# micromamba run -n hug accelerate launch train_decoder.py
|
||||
# cd ..
|
||||
#
|
||||
# cd modified_t5_decoder_4_layers
|
||||
# micromamba run -n hug accelerate launch train_decoder.py
|
||||
# cd ..
|
||||
#
|
||||
# cd modified_t5_decoder_8_layers
|
||||
# micromamba run -n hug accelerate launch train_decoder.py
|
||||
# cd ..
|
||||
#
|
||||
# cd classification_bert_complete_desc
|
||||
# micromamba run -n hug accelerate launch train.py
|
||||
# cd ..
|
||||
#
|
||||
# cd classification_bert_complete_desc_unit
|
||||
# micromamba run -n hug accelerate launch train.py
|
||||
# cd ..
|
||||
|
||||
|
|
Loading…
Reference in New Issue