diff --git a/analysis/categories/label_print.py b/analysis/categories/label_print.py index 1968f8d..89158f3 100644 --- a/analysis/categories/label_print.py +++ b/analysis/categories/label_print.py @@ -13,6 +13,10 @@ full_df # %% mdm_list +# %% +mdm_list = sorted(list((set(full_df['thing'] + full_df['property'])))) +# %% +mdm_list # %% mask = full_df['pattern'] == 'GE#Flow FGMassFlow' full_df[mask] diff --git a/analysis/data_properties/basic_eda.py b/analysis/data_properties/basic_eda.py deleted file mode 100644 index 3b0bf98..0000000 --- a/analysis/data_properties/basic_eda.py +++ /dev/null @@ -1,13 +0,0 @@ -# %% -import pandas as pd - -# %% -data_path = '../../data_import/exports/raw_data.csv' -df = pd.read_csv(data_path) - -# %% -df - -# %% -len(set(df['ships_idx'])) -# %% diff --git a/analysis/data_properties/character_count.py b/analysis/data_properties/character_count.py new file mode 100644 index 0000000..afbe4e0 --- /dev/null +++ b/analysis/data_properties/character_count.py @@ -0,0 +1,58 @@ +# %% +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +# %% +# data_path = '../../data_import/exports/raw_data.csv' +data_path = '../../data_preprocess/exports/preprocessed_data.csv' +df = pd.read_csv(data_path) + +# %% +df = df[df['MDM']].reset_index(drop=True) + +# %% +# we want to print the string length + +# print summary stats +def summary_stats(arr): + return { + "Mean": np.mean(arr), + "Median": np.median(arr), + "Standard Deviation": np.std(arr), + "Variance": np.var(arr), + "Min": np.min(arr), + "Max": np.max(arr), + "Range": np.ptp(arr), + "25th Percentile": np.percentile(arr, 25), + "75th Percentile": np.percentile(arr, 75), + "Sum": np.sum(arr), + } + +# %% +ship_domain_data = df['tag_description'] + df['unit'].fillna('') + +ship_domain_array = np.array([len(item) for item in ship_domain_data]) + +stats = summary_stats(ship_domain_array) + +for key, value in stats.items(): + print(f"{key}: {value}") + + +# %% +plt.hist(ship_domain_array, bins=50) +# %% + +# %% +platform_domain_data = df['thing'] + df['property'] + +platform_domain_array = np.array([len(item) for item in platform_domain_data]) + +stats = summary_stats(platform_domain_array) + +for key, value in stats.items(): + print(f"{key}: {value}") + + +# %% diff --git a/analysis/data_properties/description_analysis.py b/analysis/data_properties/description_analysis.py new file mode 100644 index 0000000..e69de29 diff --git a/analysis/delta_analysis/.gitignore b/analysis/delta_analysis/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/analysis/delta_analysis/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/analysis/delta_analysis/delta.py b/analysis/delta_analysis/delta.py new file mode 100644 index 0000000..f6deddd --- /dev/null +++ b/analysis/delta_analysis/delta.py @@ -0,0 +1,62 @@ +# %% +import pandas as pd +import numpy as np + +# %% +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['thing'] + full_df['property'])))) + + +# %% +fold = 5 +file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv' +df_bert = pd.read_csv(file_path) +# %% +file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv' +# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv' +df_t5 = pd.read_csv(file_path) +df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True) +df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property']) +df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list) + +# %% +df_t5['bert_prediction'] = df_bert['class_prediction'] +df_bert['t5_prediction'] = df_t5['class_prediction'] +# %% +bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction'] +# %% +t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property']) + +# %% +sum(t5_correct)/len(t5_correct) + +# %% +# replace t5 not in vocab with bert values +t5_correct_modified = t5_correct.copy() +condition = ~df_t5['in_vocab'] +t5_correct_modified[condition] = np.array(bert_correct[condition]) + +# %% +# new replacement correctness +sum(t5_correct_modified)/len(t5_correct_modified) +# %% +# when bert is correct and t5 is wrong +cond_mask = bert_correct & (~t5_correct) +print(sum(cond_mask)) +print(df_t5[cond_mask].to_string()) +# %% +# when bert is wrong and t5 is correct +cond_mask = (~bert_correct) & (t5_correct) +print(sum(cond_mask)) +print(df_bert[cond_mask].to_string()) + + + +# %% +# when both are wrong +cond_mask = (~bert_correct) & (~t5_correct) +print(sum(cond_mask)) + + +# %% diff --git a/analysis/delta_analysis/replacement.py b/analysis/delta_analysis/replacement.py new file mode 100644 index 0000000..8ec0baf --- /dev/null +++ b/analysis/delta_analysis/replacement.py @@ -0,0 +1,72 @@ +# %% +import pandas as pd +import numpy as np + +# %% +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['thing'] + full_df['property'])))) + +# %% +def run_mdm(fold): + file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv' + df_bert = pd.read_csv(file_path) + df_bert = df_bert[df_bert['MDM']].reset_index(drop=True) + + file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv' + # file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv' + df_t5 = pd.read_csv(file_path) + df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True) + df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property']) + df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list) + + df_t5['bert_prediction'] = df_bert['class_prediction'] + df_bert['t5_prediction'] = df_t5['class_prediction'] + + bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction'] + t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property']) + + t5_original_accuracy = sum(t5_correct)/len(t5_correct) + + # replace t5 not in vocab with bert values + t5_correct_modified = t5_correct.copy() + condition = ~df_t5['in_vocab'] + t5_correct_modified[condition] = np.array(bert_correct[condition]) + pd.Series(t5_correct_modified).to_csv(f'exports/result_group_{fold}.csv') + + t5_new_accuracy = sum(t5_correct_modified)/len(t5_correct_modified) + + print('original accuracy', t5_original_accuracy) + print('new accuracy', t5_new_accuracy) + + +# %% +# this does replacement for the full prediction +def run_full(fold): + file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv' + df_bert = pd.read_csv(file_path) + + file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv' + # file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv' + df_t5 = pd.read_csv(file_path) + df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property']) + df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list) + + df_t5['bert_prediction'] = df_bert['class_prediction'] + df_bert['t5_prediction'] = df_t5['class_prediction'] + + bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction'] + t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property']) + + # replace t5 not in vocab with bert values + t5_correct_modified = t5_correct.copy() + condition = ~df_t5['in_vocab'] + t5_correct_modified[condition] = np.array(bert_correct[condition]) + pd.Series(t5_correct_modified, name='grounded_pred').to_csv(f'exports/result_group_{fold}.csv') + + +# %% +for fold in [1,2,3,4,5]: + run_mdm(fold) + run_full(fold) +# %% diff --git a/analysis/delta_analysis/temp.csv b/analysis/delta_analysis/temp.csv new file mode 100644 index 0000000..72deeab --- /dev/null +++ b/analysis/delta_analysis/temp.csv @@ -0,0 +1,67 @@ +,thing,property,ships_idx,tag_name,tag_description,signal_type,min,max,unit,data_type,thing_pattern,property_pattern,pattern,MDM,class_prediction +6,SB1Flow,FOMassFlowTotal,1003,FM6_XI001_Y,AUXILIARY BOILER FUEL OIL TOTAL FLOW RATE,AI,0,0,FLOW,1304.0,SB#Flow,FOMassFlowTotal,SB#Flow FOMassFlowTotal,True,SB1FlowFOMassFlowIn +38,ShipBoiler3,RunningState,1030,BC330,COMPOSITE BOILER FIRING,DI,0,0,NOVALUE,1301.0,ShipBoiler#,RunningState,ShipBoiler# RunningState,True,ShipBoiler1RunningState +61,GeneratorEngine5,CBNonClosed,1003,PMS_5ACBNCL_Y,NO5 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine5RunningState +72,CargoHandling,BoostPp_Port_Current,1018,IT_1400_Y,MP1400 BOOSTER PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,BoostPp_Port_Current,CargoHandling BoostPp_Port_Current,True,CargoHandlingBoostPp_Stbd_Current +81,Navigation,MidPDraft,1018,TL_200002_Y,MID DRAFTP_LV,A,0,0,NOVALUE,1310.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft +86,ShipBoiler1,FOInletTemp,1018,AB_000001_Y,BOILER FUEL OIL IN BURNER_TEMP,A,0,0,NOVALUE,1310.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp +140,Navigation,MidPDraft,1003,DCM_P3_Y,DRAUGHT MID PS (DRAFT SENSOR),AI,0,0,m ,1304.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft +174,ShipBoiler1,FOInletPress,1051,MB.YO.IAS.Q3.40224,BOILER FUEL OIL IN BURNER_PRESS,Analog,0,4,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress +200,GeneratorEngine3,VoltageB,1050,MB.KM.IAS.Q3.A40193,NO3 GENERATOR_ENGINE(B) GEN VOLTAGE,AO,0,655,VOLTAGE,1300.0,GeneratorEngine#,VoltageB,GeneratorEngine# VoltageB,True,GeneratorEngine3Voltage +342,EngineRoom,AirTemp,1018,MA_TT8612_Y,MAIN_ENGINE AMBIENT_TEMP,A,0,0,NOVALUE,1310.0,EngineRoom,AirTemp,EngineRoom AirTemp,True,GeneratorEngine1CBTrip +395,GeneratorEngine3,SAPress,1036,MB.KM.IAS.Q2.400121,NO3 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine3WindingTempR +396,MainEngine1,RPM,1051,MB.YO.IAS.Q1.40006,M/E_RPM,Analog,-120,120,RPM,1304.0,MainEngine#,RPM,MainEngine# RPM,True,Shaft1RPM +653,ShipBoiler1,FOInletTemp,1033,CB014,COMPOSITE BOILER FUEL OIL TEMPERATURE,AI,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp +731,GeneratorEngine4,CBNonClosed,1003,PMS_4ACBNCL_Y,NO4 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine4CBClosed +745,ShipBoiler1,FOInletPress,1018,AB_000002_Y,BOILER FUEL OIL IN BURNER PRESSURE,A,0,0,PRESSURE,1310.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress +783,GeneratorEngine1,LOFilterInletPress,1030,GA069,NO1 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine1LOInletPress +786,GeneratorEngine1,FOFilterInletPress,1030,GA085,NO1 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine1FOInletPress +812,GE1Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400031,GENERATOR_ENGINE FUEL OIL VISCOSITY INDICATION,AO,0,2346,VOLUME FLOW,1304.0,GE#Flow,FOViscosity,GE#Flow FOViscosity,True,GE1FlowFOVolumeFlowIn +813,ME2Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400025,MAIN_ENGINE(P) FUEL OIL VISCOSITY INDICATION,AO,0,2285,VOLUME FLOW,1304.0,ME#Flow,FOViscosity,ME#Flow FOViscosity,True,ME2FlowFOVolumeFlowIn +840,GeneratorEngine1,SAPress,1036,MB.KM.IAS.Q1.400051,NO1 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine1WindingTempR +891,GE1Flow,FOMassFlowIn,1051,MB.YO.IAS.Q2.40103,GENERATOR_ENGINE HFO_FLOW,Analog,0,1800,MASS FLOW,1304.0,GE#Flow,FOMassFlowIn,GE#Flow FOMassFlowIn,True,GE1FlowFGMassFlow +935,ShipBoiler1,FOInletTemp,1051,MB.YO.IAS.Q3.40223,BOILER FUEL OIL IN BURNER_TEMP,Analog,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp +951,MainEngine2,CFWInletTemp,1020,MB.YO.IAS.Q1.A400388,MAIN_ENGINE(P) CYLINDER COOL WATER TEMPERATURE INLET,AO,-50,130,TEMPERATURE,1304.0,MainEngine#,CFWInletTemp,MainEngine# CFWInletTemp,True,MainEngine2Cy3CWTemp +1005,GeneratorEngine1,HFOUse,1051,MB.YO.IAS.Q1.10096,G/E_HFUEL OIL USE,Digital,0,1,-,1301.0,GeneratorEngine#,HFOUse,GeneratorEngine# HFOUse,True,MainEngine1HFOUse +1075,ME1Flow,FGMassFlow,1004,MB.YO.IAS.Q2.A400121,LP LPG FUEL P/P FLOW,AI,0,3500,MASS FLOW,1304.0,ME#Flow,FGMassFlow,ME#Flow FGMassFlow,True,ME2FlowFGMassFlow +1116,CargoHandling,LPGComp1MotorCurrent,1004,MB.YO.IAS.Q3.A400281,MP-2100 COMPRESSOR (P) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT3_DWPump_Port_Current +1117,CargoHandling,LPGComp2MotorCurrent,1004,MB.YO.IAS.Q3.A400282,MP-2200 COMPRESSOR (C) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT2_DWPump_Stbd_Current +1118,CargoHandling,LPGComp3MotorCurrent,1004,MB.YO.IAS.Q3.A400283,MP-2300 COMPRESSOR (S) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingBoostPp_Stbd_Current +1174,FuelOilSystem,LFOVolumeSettleTK,1003,LC_XI001_Y,NO2 LIGHT FUEL OIL SETTLING TANK VOLUME,AI,0,999999,VOLUME,1304.0,FuelOilSystem,LFOVolumeSettleTK,FuelOilSystem LFOVolumeSettleTK,True,FuelOilSystemLFOVolumeStorageTK2P +1198,GeneratorEngine4,BearingNDETemp1,1003,GE4_TIAH6_Y,NO4 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine4WindingTempT +1199,GeneratorEngine5,BearingNDETemp1,1003,GE5_TIAH6_Y,NO5 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine5WindingTempT +1200,MainEngine1,LoadPercent,1018,EG_0000005_Y,M/E_LOAD,D,0,0,%,1301.0,MainEngine#,LoadPercent,MainEngine# LoadPercent,True,GeneratorEngine2LoadPercent +1214,GE1TurboCharger1,ExhGasOutletTemp,1003,GE1_TE27_Y,NO1 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger1ExhGasOutletTemp +1226,GE2TurboCharger1,ExhGasOutletTemp,1003,GE2_TE27_Y,NO2 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger2ExhGasOutletTemp +1237,GE3TurboCharger1,ExhGasOutletTemp,1003,GE3_TE27_Y,NO3 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger3ExhGasOutletTemp +1246,GeneratorEngine3,BearingDETemp8,1003,GE3_TE698_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5 +1247,GeneratorEngine3,BearingDETemp9,1003,GE3_TE699_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6 +1273,GeneratorEngine4,BearingDETemp8,1003,GE4_TE698_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6 +1274,GeneratorEngine4,BearingDETemp9,1003,GE4_TE699_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6 +1280,GeneratorEngine5,BearingDETemp2,1003,GE5_TE692_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP2,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2BearingDETemp6 +1281,GeneratorEngine5,BearingDETemp3,1003,GE5_TE693_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP3,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5 +1282,GeneratorEngine5,BearingDETemp4,1003,GE5_TE694_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP4,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp4 +1283,GeneratorEngine5,BearingDETemp5,1003,GE5_TE695_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP5,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5 +1284,GeneratorEngine5,BearingDETemp6,1003,GE5_TE696_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP6,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6 +1285,GeneratorEngine5,BearingDETemp7,1003,GE5_TE697_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP7,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6 +1286,GeneratorEngine5,BearingDETemp8,1003,GE5_TE698_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2Cy8KnockIntensity +1287,GeneratorEngine5,BearingDETemp9,1003,GE5_TE699_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6 +1298,ME1TurboCharger1,ExhGasInletTemp,1003,AMSI_TT3721A_Y,EXHAUST GAS TEMPERATURE BEFORE TURBOCHARGER 1,AI,0,600,TEMPERATURE,1304.0,ME#TurboCharger#,ExhGasInletTemp,ME#TurboCharger# ExhGasInletTemp,True,ME1TurboCharger1ExhGasOutletTemp +1309,GeneratorEngine2,LOFilterInletPress,1030,GB069,NO2 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine2LOInletPress +1472,GeneratorEngine3,VoltageA,1050,MB.KM.IAS.Q3.A40189,NO3 GENERATOR_ENGINE(A) GEN VOLTAGE,AO,0,654,VOLTAGE,1300.0,GeneratorEngine#,VoltageA,GeneratorEngine# VoltageA,True,GeneratorEngine3Voltage +1524,GeneratorEngine2,FOFilterInletPress,1030,GB085,NO2 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine2FOInletPress +1536,ShipBoiler1,FOInletTemp,1028,MB.KM.IAS.Q2.A400184,OIL TEMPERATURE (4-20MA),AI,0,200,°C,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,GeneratorEngine4WindingTempT +1537,ShipBoiler1,FOInletPress,1028,MB.KM.IAS.Q2.A400185,FUEL OIL PRESSURE (4-20MA),AI,0,40,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,GeneratorEngine4FOInletPress +1594,GeneratorEngine3,LOFilterInletPress,1030,GC069,NO3 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine3LOInletPress +1597,GeneratorEngine3,FOFilterInletPress,1030,GC085,NO3 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine3FOInletPress +1679,GeneratorEngine3,busBarVoltage,1003,PMS_3BUSVOLA_Y,BUS VOLTAGE,AI,0,10000,VOLTAGE,1304.0,GeneratorEngine#,busBarVoltage,GeneratorEngine# busBarVoltage,True,GeneratorEngine1busBarVoltage +1727,GeneratorEngine2,SAPress,1036,MB.KM.IAS.Q1.400086,NO2 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine2WindingTempR +1763,GeneratorEngine5,BearingDETemp1,1003,GE5_TE691_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP1,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5 +1873,GeneratorEngine5,CBClosed,1003,PMS_5VCBCLED_Y,NO5 GENERATOR_ENGINE MVSB VCB CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBClosed,GeneratorEngine# CBClosed,True,GeneratorEngine5StopState +2034,CargoHandling,CT1_DWPump_Stbd_Current,1018,IT_1101_Y,MP1100 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current +2035,CargoHandling,CT2_DWPump_Port_Current,1018,IT_1200_Y,MP1200 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current +2037,CargoHandling,CT3_DWPump_Stbd_Current,1018,IT_1501_Y,MP1501 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current +2038,CargoHandling,CT4_DWPump_Port_Current,1018,IT_1700_Y,MP1700 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current +2048,GeneratorEngine5,RunningHour,1003,PMS_5GENWHRS_Y,NO5 GENERATOR_ENGINE WORKING HOURS,AI,0,10000,NOVALUE,1304.0,GeneratorEngine#,RunningHour,GeneratorEngine# RunningHour,True,GeneratorEngine4RunningHour +2057,CargoHandling,CT1_DWPump_Port_Current,1018,IT_1100_Y,MP1100 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current +2079,ShipBoiler1,ExhGasOutletTemp,1003,EG_G02_Y,EXHAUST GAS ECONOMIZER EXHAUST GAS OUTLET TEMPERATURE,AI,0,600,TEMPERATURE,1304.0,ShipBoiler#,ExhGasOutletTemp,ShipBoiler# ExhGasOutletTemp,True,MainEngine1Cy1ExhGasOutletTemp diff --git a/analysis/result_report_statistics/decoder_scaling.csv b/analysis/result_report_statistics/decoder_scaling.csv new file mode 100644 index 0000000..7a12532 --- /dev/null +++ b/analysis/result_report_statistics/decoder_scaling.csv @@ -0,0 +1,27 @@ +type,fold,accuracy +1layer,1,0.8968291528632276 +1layer,2,0.8859813084112149 +1layer,3,0.9382530120481928 +1layer,4,0.9586108468125595 +1layer,5,0.8827301878149336 +2layer,1,0.9318504495977283 +2layer,2,0.8859813084112149 +2layer,3,0.9678714859437751 +2layer,4,0.9738344433872502 +2layer,5,0.9015116811726981 +4layer,1,0.9503076194983436 +4layer,2,0.9135514018691588 +4layer,3,0.9698795180722891 +4layer,4,0.9790675547098002 +4layer,5,0.907924874026569 +6layer,1,0.9522006625650734 +6layer,2,0.9093457943925234 +6layer,3,0.9678714859437751 +6layer,4,0.9814462416745956 +6layer,5,0.890975721484196 +8layer,1,0.9441552295314718 +8layer,2,0.9121495327102803 +8layer,3,0.963855421686747 +8layer,4,0.9752616555661275 +8layer,5,0.907924874026569 + diff --git a/analysis/result_report_statistics/frozen_encoder.csv b/analysis/result_report_statistics/frozen_encoder.csv new file mode 100644 index 0000000..5b9a0bf --- /dev/null +++ b/analysis/result_report_statistics/frozen_encoder.csv @@ -0,0 +1,12 @@ +type,fold,accuracy +normal,1,0.9522006625650734 +normal,2,0.9093457943925234 +normal,3,0.9678714859437751 +normal,4,0.9814462416745956 +normal,5,0.890975721484196 +frozen,1,0.9342167534311405 +frozen,2,0.883177570093458 +frozen,3,0.963855421686747 +frozen,4,0.9705042816365367 +frozen,5,0.9051763628034815 + diff --git a/analysis/result_report_statistics/result_statistics.py b/analysis/result_report_statistics/result_statistics.py new file mode 100644 index 0000000..0de4a81 --- /dev/null +++ b/analysis/result_report_statistics/result_statistics.py @@ -0,0 +1,199 @@ +# %% +import pandas as pd +import numpy as np + +#################################################################################### +# stage 1 +# %% +# stage 1a: binary classification +df_stage1a = pd.read_csv('stage1a.csv') +# %% +# desc only +mask = df_stage1a['type'] == 'desc' +df_stage1a[mask].describe().loc[['mean', 'std']] + +# %% +# desc and unit +mask = df_stage1a['type'] == 'desc_unit' +df_stage1a[mask].describe().loc[['mean', 'std']] + +# %% +# stage 1b: similarity-based classification +df_stage1b = pd.read_csv('stage1b.csv') +# %% +# desc only +mask = df_stage1b['type'] == 'desc' +df_stage1b[mask].describe().loc[['mean', 'std']] + +# %% +# desc and unit +mask = df_stage1b['type'] == 'desc_unit' +df_stage1b[mask].describe().loc[['mean', 'std']] + + +# %% +################################################################################# +# stage 2: mapping model + +# %% +# stage 2a: mapping by classification +df_stage2a = pd.read_csv('stage2a.csv') +# %% +# desc only +mask = df_stage2a['type'] == 'desc' +df_stage2a[mask].describe().loc[['mean', 'std']] + +# %% +# desc and unit +mask = df_stage2a['type'] == 'desc_unit' +df_stage2a[mask].describe().loc[['mean', 'std']] + + +# %% +# stage 2b: mapping by seq2seq +df_stage2b = pd.read_csv('stage2b.csv') +# %% +# desc only +mask = df_stage2b['type'] == 'desc' +df_stage2b[mask].describe().loc[['mean', 'std']] + +# %% +# desc and unit +mask = df_stage2b['type'] == 'desc_unit' +df_stage2b[mask].describe().loc[['mean', 'std']] + + + +############################ +# frozen encoder +# %% +df = pd.read_csv('frozen_encoder.csv') +# %% +# normal +mask = df['type'] == 'normal' +df[mask].describe().loc[['mean', 'std']] + +# %% +# frozen +mask = df['type'] == 'frozen' +df[mask].describe().loc[['mean', 'std']] + + +# %% +############################ +# frozen encoder +# %% +df = pd.read_csv('decoder_scaling.csv') +# %% +# 1 layer +mask = df['type'] == '1layer' +df[mask].describe().loc[['mean', 'std']] + + +# %% +# 2 layer +mask = df['type'] == '2layer' +df[mask].describe().loc[['mean', 'std']] + +# %% +# 4 layer +mask = df['type'] == '4layer' +df[mask].describe().loc[['mean', 'std']] + +# %% +# 6 layer +mask = df['type'] == '6layer' +df[mask].describe().loc[['mean', 'std']] + +# %% +# 8 layer +mask = df['type'] == '8layer' +df[mask].describe().loc[['mean', 'std']] + + + +# %% +######################### +# compute overall result + +# frac{1808}{2113} = 0.856$ & $\frac{10692}{10961} = 0.975$ & $\frac{12500}{13074} = 0.956$ \\ +# frac{1932}{2140} = 0.903$ & $\frac{8304}{8582} = 0.968$ & $\frac{10236}{10722} = 0.955$ \\ +# frac{1789}{1992} = 0.898$ & $\frac{7613}{7863} = 0.968$ & $\frac{9402}{9855} = 0.954$ \\ +# frac{1967}{2102} = 0.936$ & $\frac{12929}{13349} = 0.969$ & $\frac{14896}{15451} = 0.964$ \\ +# frac{1915}{2183} = 0.877$ & $\frac{10381}{10786} = 0.962$ & $\frac{12296}{12969} = 0.948$ \\ + +# %% +matrix = np.array([ + [1808, 2113, 10692, 10961, 13074], + [1932, 2140, 8304, 8582, 10722], + [1789, 1992, 7613, 7863, 9855], + [1967, 2102, 12929, 13349, 15451], + [1915, 2183, 10381, 10786, 12969] +]) +# %% +relevant_class = matrix[:,0]/matrix[:,1] +print(relevant_class) +print(np.std(relevant_class)) + +# %% +non_relevant_class = matrix[:,2]/matrix[:,3] +print(non_relevant_class) +print(np.std(non_relevant_class)) + +# %% +numerator = (matrix[:,0] + matrix[:,2]) +denominator = (matrix[:,1] + matrix[:,3]) +print(numerator) +print(denominator) # same as last column +overall = numerator/denominator +print(overall) +print(np.std(overall)) + + +###################### +# compute mapping result +# %% + +# $\frac{1761}{1808} = 0.974$ \\ +# $\frac{1802}{1932} = 0.933$ \\ +# $\frac{1760}{1789} = 0.984$ \\ +# $\frac{1945}{1967} = 0.989$ \\ +# $\frac{1837}{1915} = 0.959$ \\ + +matrix = np.array([ + [1761, 1808], + [1802, 1932], + [1760, 1789], + [1945, 1967], + [1837, 1915] +]) + +# %% +result = matrix[:,0]/matrix[:,1] +print(result) +print(np.mean(result)) +print(np.std(result)) + +# %% +#################################### +# compute overall result +# & 1761 & 10692 & $\frac{1761 + 10692}{13074} = 0.953$ \\ +# & 1802 & 8304 & $\frac{1802 + 8304}{10722} = 0.943$ \\ +# & 1760 & 7613 & $\frac{1760 + 7613}{9855} = 0.951$ \\ +# & 1945 & 12929 & $\frac{1945 + 12929}{15451} = 0.963$ \\ +# & 1837 & 10381 & $\frac{1837 + 10381}{12969} = 0.942$ \\ + +matrix = np.array([ + [1761,10692, 13074], + [1802, 8304, 10722], + [1760, 7613, 9855], + [1945,12929, 15451], + [1837,10381, 12969] +]) + +# %% +overall = (matrix[:,0] + matrix[:,1])/matrix[:,2] +print(overall) +print(np.mean(overall)) +print(np.std(overall)) +# %% diff --git a/analysis/result_report_statistics/stage1a.csv b/analysis/result_report_statistics/stage1a.csv new file mode 100644 index 0000000..56c4690 --- /dev/null +++ b/analysis/result_report_statistics/stage1a.csv @@ -0,0 +1,11 @@ +type,fold,accuracy,f1_score,precision,recall +desc,1,0.92588,0.74001,0.85440,0.65263 +desc,2,0.88733,0.64239,0.87641,0.50701 +desc,3,0.90583,0.71429,0.92357,0.58233 +desc,4,0.93114,0.70929,0.83312,0.61751 +desc,5,0.91171,0.67683,0.88162,0.54924 +desc_unit,1,0.95610,0.86301,0.87049,0.85566 +desc_unit,2,0.95467,0.88828,0.87421,0.90280 +desc_unit,3,0.95403,0.88762,0.87739,0.89809 +desc_unit,4,0.96408,0.87636,0.82405,0.93578 +desc_unit,5,0.94811,0.85054,0.82543,0.87723 \ No newline at end of file diff --git a/analysis/result_report_statistics/stage1b.csv b/analysis/result_report_statistics/stage1b.csv new file mode 100644 index 0000000..ce7b985 --- /dev/null +++ b/analysis/result_report_statistics/stage1b.csv @@ -0,0 +1,11 @@ +type,fold,accuracy,f1_score,precision,recall +desc,1,0.93162,0.79580,0.76909,0.82442 +desc,2,0.92884,0.82440,0.81224,0.83692 +desc,3,0.93201,0.83375,0.82434,0.84337 +desc,4,0.94259,0.80937,0.73814,0.89581 +desc,5,0.92228,0.78397,0.73661,0.83784 +desc_unit,1,0.93353,0.79945,0.78018,0.81969 +desc_unit,2,0.92184,0.81006,0.78653,0.83505 +desc_unit,3,0.91821,0.80513,0.77659,0.83584 +desc_unit,4,0.93334,0.78675,0.69648,0.90390 +desc_unit,5,0.93084,0.80445,0.76747,0.84517 diff --git a/analysis/result_report_statistics/stage2a.csv b/analysis/result_report_statistics/stage2a.csv new file mode 100644 index 0000000..a3ccae9 --- /dev/null +++ b/analysis/result_report_statistics/stage2a.csv @@ -0,0 +1,11 @@ +type,fold,accuracy +desc,1,0.93706 +desc,2,0.88785 +desc,3,0.96285 +desc,4,0.95861 +desc,5,0.89601 +desc_unit,1,0.94226 +desc_unit,2,0.90561 +desc_unit,3,0.96436 +desc_unit,4,0.96955 +desc_unit,5,0.90289 \ No newline at end of file diff --git a/analysis/result_report_statistics/stage2b.csv b/analysis/result_report_statistics/stage2b.csv new file mode 100644 index 0000000..aa0c2fa --- /dev/null +++ b/analysis/result_report_statistics/stage2b.csv @@ -0,0 +1,16 @@ +type,fold,accuracy +desc,1,0.9427354472314246 +desc,2,0.8981308411214953 +desc,3,0.9588353413654619 +desc,4,0.9633682207421503 +desc,5,0.8928080622995878 +desc_unit,1,0.9578797917652626 +desc_unit,2,0.9088785046728972 +desc_unit,3,0.9673694779116466 +desc_unit,4,0.9785918173168411 +desc_unit,5,0.8918918918918919 + + + + + diff --git a/analysis/string_levenshtein/.gitignore b/analysis/string_levenshtein/.gitignore new file mode 100644 index 0000000..aab52d9 --- /dev/null +++ b/analysis/string_levenshtein/.gitignore @@ -0,0 +1 @@ +*.png \ No newline at end of file diff --git a/analysis/string_levenshtein/between_ship_and_platform.py b/analysis/string_levenshtein/between_ship_and_platform.py index 02c796f..44f527d 100644 --- a/analysis/string_levenshtein/between_ship_and_platform.py +++ b/analysis/string_levenshtein/between_ship_and_platform.py @@ -41,13 +41,26 @@ distance_array # %% +plt.rcParams.update({'font.size': 14}) # Adjust the size as needed plt.figure(figsize=(8, 6)) plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7) plt.xlabel("Normalized Levenshtein Distance") plt.ylabel("Count") plt.tight_layout() -plt.savefig("histogram.png", dpi=300) +# Add arrow for increasing dissimilarity +plt.annotate( + "Decreasing Similarity", # Text label + xy=(0.7, 500), # Arrow end (near the end of x-axis) + xytext=(0.4, 500), # Arrow start (near the middle of x-axis) + arrowprops=dict(arrowstyle="->", lw=2, color="black"), # Arrow style + va='center', # needed to make arrow centered + fontsize=14, # Font size for the text + color="black" # Text color +) +# Add arrows and text +plt.savefig("input_output_similarity.png", dpi=300) # + # %% # summary statistics of computed levenshtein distance def summary_stats(arr): diff --git a/analysis/string_levenshtein/within_same_class.py b/analysis/string_levenshtein/within_same_class.py index eaf51d6..e642b99 100644 --- a/analysis/string_levenshtein/within_same_class.py +++ b/analysis/string_levenshtein/within_same_class.py @@ -58,12 +58,24 @@ score_list # %% # plt.hist(score_list, bins=50) +plt.rcParams.update({'font.size': 14}) # Adjust the size as needed plt.figure(figsize=(8, 6)) plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7) plt.xlabel("Normalized Levenshtein Distance") plt.ylabel("Platform Domain Class Count") +# Add arrow for increasing dissimilarity +plt.annotate( + "Decreasing Similarity", # Text label + xy=(0.7, 70), # Arrow end (near the end of x-axis) + xytext=(0.2, 70), # Arrow start (near the middle of x-axis) + arrowprops=dict(arrowstyle="->", lw=2, color="black"), # Arrow style + va='center', # needed to make arrow centered + fontsize=14, # Font size for the text + color="black" # Text color +) + plt.tight_layout() -plt.savefig("histogram.png", dpi=300) +plt.savefig("within_class_similarity.png", dpi=300) # %% # summary statistics of computed levenshtein distance def summary_stats(arr): diff --git a/analysis/unit_label_differences/unit.py b/analysis/unit_label_differences/unit.py new file mode 100644 index 0000000..a94ac61 --- /dev/null +++ b/analysis/unit_label_differences/unit.py @@ -0,0 +1,26 @@ +# %% +import pandas as pd + +# %% +data_path = '../../data_preprocess/exports/preprocessed_data.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) + +# %% +df_in = full_df[full_df['MDM']].reset_index(drop=True) +# %% +df_out = full_df[~full_df['MDM']].reset_index(drop=True) +# %% +label_counts_in = df_in['unit'].value_counts() +print(label_counts_in.to_string()) + +# %% +label_counts_out = df_out['unit'].value_counts() +print(label_counts_out.to_string()) + + +# %% +label_counts_out['NOVALUE']/len(df_out) + +# %% +label_counts_in['NOVALUE']/len(df_out) +# %% diff --git a/overall/pipeline_evaluation.py b/overall/pipeline_evaluation.py index 81a8db3..e9b66f0 100644 --- a/overall/pipeline_evaluation.py +++ b/overall/pipeline_evaluation.py @@ -9,14 +9,19 @@ def run(fold): df = pd.read_csv(data_path, skipinitialspace=True) p_mdm = df['p_mdm'] - # data_path = f'../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv' - data_path = f'../train/modified_t5_decoder_4_layers/mapping_prediction/exports/result_group_{fold}.csv' + data_path = f'../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv' df = pd.read_csv(data_path, skipinitialspace=True) actual_mdm = df['MDM'] - thing_correctness = df['thing'] == df['p_thing'] - property_correctness = df['property'] == df['p_property'] - answer = thing_correctness & property_correctness + # grounded labels + data_path = f'../analysis/delta_analysis/exports/result_group_{fold}.csv' + df_grounded = pd.read_csv(data_path, skipinitialspace=True) + answer = df_grounded['grounded_pred'] + + # original labels + # thing_correctness = df['thing'] == df['p_thing'] + # property_correctness = df['property'] == df['p_property'] + # answer = thing_correctness & property_correctness ############## # evaluate relevant-class prediction performance @@ -53,6 +58,13 @@ def run(fold): print(mapping_rate) print('size', correct_positive_mdm_and_map, '/', sum(p_mdm & actual_mdm)) + # evaluate relevant mappings + correct_positive_mdm_and_map = sum(p_mdm & actual_mdm & answer) + mapping_rate = correct_positive_mdm_and_map / sum(actual_mdm) + print('relevant data mapping rate') + print(mapping_rate) + print('size', correct_positive_mdm_and_map, '/', sum(actual_mdm)) + ############## # evaluate overall pipeline result @@ -76,3 +88,5 @@ for fold in [1,2,3,4,5]: print('*' * 40) run(fold) + +# %% diff --git a/relevant_class/binary_classifier_desc/train.py b/relevant_class/binary_classifier_desc/train.py index 0276a14..fc05072 100644 --- a/relevant_class/binary_classifier_desc/train.py +++ b/relevant_class/binary_classifier_desc/train.py @@ -179,8 +179,8 @@ def train(fold): # save_strategy="epoch", load_best_model_at_end=False, learning_rate=1e-5, - per_device_train_batch_size=128, - per_device_eval_batch_size=128, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, auto_find_batch_size=False, ddp_find_unused_parameters=False, weight_decay=0.01, diff --git a/relevant_class/binary_classifier_desc_unit/train.py b/relevant_class/binary_classifier_desc_unit/train.py index 58a8624..1fb2e6b 100644 --- a/relevant_class/binary_classifier_desc_unit/train.py +++ b/relevant_class/binary_classifier_desc_unit/train.py @@ -180,8 +180,8 @@ def train(fold): # save_strategy="epoch", load_best_model_at_end=False, learning_rate=1e-5, - per_device_train_batch_size=128, - per_device_eval_batch_size=128, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, auto_find_batch_size=False, ddp_find_unused_parameters=False, weight_decay=0.01, diff --git a/train/class_number_tokens/.gitignore b/train/class_number_tokens/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/train/class_number_tokens/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/train/class_number_tokens/classification_prediction/.gitignore b/train/class_number_tokens/classification_prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/train/class_number_tokens/classification_prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/train/class_number_tokens/classification_prediction/output.txt b/train/class_number_tokens/classification_prediction/output.txt new file mode 100644 index 0000000..2e8030d --- /dev/null +++ b/train/class_number_tokens/classification_prediction/output.txt @@ -0,0 +1,31 @@ + +******************************************************************************** +Fold: 1 +Accuracy: 0.94510 +F1 Score: 0.94087 +Precision: 0.94623 +Recall: 0.94510 +******************************************************************************** +Fold: 2 +Accuracy: 0.91682 +F1 Score: 0.91698 +Precision: 0.92824 +Recall: 0.91682 +******************************************************************************** +Fold: 3 +Accuracy: 0.96185 +F1 Score: 0.95743 +Precision: 0.96001 +Recall: 0.96185 +******************************************************************************** +Fold: 4 +Accuracy: 0.97479 +F1 Score: 0.97074 +Precision: 0.97072 +Recall: 0.97479 +******************************************************************************** +Fold: 5 +Accuracy: 0.90563 +F1 Score: 0.89532 +Precision: 0.90040 +Recall: 0.90563 diff --git a/train/class_number_tokens/classification_prediction/predict.py b/train/class_number_tokens/classification_prediction/predict.py new file mode 100644 index 0000000..34b1221 --- /dev/null +++ b/train/class_number_tokens/classification_prediction/predict.py @@ -0,0 +1,289 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import re +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 128 + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +# mdm_list = sorted(list((set(full_df['pattern'])))) +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% +def substitute_and_append_digits(s): + """ + Finds all digit groups in a string, substitutes them with a placeholder, + and appends the extracted digit groups at the end of the string flanked by markers. + + Args: + s (str): The input string. + + Returns: + str: The transformed string. + """ + # Find all digit groups in the string + digit_groups = re.findall(r'\d+', s) + + # Substitute digit groups with placeholder + substituted_string = re.sub(r'\d+', '', s) + + # Append extracted digit groups to the end of the string + appended_digits = ''.join([f'{group}' for group in digit_groups]) + + return substituted_string + appended_digits + + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + processed_desc = substitute_and_append_digits(row['tag_description']) + desc = f"{processed_desc}" + unit = f"{row['unit']}" + + pattern = f"{row['thing'] + row['property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_dataset(fold, mdm_list): + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + # uncomment for mdm + # we only use the mdm subset + test_df = test_df[test_df['MDM']].reset_index(drop=True) + + test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list)) + + return test_dataset + + +# %% + +# function to perform training for a given fold +def test(fold): + + test_dataset = create_dataset(fold, mdm_list) + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint_fold_{fold}' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 128 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + average_parameter = 'weighted' + zero_division_parameter = 0 + f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter) + + + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + print(f'Fold: {fold}', file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + # export result + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + # uncomment if you want to predict for all + # df = df[df['MDM']].reset_index(drop=True) + + label_list = [id2label[id] for id in pred_labels] + df_out = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + test(fold) diff --git a/train/class_number_tokens/train.py b/train/class_number_tokens/train.py new file mode 100644 index 0000000..3ad2fbc --- /dev/null +++ b/train/class_number_tokens/train.py @@ -0,0 +1,241 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import re +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +# mdm_list = sorted(list((set(full_df['pattern'])))) +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% +def substitute_and_append_digits(s): + """ + Finds all digit groups in a string, substitutes them with a placeholder, + and appends the extracted digit groups at the end of the string flanked by markers. + + Args: + s (str): The input string. + + Returns: + str: The transformed string. + """ + # Find all digit groups in the string + digit_groups = re.findall(r'\d+', s) + + # Substitute digit groups with placeholder + substituted_string = re.sub(r'\d+', '', s) + + # Append extracted digit groups to the end of the string + appended_digits = ''.join([f'{group}' for group in digit_groups]) + + return substituted_string + appended_digits + + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + processed_desc = substitute_and_append_digits(row['tag_description']) + desc = f"{processed_desc}" + unit = f"{row['unit']}" + pattern = f"{row['thing'] + row['property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold, mdm_list) + + # prepare tokenizer + + # model_checkpoint = "distilbert/distilbert-base-uncased" + model_checkpoint = 'google-bert/bert-base-cased' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-4, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + + +# %% diff --git a/train/classification_bert_complete_desc/classification_prediction/.gitignore b/train/classification_bert_complete_desc/classification_prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/train/classification_bert_complete_desc/classification_prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/train/classification_bert_complete_desc/classification_prediction/output.txt b/train/classification_bert_complete_desc/classification_prediction/output.txt index 192305c..709e376 100644 --- a/train/classification_bert_complete_desc/classification_prediction/output.txt +++ b/train/classification_bert_complete_desc/classification_prediction/output.txt @@ -1,31 +1,31 @@ ******************************************************************************** Fold: 1 -Accuracy: 0.78277 -F1 Score: 0.73629 -Precision: 0.71419 -Recall: 0.78277 +Accuracy: 0.93706 +F1 Score: 0.93286 +Precision: 0.93920 +Recall: 0.93706 ******************************************************************************** Fold: 2 -Accuracy: 0.78598 -F1 Score: 0.73708 -Precision: 0.71578 -Recall: 0.78598 +Accuracy: 0.88785 +F1 Score: 0.88726 +Precision: 0.90566 +Recall: 0.88785 ******************************************************************************** Fold: 3 -Accuracy: 0.79819 -F1 Score: 0.74411 -Precision: 0.71749 -Recall: 0.79819 +Accuracy: 0.96285 +F1 Score: 0.95930 +Precision: 0.96310 +Recall: 0.96285 ******************************************************************************** Fold: 4 -Accuracy: 0.79543 -F1 Score: 0.73902 -Precision: 0.71094 -Recall: 0.79543 +Accuracy: 0.95861 +F1 Score: 0.95320 +Precision: 0.95615 +Recall: 0.95861 ******************************************************************************** Fold: 5 -Accuracy: 0.77279 -F1 Score: 0.72098 -Precision: 0.69817 -Recall: 0.77279 +Accuracy: 0.89601 +F1 Score: 0.88613 +Precision: 0.89038 +Recall: 0.89601 diff --git a/train/classification_bert_complete_desc/classification_prediction/predict.py b/train/classification_bert_complete_desc/classification_prediction/predict.py index 991608b..2b38e29 100644 --- a/train/classification_bert_complete_desc/classification_prediction/predict.py +++ b/train/classification_bert_complete_desc/classification_prediction/predict.py @@ -235,6 +235,24 @@ def test(fold): print(f'Precision: {precision:.5f}', file=f) print(f'Recall: {recall:.5f}', file=f) + # export result + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df[df['MDM']].reset_index(drop=True) + + label_list = [id2label[id] for id in pred_labels] + df_out = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + + + + # %% # reset file before writing to it diff --git a/train/classification_bert_complete_desc/train.py b/train/classification_bert_complete_desc/train.py index c490a15..c380d2e 100644 --- a/train/classification_bert_complete_desc/train.py +++ b/train/classification_bert_complete_desc/train.py @@ -176,7 +176,7 @@ def train(fold): logging_strategy="epoch", # save_strategy="epoch", load_best_model_at_end=False, - learning_rate=1e-3, + learning_rate=1e-4, per_device_train_batch_size=64, per_device_eval_batch_size=64, auto_find_batch_size=False, diff --git a/train/classification_bert_complete_desc_unit/classification_prediction/.gitignore b/train/classification_bert_complete_desc_unit/classification_prediction/.gitignore new file mode 100644 index 0000000..dbe1a9b --- /dev/null +++ b/train/classification_bert_complete_desc_unit/classification_prediction/.gitignore @@ -0,0 +1 @@ +exports \ No newline at end of file diff --git a/train/classification_bert_complete_desc_unit/classification_prediction/output.txt b/train/classification_bert_complete_desc_unit/classification_prediction/output.txt index 813a31b..e0fc03f 100644 --- a/train/classification_bert_complete_desc_unit/classification_prediction/output.txt +++ b/train/classification_bert_complete_desc_unit/classification_prediction/output.txt @@ -1,31 +1,31 @@ ******************************************************************************** Fold: 1 -Accuracy: 0.78940 -F1 Score: 0.73284 -Precision: 0.70389 -Recall: 0.78940 +Accuracy: 0.15229 +F1 Score: 0.07923 +Precision: 0.05929 +Recall: 0.15229 ******************************************************************************** Fold: 2 -Accuracy: 0.78411 -F1 Score: 0.73695 -Precision: 0.71914 -Recall: 0.78411 +Accuracy: 0.18075 +F1 Score: 0.09625 +Precision: 0.07243 +Recall: 0.18075 ******************************************************************************** Fold: 3 -Accuracy: 0.80522 -F1 Score: 0.75406 -Precision: 0.72847 -Recall: 0.80522 +Accuracy: 0.19493 +F1 Score: 0.10903 +Precision: 0.08332 +Recall: 0.19493 ******************************************************************************** Fold: 4 -Accuracy: 0.80780 -F1 Score: 0.75361 -Precision: 0.72432 -Recall: 0.80780 +Accuracy: 0.13190 +F1 Score: 0.05761 +Precision: 0.04173 +Recall: 0.13190 ******************************************************************************** Fold: 5 -Accuracy: 0.76958 -F1 Score: 0.71912 -Precision: 0.69965 -Recall: 0.76958 +Accuracy: 0.15198 +F1 Score: 0.07383 +Precision: 0.05411 +Recall: 0.15198 diff --git a/train/classification_bert_complete_desc_unit/classification_prediction/predict.py b/train/classification_bert_complete_desc_unit/classification_prediction/predict.py index b645867..38f581f 100644 --- a/train/classification_bert_complete_desc_unit/classification_prediction/predict.py +++ b/train/classification_bert_complete_desc_unit/classification_prediction/predict.py @@ -80,8 +80,9 @@ def process_df_to_dict(df, mdm_list): def create_dataset(fold, mdm_list): data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" test_df = pd.read_csv(data_path, skipinitialspace=True) + # uncomment for mdm # we only use the mdm subset - test_df = test_df[test_df['MDM']].reset_index(drop=True) + # test_df = test_df[test_df['MDM']].reset_index(drop=True) test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list)) @@ -237,6 +238,22 @@ def test(fold): print(f'Precision: {precision:.5f}', file=f) print(f'Recall: {recall:.5f}', file=f) + # export result + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + # uncomment if you want to predict for all + # df = df[df['MDM']].reset_index(drop=True) + + label_list = [id2label[id] for id in pred_labels] + df_out = pd.DataFrame({ + 'class_prediction': pd.Series(label_list) + }) + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # %% # reset file before writing to it diff --git a/train/classification_bert_complete_desc_unit/train.py b/train/classification_bert_complete_desc_unit/train.py index 654790a..c51388e 100644 --- a/train/classification_bert_complete_desc_unit/train.py +++ b/train/classification_bert_complete_desc_unit/train.py @@ -177,7 +177,7 @@ def train(fold): logging_strategy="epoch", # save_strategy="epoch", load_best_model_at_end=False, - learning_rate=1e-5, + learning_rate=1e-4, per_device_train_batch_size=64, per_device_eval_batch_size=64, auto_find_batch_size=False, diff --git a/train/frozen_t5_encoder/train_decoder.py b/train/frozen_t5_encoder/train_decoder.py index 41c3a39..404e6ac 100644 --- a/train/frozen_t5_encoder/train_decoder.py +++ b/train/frozen_t5_encoder/train_decoder.py @@ -202,7 +202,7 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/modified_t5_decoder_12_layers/.gitignore b/train/mapping_t5-base_desc/.gitignore similarity index 100% rename from train/modified_t5_decoder_12_layers/.gitignore rename to train/mapping_t5-base_desc/.gitignore diff --git a/train/modified_t5_decoder_12_layers/mapping_prediction/.gitignore b/train/mapping_t5-base_desc/mapping_prediction/.gitignore similarity index 100% rename from train/modified_t5_decoder_12_layers/mapping_prediction/.gitignore rename to train/mapping_t5-base_desc/mapping_prediction/.gitignore diff --git a/train/modified_t5_decoder_12_layers/mapping_prediction/inference.py b/train/mapping_t5-base_desc/mapping_prediction/inference.py similarity index 98% rename from train/modified_t5_decoder_12_layers/mapping_prediction/inference.py rename to train/mapping_t5-base_desc/mapping_prediction/inference.py index 9ea9c77..4e2b72f 100644 --- a/train/modified_t5_decoder_12_layers/mapping_prediction/inference.py +++ b/train/mapping_t5-base_desc/mapping_prediction/inference.py @@ -76,7 +76,7 @@ class Inference(): text_target=target, max_length=max_length, return_tensors="pt", - padding="max_length", + padding='max_length', truncation=True, ) return model_inputs @@ -100,7 +100,7 @@ class Inference(): def generate(self): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') MAX_GENERATE_LENGTH = 128 pred_generations = [] diff --git a/train/mapping_t5-base_desc/mapping_prediction/output.txt b/train/mapping_t5-base_desc/mapping_prediction/output.txt new file mode 100644 index 0000000..c7910ec --- /dev/null +++ b/train/mapping_t5-base_desc/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9536204448651207 +Accuracy for fold 2: 0.8845794392523364 +Accuracy for fold 3: 0.9618473895582329 +Accuracy for fold 4: 0.9576593720266413 +Accuracy for fold 5: 0.8928080622995878 diff --git a/train/mapping_t5-base_desc/mapping_prediction/output_with_abbreviation.txt b/train/mapping_t5-base_desc/mapping_prediction/output_with_abbreviation.txt new file mode 100644 index 0000000..3bf5e42 --- /dev/null +++ b/train/mapping_t5-base_desc/mapping_prediction/output_with_abbreviation.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9588263132986276 +Accuracy for fold 2: 0.9182242990654206 +Accuracy for fold 3: 0.9633534136546185 +Accuracy for fold 4: 0.9809705042816366 +Accuracy for fold 5: 0.8891433806688044 diff --git a/train/modified_t5_decoder_9_layers/mapping_prediction/predict.py b/train/mapping_t5-base_desc/mapping_prediction/predict.py similarity index 98% rename from train/modified_t5_decoder_9_layers/mapping_prediction/predict.py rename to train/mapping_t5-base_desc/mapping_prediction/predict.py index 29e45f8..198cc34 100644 --- a/train/modified_t5_decoder_9_layers/mapping_prediction/predict.py +++ b/train/mapping_t5-base_desc/mapping_prediction/predict.py @@ -26,7 +26,7 @@ def infer_and_select(fold): # run inference # checkpoint # Use glob to find matching paths - directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') # Use glob to find matching paths # path is usually checkpoint_fold_1/checkpoint- # we are guaranteed to save only 1 checkpoint from training @@ -70,5 +70,6 @@ def infer_and_select(fold): with open("output.txt", "w") as f: print('', file=f) +# for fold in [1,2,3,4,5]: for fold in [1,2,3,4,5]: infer_and_select(fold) diff --git a/train/modified_t5_decoder_3_layers/train_decoder.py b/train/mapping_t5-base_desc/train.py similarity index 71% rename from train/modified_t5_decoder_3_layers/train_decoder.py rename to train/mapping_t5-base_desc/train.py index d4d3170..3aecb20 100644 --- a/train/modified_t5_decoder_3_layers/train_decoder.py +++ b/train/mapping_t5-base_desc/train.py @@ -2,7 +2,6 @@ # from datasets import load_from_disk import os -import glob os.environ['NCCL_P2P_DISABLE'] = '1' os.environ['NCCL_IB_DISABLE'] = '1' @@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" import torch - -from safetensors.torch import load_file - -from transformers.models.t5.modeling_t5 import T5Block from transformers import ( - T5Config, T5TokenizerFast, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, EarlyStoppingCallback, - Seq2SeqTrainingArguments, - T5ForConditionalGeneration, - T5Model + Seq2SeqTrainingArguments ) import evaluate import numpy as np @@ -35,23 +27,13 @@ from datasets import Dataset, DatasetDict torch.set_float32_matmul_precision('high') - - -# %% - -# model_checkpoint = "t5-small" -# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) -# model.config - -# %% # outputs a list of dictionaries def process_df_to_dict(df): output_list = [] for _, row in df.iterrows(): desc = f"{row['tag_description']}" - unit = f"{row['unit']}" element = { - 'input' : f"{desc}{unit}", + 'input' : f"{desc}", 'output': f"{row['thing']}{row['property']}", } output_list.append(element) @@ -77,11 +59,12 @@ def create_split_dataset(fold): # function to perform training for a given fold def train(fold): - save_path = f'checkpoint_fold_{fold}b' + save_path = f'checkpoint_fold_{fold}' split_datasets = create_split_dataset(fold) # prepare tokenizer - model_checkpoint = "t5-small" + + model_checkpoint = "t5-base" tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) # Define additional special tokens additional_special_tokens = ["", "", "", "", "", "", "", "", ""] @@ -101,7 +84,7 @@ def train(fold): text_target=target, max_length=max_length, truncation=True, - padding="max_length" + padding=True ) return model_inputs @@ -119,52 +102,10 @@ def train(fold): # device_map set to auto to force it to load contiguous weights # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') - # directory = os.path.join(".", f'checkpoint_fold_{fold}a') - # # Use glob to find matching paths - # # path is usually checkpoint_fold_1/checkpoint- - # # we are guaranteed to save only 1 checkpoint from training - # pattern = 'checkpoint-*' - # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] - # # t5_classify = T5Model.from_pretrained(prev_checkpoint) - # # Load the checkpoint - # checkpoint_path = f"{prev_checkpoint}/model.safetensors" - # checkpoint = load_file(checkpoint_path) - # # Filter out weights related to the classification head - # # given name format: t5.encoder.embed_tokens.weight - # # we want: encoder.embed.tokens.weight - # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} - - - pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) - - # Access the decoder stack - # config = T5Config("t5-small") - - config = pretrained_model.config - config.num_layers = 6 - config.num_decoder_layers = 3 # set new decoder layer count - - model = T5ForConditionalGeneration(config) - - model.shared = pretrained_model.shared - model.encoder = pretrained_model.encoder - - pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] - for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]): - model.decoder.block[i].load_state_dict(layer) # Load pretrained weights - - - # print number of decoder blocks - print(f'Number of decoder blocks: {len(model.decoder.block)}') - print(f'num_layers: {model.config.num_layers}') - print(f'num_decoder_layers: {model.config.num_decoder_layers}') - - - # change the token embedding size to match the shape + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + # important! after extending tokens vocab model.resize_token_embeddings(len(tokenizer)) - - data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) metric = evaluate.load("sacrebleu") @@ -199,7 +140,7 @@ def train(fold): # Generation Config # from transformers import GenerationConfig gen_config = model.generation_config - gen_config.max_length = 128 + gen_config.max_length = 64 # compile # model = torch.compile(model, backend="inductor", dynamic=True) @@ -222,7 +163,7 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/mapping_t5-base_desc_unit/.gitignore b/train/mapping_t5-base_desc_unit/.gitignore new file mode 100644 index 0000000..2e7f3f7 --- /dev/null +++ b/train/mapping_t5-base_desc_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log/ diff --git a/train/modified_t5_decoder_3_layers/mapping_prediction/.gitignore b/train/mapping_t5-base_desc_unit/mapping_prediction/.gitignore similarity index 100% rename from train/modified_t5_decoder_3_layers/mapping_prediction/.gitignore rename to train/mapping_t5-base_desc_unit/mapping_prediction/.gitignore diff --git a/train/modified_t5_decoder_3_layers/mapping_prediction/inference.py b/train/mapping_t5-base_desc_unit/mapping_prediction/inference.py similarity index 98% rename from train/modified_t5_decoder_3_layers/mapping_prediction/inference.py rename to train/mapping_t5-base_desc_unit/mapping_prediction/inference.py index 9ea9c77..4e2b72f 100644 --- a/train/modified_t5_decoder_3_layers/mapping_prediction/inference.py +++ b/train/mapping_t5-base_desc_unit/mapping_prediction/inference.py @@ -76,7 +76,7 @@ class Inference(): text_target=target, max_length=max_length, return_tensors="pt", - padding="max_length", + padding='max_length', truncation=True, ) return model_inputs @@ -100,7 +100,7 @@ class Inference(): def generate(self): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') MAX_GENERATE_LENGTH = 128 pred_generations = [] diff --git a/train/mapping_t5-base_desc_unit/mapping_prediction/output.txt b/train/mapping_t5-base_desc_unit/mapping_prediction/output.txt new file mode 100644 index 0000000..15a3adb --- /dev/null +++ b/train/mapping_t5-base_desc_unit/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9697113109323237 +Accuracy for fold 2: 0.9 +Accuracy for fold 3: 0.9613453815261044 +Accuracy for fold 4: 0.9686013320647003 +Accuracy for fold 5: 0.8932661475034357 diff --git a/train/modified_t5_decoder_12_layers/mapping_prediction/predict.py b/train/mapping_t5-base_desc_unit/mapping_prediction/predict.py similarity index 97% rename from train/modified_t5_decoder_12_layers/mapping_prediction/predict.py rename to train/mapping_t5-base_desc_unit/mapping_prediction/predict.py index 29e45f8..afc77e8 100644 --- a/train/modified_t5_decoder_12_layers/mapping_prediction/predict.py +++ b/train/mapping_t5-base_desc_unit/mapping_prediction/predict.py @@ -6,13 +6,14 @@ from inference import Inference checkpoint_directory = '../' -BATCH_SIZE = 512 +BATCH_SIZE = 128 def infer_and_select(fold): print(f"Inference for fold {fold}") # import test data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" df = pd.read_csv(data_path, skipinitialspace=True) + # uncomment for mdm only df = df[df['MDM']].reset_index(drop=True) # get target data @@ -26,7 +27,7 @@ def infer_and_select(fold): # run inference # checkpoint # Use glob to find matching paths - directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') # Use glob to find matching paths # path is usually checkpoint_fold_1/checkpoint- # we are guaranteed to save only 1 checkpoint from training diff --git a/train/modified_t5_decoder_12_layers/train_decoder.py b/train/mapping_t5-base_desc_unit/train.py similarity index 72% rename from train/modified_t5_decoder_12_layers/train_decoder.py rename to train/mapping_t5-base_desc_unit/train.py index c1fd98b..de3d84c 100644 --- a/train/modified_t5_decoder_12_layers/train_decoder.py +++ b/train/mapping_t5-base_desc_unit/train.py @@ -2,7 +2,6 @@ # from datasets import load_from_disk import os -import glob os.environ['NCCL_P2P_DISABLE'] = '1' os.environ['NCCL_IB_DISABLE'] = '1' @@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" import torch - -from safetensors.torch import load_file - -from transformers.models.t5.modeling_t5 import T5Block from transformers import ( - T5Config, T5TokenizerFast, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, EarlyStoppingCallback, - Seq2SeqTrainingArguments, - T5ForConditionalGeneration, - T5Model + Seq2SeqTrainingArguments ) import evaluate import numpy as np @@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict torch.set_float32_matmul_precision('high') - - -# %% - -# model_checkpoint = "t5-small" -# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) -# model.config - -# %% # outputs a list of dictionaries def process_df_to_dict(df): output_list = [] @@ -77,11 +60,12 @@ def create_split_dataset(fold): # function to perform training for a given fold def train(fold): - save_path = f'checkpoint_fold_{fold}b' + save_path = f'checkpoint_fold_{fold}' split_datasets = create_split_dataset(fold) # prepare tokenizer - model_checkpoint = "t5-small" + + model_checkpoint = "t5-base" tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) # Define additional special tokens additional_special_tokens = ["", "", "", "", "", "", "", "", ""] @@ -101,7 +85,7 @@ def train(fold): text_target=target, max_length=max_length, truncation=True, - padding="max_length" + padding=True ) return model_inputs @@ -119,52 +103,10 @@ def train(fold): # device_map set to auto to force it to load contiguous weights # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') - # directory = os.path.join(".", f'checkpoint_fold_{fold}a') - # # Use glob to find matching paths - # # path is usually checkpoint_fold_1/checkpoint- - # # we are guaranteed to save only 1 checkpoint from training - # pattern = 'checkpoint-*' - # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] - # # t5_classify = T5Model.from_pretrained(prev_checkpoint) - # # Load the checkpoint - # checkpoint_path = f"{prev_checkpoint}/model.safetensors" - # checkpoint = load_file(checkpoint_path) - # # Filter out weights related to the classification head - # # given name format: t5.encoder.embed_tokens.weight - # # we want: encoder.embed.tokens.weight - # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} - - - pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) - - # Access the decoder stack - # config = T5Config("t5-small") - - config = pretrained_model.config - config.num_layers = 6 - config.num_decoder_layers = 12 # set new decoder layer count - - model = T5ForConditionalGeneration(config) - - model.shared = pretrained_model.shared - model.encoder = pretrained_model.encoder - - pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] - for i, layer in enumerate(pretrained_decoder_weights): - model.decoder.block[i].load_state_dict(layer) # Load pretrained weights - - - # print number of decoder blocks - print(f'Number of decoder blocks: {len(model.decoder.block)}') - print(f'num_layers: {model.config.num_layers}') - print(f'num_decoder_layers: {model.config.num_decoder_layers}') - - - # change the token embedding size to match the shape + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + # important! after extending tokens vocab model.resize_token_embeddings(len(tokenizer)) - - data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) metric = evaluate.load("sacrebleu") @@ -199,10 +141,11 @@ def train(fold): # Generation Config # from transformers import GenerationConfig gen_config = model.generation_config - gen_config.max_length = 128 + gen_config.max_length = 64 # compile # model = torch.compile(model, backend="inductor", dynamic=True) + # model = torch.compile(model) # Trainer @@ -210,10 +153,10 @@ def train(fold): args = Seq2SeqTrainingArguments( f"{save_path}", # eval_strategy="epoch", + save_strategy="epoch", eval_strategy="no", logging_dir="tensorboard-log", logging_strategy="epoch", - # save_strategy="epoch", load_best_model_at_end=False, learning_rate=1e-3, per_device_train_batch_size=64, @@ -222,12 +165,13 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, generation_config=gen_config, remove_unused_columns=False, + warmup_steps=400 ) diff --git a/train/mapping_t5_1e4/.gitignore b/train/mapping_t5_1e4/.gitignore new file mode 100644 index 0000000..2e7f3f7 --- /dev/null +++ b/train/mapping_t5_1e4/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log/ diff --git a/train/modified_t5_decoder_9_layers/mapping_prediction/.gitignore b/train/mapping_t5_1e4/mapping_prediction/.gitignore similarity index 100% rename from train/modified_t5_decoder_9_layers/mapping_prediction/.gitignore rename to train/mapping_t5_1e4/mapping_prediction/.gitignore diff --git a/train/modified_t5_decoder_9_layers/mapping_prediction/inference.py b/train/mapping_t5_1e4/mapping_prediction/inference.py similarity index 98% rename from train/modified_t5_decoder_9_layers/mapping_prediction/inference.py rename to train/mapping_t5_1e4/mapping_prediction/inference.py index 9ea9c77..4e2b72f 100644 --- a/train/modified_t5_decoder_9_layers/mapping_prediction/inference.py +++ b/train/mapping_t5_1e4/mapping_prediction/inference.py @@ -76,7 +76,7 @@ class Inference(): text_target=target, max_length=max_length, return_tensors="pt", - padding="max_length", + padding='max_length', truncation=True, ) return model_inputs @@ -100,7 +100,7 @@ class Inference(): def generate(self): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') MAX_GENERATE_LENGTH = 128 pred_generations = [] diff --git a/train/mapping_t5_1e4/mapping_prediction/output.txt b/train/mapping_t5_1e4/mapping_prediction/output.txt new file mode 100644 index 0000000..f9ce777 --- /dev/null +++ b/train/mapping_t5_1e4/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.934690014197823 +Accuracy for fold 2: 0.9023364485981309 +Accuracy for fold 3: 0.9643574297188755 +Accuracy for fold 4: 0.9700285442435775 +Accuracy for fold 5: 0.8941823179111315 diff --git a/train/modified_t5_decoder_3_layers/mapping_prediction/predict.py b/train/mapping_t5_1e4/mapping_prediction/predict.py similarity index 99% rename from train/modified_t5_decoder_3_layers/mapping_prediction/predict.py rename to train/mapping_t5_1e4/mapping_prediction/predict.py index 29e45f8..76212fa 100644 --- a/train/modified_t5_decoder_3_layers/mapping_prediction/predict.py +++ b/train/mapping_t5_1e4/mapping_prediction/predict.py @@ -26,7 +26,7 @@ def infer_and_select(fold): # run inference # checkpoint # Use glob to find matching paths - directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b') + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') # Use glob to find matching paths # path is usually checkpoint_fold_1/checkpoint- # we are guaranteed to save only 1 checkpoint from training diff --git a/train/modified_t5_decoder_9_layers/train_decoder.py b/train/mapping_t5_1e4/train.py similarity index 73% rename from train/modified_t5_decoder_9_layers/train_decoder.py rename to train/mapping_t5_1e4/train.py index 7969fa2..30091ae 100644 --- a/train/modified_t5_decoder_9_layers/train_decoder.py +++ b/train/mapping_t5_1e4/train.py @@ -2,7 +2,6 @@ # from datasets import load_from_disk import os -import glob os.environ['NCCL_P2P_DISABLE'] = '1' os.environ['NCCL_IB_DISABLE'] = '1' @@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" import torch - -from safetensors.torch import load_file - -from transformers.models.t5.modeling_t5 import T5Block from transformers import ( - T5Config, T5TokenizerFast, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, EarlyStoppingCallback, - Seq2SeqTrainingArguments, - T5ForConditionalGeneration, - T5Model + Seq2SeqTrainingArguments ) import evaluate import numpy as np @@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict torch.set_float32_matmul_precision('high') - - -# %% - -# model_checkpoint = "t5-small" -# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) -# model.config - -# %% # outputs a list of dictionaries def process_df_to_dict(df): output_list = [] @@ -77,10 +60,11 @@ def create_split_dataset(fold): # function to perform training for a given fold def train(fold): - save_path = f'checkpoint_fold_{fold}b' + save_path = f'checkpoint_fold_{fold}' split_datasets = create_split_dataset(fold) # prepare tokenizer + model_checkpoint = "t5-small" tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) # Define additional special tokens @@ -101,7 +85,7 @@ def train(fold): text_target=target, max_length=max_length, truncation=True, - padding="max_length" + padding=True ) return model_inputs @@ -119,52 +103,10 @@ def train(fold): # device_map set to auto to force it to load contiguous weights # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') - # directory = os.path.join(".", f'checkpoint_fold_{fold}a') - # # Use glob to find matching paths - # # path is usually checkpoint_fold_1/checkpoint- - # # we are guaranteed to save only 1 checkpoint from training - # pattern = 'checkpoint-*' - # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0] - # # t5_classify = T5Model.from_pretrained(prev_checkpoint) - # # Load the checkpoint - # checkpoint_path = f"{prev_checkpoint}/model.safetensors" - # checkpoint = load_file(checkpoint_path) - # # Filter out weights related to the classification head - # # given name format: t5.encoder.embed_tokens.weight - # # we want: encoder.embed.tokens.weight - # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key} - - - pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) - - # Access the decoder stack - # config = T5Config("t5-small") - - config = pretrained_model.config - config.num_layers = 6 - config.num_decoder_layers = 9 # set new decoder layer count - - model = T5ForConditionalGeneration(config) - - model.shared = pretrained_model.shared - model.encoder = pretrained_model.encoder - - pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block] - for i, layer in enumerate(pretrained_decoder_weights): - model.decoder.block[i].load_state_dict(layer) # Load pretrained weights - - - # print number of decoder blocks - print(f'Number of decoder blocks: {len(model.decoder.block)}') - print(f'num_layers: {model.config.num_layers}') - print(f'num_decoder_layers: {model.config.num_decoder_layers}') - - - # change the token embedding size to match the shape + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + # important! after extending tokens vocab model.resize_token_embeddings(len(tokenizer)) - - data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) metric = evaluate.load("sacrebleu") @@ -199,7 +141,7 @@ def train(fold): # Generation Config # from transformers import GenerationConfig gen_config = model.generation_config - gen_config.max_length = 128 + gen_config.max_length = 64 # compile # model = torch.compile(model, backend="inductor", dynamic=True) @@ -215,14 +157,14 @@ def train(fold): logging_strategy="epoch", # save_strategy="epoch", load_best_model_at_end=False, - learning_rate=1e-3, + learning_rate=1e-4, per_device_train_batch_size=64, per_device_eval_batch_size=64, auto_find_batch_size=False, ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/mapping_t5_complete_desc/mapping_prediction/output.txt b/train/mapping_t5_complete_desc/mapping_prediction/output.txt index ce5e3df..9506f38 100644 --- a/train/mapping_t5_complete_desc/mapping_prediction/output.txt +++ b/train/mapping_t5_complete_desc/mapping_prediction/output.txt @@ -1,6 +1,6 @@ -Accuracy for fold 1: 0.9455750118315192 -Accuracy for fold 2: 0.8864485981308411 -Accuracy for fold 3: 0.9558232931726908 -Accuracy for fold 4: 0.9686013320647003 -Accuracy for fold 5: 0.896930829134219 +Accuracy for fold 1: 0.9427354472314246 +Accuracy for fold 2: 0.8981308411214953 +Accuracy for fold 3: 0.9588353413654619 +Accuracy for fold 4: 0.9633682207421503 +Accuracy for fold 5: 0.8928080622995878 diff --git a/train/mapping_t5_complete_desc/train.py b/train/mapping_t5_complete_desc/train.py index b44121e..6ad492f 100644 --- a/train/mapping_t5_complete_desc/train.py +++ b/train/mapping_t5_complete_desc/train.py @@ -157,13 +157,13 @@ def train(fold): # save_strategy="epoch", load_best_model_at_end=False, learning_rate=1e-3, - per_device_train_batch_size=128, - per_device_eval_batch_size=128, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, auto_find_batch_size=False, ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt b/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt index 720068d..e6c556a 100644 --- a/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt +++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt @@ -1,6 +1,6 @@ -Accuracy for fold 1: 0.9522006625650734 -Accuracy for fold 2: 0.9093457943925234 -Accuracy for fold 3: 0.9678714859437751 -Accuracy for fold 4: 0.9814462416745956 -Accuracy for fold 5: 0.890975721484196 +Accuracy for fold 1: 0.9578797917652626 +Accuracy for fold 2: 0.9088785046728972 +Accuracy for fold 3: 0.9673694779116466 +Accuracy for fold 4: 0.9785918173168411 +Accuracy for fold 5: 0.8918918918918919 diff --git a/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py b/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py index 76212fa..00af9ec 100644 --- a/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py +++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py @@ -13,7 +13,8 @@ def infer_and_select(fold): # import test data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" df = pd.read_csv(data_path, skipinitialspace=True) - df = df[df['MDM']].reset_index(drop=True) + # note: we need to uncomment this for overall evaluation + # df = df[df['MDM']].reset_index(drop=True) # get target data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" diff --git a/train/mapping_t5_complete_desc_unit/train.py b/train/mapping_t5_complete_desc_unit/train.py index ba77309..c49189d 100644 --- a/train/mapping_t5_complete_desc_unit/train.py +++ b/train/mapping_t5_complete_desc_unit/train.py @@ -164,7 +164,7 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/modified_t5_decoder_12_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_12_layers/mapping_prediction/output.txt deleted file mode 100644 index 877665d..0000000 --- a/train/modified_t5_decoder_12_layers/mapping_prediction/output.txt +++ /dev/null @@ -1,6 +0,0 @@ - -Accuracy for fold 1: 0.9403691433980123 -Accuracy for fold 2: 0.9046728971962616 -Accuracy for fold 3: 0.9678714859437751 -Accuracy for fold 4: 0.9695528068506185 -Accuracy for fold 5: 0.902427851580394 diff --git a/train/modified_t5_decoder_1_layers/train_decoder.py b/train/modified_t5_decoder_1_layers/train_decoder.py index 7780901..172f0e9 100644 --- a/train/modified_t5_decoder_1_layers/train_decoder.py +++ b/train/modified_t5_decoder_1_layers/train_decoder.py @@ -222,7 +222,7 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/modified_t5_decoder_2_layers/train_decoder.py b/train/modified_t5_decoder_2_layers/train_decoder.py index fa96896..bbc7015 100644 --- a/train/modified_t5_decoder_2_layers/train_decoder.py +++ b/train/modified_t5_decoder_2_layers/train_decoder.py @@ -222,7 +222,7 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/modified_t5_decoder_3_layers/.gitignore b/train/modified_t5_decoder_3_layers/.gitignore deleted file mode 100644 index d943a39..0000000 --- a/train/modified_t5_decoder_3_layers/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -checkpoint* -tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_3_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_3_layers/mapping_prediction/output.txt deleted file mode 100644 index 539366c..0000000 --- a/train/modified_t5_decoder_3_layers/mapping_prediction/output.txt +++ /dev/null @@ -1,6 +0,0 @@ - -Accuracy for fold 1: 0.9427354472314246 -Accuracy for fold 2: 0.9098130841121496 -Accuracy for fold 3: 0.964859437751004 -Accuracy for fold 4: 0.9719314938154139 -Accuracy for fold 5: 0.9070087036188731 diff --git a/train/modified_t5_decoder_4_layers/train_decoder.py b/train/modified_t5_decoder_4_layers/train_decoder.py index 155cdfd..33f305a 100644 --- a/train/modified_t5_decoder_4_layers/train_decoder.py +++ b/train/modified_t5_decoder_4_layers/train_decoder.py @@ -222,7 +222,7 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/modified_t5_decoder_8_layers/train_decoder.py b/train/modified_t5_decoder_8_layers/train_decoder.py index 7f4e233..f977e81 100644 --- a/train/modified_t5_decoder_8_layers/train_decoder.py +++ b/train/modified_t5_decoder_8_layers/train_decoder.py @@ -222,7 +222,7 @@ def train(fold): ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, predict_with_generate=True, bf16=True, push_to_hub=False, diff --git a/train/modified_t5_decoder_9_layers/.gitignore b/train/modified_t5_decoder_9_layers/.gitignore deleted file mode 100644 index d943a39..0000000 --- a/train/modified_t5_decoder_9_layers/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -checkpoint* -tensorboard-log \ No newline at end of file diff --git a/train/modified_t5_decoder_9_layers/mapping_prediction/output.txt b/train/modified_t5_decoder_9_layers/mapping_prediction/output.txt deleted file mode 100644 index 37ce896..0000000 --- a/train/modified_t5_decoder_9_layers/mapping_prediction/output.txt +++ /dev/null @@ -1,6 +0,0 @@ - -Accuracy for fold 1: 0.9441552295314718 -Accuracy for fold 2: 0.9121495327102803 -Accuracy for fold 3: 0.963855421686747 -Accuracy for fold 4: 0.9752616555661275 -Accuracy for fold 5: 0.907924874026569 diff --git a/train/train.bash b/train/train.bash index f95baab..53347d4 100644 --- a/train/train.bash +++ b/train/train.bash @@ -1,28 +1,14 @@ #!/bin/bash -cd hybrid_t5_complete_desc_unit -micromamba run -n hug accelerate launch train_encoder.py -micromamba run -n hug accelerate launch train_decoder.py +cd mapping_t5-base_desc +micromamba run -n hug accelerate launch train.py cd .. -cd hybrid_t5_pattern_desc_unit -micromamba run -n hug accelerate launch train_encoder.py -micromamba run -n hug accelerate launch train_decoder.py +cd mapping_t5-base_desc_unit +micromamba run -n hug accelerate launch train.py cd .. -# cd classification_bert_complete_desc -# micromamba run -n hug accelerate launch train.py -# cd .. - -# cd classification_bert_complete_desc_unit -# micromamba run -n hug accelerate launch train.py -# cd .. - -# cd classification_bert_complete_desc_unit_name -# micromamba run -n hug accelerate launch train.py -# cd .. - # cd mapping_t5_complete_desc # micromamba run -n hug accelerate launch train.py # cd .. @@ -31,6 +17,31 @@ cd .. # micromamba run -n hug accelerate launch train.py # cd .. # -# cd mapping_t5_complete_name_desc_unit +# cd frozen_t5_encoder +# micromamba run -n hug accelerate launch train_decoder.py +# cd .. +# +# cd modified_t5_decoder_1_layers +# micromamba run -n hug accelerate launch train_decoder.py +# cd .. +# +# cd modified_t5_decoder_2_layers +# micromamba run -n hug accelerate launch train_decoder.py +# cd .. +# +# cd modified_t5_decoder_4_layers +# micromamba run -n hug accelerate launch train_decoder.py +# cd .. +# +# cd modified_t5_decoder_8_layers +# micromamba run -n hug accelerate launch train_decoder.py +# cd .. +# +# cd classification_bert_complete_desc # micromamba run -n hug accelerate launch train.py # cd .. +# +# cd classification_bert_complete_desc_unit +# micromamba run -n hug accelerate launch train.py +# cd .. +